@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  11. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  12. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  13. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  14. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  15. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  22. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  23. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  24. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  28. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  29. package/dist/_vendor/ailf-core/services/index.js +5 -0
  30. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  31. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  32. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  33. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  34. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  35. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  36. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  37. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  38. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  39. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  40. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  41. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  42. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  43. package/dist/_vendor/ailf-core/types/index.js +15 -1
  44. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  45. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  47. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  48. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  49. package/dist/adapters/api-client/build-request.d.ts +1 -0
  50. package/dist/adapters/api-client/build-request.js +3 -0
  51. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  52. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  53. package/dist/adapters/attribution/index.d.ts +9 -0
  54. package/dist/adapters/attribution/index.js +8 -0
  55. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  56. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  57. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  58. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  59. package/dist/adapters/grader-outputs/index.js +8 -0
  60. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  61. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  62. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  63. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  64. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  65. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  66. package/dist/adapters/index.d.ts +3 -0
  67. package/dist/adapters/index.js +4 -0
  68. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  69. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  70. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  71. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  72. package/dist/commands/calculate-scores.js +1 -1
  73. package/dist/commands/explain-handler.js +1 -1
  74. package/dist/commands/lookup-doc.d.ts +1 -1
  75. package/dist/commands/lookup-doc.js +3 -3
  76. package/dist/commands/pipeline-action.d.ts +6 -0
  77. package/dist/commands/pipeline-action.js +2 -0
  78. package/dist/commands/remote-pipeline.js +1 -0
  79. package/dist/composition-root.d.ts +36 -0
  80. package/dist/composition-root.js +48 -0
  81. package/dist/config/rubrics.ts +38 -2
  82. package/dist/grader/agent-harness.d.ts +14 -0
  83. package/dist/grader/agent-harness.js +17 -0
  84. package/dist/grader/common.d.ts +17 -0
  85. package/dist/grader/common.js +21 -0
  86. package/dist/grader/index.d.ts +38 -0
  87. package/dist/grader/index.js +75 -0
  88. package/dist/grader/knowledge-probe.d.ts +14 -0
  89. package/dist/grader/knowledge-probe.js +18 -0
  90. package/dist/grader/literacy.d.ts +13 -0
  91. package/dist/grader/literacy.js +17 -0
  92. package/dist/grader/mcp.d.ts +14 -0
  93. package/dist/grader/mcp.js +18 -0
  94. package/dist/orchestration/build-app-context.js +1 -0
  95. package/dist/orchestration/build-step-sequence.js +5 -0
  96. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  97. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  98. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  99. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  100. package/dist/orchestration/steps/index.d.ts +1 -0
  101. package/dist/orchestration/steps/index.js +1 -0
  102. package/dist/pipeline/attribution.d.ts +15 -0
  103. package/dist/pipeline/attribution.js +18 -9
  104. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  105. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  106. package/dist/pipeline/borderline-detector.d.ts +24 -0
  107. package/dist/pipeline/borderline-detector.js +26 -0
  108. package/dist/pipeline/calculate-scores.d.ts +114 -3
  109. package/dist/pipeline/calculate-scores.js +426 -24
  110. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  111. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  112. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  113. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  114. package/dist/pipeline/compute-attribution.d.ts +80 -0
  115. package/dist/pipeline/compute-attribution.js +196 -0
  116. package/dist/pipeline/failure-modes.d.ts +52 -17
  117. package/dist/pipeline/failure-modes.js +178 -117
  118. package/dist/pipeline/map-request-to-config.js +1 -0
  119. package/package.json +6 -4
@@ -29,9 +29,11 @@
29
29
  */
30
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
31
31
  import { join } from "path";
32
- import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
32
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
33
33
  import { calculateCost } from "../agent-observer/pricing.js";
34
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
+ import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
36
+ import { validateFailureMode } from "./failure-modes.js";
35
37
  import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
36
38
  import { checkResultsExist } from "./checks.js";
37
39
  import { loadRubricTemplates } from "./rubric-loader.js";
@@ -99,6 +101,54 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
99
101
  // ---------------------------------------------------------------------------
100
102
  // URL extraction from assertion metadata
101
103
  // ---------------------------------------------------------------------------
104
+ /**
105
+ * Synthesize a Phase 3 GRAD-05 fallback judgment when the strict-schema
106
+ * parse fails. Required GRAD-02 fields are populated with sensible
107
+ * defaults; the `judgmentId` is salted with `runId` (when threaded
108
+ * through) so consumers can dedup the natural composite key
109
+ * `(taskId, modelId, dimension)` per-run rather than colliding across
110
+ * re-runs of the same task.
111
+ *
112
+ * The branded-id construction lives in `generateJudgmentId` from
113
+ * `@sanity/ailf-core/branded-ids` — that's the single audited-cast site
114
+ * for the `as JudgmentId` exit, mirroring how `generateRunId` owns the
115
+ * `as RunId` cast.
116
+ */
117
+ function synthesizeUnparsedJudgment(input) {
118
+ const { taskId, modelId, dimension, score, reason, outputFailure, runId } = input;
119
+ return {
120
+ dimension,
121
+ modelId,
122
+ ...(outputFailure && { outputFailure: true }),
123
+ reason,
124
+ score,
125
+ taskId,
126
+ judgmentId: generateJudgmentId({
127
+ taskId,
128
+ modelId,
129
+ dimension,
130
+ ...(runId ? { runId } : {}),
131
+ }),
132
+ subJudgments: [],
133
+ docCitations: [],
134
+ failureMode: "unclassified",
135
+ confidence: {
136
+ level: "low",
137
+ signalsPresent: 0,
138
+ // Sentinel — IF this leaks downstream past
139
+ // validateGraderJudgmentsCalibration the tag is obviously a
140
+ // synthesize-time placeholder rather than a real ceiling-cross-check
141
+ // result. The validator overwrites it with "ceiling-cross-check"
142
+ // along the live pipeline path.
143
+ derivation: "synthesized-pre-cross-check",
144
+ },
145
+ hallucinationCheckedAgainst: [],
146
+ metadata: {
147
+ graderModel: "unknown",
148
+ graderJudgmentsVersion,
149
+ },
150
+ };
151
+ }
102
152
  /**
103
153
  * Extract grader judgments (reason text + scores) from evaluation results.
104
154
  *
@@ -107,8 +157,15 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
107
157
  * assertion produces one GraderJudgment entry.
108
158
  *
109
159
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
160
+ *
161
+ * @param resultsPath - Path to the Promptfoo results JSON file.
162
+ * @param telemetry - Optional reliability counter (Plan 03-03). When passed,
163
+ * `parseFailures` is incremented on every strict-schema rejection so the
164
+ * live pipeline can surface schema drift over time.
165
+ * `runId` (when supplied) is threaded into synthesized fall-back judgment
166
+ * ids so dedup keys are unique per-run.
110
167
  */
111
- export function extractGraderJudgments(resultsPath) {
168
+ export function extractGraderJudgments(resultsPath, telemetry) {
112
169
  const results = readAndNormalizeResults(resultsPath);
113
170
  const judgments = [];
114
171
  for (const result of results) {
@@ -127,36 +184,242 @@ export function extractGraderJudgments(resultsPath) {
127
184
  continue;
128
185
  }
129
186
  const score = parseRubricScore(comp);
130
- // Extract the reason text — the grader's reasoning
131
- let reason = comp.reason ?? "";
132
- if (reason) {
133
- // Try to parse JSON reason to extract the reason field
187
+ // Extract the reason text — the grader's reasoning. Plan 03-01
188
+ // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
189
+ // this site is replaced with `GraderJudgmentSchema.safeParse`
190
+ // so that grader output flows through a validated schema before
191
+ // it enters the scoring pipeline. On parse failure we fall to
192
+ // an `unclassified`-shape Phase 1 judgment built from the raw
193
+ // reason string — NEVER fall back to the legacy parser (Pitfall
194
+ // 4: strict and legacy schemas are deliberate siblings, not a
195
+ // fall-through chain).
196
+ const reasonRaw = comp.reason ?? "";
197
+ let parsedJudgment = null;
198
+ let reason = reasonRaw;
199
+ if (reasonRaw) {
134
200
  try {
135
- const parsed = JSON.parse(reason);
136
- const obj = parsed;
137
- if (typeof obj.reason === "string") {
138
- ;
139
- ({ reason } = obj);
201
+ const candidate = JSON.parse(reasonRaw);
202
+ // The strict schema asserts the full GraderJudgment surface.
203
+ // safeParse handles non-object inputs (number, array, etc.)
204
+ // by failing — we don't pre-narrow here.
205
+ const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
206
+ const result = GraderJudgmentSchema.safeParse({
207
+ ...candidateObj,
208
+ taskId,
209
+ modelId,
210
+ dimension: kind,
211
+ });
212
+ if (result.success) {
213
+ parsedJudgment = result.data;
214
+ reason = result.data.reason;
215
+ }
216
+ else {
217
+ // Parse failure — drop to failureMode='unclassified' below.
218
+ // Plan 03-03: increment graderReliability.parseFailures so the
219
+ // live pipeline surfaces schema drift over time. When no
220
+ // telemetry sink is passed, the increment is a no-op (preserves
221
+ // the pre-Plan-03-03 calling shape for any callers that don't
222
+ // care about reliability counters).
223
+ if (telemetry) {
224
+ telemetry.reliability.parseFailures =
225
+ (telemetry.reliability.parseFailures ?? 0) + 1;
226
+ }
227
+ reason = reasonRaw;
140
228
  }
141
229
  }
142
230
  catch {
143
- // Not JSON — use raw reason string
231
+ // Not JSON (legacy free-prose) keep raw reason. The
232
+ // unclassified-fall-back path below covers this case.
144
233
  }
145
234
  }
146
235
  // Also flag synthesized api-error judgments as output failures
147
236
  const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
148
- judgments.push({
149
- dimension: kind,
150
- modelId,
151
- ...(outputFailure && { outputFailure: true }),
152
- reason,
153
- score,
154
- taskId,
155
- });
237
+ if (parsedJudgment !== null) {
238
+ // Strict-shape parse succeeded — preserve the parsed structured
239
+ // surface (additive GRAD-02 fields if the grader emitted them)
240
+ // and overlay pipeline-side semantics (outputFailure flag, the
241
+ // post-extraction score, and outer-context fields).
242
+ //
243
+ // The grader's emitted `score` from the JSON envelope is
244
+ // intentionally discarded — `parseRubricScore(comp)` (the
245
+ // component-level score Promptfoo computed) wins. Phase 3 keeps
246
+ // this asymmetry intentionally; the GRAD-06 cutover plan
247
+ // revisits whether the grader-emitted score should win or
248
+ // surface a tolerance-mismatch parseFailure variant.
249
+ judgments.push({
250
+ ...parsedJudgment,
251
+ dimension: kind,
252
+ modelId,
253
+ taskId,
254
+ score,
255
+ ...(outputFailure && { outputFailure: true }),
256
+ reason,
257
+ });
258
+ }
259
+ else {
260
+ // Phase 3 GRAD-05 fallback — strict-schema parse failed. The
261
+ // GRAD-02 surface is now required, so we synthesize sensible
262
+ // defaults that mark this judgment as unparsed:
263
+ // - failureMode: "unclassified" (per per-dimension taxonomy in
264
+ // packages/eval/src/grader/common.ts).
265
+ // - confidence: low / 0 signals / "synthesized-pre-cross-check"
266
+ // (D0049 — sentinel placeholder; overwritten by
267
+ // validateGraderJudgmentsCalibration with "ceiling-cross-check"
268
+ // along the live pipeline path. The placeholder is a distinct
269
+ // tag so the unparsed state is obvious if it leaks downstream).
270
+ // - hallucinationCheckedAgainst, subJudgments, docCitations:
271
+ // empty arrays.
272
+ // - judgmentId: deterministic taskId::modelId::dimension stamp
273
+ // so consumers can dedupe on it.
274
+ judgments.push(synthesizeUnparsedJudgment({
275
+ taskId,
276
+ modelId,
277
+ dimension: kind,
278
+ score,
279
+ reason,
280
+ outputFailure,
281
+ ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
282
+ }));
283
+ }
156
284
  }
157
285
  }
158
286
  return judgments;
159
287
  }
288
+ /**
289
+ * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
290
+ * triple and increment `GraderReliability.failureModeCalibration` whenever
291
+ * the grader's emitted `failureMode` disagrees with the
292
+ * ceiling-decomposition mode.
293
+ *
294
+ * Plan 03-03 — the grader's emitted `failureMode` is the source of truth
295
+ * for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
296
+ * stamps confidence based on whether the structural ceiling signal agrees
297
+ * and surfaces calibration drift as a counter on `GraderReliability`.
298
+ *
299
+ * The function mutates `judgments` in place — it overlays
300
+ * `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
301
+ * already carries a confidence from the strict-schema parse (Plan 03-01),
302
+ * the ceiling-cross-check stamp REPLACES it because the validator's
303
+ * derivation tag is the live-pipeline contract; the parsed-shape
304
+ * confidence (if emitted by the grader) is preserved on the original
305
+ * `parsedJudgment` upstream of this site.
306
+ *
307
+ * `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
308
+ * does not have access to `task.contextDocs ∪ run.documentManifest` at this
309
+ * site (the union travels through a separate path in
310
+ * `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
311
+ * couple the doc-union population at the strict-schema flip site so the
312
+ * field is populated alongside the required-flip.
313
+ *
314
+ * @param judgments - Grader judgments produced by `extractGraderJudgments`.
315
+ * @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
316
+ * come from this lookup. Missing areas default to ceiling 100, floor 0
317
+ * (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
318
+ * @param reliability - `GraderReliability` sink whose
319
+ * `failureModeCalibration` counter is incremented on disagreement.
320
+ *
321
+ * @see docs/decisions/D0005-grader-model-separation.md
322
+ * @see docs/decisions/D0049-shared-confidence-contract.md
323
+ */
324
+ export function validateGraderJudgmentsCalibration(judgments, scores, reliability) {
325
+ const scoreByArea = new Map();
326
+ for (const score of scores) {
327
+ scoreByArea.set(score.feature, score);
328
+ }
329
+ for (const judgment of judgments) {
330
+ // Resolve area for ceiling/floor lookup. Mirrors the resolution
331
+ // strategy used inside buildFailureModeReport but kept local so we
332
+ // don't widen the failure-modes module surface.
333
+ const detected = detectFeatureArea(judgment.taskId);
334
+ let area;
335
+ if (detected !== "other" && scoreByArea.has(detected)) {
336
+ area = detected;
337
+ }
338
+ else {
339
+ for (const candidate of scoreByArea.keys()) {
340
+ if (judgment.taskId.startsWith(candidate)) {
341
+ area = candidate;
342
+ break;
343
+ }
344
+ }
345
+ }
346
+ const areaScore = area ? scoreByArea.get(area) : undefined;
347
+ const ceilingScore = areaScore?.ceilingScore ?? 100;
348
+ const floorScore = areaScore?.floorScore ?? 0;
349
+ // Snapshot the pre-overwrite derivation BEFORE replacing
350
+ // `judgment.confidence` with the ceiling-cross-check stamp. The
351
+ // synthesized parse-fallback sentinel is the structurally correct
352
+ // discriminator for "this judgment came from a parse failure, not a
353
+ // genuine grader emission". Reading it before overwrite preserves
354
+ // the signal that L627 erases. Equivalent in shape to a typed
355
+ // sentinel check — derivation is the contract field for upstream
356
+ // provenance.
357
+ const isSynthesizedFallback = judgment.confidence?.derivation === "synthesized-pre-cross-check";
358
+ const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
359
+ judgment.confidence = stamp;
360
+ // A calibration miss requires BOTH signals present and disagreeing.
361
+ // signalsPresent === 1 (grader-only, no ceiling pattern) is NOT a
362
+ // miss — we have nothing to cross-check against. Folding it in
363
+ // over-counted whenever classifyByCeiling returned null. Skip the
364
+ // synthesized parse-fallback judgment whose hard-coded
365
+ // failureMode: "unclassified" would otherwise increment the counter
366
+ // every time ceiling decomposition picks ANY classified mode —
367
+ // that's a parseFailures concern, not a calibration concern. We
368
+ // detect "synthesized fallback" via the derivation sentinel rather
369
+ // than `failureMode !== "unclassified"` — the latter under-counts
370
+ // when a grader genuinely emits "unclassified" as a classification.
371
+ if (stamp.level === "medium" &&
372
+ stamp.signalsPresent === 2 &&
373
+ !isSynthesizedFallback) {
374
+ reliability.failureModeCalibration =
375
+ (reliability.failureModeCalibration ?? 0) + 1;
376
+ }
377
+ }
378
+ }
379
+ /**
380
+ * Populate Pitfall #11 hallucination cross-check fields on grader
381
+ * judgments (Plan 03-04 GRAD-05).
382
+ *
383
+ * For each judgment, sets `hallucinationCheckedAgainst` to the union of
384
+ * (a) the slugs of docs the task declared in `context.docs` and (b) the
385
+ * run's full document manifest. For each entry in `judgment.docCitations`
386
+ * that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
387
+ * that does not appear in either set is a fabrication, not a real
388
+ * resolvable doc.
389
+ *
390
+ * Mutates `judgments` in place. Slug-less citations are left with
391
+ * `hallucinated` undefined since the lookup key is the slug per the
392
+ * GraderJudgment domain doc-comment ("slug does not resolve against the
393
+ * task's contextDocs set").
394
+ *
395
+ * Called from `gap-analysis-step` (literacy mode only) after
396
+ * `descToDocRefs` and `documentManifest` are built — both inputs are
397
+ * unavailable at extract time. Non-literacy modes skip this step
398
+ * entirely (the runtime contract has no canonical doc set to check
399
+ * against).
400
+ *
401
+ * @param judgments - Grader judgments to enrich. Mutated in place.
402
+ * @param taskDocSlugs - Map from base task description (the form judgment
403
+ * `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
404
+ * to the slugs declared in that task's `context.docs`.
405
+ * @param manifestSlugs - All slugs in the run's document manifest.
406
+ */
407
+ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlugs) {
408
+ const manifestSet = new Set(manifestSlugs);
409
+ for (const judgment of judgments) {
410
+ const baseDesc = judgment.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
411
+ const taskSlugs = taskDocSlugs.get(baseDesc) ?? [];
412
+ const union = new Set(manifestSet);
413
+ for (const slug of taskSlugs)
414
+ union.add(slug);
415
+ judgment.hallucinationCheckedAgainst = [...union];
416
+ for (const citation of judgment.docCitations) {
417
+ if (typeof citation.slug === "string" && citation.slug.length > 0) {
418
+ citation.hallucinated = !union.has(citation.slug);
419
+ }
420
+ }
421
+ }
422
+ }
160
423
  /**
161
424
  * Maximum characters (JS string length, not bytes) to store for model
162
425
  * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
@@ -998,7 +1261,56 @@ export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptio
998
1261
  */
999
1262
  // mergeScores — imported from @sanity/ailf-core above
1000
1263
  const CRITICAL_THRESHOLD = 40;
1001
- export function calculateAndWriteScores(options) {
1264
+ /**
1265
+ * Build a regrade callback for the borderline runner from the original
1266
+ * Promptfoo results file. Resolves each judgment back to its
1267
+ * `(taskId, modelId, dimension)` component result, then calls the
1268
+ * supplied `regradeOnce` to score the response against the rubric
1269
+ * again. Returns the original judgment score when the lookup or
1270
+ * regrade call fails — the runner treats that as a degenerate replica
1271
+ * (the consensus median absorbs it).
1272
+ */
1273
+ function buildBorderlineRegrader(input) {
1274
+ // Lazily build the lookup map on first call so we don't pay the
1275
+ // file-read cost when the runner short-circuits on no borderline
1276
+ // judgments. Merges componentResults across every supplied results
1277
+ // file so full-mode (baseline + agentic) judgments resolve to their
1278
+ // original (responseText, rubricText) pair regardless of source file.
1279
+ let lookup = null;
1280
+ return async (judgment) => {
1281
+ if (lookup === null) {
1282
+ lookup = new Map();
1283
+ for (const path of input.resultsPaths) {
1284
+ const results = readAndNormalizeResults(path);
1285
+ for (const result of results) {
1286
+ const taskId = result.description;
1287
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
1288
+ for (const comp of result.gradingResult.componentResults) {
1289
+ if (comp.assertion?.type !== "llm-rubric")
1290
+ continue;
1291
+ const dimension = classifyRubric(comp);
1292
+ if (!dimension)
1293
+ continue;
1294
+ const rubricText = typeof comp.assertion.value === "string"
1295
+ ? comp.assertion.value
1296
+ : "";
1297
+ const responseText = result.response?.output ?? "";
1298
+ lookup.set(`${taskId}::${modelId}::${dimension}`, {
1299
+ responseText,
1300
+ rubricText,
1301
+ });
1302
+ }
1303
+ }
1304
+ }
1305
+ }
1306
+ const ctx = lookup.get(`${judgment.taskId}::${judgment.modelId}::${judgment.dimension}`);
1307
+ if (!ctx)
1308
+ return judgment.score;
1309
+ const replica = await input.regradeOnce(ctx.responseText, ctx.rubricText);
1310
+ return replica ?? judgment.score;
1311
+ };
1312
+ }
1313
+ export async function calculateAndWriteScores(options) {
1002
1314
  const ROOT = options.rootDir;
1003
1315
  const log = options.logger ?? new ConsoleLogger();
1004
1316
  const sourceName = options.source;
@@ -1026,6 +1338,44 @@ export function calculateAndWriteScores(options) {
1026
1338
  const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
1027
1339
  // Agentic results path (only used in full mode)
1028
1340
  const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
1341
+ // Borderline-consensus pass — shared across all four scoring modes
1342
+ // (literacy, agent-harness, knowledge-probe, mcp-server). Each mode's
1343
+ // persist site invokes this AFTER `extractGraderJudgments` and BEFORE
1344
+ // `writeFileSync(grader-judgments.json)` so the persisted file carries
1345
+ // the consensus median rather than single-replica scores. Returns the
1346
+ // per-judgment consistency map (or null) so the literacy branch can
1347
+ // also persist `borderline-consistency.json`.
1348
+ const runBorderlinePass = async (judgments, resultsPaths) => {
1349
+ if (judgments.length === 0 ||
1350
+ !options.borderlineConsensusRunner ||
1351
+ !options.borderlineRegradeOnce) {
1352
+ return null;
1353
+ }
1354
+ const regrade = buildBorderlineRegrader({
1355
+ resultsPaths,
1356
+ regradeOnce: options.borderlineRegradeOnce,
1357
+ logger: log,
1358
+ });
1359
+ try {
1360
+ const { judgments: regraded, consistencyByJudgment } = await options.borderlineConsensusRunner({
1361
+ judgments,
1362
+ regrade,
1363
+ logger: log,
1364
+ });
1365
+ // Mutate-in-place so subsequent steps (validateGraderJudgmentsCalibration,
1366
+ // persist) see the consensus-merged scores.
1367
+ judgments.length = 0;
1368
+ judgments.push(...regraded);
1369
+ if (consistencyByJudgment.size > 0) {
1370
+ log.info(`Borderline consensus merged ${consistencyByJudgment.size} judgment(s)`);
1371
+ }
1372
+ return consistencyByJudgment;
1373
+ }
1374
+ catch (err) {
1375
+ log.warn(`Borderline consensus pass failed — falling back to single-replica scores: ${err instanceof Error ? err.message : String(err)}`);
1376
+ return null;
1377
+ }
1378
+ };
1029
1379
  // Validate baseline results file
1030
1380
  const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
1031
1381
  const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
@@ -1079,7 +1429,14 @@ export function calculateAndWriteScores(options) {
1079
1429
  log.info("Score summary written to results/latest/score-summary.json");
1080
1430
  // Extract and persist grader judgments
1081
1431
  const judgments = extractGraderJudgments(baselineResultsPath);
1432
+ const borderlineConsistency = await runBorderlinePass(judgments, [
1433
+ baselineResultsPath,
1434
+ ]);
1082
1435
  if (judgments.length > 0) {
1436
+ if (borderlineConsistency && borderlineConsistency.size > 0) {
1437
+ writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
1438
+ log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
1439
+ }
1083
1440
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1084
1441
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1085
1442
  }
@@ -1130,7 +1487,14 @@ export function calculateAndWriteScores(options) {
1130
1487
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1131
1488
  log.info("Score summary written to results/latest/score-summary.json");
1132
1489
  const judgments = extractGraderJudgments(baselineResultsPath);
1490
+ const borderlineConsistency = await runBorderlinePass(judgments, [
1491
+ baselineResultsPath,
1492
+ ]);
1133
1493
  if (judgments.length > 0) {
1494
+ if (borderlineConsistency && borderlineConsistency.size > 0) {
1495
+ writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
1496
+ log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
1497
+ }
1134
1498
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1135
1499
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1136
1500
  }
@@ -1232,16 +1596,54 @@ export function calculateAndWriteScores(options) {
1232
1596
  mkdirSync(outDir, { recursive: true });
1233
1597
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1234
1598
  log.info("Score summary written to results/latest/score-summary.json");
1235
- // Extract and persist grader judgments (Phase 3a: failure mode extraction)
1236
- const judgments = extractGraderJudgments(baselineResultsPath);
1599
+ // Extract and persist grader judgments (Phase 3a: failure mode extraction).
1600
+ //
1601
+ // Plan 03-03 — wire calibration telemetry. The strict-schema parse
1602
+ // failure counter (`parseFailures`) is incremented during extraction;
1603
+ // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
1604
+ // is incremented during the post-extraction validation pass below.
1605
+ const reliability = { graderModel: "unknown" };
1606
+ const judgments = extractGraderJudgments(baselineResultsPath, {
1607
+ reliability,
1608
+ ...(options.runId ? { runId: options.runId } : {}),
1609
+ });
1237
1610
  // In full mode, also extract judgments from agentic results
1238
1611
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1239
- const agenticJudgments = extractGraderJudgments(agenticResultsPath);
1612
+ const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
1613
+ reliability,
1614
+ ...(options.runId ? { runId: options.runId } : {}),
1615
+ });
1240
1616
  judgments.push(...agenticJudgments);
1241
1617
  }
1618
+ // Borderline-consensus pass — re-grade the ±5 borderline subset N times
1619
+ // and merge medians back into the canonical judgments BEFORE
1620
+ // `validateGraderJudgmentsCalibration` runs, so the calibration counter
1621
+ // sees the consensus-merged scores rather than single-replica noise.
1622
+ // In full mode, the merged `judgments` array includes entries sourced
1623
+ // from both baseline and agentic result files — pass both paths so the
1624
+ // regrader's lookup map can resolve agentic-sourced judgments back to
1625
+ // their (responseText, rubricText) pair instead of falling through to
1626
+ // the unchanged-score branch.
1627
+ const borderlineConsistency = await runBorderlinePass(judgments, mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
1628
+ ? [baselineResultsPath, agenticResultsPath]
1629
+ : [baselineResultsPath]);
1242
1630
  if (judgments.length > 0) {
1631
+ // Stamp each judgment with the D0049 ceiling-cross-check confidence
1632
+ // triple and accumulate `failureModeCalibration` disagreement counts.
1633
+ validateGraderJudgmentsCalibration(judgments, scores, reliability);
1634
+ if (borderlineConsistency && borderlineConsistency.size > 0) {
1635
+ writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
1636
+ log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
1637
+ }
1243
1638
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1244
1639
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1640
+ if (reliability.parseFailures !== undefined ||
1641
+ reliability.failureModeCalibration !== undefined) {
1642
+ log.debug("Grader reliability telemetry", {
1643
+ parseFailures: reliability.parseFailures ?? 0,
1644
+ failureModeCalibration: reliability.failureModeCalibration ?? 0,
1645
+ });
1646
+ }
1245
1647
  }
1246
1648
  // Extract and persist per-test results (D0029: model output + metadata)
1247
1649
  const testResults = extractStoredTestResults(baselineResultsPath);
@@ -18,7 +18,7 @@
18
18
  *
19
19
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
20
20
  */
21
- import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
21
+ import { type LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
22
22
  import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
23
23
  import type { PreflightRubricContext } from "./rubric-resolution.js";
24
24
  import { type LiteracyEvalSubMode } from "../normalize-mode.js";
@@ -18,6 +18,7 @@
18
18
  *
19
19
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
20
20
  */
21
+ import { RubricConfigSchema, formatZodErrors, } from "../../_vendor/ailf-core/index.js";
21
22
  import { compileLiteracyTask, } from "./mode-handlers/literacy/index.js";
22
23
  import { tryLoadConfigFile } from "./config-loader.js";
23
24
  import { buildTaskGraph } from "./task-graph-builder.js";
@@ -152,23 +153,40 @@ function loadRubricResolutionInput(rootDir) {
152
153
  const result = tryLoadConfigFile("rubrics", rootDir);
153
154
  if (!result)
154
155
  return undefined;
155
- try {
156
- const parsed = result.data;
157
- if (!parsed?.templates)
158
- return undefined;
159
- const templates = {};
160
- for (const [key, val] of Object.entries(parsed.templates)) {
161
- const t = val;
162
- templates[key] = {
163
- header: String(t.header ?? ""),
164
- scale: t.scale ?? [],
165
- dimension: t.dimension ? String(t.dimension) : undefined,
166
- criteria_label: t.criteria_label ? String(t.criteria_label) : undefined,
167
- };
168
- }
169
- return { templates };
156
+ // Run the canonical RubricConfigSchema (Plan 03-01) at the trust
157
+ // boundary. Hand-rolled .ailf/rubrics.{yaml,json} configs and test
158
+ // fixtures previously bypassed validation: tryLoadConfigFile only
159
+ // ran the format-specific parser, then per-field unsafe casts read
160
+ // the result. With the strict GraderJudgmentSchema from Plan 03-04,
161
+ // a missing footer silently became "" and every grader emission
162
+ // fell through to the synthesized 'unclassified' fallback (CR-03).
163
+ // Fail loudly with a Zod-formatted message instead so config drift
164
+ // is caught at load time, not at every grader emission.
165
+ const parseResult = RubricConfigSchema.safeParse(result.data);
166
+ if (!parseResult.success) {
167
+ const lines = formatZodErrors(parseResult.error);
168
+ throw new Error(`Invalid rubric config at ${result.filePath}:\n${lines.join("\n")}`);
170
169
  }
171
- catch {
172
- return undefined;
170
+ const parsed = parseResult.data;
171
+ const templates = {};
172
+ for (const [key, t] of Object.entries(parsed.templates)) {
173
+ // Plan 03-02 — thread per-dimension failureModes through to the
174
+ // runtime prompt assembler when the config stamped the field via
175
+ // `failureModesForDimension(dimension)`.
176
+ templates[key] = {
177
+ header: t.header,
178
+ scale: t.scale,
179
+ ...(t.dimension !== undefined ? { dimension: t.dimension } : {}),
180
+ ...(t.criteria_label !== undefined && t.criteria_label !== null
181
+ ? { criteria_label: t.criteria_label }
182
+ : {}),
183
+ ...(t.failureModes && t.failureModes.length > 0
184
+ ? { failureModes: t.failureModes }
185
+ : {}),
186
+ };
173
187
  }
188
+ // Plan 03-01 — footer is min(1) per RubricConfigSchema, so safeParse
189
+ // above guarantees a non-empty string here. No defensive empty-string
190
+ // fall-back needed.
191
+ return { templates, footer: parsed.footer };
174
192
  }
@@ -23,7 +23,22 @@ export interface RubricResolutionInput {
23
23
  dimension?: string;
24
24
  header: string;
25
25
  scale: string[];
26
+ /**
27
+ * Plan 03-02 — per-dimension legal failure-mode list emitted into the
28
+ * rubric prompt body before `${rubricConfig.footer}`. Sourced from
29
+ * `failureModesForDimension(dimension)` in
30
+ * `packages/eval/src/grader/index.ts` so the grader is told which modes
31
+ * are legal for this dimension family.
32
+ */
33
+ failureModes?: readonly string[];
26
34
  }>;
35
+ /**
36
+ * Plan 03-01 — formerly hard-coded literal at L98-102; now sourced
37
+ * from RubricConfig.footer. The footer documents the target wire
38
+ * format the grader emits (structured GraderJudgment shape sketch in
39
+ * Phase 3+; legacy `{score, reason}` JSON pre-Phase-3).
40
+ */
41
+ footer: string;
27
42
  }
28
43
  /**
29
44
  * Resolve a templated LLM-rubric assertion into a fully assembled
@@ -54,10 +54,18 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
54
54
  const preflightSection = preflightContext && template.dimension === "code-correctness"
55
55
  ? buildPreflightSection(preflightContext)
56
56
  : "";
57
+ // Plan 03-02 — when the template carries a per-dimension legal-mode
58
+ // list, announce it to the grader before the structured-shape footer
59
+ // (Plan 03-01). Empty list => no announcement (preserves prior wording
60
+ // for templates that haven't yet been wired to a taxonomy family).
61
+ const failureModesLine = template.failureModes && template.failureModes.length > 0
62
+ ? `\nThe "failureMode" must be one of: ${template.failureModes.join(", ")}.\n\n`
63
+ : "";
57
64
  const rubricValue = preflightSection +
58
65
  `${template.header}\n${scaleText}\n\n` +
59
66
  `${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
60
- `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
67
+ failureModesLine +
68
+ `${rubricConfig.footer}`;
61
69
  const rubricPrompt = canonicalReference
62
70
  ? buildRubricPromptWithReference(rubricValue, canonicalReference)
63
71
  : undefined;