@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  11. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  12. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  13. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  14. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  15. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  22. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  23. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  24. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  28. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  29. package/dist/_vendor/ailf-core/services/index.js +5 -0
  30. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  31. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  32. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  33. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  34. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  35. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  36. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  37. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  38. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  39. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  40. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  41. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  42. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  43. package/dist/_vendor/ailf-core/types/index.js +15 -1
  44. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  45. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  47. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  48. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  49. package/dist/adapters/api-client/build-request.d.ts +1 -0
  50. package/dist/adapters/api-client/build-request.js +3 -0
  51. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  52. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  53. package/dist/adapters/attribution/index.d.ts +9 -0
  54. package/dist/adapters/attribution/index.js +8 -0
  55. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  56. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  57. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  58. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  59. package/dist/adapters/grader-outputs/index.js +8 -0
  60. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  61. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  62. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  63. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  64. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  65. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  66. package/dist/adapters/index.d.ts +3 -0
  67. package/dist/adapters/index.js +4 -0
  68. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  69. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  70. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  71. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  72. package/dist/commands/calculate-scores.js +1 -1
  73. package/dist/commands/explain-handler.js +1 -1
  74. package/dist/commands/lookup-doc.d.ts +1 -1
  75. package/dist/commands/lookup-doc.js +3 -3
  76. package/dist/commands/pipeline-action.d.ts +6 -0
  77. package/dist/commands/pipeline-action.js +2 -0
  78. package/dist/commands/remote-pipeline.js +1 -0
  79. package/dist/composition-root.d.ts +36 -0
  80. package/dist/composition-root.js +48 -0
  81. package/dist/config/rubrics.ts +38 -2
  82. package/dist/grader/agent-harness.d.ts +14 -0
  83. package/dist/grader/agent-harness.js +17 -0
  84. package/dist/grader/common.d.ts +17 -0
  85. package/dist/grader/common.js +21 -0
  86. package/dist/grader/index.d.ts +38 -0
  87. package/dist/grader/index.js +75 -0
  88. package/dist/grader/knowledge-probe.d.ts +14 -0
  89. package/dist/grader/knowledge-probe.js +18 -0
  90. package/dist/grader/literacy.d.ts +13 -0
  91. package/dist/grader/literacy.js +17 -0
  92. package/dist/grader/mcp.d.ts +14 -0
  93. package/dist/grader/mcp.js +18 -0
  94. package/dist/orchestration/build-app-context.js +1 -0
  95. package/dist/orchestration/build-step-sequence.js +5 -0
  96. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  97. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  98. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  99. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  100. package/dist/orchestration/steps/index.d.ts +1 -0
  101. package/dist/orchestration/steps/index.js +1 -0
  102. package/dist/pipeline/attribution.d.ts +15 -0
  103. package/dist/pipeline/attribution.js +18 -9
  104. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  105. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  106. package/dist/pipeline/borderline-detector.d.ts +24 -0
  107. package/dist/pipeline/borderline-detector.js +26 -0
  108. package/dist/pipeline/calculate-scores.d.ts +114 -3
  109. package/dist/pipeline/calculate-scores.js +426 -24
  110. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  111. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  112. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  113. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  114. package/dist/pipeline/compute-attribution.d.ts +80 -0
  115. package/dist/pipeline/compute-attribution.js +196 -0
  116. package/dist/pipeline/failure-modes.d.ts +52 -17
  117. package/dist/pipeline/failure-modes.js +178 -117
  118. package/dist/pipeline/map-request-to-config.js +1 -0
  119. package/package.json +6 -4
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Pipeline step: Per-judgment attribution ensemble (v0).
3
+ *
4
+ * Reads `grader-judgments.json` and `score-summary.json` from the latest
5
+ * results, calls the pure `computeJudgmentAttribution(...)` helper for each
6
+ * judgment, and emits:
7
+ *
8
+ * - One `perEntryAttribution` artifact per judgment at
9
+ * `runs/{runId}/attribution/{entryKey}.json`
10
+ * - One `attributionMeta` artifact at
11
+ * `runs/{runId}/attribution-meta.json`
12
+ *
13
+ * Additionally, when any hallucinated citations are detected, the step
14
+ * atomically rewrites `score-summary.json` to persist
15
+ * `graderReliability.hallucinationCount` (D-05 — only this one direct-
16
+ * mutation path uses the temp+rename pattern; all artifact emissions go
17
+ * through `ctx.artifactWriter.emit` which handles atomicity internally).
18
+ *
19
+ * This step is `optional: true` — it self-skips when either
20
+ * `grader-judgments.json` or `score-summary.json` is missing, so
21
+ * non-graded runs are unaffected.
22
+ *
23
+ * Task → judgment join (D-10): `judgment.taskId` is the promptfoo row
24
+ * description, which for literacy mode is `"${task.title} (gold|baseline)"`.
25
+ * The join strips the variant suffix and looks up in a triple-keyed cache
26
+ * by `task.title` (primary), `task.description`, and `task.id` (defensive
27
+ * fallbacks for non-literacy modes).
28
+ *
29
+ * Retrieved signal (D-11/D-12): `FeatureAgentBehavior.feature` is the join
30
+ * key — for literacy mode it equals `task.area` exactly (compiler propagates
31
+ * `task.area → __featureArea → ab.feature`).
32
+ *
33
+ * @see docs/decisions/D0033-unified-artifact-writer.md
34
+ * @see docs/decisions/D0049-shared-confidence-contract.md
35
+ * @see docs/decisions/D0050-per-entry-attribution-layout.md
36
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
37
+ */
38
+ import { existsSync, readFileSync, renameSync, unlinkSync, writeFileSync, } from "node:fs";
39
+ import { resolve } from "node:path";
40
+ import { isSlugRef } from "../../_vendor/ailf-core/index.js";
41
+ import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
42
+ import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
43
+ // ---------------------------------------------------------------------------
44
+ // Step implementation
45
+ // ---------------------------------------------------------------------------
46
+ export class ComputeAttributionStep {
47
+ name = "compute-attribution";
48
+ optional = true;
49
+ check(ctx) {
50
+ const issues = [];
51
+ const judgmentsPath = resolve(ctx.config.rootDir, "results", "latest", "grader-judgments.json");
52
+ const summaryPath = resolve(ctx.config.rootDir, "results", "latest", "score-summary.json");
53
+ if (!existsSync(judgmentsPath)) {
54
+ issues.push({
55
+ message: "No grader-judgments.json — attribution computation will skip",
56
+ severity: "warning",
57
+ source: "compute-attribution",
58
+ });
59
+ return issues;
60
+ }
61
+ // WARN 5: documentManifest must be present and non-empty (gap-analysis
62
+ // enriches it). Without it, the canonical signal is permanently 0 and
63
+ // the three-signal ensemble silently degrades to citation-only.
64
+ if (existsSync(summaryPath)) {
65
+ try {
66
+ const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
67
+ const dm = summary.documentManifest;
68
+ if (!dm || dm.length === 0) {
69
+ issues.push({
70
+ message: "documentManifest is empty — attribution canonical signal will be permanently 0. Ensure gap-analysis runs before compute-attribution.",
71
+ severity: "warning",
72
+ source: "compute-attribution",
73
+ });
74
+ }
75
+ }
76
+ catch {
77
+ // Surfaces at execute() with a failed StepResult.
78
+ }
79
+ }
80
+ return issues;
81
+ }
82
+ async execute(ctx, _state) {
83
+ const start = Date.now();
84
+ const root = ctx.config.rootDir;
85
+ const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
86
+ const summaryPath = resolve(root, "results", "latest", "score-summary.json");
87
+ if (!existsSync(judgmentsPath)) {
88
+ return { status: "skipped", reason: "No grader-judgments.json" };
89
+ }
90
+ if (!existsSync(summaryPath)) {
91
+ return { status: "skipped", reason: "No score-summary.json" };
92
+ }
93
+ try {
94
+ const judgments = JSON.parse(readFileSync(judgmentsPath, "utf-8"));
95
+ const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
96
+ // D-10: judgment.taskId is the promptfoo row description, sometimes
97
+ // suffixed "(gold)" / "(baseline)" by the literacy compiler
98
+ // (literacy/compiler.ts:184). Build a triple-keyed task cache:
99
+ // - task.title (primary for literacy — the compiler binds task.title
100
+ // as the row description before appending the suffix)
101
+ // - task.description (defensive fallback for future modes)
102
+ // - task.id (defensive fallback for KP / agent-harness modes)
103
+ // Strip the variant suffix before lookup so the join matches.
104
+ const tasksByKey = new Map();
105
+ if (ctx.taskSource) {
106
+ try {
107
+ const tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
108
+ for (const t of tasks) {
109
+ if (t.title)
110
+ tasksByKey.set(t.title, t);
111
+ if (t.description)
112
+ tasksByKey.set(t.description, t);
113
+ tasksByKey.set(t.id, t);
114
+ }
115
+ }
116
+ catch (err) {
117
+ // Surface the failure so operators see why the canonical signal
118
+ // is permanently 0 for this run. Behavior is unchanged — we
119
+ // degrade to an empty tasksByKey rather than failing the step.
120
+ ctx.logger.warn(`[compute-attribution] taskSource.loadTasks() failed; canonical signal will be 0 for all judgments. ${err instanceof Error ? err.message : String(err)}`);
121
+ }
122
+ }
123
+ // D-12: FeatureAgentBehavior.feature is the correct field (NOT .area).
124
+ // Verified at packages/core/src/types/index.ts:1180-1190.
125
+ // D-11: for literacy mode, ab.feature === task.area via the compiler's
126
+ // __featureArea propagation (literacy/compiler.ts:180).
127
+ const visitedByFeature = new Map();
128
+ for (const ab of summary.agentBehavior ?? []) {
129
+ visitedByFeature.set(ab.feature, new Set(ab.docSlugsVisited ?? []));
130
+ }
131
+ // Build a slug→DocumentRef map from documentManifest for candidate resolution.
132
+ const manifestBySlug = new Map();
133
+ for (const ref of summary.documentManifest ?? []) {
134
+ if (ref.slug)
135
+ manifestBySlug.set(ref.slug, ref);
136
+ }
137
+ const reliability = { hallucinationCount: 0 };
138
+ for (const j of judgments) {
139
+ // D-10: strip variant suffix before lookup — mirrors calculate-scores.ts:696.
140
+ const baseDesc = stripVariantSuffix(j.taskId);
141
+ const task = tasksByKey.get(baseDesc) ?? tasksByKey.get(j.taskId);
142
+ // Resolve context docs → DocumentRef[] via manifest lookup.
143
+ const contextSlugs = extractContextSlugs(task);
144
+ const contextDocs = contextSlugs
145
+ .map((s) => manifestBySlug.get(s))
146
+ .filter((r) => r !== undefined);
147
+ const contextDocIds = new Set(contextDocs.map((r) => r.documentId));
148
+ // D-11: retrieved signal keys by task.area (= ab.feature in literacy mode).
149
+ // When agentBehavior is absent (baseline run), drop the retrieved signal
150
+ // (pass undefined — Pitfall #4 / locked D-04).
151
+ const area = task?.area;
152
+ const retrievedSlugs = summary.agentBehavior === undefined || summary.agentBehavior === null
153
+ ? undefined
154
+ : area === undefined
155
+ ? new Set()
156
+ : (visitedByFeature.get(area) ?? new Set());
157
+ // Candidate set: contextDocs ∪ docs in the manifest cited by the judgment.
158
+ const citedDocIds = new Set(j.docCitations.map((c) => c.documentId));
159
+ const candidates = [
160
+ ...contextDocs,
161
+ ...Array.from(manifestBySlug.values()).filter((r) => !contextDocIds.has(r.documentId) &&
162
+ (citedDocIds.has(r.documentId) ||
163
+ (r.slug !== undefined &&
164
+ (retrievedSlugs?.has(r.slug) ?? false)))),
165
+ ];
166
+ const judgmentAttribution = computeJudgmentAttribution(j, candidates, contextDocIds, retrievedSlugs, reliability, V0_WEIGHTS);
167
+ // D-06: pass { run, name } where name is the entry key.
168
+ // formatKeyFromAxes requires assoc.name for per-entry descriptors —
169
+ // forgetting it causes a hard throw at emit time.
170
+ const entryKey = `${j.taskId}--${j.modelId}--${j.dimension}`;
171
+ await ctx.artifactWriter.emit("perEntryAttribution", { run: ctx.runId, name: entryKey }, judgmentAttribution);
172
+ }
173
+ await ctx.artifactWriter.emit("attributionMeta", { run: ctx.runId }, {
174
+ ensembleVersion,
175
+ embeddingModel,
176
+ weights: V0_WEIGHTS,
177
+ calibrationSetVersion,
178
+ });
179
+ // Atomically persist reliability.hallucinationCount onto score-summary.json
180
+ // (D-05: the atomic write applies ONLY to this direct-mutation path;
181
+ // ctx.artifactWriter.emit handles atomicity for all other writes).
182
+ if (reliability.hallucinationCount > 0) {
183
+ // Build the updated graderReliability by merging only the new count.
184
+ // graderReliability.graderModel is a required field — spread to preserve
185
+ // all existing required fields before adding/updating hallucinationCount.
186
+ const existingReliability = summary.graderReliability;
187
+ const updatedReliability = existingReliability
188
+ ? {
189
+ ...existingReliability,
190
+ hallucinationCount: (existingReliability.hallucinationCount ?? 0) +
191
+ reliability.hallucinationCount,
192
+ }
193
+ : {
194
+ // When there is no existing graderReliability, we cannot provide
195
+ // the required graderModel field from the run context here.
196
+ // Persist only the new counter under a minimal shape.
197
+ graderModel: "unknown",
198
+ hallucinationCount: reliability.hallucinationCount,
199
+ };
200
+ const updated = {
201
+ ...summary,
202
+ graderReliability: updatedReliability,
203
+ };
204
+ const tmp = `${summaryPath}.tmp`;
205
+ try {
206
+ writeFileSync(tmp, JSON.stringify(updated, null, 2));
207
+ renameSync(tmp, summaryPath);
208
+ }
209
+ catch (err) {
210
+ // Best-effort cleanup of the .tmp file so a stale leftover does
211
+ // not confuse subsequent runs (cross-device move, EACCES, ENOSPC
212
+ // after writeFileSync). Re-throw to surface the underlying error
213
+ // via the outer try/catch's failed StepResult.
214
+ try {
215
+ unlinkSync(tmp);
216
+ }
217
+ catch {
218
+ /* best-effort cleanup */
219
+ }
220
+ throw err;
221
+ }
222
+ }
223
+ return {
224
+ durationMs: Date.now() - start,
225
+ status: "success",
226
+ summary: `Attribution computed for ${judgments.length} judgments (${reliability.hallucinationCount} hallucinated citations)`,
227
+ };
228
+ }
229
+ catch (err) {
230
+ return {
231
+ durationMs: Date.now() - start,
232
+ error: err instanceof Error ? err.message : String(err),
233
+ status: "failed",
234
+ };
235
+ }
236
+ }
237
+ }
238
+ // ---------------------------------------------------------------------------
239
+ // Internal helpers
240
+ // ---------------------------------------------------------------------------
241
+ /**
242
+ * Strip the literacy compiler's variant suffix from a row description.
243
+ * Mirrors the canonical Phase-3 strip site at
244
+ * packages/eval/src/pipeline/calculate-scores.ts:696 (D-10).
245
+ */
246
+ function stripVariantSuffix(taskId) {
247
+ return taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
248
+ }
249
+ /** True for any ref whose shape carries a string-valued `slug` field. */
250
+ function hasSlugField(ref) {
251
+ return "slug" in ref && typeof ref.slug === "string";
252
+ }
253
+ /**
254
+ * Extract slug strings from a task's context.docs array (D-09).
255
+ *
256
+ * Uses isSlugRef for SlugDocRef and falls back to the informational
257
+ * `slug` annotation on IdDocRef. PathDocRef refs (and the `path`
258
+ * annotation on IdDocRef) are intentionally NOT included — paths are
259
+ * filesystem-style and never match manifestBySlug, which is keyed by
260
+ * article slug. A PathDocRef contributes zero canonical signal (correct
261
+ * fallback) rather than polluting the slug lookup with stale keys.
262
+ */
263
+ function extractContextSlugs(task) {
264
+ if (!task?.context?.docs)
265
+ return [];
266
+ const out = [];
267
+ for (const ref of task.context.docs) {
268
+ if (isSlugRef(ref)) {
269
+ out.push(ref.slug);
270
+ }
271
+ else if (hasSlugField(ref)) {
272
+ // IdDocRef carries an informational `slug` annotation — use it
273
+ // for the manifest lookup. PerspectiveDocRef has neither slug
274
+ // nor a usable lookup key and is skipped.
275
+ out.push(ref.slug);
276
+ }
277
+ }
278
+ return out;
279
+ }
@@ -14,7 +14,7 @@
14
14
  *
15
15
  * This is an optional step — failure doesn't stop the pipeline.
16
16
  */
17
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
17
+ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from "fs";
18
18
  import { join, resolve } from "path";
19
19
  import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
20
20
  import { emitFileContents } from "../../artifact-capture/emit-file.js";
@@ -151,19 +151,44 @@ export class GapAnalysisStep {
151
151
  ...s,
152
152
  documents: areaToDocRefs.get(s.feature),
153
153
  }));
154
+ // Pitfall #11 hallucination cross-check (GRAD-05). The required
155
+ // `hallucinationCheckedAgainst` and per-citation `hallucinated`
156
+ // fields are populated here because `extractGraderJudgments`
157
+ // (the upstream emitter) does not have access to either the
158
+ // task contextDocs map or the run's document manifest. The
159
+ // populator mutates `judgments` in place and the rewrite below
160
+ // persists the enrichment back to disk so downstream consumers
161
+ // (Studio, gap-analysis followups) see populated fields.
162
+ const taskDocSlugs = new Map();
163
+ for (const [desc, refs] of descToDocRefs) {
164
+ taskDocSlugs.set(desc, refs
165
+ .map((r) => r.slug)
166
+ .filter((s) => typeof s === "string" && s.length > 0));
167
+ }
168
+ const manifestSlugs = (documentManifest ?? [])
169
+ .map((d) => d.slug)
170
+ .filter((s) => typeof s === "string" && s.length > 0);
171
+ const { populateHallucinationFields } = await import("../../pipeline/calculate-scores.js");
172
+ populateHallucinationFields(judgments, taskDocSlugs, manifestSlugs);
173
+ // Atomic write — POSIX rename is atomic on the same filesystem,
174
+ // so a mid-write failure leaves either the prior file or the new
175
+ // file intact, never a half-written JSON document.
176
+ const tmpPath = `${judgmentsPath}.tmp`;
177
+ writeFileSync(tmpPath, JSON.stringify(judgments, null, 2));
178
+ renameSync(tmpPath, judgmentsPath);
154
179
  }
155
180
  // ── Per-test results (D0029: model output + metadata) ──────
156
181
  const testResultsPath = resolve(root, "results", "latest", "test-results.json");
157
182
  let testResults;
158
183
  if (existsSync(testResultsPath)) {
159
184
  const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
160
- // Enrich with canonical docs (literacy mode only)
185
+ // Enrich with context docs (literacy mode only)
161
186
  testResults = rawTestResults.map((tr) => {
162
187
  if (!isLiteracyMode)
163
188
  return tr;
164
189
  const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
165
- const canonicalDocs = descToDocRefs.get(baseDesc);
166
- return canonicalDocs ? { ...tr, canonicalDocs } : tr;
190
+ const contextDocs = descToDocRefs.get(baseDesc);
191
+ return contextDocs ? { ...tr, contextDocs } : tr;
167
192
  });
168
193
  }
169
194
  // ── Low-scoring judgments ────────────────────────────────────
@@ -182,8 +207,8 @@ export class GapAnalysisStep {
182
207
  return j;
183
208
  // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
184
209
  const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
185
- const canonicalDocs = descToDocRefs.get(baseDesc);
186
- return canonicalDocs ? { ...j, canonicalDocs } : j;
210
+ const contextDocs = descToDocRefs.get(baseDesc);
211
+ return contextDocs ? { ...j, contextDocs } : j;
187
212
  });
188
213
  const enrichedSummary = {
189
214
  ...scoreSummary,
@@ -194,7 +219,10 @@ export class GapAnalysisStep {
194
219
  scores: enrichedScores,
195
220
  ...(testResults !== undefined && { testResults }),
196
221
  };
197
- writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
222
+ // Atomic write — see judgmentsPath above for rationale.
223
+ const scoreSummaryTmpPath = `${scoreSummaryPath}.tmp`;
224
+ writeFileSync(scoreSummaryTmpPath, JSON.stringify(enrichedSummary, null, 2));
225
+ renameSync(scoreSummaryTmpPath, scoreSummaryPath);
198
226
  // W0051 Slice 2 — failureModes is per-entry keyed by {mode, category};
199
227
  // one entry per classified FailureModeType. Zero-count categories are
200
228
  // skipped to keep the manifest honest about what the run surfaced.
@@ -5,6 +5,7 @@
5
5
  * AppContext instead of positional parameters.
6
6
  */
7
7
  export { CalculateScoresStep } from "./calculate-scores-step.js";
8
+ export { ComputeAttributionStep } from "./compute-attribution-step.js";
8
9
  export { CompareStep } from "./compare-step.js";
9
10
  export { FetchDocsStep } from "./fetch-docs-step.js";
10
11
  export { GapAnalysisStep } from "./gap-analysis-step.js";
@@ -5,6 +5,7 @@
5
5
  * AppContext instead of positional parameters.
6
6
  */
7
7
  export { CalculateScoresStep } from "./calculate-scores-step.js";
8
+ export { ComputeAttributionStep } from "./compute-attribution-step.js";
8
9
  export { CompareStep } from "./compare-step.js";
9
10
  export { FetchDocsStep } from "./fetch-docs-step.js";
10
11
  export { GapAnalysisStep } from "./gap-analysis-step.js";
@@ -17,6 +17,21 @@
17
17
  */
18
18
  import type { AttributionReport, ComparisonReport } from "./types.js";
19
19
  import type { ResolvedMappings } from "./resolve-mappings.js";
20
+ /** v0 sentinel — no embedding call in v0; v1.2's flip to a real
21
+ * model name (e.g., "text-embedding-3-small") mechanically forces
22
+ * ensembleVersion's right segment to change, invalidating cached
23
+ * weights downstream. */
24
+ export declare const embeddingModel = "none";
25
+ /** VER-01 D-02 — co-located ensemble version. Compound semver-ish
26
+ * shape: "{algorithmVersion}+{embeddingModel}" (per D-02 of Phase
27
+ * 4 CONTEXT.md). Phase 1 landed the constant; Phase 4 wires the
28
+ * compound shape and the bump-by-calibration discipline
29
+ * (calibrate-attribution.ts is the ONLY allowed bumper). */
30
+ export declare const ensembleVersion: "0.1.0+none";
31
+ /** Version tag for the calibration set fixture co-located in the repo.
32
+ * Bump when the fixture structure changes (e.g. when v1 moves the
33
+ * calibration set to Content Lake or expands to ~30 stratified rows). */
34
+ export declare const calibrationSetVersion = "v0-fixture";
20
35
  /**
21
36
  * Attribute score changes to individual documents.
22
37
  *
@@ -15,6 +15,21 @@
15
15
  * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
16
  * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
17
  */
18
+ /** v0 sentinel — no embedding call in v0; v1.2's flip to a real
19
+ * model name (e.g., "text-embedding-3-small") mechanically forces
20
+ * ensembleVersion's right segment to change, invalidating cached
21
+ * weights downstream. */
22
+ export const embeddingModel = "none";
23
+ /** VER-01 D-02 — co-located ensemble version. Compound semver-ish
24
+ * shape: "{algorithmVersion}+{embeddingModel}" (per D-02 of Phase
25
+ * 4 CONTEXT.md). Phase 1 landed the constant; Phase 4 wires the
26
+ * compound shape and the bump-by-calibration discipline
27
+ * (calibrate-attribution.ts is the ONLY allowed bumper). */
28
+ export const ensembleVersion = `0.1.0+${embeddingModel}`;
29
+ /** Version tag for the calibration set fixture co-located in the repo.
30
+ * Bump when the fixture structure changes (e.g. when v1 moves the
31
+ * calibration set to Content Lake or expands to ~30 stratified rows). */
32
+ export const calibrationSetVersion = "v0-fixture";
18
33
  // ---------------------------------------------------------------------------
19
34
  // Public API
20
35
  // ---------------------------------------------------------------------------
@@ -59,11 +74,9 @@ export function attributeChanges(comparison, changedSlugs, mappings, noiseThresh
59
74
  // (shouldn't happen in practice, but handle gracefully)
60
75
  continue;
61
76
  }
62
- // For area-level attribution: check which changed docs overlap
63
- // with any task's canonical docs in this area
64
- const areaCanonicalSlugs = new Set(areaTasks.flatMap(([, info]) => info.slugs));
65
- const matchingSlugs = changedSlugs.filter((s) => areaCanonicalSlugs.has(s));
66
- // Classify each task
77
+ // Classify each task. When no task in this area has a matching
78
+ // changed doc, every per-task classification will fall through to
79
+ // `uncorrelated` here no separate area-level record is needed.
67
80
  for (const [taskId, taskInfo] of areaTasks) {
68
81
  const taskMatchingSlugs = taskInfo.slugs.filter((s) => changedSet.has(s));
69
82
  const classification = classifyAttribution(taskMatchingSlugs.length);
@@ -76,10 +89,6 @@ export function attributeChanges(comparison, changedSlugs, mappings, noiseThresh
76
89
  withinNoiseFloor: Math.abs(areaDelta.delta) <= noiseThreshold,
77
90
  });
78
91
  }
79
- // If no task-level matches but area has a delta, record area-level
80
- if (areaTasks.length > 0 && matchingSlugs.length === 0) {
81
- // All tasks in this area are uncorrelated — already handled above
82
- }
83
92
  }
84
93
  // Find untracked documents: changed slugs not in ANY task's canonical docs
85
94
  const allTrackedSlugs = new Set([...taskCanonicalDocs.values()].flatMap((info) => info.slugs));
@@ -0,0 +1,63 @@
1
+ /**
2
+ * pipeline/borderline-consensus-runner.ts
3
+ *
4
+ * GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
5
+ * of runGraderConsistency — re-grades ONLY judgments where
6
+ * `isBorderline(score, thresholds)` returns true. Non-borderline
7
+ * judgments pass through unchanged.
8
+ *
9
+ * Per D0005 (grader-model separation), replicates the SAME pinned
10
+ * grader N times (default 3, configurable via
11
+ * RepoConfig.execution.borderlineReplications); NOT the inter-grader
12
+ * ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
13
+ * source bug" callout pinned this to intra-grader replication only.
14
+ *
15
+ * The re-grade hook is supplied by the caller as a `regrade` callback.
16
+ * The composition root wires it to `gradeOnce` from grader-api.js with
17
+ * the response/rubric text drawn from the original Promptfoo result.
18
+ * The runner itself imports `gradeOnce` only as the default regrader
19
+ * fallback so unit tests can spy/inject without re-wiring.
20
+ *
21
+ * @see docs/decisions/D0005-grader-model-separation.md
22
+ * @see ./borderline-detector.ts — pure predicate
23
+ * @see ./grader-consistency.ts — JudgmentConsistency shape we emit
24
+ */
25
+ import type { GraderJudgment, Logger } from "../_vendor/ailf-core/index.d.ts";
26
+ import { gradeOnce } from "./grader-api.js";
27
+ import { type JudgmentConsistency } from "./grader-consistency.js";
28
+ /**
29
+ * Re-export `gradeOnce` so callers that need to wire the default regrader
30
+ * (composition-root, integration tests) can import the grader entry point
31
+ * from this module rather than rediscovering grader-api.js. The runner
32
+ * itself does not invoke `gradeOnce` — the caller-supplied `regrade`
33
+ * callback owns the live grader call (Pitfall 6 — runner stays pure wrt
34
+ * provider config).
35
+ */
36
+ export { gradeOnce };
37
+ export interface BorderlineConsensusOptions {
38
+ judgments: GraderJudgment[];
39
+ logger?: Logger;
40
+ /** Callback that re-grades a single judgment once. Returns a fresh score. */
41
+ regrade: (judgment: GraderJudgment) => Promise<number>;
42
+ /** Default 3 — see RepoConfig.execution.borderlineReplications. */
43
+ replications: number;
44
+ /** Severity boundaries from config/thresholds.ts (default [30, 50, 60]). */
45
+ thresholds: readonly number[];
46
+ }
47
+ export interface BorderlineConsensusResult {
48
+ consistencyByJudgment: Map<string, JudgmentConsistency>;
49
+ judgments: GraderJudgment[];
50
+ }
51
+ /**
52
+ * Run intra-grader consensus on the borderline subset of `judgments`.
53
+ *
54
+ * - Borderline (per `isBorderline(score, thresholds)`): re-grade
55
+ * `replications` times via `regrade`; emit a `JudgmentConsistency`
56
+ * keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
57
+ * median back into the canonical judgment's `score`.
58
+ * - Non-borderline: pass through unchanged. Output array length == input.
59
+ *
60
+ * The function is order-preserving — the returned `judgments` array
61
+ * keeps the same element order as the input.
62
+ */
63
+ export declare function runBorderlineConsensus(options: BorderlineConsensusOptions): Promise<BorderlineConsensusResult>;
@@ -0,0 +1,124 @@
1
+ /**
2
+ * pipeline/borderline-consensus-runner.ts
3
+ *
4
+ * GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
5
+ * of runGraderConsistency — re-grades ONLY judgments where
6
+ * `isBorderline(score, thresholds)` returns true. Non-borderline
7
+ * judgments pass through unchanged.
8
+ *
9
+ * Per D0005 (grader-model separation), replicates the SAME pinned
10
+ * grader N times (default 3, configurable via
11
+ * RepoConfig.execution.borderlineReplications); NOT the inter-grader
12
+ * ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
13
+ * source bug" callout pinned this to intra-grader replication only.
14
+ *
15
+ * The re-grade hook is supplied by the caller as a `regrade` callback.
16
+ * The composition root wires it to `gradeOnce` from grader-api.js with
17
+ * the response/rubric text drawn from the original Promptfoo result.
18
+ * The runner itself imports `gradeOnce` only as the default regrader
19
+ * fallback so unit tests can spy/inject without re-wiring.
20
+ *
21
+ * @see docs/decisions/D0005-grader-model-separation.md
22
+ * @see ./borderline-detector.ts — pure predicate
23
+ * @see ./grader-consistency.ts — JudgmentConsistency shape we emit
24
+ */
25
+ import { isBorderline } from "./borderline-detector.js";
26
+ // Imported for the default-regrader fallback documented in the header.
27
+ // The runner does not invoke gradeOnce directly when `regrade` is supplied.
28
+ // Keeping the import on the public surface preserves the architectural
29
+ // rule that the runner's grader entry point lives in grader-api.js
30
+ // (Pitfall 6 — the inter-grader ensemble module is intentionally NOT
31
+ // reached for on this path).
32
+ import { gradeOnce } from "./grader-api.js";
33
+ import { analyzeJudgment, } from "./grader-consistency.js";
34
+ /**
35
+ * Re-export `gradeOnce` so callers that need to wire the default regrader
36
+ * (composition-root, integration tests) can import the grader entry point
37
+ * from this module rather than rediscovering grader-api.js. The runner
38
+ * itself does not invoke `gradeOnce` — the caller-supplied `regrade`
39
+ * callback owns the live grader call (Pitfall 6 — runner stays pure wrt
40
+ * provider config).
41
+ */
42
+ export { gradeOnce };
43
+ /** Map key for the per-judgment consistency record. */
44
+ function consistencyKey(j) {
45
+ return `${j.taskId}::${j.dimension}::${j.modelId}`;
46
+ }
47
+ /**
48
+ * Run intra-grader consensus on the borderline subset of `judgments`.
49
+ *
50
+ * - Borderline (per `isBorderline(score, thresholds)`): re-grade
51
+ * `replications` times via `regrade`; emit a `JudgmentConsistency`
52
+ * keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
53
+ * median back into the canonical judgment's `score`.
54
+ * - Non-borderline: pass through unchanged. Output array length == input.
55
+ *
56
+ * The function is order-preserving — the returned `judgments` array
57
+ * keeps the same element order as the input.
58
+ */
59
+ export async function runBorderlineConsensus(options) {
60
+ const { judgments, logger, regrade, replications, thresholds } = options;
61
+ const consistencyByJudgment = new Map();
62
+ // Filter to borderline subset; bypass entirely if empty.
63
+ const borderlineKeys = new Set();
64
+ for (const j of judgments) {
65
+ if (isBorderline(j.score, thresholds)) {
66
+ borderlineKeys.add(consistencyKey(j));
67
+ }
68
+ }
69
+ if (borderlineKeys.size === 0) {
70
+ return { consistencyByJudgment, judgments };
71
+ }
72
+ const out = [];
73
+ for (const j of judgments) {
74
+ const key = consistencyKey(j);
75
+ if (!borderlineKeys.has(key)) {
76
+ out.push(j); // non-borderline — single replica
77
+ continue;
78
+ }
79
+ // Re-grade `replications` times via the same pinned grader. The
80
+ // replications carry network-bound side effects (LLM calls), so run
81
+ // them concurrently — `Promise.allSettled` preserves the per-replica
82
+ // try/catch shape (failures log + drop, surviving replicas still
83
+ // contribute to the consensus median). Worst-case wall time drops
84
+ // from `replications * roundTrip` to a single `roundTrip`.
85
+ const scores = [j.score];
86
+ const settled = await Promise.allSettled(Array.from({ length: replications }, () => regrade(j)));
87
+ settled.forEach((outcome, i) => {
88
+ if (outcome.status === "fulfilled") {
89
+ scores.push(outcome.value);
90
+ }
91
+ else {
92
+ const err = outcome.reason;
93
+ logger?.warn(`Borderline replication ${i + 1}/${replications} failed for ${key}: ` +
94
+ (err instanceof Error ? err.message : String(err)));
95
+ }
96
+ });
97
+ const grading = {
98
+ area: "",
99
+ dimension: j.dimension,
100
+ ...(j.modelId ? { providerId: j.modelId } : {}),
101
+ scores,
102
+ taskId: j.taskId,
103
+ };
104
+ const consistency = analyzeJudgment(grading);
105
+ consistencyByJudgment.set(key, consistency);
106
+ // Merge consensus (median across replicas) into the canonical judgment.
107
+ out.push({ ...j, score: median(scores) });
108
+ }
109
+ return { consistencyByJudgment, judgments: out };
110
+ }
111
+ /**
112
+ * Compute the median of an array of numbers. The runner uses median
113
+ * (not mean) so a single outlier replica doesn't drag the consensus
114
+ * score across a severity threshold.
115
+ */
116
+ function median(values) {
117
+ if (values.length === 0)
118
+ return 0;
119
+ const sorted = [...values].sort((a, b) => a - b);
120
+ const mid = Math.floor(sorted.length / 2);
121
+ return sorted.length % 2 === 0
122
+ ? (sorted[mid - 1] + sorted[mid]) / 2
123
+ : sorted[mid];
124
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * pipeline/borderline-detector.ts
3
+ *
4
+ * GRAD-04 borderline-band predicate. Pure computation; no I/O.
5
+ *
6
+ * A judgment is "borderline" when its score lies within ±5 of any of
7
+ * the three rubric thresholds (severity boundaries 30 / 50 / 60 from
8
+ * packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
9
+ * info edges).
10
+ *
11
+ * Per D0005 (grader-model separation), borderline judgments trigger
12
+ * intra-grader consensus replication of the SAME pinned grader rather
13
+ * than inter-grader ensemble — preserving D0005's reproducibility
14
+ * posture.
15
+ *
16
+ * @see docs/decisions/D0005-grader-model-separation.md
17
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
18
+ */
19
+ export declare const BORDERLINE_BAND = 5;
20
+ /**
21
+ * Returns true when `score` lies within ±BORDERLINE_BAND of any
22
+ * configured threshold. Pure function — safe to call N×.
23
+ */
24
+ export declare function isBorderline(score: number, thresholds: readonly number[]): boolean;