@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
  14. package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
  15. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  16. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  17. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  18. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  19. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  22. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  28. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  33. package/dist/_vendor/ailf-core/services/index.js +5 -0
  34. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  35. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  36. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  37. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  38. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  39. package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
  40. package/dist/_vendor/ailf-core/types/confidence.js +56 -0
  41. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  42. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  44. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  45. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
  47. package/dist/_vendor/ailf-core/types/index.js +16 -1
  48. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  49. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  50. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  51. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  52. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  53. package/dist/adapters/api-client/build-request.d.ts +1 -0
  54. package/dist/adapters/api-client/build-request.js +3 -0
  55. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  56. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  57. package/dist/adapters/attribution/index.d.ts +9 -0
  58. package/dist/adapters/attribution/index.js +8 -0
  59. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  60. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  61. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  62. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  63. package/dist/adapters/grader-outputs/index.js +8 -0
  64. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  65. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  66. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  67. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  68. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  69. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  70. package/dist/adapters/index.d.ts +3 -0
  71. package/dist/adapters/index.js +4 -0
  72. package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
  73. package/dist/adapters/llm/anthropic-llm-client.js +205 -0
  74. package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
  75. package/dist/adapters/llm/fake-llm-client.js +63 -0
  76. package/dist/adapters/llm/index.d.ts +9 -0
  77. package/dist/adapters/llm/index.js +4 -0
  78. package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
  79. package/dist/adapters/llm/openai-llm-client.js +168 -0
  80. package/dist/adapters/llm/pricing.d.ts +12 -0
  81. package/dist/adapters/llm/pricing.js +8 -0
  82. package/dist/adapters/llm/retry.d.ts +56 -0
  83. package/dist/adapters/llm/retry.js +66 -0
  84. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  85. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  86. package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
  87. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  88. package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
  89. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
  90. package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
  91. package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/explain-handler.js +1 -1
  94. package/dist/commands/lookup-doc.d.ts +1 -1
  95. package/dist/commands/lookup-doc.js +3 -3
  96. package/dist/commands/pipeline-action.d.ts +6 -0
  97. package/dist/commands/pipeline-action.js +2 -0
  98. package/dist/commands/remote-pipeline.js +1 -0
  99. package/dist/composition-root.d.ts +59 -1
  100. package/dist/composition-root.js +95 -0
  101. package/dist/config/rubrics.ts +38 -2
  102. package/dist/grader/agent-harness.d.ts +14 -0
  103. package/dist/grader/agent-harness.js +17 -0
  104. package/dist/grader/common.d.ts +17 -0
  105. package/dist/grader/common.js +21 -0
  106. package/dist/grader/index.d.ts +38 -0
  107. package/dist/grader/index.js +75 -0
  108. package/dist/grader/knowledge-probe.d.ts +14 -0
  109. package/dist/grader/knowledge-probe.js +18 -0
  110. package/dist/grader/literacy.d.ts +13 -0
  111. package/dist/grader/literacy.js +17 -0
  112. package/dist/grader/mcp.d.ts +14 -0
  113. package/dist/grader/mcp.js +18 -0
  114. package/dist/orchestration/build-app-context.js +1 -0
  115. package/dist/orchestration/build-step-sequence.js +5 -0
  116. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  117. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  118. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  119. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  120. package/dist/orchestration/steps/index.d.ts +1 -0
  121. package/dist/orchestration/steps/index.js +1 -0
  122. package/dist/pipeline/attribution.d.ts +15 -0
  123. package/dist/pipeline/attribution.js +18 -9
  124. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  125. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  126. package/dist/pipeline/borderline-detector.d.ts +24 -0
  127. package/dist/pipeline/borderline-detector.js +26 -0
  128. package/dist/pipeline/calculate-scores.d.ts +114 -3
  129. package/dist/pipeline/calculate-scores.js +426 -24
  130. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  131. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  132. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  133. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  134. package/dist/pipeline/compute-attribution.d.ts +80 -0
  135. package/dist/pipeline/compute-attribution.js +196 -0
  136. package/dist/pipeline/failure-modes.d.ts +52 -17
  137. package/dist/pipeline/failure-modes.js +178 -117
  138. package/dist/pipeline/map-request-to-config.js +1 -0
  139. package/package.json +6 -4
@@ -0,0 +1,124 @@
1
+ /**
2
+ * pipeline/borderline-consensus-runner.ts
3
+ *
4
+ * GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
5
+ * of runGraderConsistency — re-grades ONLY judgments where
6
+ * `isBorderline(score, thresholds)` returns true. Non-borderline
7
+ * judgments pass through unchanged.
8
+ *
9
+ * Per D0005 (grader-model separation), replicates the SAME pinned
10
+ * grader N times (default 3, configurable via
11
+ * RepoConfig.execution.borderlineReplications); NOT the inter-grader
12
+ * ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
13
+ * source bug" callout pinned this to intra-grader replication only.
14
+ *
15
+ * The re-grade hook is supplied by the caller as a `regrade` callback.
16
+ * The composition root wires it to `gradeOnce` from grader-api.js with
17
+ * the response/rubric text drawn from the original Promptfoo result.
18
+ * The runner itself imports `gradeOnce` only as the default regrader
19
+ * fallback so unit tests can spy/inject without re-wiring.
20
+ *
21
+ * @see docs/decisions/D0005-grader-model-separation.md
22
+ * @see ./borderline-detector.ts — pure predicate
23
+ * @see ./grader-consistency.ts — JudgmentConsistency shape we emit
24
+ */
25
+ import { isBorderline } from "./borderline-detector.js";
26
+ // Imported for the default-regrader fallback documented in the header.
27
+ // The runner does not invoke gradeOnce directly when `regrade` is supplied.
28
+ // Keeping the import on the public surface preserves the architectural
29
+ // rule that the runner's grader entry point lives in grader-api.js
30
+ // (Pitfall 6 — the inter-grader ensemble module is intentionally NOT
31
+ // reached for on this path).
32
+ import { gradeOnce } from "./grader-api.js";
33
+ import { analyzeJudgment, } from "./grader-consistency.js";
34
+ /**
35
+ * Re-export `gradeOnce` so callers that need to wire the default regrader
36
+ * (composition-root, integration tests) can import the grader entry point
37
+ * from this module rather than rediscovering grader-api.js. The runner
38
+ * itself does not invoke `gradeOnce` — the caller-supplied `regrade`
39
+ * callback owns the live grader call (Pitfall 6 — runner stays pure wrt
40
+ * provider config).
41
+ */
42
+ export { gradeOnce };
43
+ /** Map key for the per-judgment consistency record. */
44
+ function consistencyKey(j) {
45
+ return `${j.taskId}::${j.dimension}::${j.modelId}`;
46
+ }
47
+ /**
48
+ * Run intra-grader consensus on the borderline subset of `judgments`.
49
+ *
50
+ * - Borderline (per `isBorderline(score, thresholds)`): re-grade
51
+ * `replications` times via `regrade`; emit a `JudgmentConsistency`
52
+ * keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
53
+ * median back into the canonical judgment's `score`.
54
+ * - Non-borderline: pass through unchanged. Output array length == input.
55
+ *
56
+ * The function is order-preserving — the returned `judgments` array
57
+ * keeps the same element order as the input.
58
+ */
59
+ export async function runBorderlineConsensus(options) {
60
+ const { judgments, logger, regrade, replications, thresholds } = options;
61
+ const consistencyByJudgment = new Map();
62
+ // Filter to borderline subset; bypass entirely if empty.
63
+ const borderlineKeys = new Set();
64
+ for (const j of judgments) {
65
+ if (isBorderline(j.score, thresholds)) {
66
+ borderlineKeys.add(consistencyKey(j));
67
+ }
68
+ }
69
+ if (borderlineKeys.size === 0) {
70
+ return { consistencyByJudgment, judgments };
71
+ }
72
+ const out = [];
73
+ for (const j of judgments) {
74
+ const key = consistencyKey(j);
75
+ if (!borderlineKeys.has(key)) {
76
+ out.push(j); // non-borderline — single replica
77
+ continue;
78
+ }
79
+ // Re-grade `replications` times via the same pinned grader. The
80
+ // replications carry network-bound side effects (LLM calls), so run
81
+ // them concurrently — `Promise.allSettled` preserves the per-replica
82
+ // try/catch shape (failures log + drop, surviving replicas still
83
+ // contribute to the consensus median). Worst-case wall time drops
84
+ // from `replications * roundTrip` to a single `roundTrip`.
85
+ const scores = [j.score];
86
+ const settled = await Promise.allSettled(Array.from({ length: replications }, () => regrade(j)));
87
+ settled.forEach((outcome, i) => {
88
+ if (outcome.status === "fulfilled") {
89
+ scores.push(outcome.value);
90
+ }
91
+ else {
92
+ const err = outcome.reason;
93
+ logger?.warn(`Borderline replication ${i + 1}/${replications} failed for ${key}: ` +
94
+ (err instanceof Error ? err.message : String(err)));
95
+ }
96
+ });
97
+ const grading = {
98
+ area: "",
99
+ dimension: j.dimension,
100
+ ...(j.modelId ? { providerId: j.modelId } : {}),
101
+ scores,
102
+ taskId: j.taskId,
103
+ };
104
+ const consistency = analyzeJudgment(grading);
105
+ consistencyByJudgment.set(key, consistency);
106
+ // Merge consensus (median across replicas) into the canonical judgment.
107
+ out.push({ ...j, score: median(scores) });
108
+ }
109
+ return { consistencyByJudgment, judgments: out };
110
+ }
111
+ /**
112
+ * Compute the median of an array of numbers. The runner uses median
113
+ * (not mean) so a single outlier replica doesn't drag the consensus
114
+ * score across a severity threshold.
115
+ */
116
+ function median(values) {
117
+ if (values.length === 0)
118
+ return 0;
119
+ const sorted = [...values].sort((a, b) => a - b);
120
+ const mid = Math.floor(sorted.length / 2);
121
+ return sorted.length % 2 === 0
122
+ ? (sorted[mid - 1] + sorted[mid]) / 2
123
+ : sorted[mid];
124
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * pipeline/borderline-detector.ts
3
+ *
4
+ * GRAD-04 borderline-band predicate. Pure computation; no I/O.
5
+ *
6
+ * A judgment is "borderline" when its score lies within ±5 of any of
7
+ * the three rubric thresholds (severity boundaries 30 / 50 / 60 from
8
+ * packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
9
+ * info edges).
10
+ *
11
+ * Per D0005 (grader-model separation), borderline judgments trigger
12
+ * intra-grader consensus replication of the SAME pinned grader rather
13
+ * than inter-grader ensemble — preserving D0005's reproducibility
14
+ * posture.
15
+ *
16
+ * @see docs/decisions/D0005-grader-model-separation.md
17
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
18
+ */
19
+ export declare const BORDERLINE_BAND = 5;
20
+ /**
21
+ * Returns true when `score` lies within ±BORDERLINE_BAND of any
22
+ * configured threshold. Pure function — safe to call N×.
23
+ */
24
+ export declare function isBorderline(score: number, thresholds: readonly number[]): boolean;
@@ -0,0 +1,26 @@
1
+ /**
2
+ * pipeline/borderline-detector.ts
3
+ *
4
+ * GRAD-04 borderline-band predicate. Pure computation; no I/O.
5
+ *
6
+ * A judgment is "borderline" when its score lies within ±5 of any of
7
+ * the three rubric thresholds (severity boundaries 30 / 50 / 60 from
8
+ * packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
9
+ * info edges).
10
+ *
11
+ * Per D0005 (grader-model separation), borderline judgments trigger
12
+ * intra-grader consensus replication of the SAME pinned grader rather
13
+ * than inter-grader ensemble — preserving D0005's reproducibility
14
+ * posture.
15
+ *
16
+ * @see docs/decisions/D0005-grader-model-separation.md
17
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
18
+ */
19
+ export const BORDERLINE_BAND = 5;
20
+ /**
21
+ * Returns true when `score` lies within ±BORDERLINE_BAND of any
22
+ * configured threshold. Pure function — safe to call N×.
23
+ */
24
+ export function isBorderline(score, thresholds) {
25
+ return thresholds.some((t) => Math.abs(score - t) <= BORDERLINE_BAND);
26
+ }
@@ -1,4 +1,5 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type GraderReliability, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
+ import type { JudgmentConsistency } from "./grader-consistency.js";
2
3
  import { type ResolvedSourceConfig } from "../sources.js";
3
4
  import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
5
  import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
@@ -108,8 +109,84 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
108
109
  * assertion produces one GraderJudgment entry.
109
110
  *
110
111
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
112
+ *
113
+ * @param resultsPath - Path to the Promptfoo results JSON file.
114
+ * @param telemetry - Optional reliability counter (Plan 03-03). When passed,
115
+ * `parseFailures` is incremented on every strict-schema rejection so the
116
+ * live pipeline can surface schema drift over time.
117
+ * `runId` (when supplied) is threaded into synthesized fall-back judgment
118
+ * ids so dedup keys are unique per-run.
119
+ */
120
+ export declare function extractGraderJudgments(resultsPath: string, telemetry?: {
121
+ reliability: GraderReliability;
122
+ runId?: string;
123
+ }): GraderJudgment[];
124
+ /**
125
+ * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
126
+ * triple and increment `GraderReliability.failureModeCalibration` whenever
127
+ * the grader's emitted `failureMode` disagrees with the
128
+ * ceiling-decomposition mode.
129
+ *
130
+ * Plan 03-03 — the grader's emitted `failureMode` is the source of truth
131
+ * for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
132
+ * stamps confidence based on whether the structural ceiling signal agrees
133
+ * and surfaces calibration drift as a counter on `GraderReliability`.
134
+ *
135
+ * The function mutates `judgments` in place — it overlays
136
+ * `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
137
+ * already carries a confidence from the strict-schema parse (Plan 03-01),
138
+ * the ceiling-cross-check stamp REPLACES it because the validator's
139
+ * derivation tag is the live-pipeline contract; the parsed-shape
140
+ * confidence (if emitted by the grader) is preserved on the original
141
+ * `parsedJudgment` upstream of this site.
142
+ *
143
+ * `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
144
+ * does not have access to `task.contextDocs ∪ run.documentManifest` at this
145
+ * site (the union travels through a separate path in
146
+ * `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
147
+ * couple the doc-union population at the strict-schema flip site so the
148
+ * field is populated alongside the required-flip.
149
+ *
150
+ * @param judgments - Grader judgments produced by `extractGraderJudgments`.
151
+ * @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
152
+ * come from this lookup. Missing areas default to ceiling 100, floor 0
153
+ * (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
154
+ * @param reliability - `GraderReliability` sink whose
155
+ * `failureModeCalibration` counter is incremented on disagreement.
156
+ *
157
+ * @see docs/decisions/D0005-grader-model-separation.md
158
+ * @see docs/decisions/D0049-shared-confidence-contract.md
111
159
  */
112
- export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
160
+ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudgment[], scores: FeatureScore[], reliability: GraderReliability): void;
161
+ /**
162
+ * Populate Pitfall #11 hallucination cross-check fields on grader
163
+ * judgments (Plan 03-04 GRAD-05).
164
+ *
165
+ * For each judgment, sets `hallucinationCheckedAgainst` to the union of
166
+ * (a) the slugs of docs the task declared in `context.docs` and (b) the
167
+ * run's full document manifest. For each entry in `judgment.docCitations`
168
+ * that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
169
+ * that does not appear in either set is a fabrication, not a real
170
+ * resolvable doc.
171
+ *
172
+ * Mutates `judgments` in place. Slug-less citations are left with
173
+ * `hallucinated` undefined since the lookup key is the slug per the
174
+ * GraderJudgment domain doc-comment ("slug does not resolve against the
175
+ * task's contextDocs set").
176
+ *
177
+ * Called from `gap-analysis-step` (literacy mode only) after
178
+ * `descToDocRefs` and `documentManifest` are built — both inputs are
179
+ * unavailable at extract time. Non-literacy modes skip this step
180
+ * entirely (the runtime contract has no canonical doc set to check
181
+ * against).
182
+ *
183
+ * @param judgments - Grader judgments to enrich. Mutated in place.
184
+ * @param taskDocSlugs - Map from base task description (the form judgment
185
+ * `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
186
+ * to the slugs declared in that task's `context.docs`.
187
+ * @param manifestSlugs - All slugs in the run's document manifest.
188
+ */
189
+ export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
113
190
  /**
114
191
  * Extract per-test results with model output from evaluation results.
115
192
  *
@@ -165,9 +242,43 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
165
242
  */
166
243
  export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
167
244
  /** Options for the calculate-scores main() function. */
245
+ /**
246
+ * Pre-built runner closure for the GRAD-04 borderline-consensus pass.
247
+ *
248
+ * The composition root produces one of these via
249
+ * `createBorderlineConsensusRunner` so the threshold + replication
250
+ * defaults stay co-located with the rest of the pipeline wiring.
251
+ * `calculateAndWriteScores` invokes it after `extractGraderJudgments`
252
+ * and before persisting `grader-judgments.json` so the judgments file
253
+ * carries the consensus-merged scores rather than the original
254
+ * single-replica grader output (CR-01).
255
+ */
256
+ export type BorderlineConsensusRunner = (args: {
257
+ judgments: GraderJudgment[];
258
+ logger?: Logger;
259
+ regrade: (judgment: GraderJudgment) => Promise<number>;
260
+ }) => Promise<{
261
+ consistencyByJudgment: Map<string, JudgmentConsistency>;
262
+ judgments: GraderJudgment[];
263
+ }>;
168
264
  export interface CalculateScoresOptions {
169
265
  /** Allowed origins for source isolation reporting */
170
266
  allowedOrigins?: string[];
267
+ /**
268
+ * Pre-built borderline-consensus runner (CR-01). When provided AND
269
+ * non-zero `borderlineReplications`, runs after extraction and
270
+ * persists `borderline-consistency.json` alongside
271
+ * `grader-judgments.json`. When omitted, the pipeline keeps the
272
+ * single-replica scores — preserving Phase 2 behavior for callers
273
+ * that haven't opted in.
274
+ */
275
+ borderlineConsensusRunner?: BorderlineConsensusRunner;
276
+ /**
277
+ * Optional regrade entry point used by the borderline runner. Wired
278
+ * in by the orchestration step from grader-api.ts; when absent, the
279
+ * runner is skipped because there's nothing to call.
280
+ */
281
+ borderlineRegradeOnce?: (responseText: string, rubricText: string) => Promise<null | number>;
171
282
  /** Logger instance (defaults to ConsoleLogger if not provided) */
172
283
  logger?: Logger;
173
284
  /** Evaluation mode (controls which result files are read) */
@@ -209,4 +320,4 @@ export interface CalculateScoresResult {
209
320
  /** Summary of test execution outcomes (total, passed, failed, errored). */
210
321
  testSummary?: TestSummary;
211
322
  }
212
- export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;
323
+ export declare function calculateAndWriteScores(options: CalculateScoresOptions): Promise<CalculateScoresResult>;