@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-detector.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-band predicate. Pure computation; no I/O.
|
|
5
|
+
*
|
|
6
|
+
* A judgment is "borderline" when its score lies within ±5 of any of
|
|
7
|
+
* the three rubric thresholds (severity boundaries 30 / 50 / 60 from
|
|
8
|
+
* packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
|
|
9
|
+
* info edges).
|
|
10
|
+
*
|
|
11
|
+
* Per D0005 (grader-model separation), borderline judgments trigger
|
|
12
|
+
* intra-grader consensus replication of the SAME pinned grader rather
|
|
13
|
+
* than inter-grader ensemble — preserving D0005's reproducibility
|
|
14
|
+
* posture.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
18
|
+
*/
|
|
19
|
+
export const BORDERLINE_BAND = 5;
|
|
20
|
+
/**
|
|
21
|
+
* Returns true when `score` lies within ±BORDERLINE_BAND of any
|
|
22
|
+
* configured threshold. Pure function — safe to call N×.
|
|
23
|
+
*/
|
|
24
|
+
export function isBorderline(score, thresholds) {
|
|
25
|
+
return thresholds.some((t) => Math.abs(score - t) <= BORDERLINE_BAND);
|
|
26
|
+
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type GraderReliability, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
|
+
import type { JudgmentConsistency } from "./grader-consistency.js";
|
|
2
3
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
4
|
import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
5
|
import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
|
|
@@ -108,8 +109,84 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
|
|
|
108
109
|
* assertion produces one GraderJudgment entry.
|
|
109
110
|
*
|
|
110
111
|
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
112
|
+
*
|
|
113
|
+
* @param resultsPath - Path to the Promptfoo results JSON file.
|
|
114
|
+
* @param telemetry - Optional reliability counter (Plan 03-03). When passed,
|
|
115
|
+
* `parseFailures` is incremented on every strict-schema rejection so the
|
|
116
|
+
* live pipeline can surface schema drift over time.
|
|
117
|
+
* `runId` (when supplied) is threaded into synthesized fall-back judgment
|
|
118
|
+
* ids so dedup keys are unique per-run.
|
|
119
|
+
*/
|
|
120
|
+
export declare function extractGraderJudgments(resultsPath: string, telemetry?: {
|
|
121
|
+
reliability: GraderReliability;
|
|
122
|
+
runId?: string;
|
|
123
|
+
}): GraderJudgment[];
|
|
124
|
+
/**
|
|
125
|
+
* Stamp every grader judgment with a D0049 ceiling-cross-check confidence
|
|
126
|
+
* triple and increment `GraderReliability.failureModeCalibration` whenever
|
|
127
|
+
* the grader's emitted `failureMode` disagrees with the
|
|
128
|
+
* ceiling-decomposition mode.
|
|
129
|
+
*
|
|
130
|
+
* Plan 03-03 — the grader's emitted `failureMode` is the source of truth
|
|
131
|
+
* for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
|
|
132
|
+
* stamps confidence based on whether the structural ceiling signal agrees
|
|
133
|
+
* and surfaces calibration drift as a counter on `GraderReliability`.
|
|
134
|
+
*
|
|
135
|
+
* The function mutates `judgments` in place — it overlays
|
|
136
|
+
* `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
|
|
137
|
+
* already carries a confidence from the strict-schema parse (Plan 03-01),
|
|
138
|
+
* the ceiling-cross-check stamp REPLACES it because the validator's
|
|
139
|
+
* derivation tag is the live-pipeline contract; the parsed-shape
|
|
140
|
+
* confidence (if emitted by the grader) is preserved on the original
|
|
141
|
+
* `parsedJudgment` upstream of this site.
|
|
142
|
+
*
|
|
143
|
+
* `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
|
|
144
|
+
* does not have access to `task.contextDocs ∪ run.documentManifest` at this
|
|
145
|
+
* site (the union travels through a separate path in
|
|
146
|
+
* `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
|
|
147
|
+
* couple the doc-union population at the strict-schema flip site so the
|
|
148
|
+
* field is populated alongside the required-flip.
|
|
149
|
+
*
|
|
150
|
+
* @param judgments - Grader judgments produced by `extractGraderJudgments`.
|
|
151
|
+
* @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
|
|
152
|
+
* come from this lookup. Missing areas default to ceiling 100, floor 0
|
|
153
|
+
* (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
|
|
154
|
+
* @param reliability - `GraderReliability` sink whose
|
|
155
|
+
* `failureModeCalibration` counter is incremented on disagreement.
|
|
156
|
+
*
|
|
157
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
158
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
111
159
|
*/
|
|
112
|
-
export declare function
|
|
160
|
+
export declare function validateGraderJudgmentsCalibration(judgments: GraderJudgment[], scores: FeatureScore[], reliability: GraderReliability): void;
|
|
161
|
+
/**
|
|
162
|
+
* Populate Pitfall #11 hallucination cross-check fields on grader
|
|
163
|
+
* judgments (Plan 03-04 GRAD-05).
|
|
164
|
+
*
|
|
165
|
+
* For each judgment, sets `hallucinationCheckedAgainst` to the union of
|
|
166
|
+
* (a) the slugs of docs the task declared in `context.docs` and (b) the
|
|
167
|
+
* run's full document manifest. For each entry in `judgment.docCitations`
|
|
168
|
+
* that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
|
|
169
|
+
* that does not appear in either set is a fabrication, not a real
|
|
170
|
+
* resolvable doc.
|
|
171
|
+
*
|
|
172
|
+
* Mutates `judgments` in place. Slug-less citations are left with
|
|
173
|
+
* `hallucinated` undefined since the lookup key is the slug per the
|
|
174
|
+
* GraderJudgment domain doc-comment ("slug does not resolve against the
|
|
175
|
+
* task's contextDocs set").
|
|
176
|
+
*
|
|
177
|
+
* Called from `gap-analysis-step` (literacy mode only) after
|
|
178
|
+
* `descToDocRefs` and `documentManifest` are built — both inputs are
|
|
179
|
+
* unavailable at extract time. Non-literacy modes skip this step
|
|
180
|
+
* entirely (the runtime contract has no canonical doc set to check
|
|
181
|
+
* against).
|
|
182
|
+
*
|
|
183
|
+
* @param judgments - Grader judgments to enrich. Mutated in place.
|
|
184
|
+
* @param taskDocSlugs - Map from base task description (the form judgment
|
|
185
|
+
* `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
|
|
186
|
+
* to the slugs declared in that task's `context.docs`.
|
|
187
|
+
* @param manifestSlugs - All slugs in the run's document manifest.
|
|
188
|
+
*/
|
|
189
|
+
export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
|
|
113
190
|
/**
|
|
114
191
|
* Extract per-test results with model output from evaluation results.
|
|
115
192
|
*
|
|
@@ -165,9 +242,43 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
|
|
|
165
242
|
*/
|
|
166
243
|
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
|
|
167
244
|
/** Options for the calculate-scores main() function. */
|
|
245
|
+
/**
|
|
246
|
+
* Pre-built runner closure for the GRAD-04 borderline-consensus pass.
|
|
247
|
+
*
|
|
248
|
+
* The composition root produces one of these via
|
|
249
|
+
* `createBorderlineConsensusRunner` so the threshold + replication
|
|
250
|
+
* defaults stay co-located with the rest of the pipeline wiring.
|
|
251
|
+
* `calculateAndWriteScores` invokes it after `extractGraderJudgments`
|
|
252
|
+
* and before persisting `grader-judgments.json` so the judgments file
|
|
253
|
+
* carries the consensus-merged scores rather than the original
|
|
254
|
+
* single-replica grader output (CR-01).
|
|
255
|
+
*/
|
|
256
|
+
export type BorderlineConsensusRunner = (args: {
|
|
257
|
+
judgments: GraderJudgment[];
|
|
258
|
+
logger?: Logger;
|
|
259
|
+
regrade: (judgment: GraderJudgment) => Promise<number>;
|
|
260
|
+
}) => Promise<{
|
|
261
|
+
consistencyByJudgment: Map<string, JudgmentConsistency>;
|
|
262
|
+
judgments: GraderJudgment[];
|
|
263
|
+
}>;
|
|
168
264
|
export interface CalculateScoresOptions {
|
|
169
265
|
/** Allowed origins for source isolation reporting */
|
|
170
266
|
allowedOrigins?: string[];
|
|
267
|
+
/**
|
|
268
|
+
* Pre-built borderline-consensus runner (CR-01). When provided AND
|
|
269
|
+
* non-zero `borderlineReplications`, runs after extraction and
|
|
270
|
+
* persists `borderline-consistency.json` alongside
|
|
271
|
+
* `grader-judgments.json`. When omitted, the pipeline keeps the
|
|
272
|
+
* single-replica scores — preserving Phase 2 behavior for callers
|
|
273
|
+
* that haven't opted in.
|
|
274
|
+
*/
|
|
275
|
+
borderlineConsensusRunner?: BorderlineConsensusRunner;
|
|
276
|
+
/**
|
|
277
|
+
* Optional regrade entry point used by the borderline runner. Wired
|
|
278
|
+
* in by the orchestration step from grader-api.ts; when absent, the
|
|
279
|
+
* runner is skipped because there's nothing to call.
|
|
280
|
+
*/
|
|
281
|
+
borderlineRegradeOnce?: (responseText: string, rubricText: string) => Promise<null | number>;
|
|
171
282
|
/** Logger instance (defaults to ConsoleLogger if not provided) */
|
|
172
283
|
logger?: Logger;
|
|
173
284
|
/** Evaluation mode (controls which result files are read) */
|
|
@@ -209,4 +320,4 @@ export interface CalculateScoresResult {
|
|
|
209
320
|
/** Summary of test execution outcomes (total, passed, failed, errored). */
|
|
210
321
|
testSummary?: TestSummary;
|
|
211
322
|
}
|
|
212
|
-
export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult
|
|
323
|
+
export declare function calculateAndWriteScores(options: CalculateScoresOptions): Promise<CalculateScoresResult>;
|