@sanity/ailf 7.2.1 → 7.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/orchestration/required-eval-runs.d.ts +35 -0
- package/dist/orchestration/required-eval-runs.js +41 -0
- package/dist/orchestration/steps/calculate-scores-step.js +15 -22
- package/dist/orchestration/steps/compute-attribution-step.js +6 -3
- package/dist/orchestration/steps/gap-analysis-step.js +8 -4
- package/dist/pipeline/assert-grader-judgments-persisted.d.ts +35 -0
- package/dist/pipeline/assert-grader-judgments-persisted.js +58 -0
- package/dist/pipeline/calculate-scores.js +18 -0
- package/package.json +1 -1
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Which eval sub-runs a pipeline configuration requires, and whether every one
|
|
3
|
+
* of them was satisfied by a remote-cache hit.
|
|
4
|
+
*
|
|
5
|
+
* The post-scoring enrichment steps (gap-analysis, compute-attribution) may
|
|
6
|
+
* skip benignly when grader judgments are absent because ALL required runs came
|
|
7
|
+
* from the remote cache — a cache hit restores `score-summary.json` but never
|
|
8
|
+
* writes `grader-judgments.json`. They must NOT skip when at least one required
|
|
9
|
+
* run was evaluated fresh this pipeline: a fresh run that scored tests yet
|
|
10
|
+
* persisted no judgments is a degraded outcome that has to fail loud.
|
|
11
|
+
*
|
|
12
|
+
* Mirrors the required-run derivation in `calculate-scores-step` so the
|
|
13
|
+
* "all required runs cached" judgement is defined in exactly one place.
|
|
14
|
+
*/
|
|
15
|
+
interface EvalRunSelector {
|
|
16
|
+
mode: string;
|
|
17
|
+
variant?: string | null;
|
|
18
|
+
}
|
|
19
|
+
interface RemoteCacheState {
|
|
20
|
+
remoteCacheHits?: ReadonlySet<string>;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* The eval sub-runs a configuration requires, keyed by the same strings
|
|
24
|
+
* `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
|
|
25
|
+
* or the bare mode name for non-literacy modes).
|
|
26
|
+
*/
|
|
27
|
+
export declare function requiredEvalRuns(config: EvalRunSelector): string[];
|
|
28
|
+
/**
|
|
29
|
+
* True only when every eval sub-run the configuration requires was satisfied by
|
|
30
|
+
* a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
|
|
31
|
+
* cached, baseline fresh) returns false — the fresh run's outputs are still the
|
|
32
|
+
* pipeline's responsibility.
|
|
33
|
+
*/
|
|
34
|
+
export declare function allRequiredEvalRunsCached(config: EvalRunSelector, state: RemoteCacheState | undefined): boolean;
|
|
35
|
+
export {};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Which eval sub-runs a pipeline configuration requires, and whether every one
|
|
3
|
+
* of them was satisfied by a remote-cache hit.
|
|
4
|
+
*
|
|
5
|
+
* The post-scoring enrichment steps (gap-analysis, compute-attribution) may
|
|
6
|
+
* skip benignly when grader judgments are absent because ALL required runs came
|
|
7
|
+
* from the remote cache — a cache hit restores `score-summary.json` but never
|
|
8
|
+
* writes `grader-judgments.json`. They must NOT skip when at least one required
|
|
9
|
+
* run was evaluated fresh this pipeline: a fresh run that scored tests yet
|
|
10
|
+
* persisted no judgments is a degraded outcome that has to fail loud.
|
|
11
|
+
*
|
|
12
|
+
* Mirrors the required-run derivation in `calculate-scores-step` so the
|
|
13
|
+
* "all required runs cached" judgement is defined in exactly one place.
|
|
14
|
+
*/
|
|
15
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
16
|
+
/**
|
|
17
|
+
* The eval sub-runs a configuration requires, keyed by the same strings
|
|
18
|
+
* `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
|
|
19
|
+
* or the bare mode name for non-literacy modes).
|
|
20
|
+
*/
|
|
21
|
+
export function requiredEvalRuns(config) {
|
|
22
|
+
if (config.mode === "literacy") {
|
|
23
|
+
const variant = config.variant ?? LiteracyVariant.STANDARD;
|
|
24
|
+
return variant === LiteracyVariant.FULL
|
|
25
|
+
? [LiteracyVariant.STANDARD, LiteracyVariant.AGENTIC]
|
|
26
|
+
: [variant];
|
|
27
|
+
}
|
|
28
|
+
return [config.mode];
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* True only when every eval sub-run the configuration requires was satisfied by
|
|
32
|
+
* a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
|
|
33
|
+
* cached, baseline fresh) returns false — the fresh run's outputs are still the
|
|
34
|
+
* pipeline's responsibility.
|
|
35
|
+
*/
|
|
36
|
+
export function allRequiredEvalRunsCached(config, state) {
|
|
37
|
+
const hits = state?.remoteCacheHits;
|
|
38
|
+
if (!hits || hits.size === 0)
|
|
39
|
+
return false;
|
|
40
|
+
return requiredEvalRuns(config).every((run) => hits.has(run));
|
|
41
|
+
}
|
|
@@ -11,6 +11,7 @@ import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
|
11
11
|
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
12
12
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
13
13
|
import { buildCacheContext } from "../cache-context.js";
|
|
14
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
14
15
|
import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
|
|
15
16
|
import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
|
|
16
17
|
import { resultsFileForMode } from "../../pipeline/eval-constants.js";
|
|
@@ -27,30 +28,22 @@ export class CalculateScoresStep {
|
|
|
27
28
|
}
|
|
28
29
|
async execute(ctx, state) {
|
|
29
30
|
const start = Date.now();
|
|
30
|
-
// When all required eval
|
|
31
|
+
// When all required eval runs were satisfied by remote cache hits,
|
|
31
32
|
// score-summary.json was already restored from the cached report.
|
|
32
|
-
// Skip re-calculation — the raw eval-results files don't exist.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
45
|
-
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
46
|
-
if (summaryErrors.length === 0) {
|
|
47
|
-
return {
|
|
48
|
-
reason: "Remote cache hit — score-summary.json restored from cached report",
|
|
49
|
-
status: "skipped",
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
// If the summary is invalid, fall through to normal calculation
|
|
33
|
+
// Skip re-calculation — the raw eval-results files don't exist. A partial
|
|
34
|
+
// cache hit (only some required runs cached) falls through to normal
|
|
35
|
+
// calculation: the freshly-run sub-evals produced raw results to score.
|
|
36
|
+
if (allRequiredEvalRunsCached(ctx.config, state)) {
|
|
37
|
+
// Verify the restored score-summary.json is valid
|
|
38
|
+
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
39
|
+
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
40
|
+
if (summaryErrors.length === 0) {
|
|
41
|
+
return {
|
|
42
|
+
reason: "Remote cache hit — score-summary.json restored from cached report",
|
|
43
|
+
status: "skipped",
|
|
44
|
+
};
|
|
53
45
|
}
|
|
46
|
+
// If the summary is invalid, fall through to normal calculation
|
|
54
47
|
}
|
|
55
48
|
// Primary results file to score.
|
|
56
49
|
// For literacy: "full" variant uses baseline as primary; others use variant directly.
|
|
@@ -41,6 +41,7 @@ import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
|
41
41
|
import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
|
|
42
42
|
import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
|
|
43
43
|
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
44
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
44
45
|
// ---------------------------------------------------------------------------
|
|
45
46
|
// Step implementation
|
|
46
47
|
// ---------------------------------------------------------------------------
|
|
@@ -89,10 +90,12 @@ export class ComputeAttributionStep {
|
|
|
89
90
|
// grader judgments is a degraded run, not a benign skip. Fail loud so the
|
|
90
91
|
// outcome surfaces in pipeline-result and on the job document. A remote
|
|
91
92
|
// cache hit restores score-summary.json without grader-judgments.json, so
|
|
92
|
-
//
|
|
93
|
-
|
|
93
|
+
// missing judgments are legitimate ONLY when every required sub-eval came
|
|
94
|
+
// from the cache — a hybrid full run with a freshly-evaluated sub-eval that
|
|
95
|
+
// persisted no judgments is still degraded.
|
|
96
|
+
const allCached = allRequiredEvalRunsCached(ctx.config, state);
|
|
94
97
|
const inputs = classifyEnrichmentInputs(root);
|
|
95
|
-
if (inputs.kind === "judgments-missing-after-eval" && !
|
|
98
|
+
if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
|
|
96
99
|
return {
|
|
97
100
|
durationMs: Date.now() - start,
|
|
98
101
|
status: "failed",
|
|
@@ -19,6 +19,7 @@ import { join, resolve } from "path";
|
|
|
19
19
|
import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
21
21
|
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
22
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
22
23
|
export class GapAnalysisStep {
|
|
23
24
|
name = "gap-analysis";
|
|
24
25
|
optional = true;
|
|
@@ -46,11 +47,14 @@ export class GapAnalysisStep {
|
|
|
46
47
|
// reports publish with a score but no test details.
|
|
47
48
|
//
|
|
48
49
|
// A remote cache hit restores score-summary.json (with testCount) from a
|
|
49
|
-
// prior report but never writes grader-judgments.json, so judgments
|
|
50
|
-
//
|
|
51
|
-
|
|
50
|
+
// prior report but never writes grader-judgments.json, so absent judgments
|
|
51
|
+
// are legitimate ONLY when every required sub-eval came from the cache. In a
|
|
52
|
+
// hybrid full run (e.g. agentic cached, baseline evaluated fresh) the fresh
|
|
53
|
+
// run's missing judgments are still degraded — gate the skip on ALL required
|
|
54
|
+
// runs being cached, not merely any.
|
|
55
|
+
const allCached = allRequiredEvalRunsCached(ctx.config, state);
|
|
52
56
|
const inputs = classifyEnrichmentInputs(root);
|
|
53
|
-
if (inputs.kind === "judgments-missing-after-eval" && !
|
|
57
|
+
if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
|
|
54
58
|
return {
|
|
55
59
|
durationMs: Date.now() - start,
|
|
56
60
|
status: "failed",
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/assert-grader-judgments-persisted.ts
|
|
3
|
+
*
|
|
4
|
+
* Post-persist guard for the grader-judgments write junction in
|
|
5
|
+
* `calculateAndWriteScores`.
|
|
6
|
+
*
|
|
7
|
+
* `extractGraderJudgmentsResilient` returns N judgments in memory, after which
|
|
8
|
+
* `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
|
|
9
|
+
* guard decides whether `grader-judgments.json` is written. A transient read
|
|
10
|
+
* anomaly or an unexpected in-place emptying can leave the file absent or empty
|
|
11
|
+
* even though extraction yielded judgments. Silently skipping the write strands
|
|
12
|
+
* gap-analysis and ships a scored report with no test details.
|
|
13
|
+
*
|
|
14
|
+
* This guard re-reads the file from disk — the same read gap-analysis performs
|
|
15
|
+
* — and fails loud when a non-empty extraction did not round-trip. The check is
|
|
16
|
+
* deliberately narrow: it fires only on the catastrophic "extracted N>0,
|
|
17
|
+
* persisted 0" divergence, never on a genuinely judgment-free run.
|
|
18
|
+
*/
|
|
19
|
+
/** Injectable seam — counts the grader judgments actually on disk. */
|
|
20
|
+
export interface PersistVerificationDeps {
|
|
21
|
+
countPersisted: (path: string) => number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Fail loud when a non-empty grader-judgment extraction did not round-trip to
|
|
25
|
+
* disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
|
|
26
|
+
* / no llm-rubric) is valid and persists nothing by design.
|
|
27
|
+
*
|
|
28
|
+
* @param extractedCount Judgments returned by extraction, captured BEFORE any
|
|
29
|
+
* in-place mutation (e.g. the borderline-consensus pass) so the count
|
|
30
|
+
* reflects what extraction actually produced.
|
|
31
|
+
* @param judgmentsPath Absolute path to `grader-judgments.json`.
|
|
32
|
+
* @param deps Injectable disk reader; defaults to the real filesystem.
|
|
33
|
+
* @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
|
|
34
|
+
*/
|
|
35
|
+
export declare function assertGraderJudgmentsPersisted(extractedCount: number, judgmentsPath: string, deps?: PersistVerificationDeps): void;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/assert-grader-judgments-persisted.ts
|
|
3
|
+
*
|
|
4
|
+
* Post-persist guard for the grader-judgments write junction in
|
|
5
|
+
* `calculateAndWriteScores`.
|
|
6
|
+
*
|
|
7
|
+
* `extractGraderJudgmentsResilient` returns N judgments in memory, after which
|
|
8
|
+
* `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
|
|
9
|
+
* guard decides whether `grader-judgments.json` is written. A transient read
|
|
10
|
+
* anomaly or an unexpected in-place emptying can leave the file absent or empty
|
|
11
|
+
* even though extraction yielded judgments. Silently skipping the write strands
|
|
12
|
+
* gap-analysis and ships a scored report with no test details.
|
|
13
|
+
*
|
|
14
|
+
* This guard re-reads the file from disk — the same read gap-analysis performs
|
|
15
|
+
* — and fails loud when a non-empty extraction did not round-trip. The check is
|
|
16
|
+
* deliberately narrow: it fires only on the catastrophic "extracted N>0,
|
|
17
|
+
* persisted 0" divergence, never on a genuinely judgment-free run.
|
|
18
|
+
*/
|
|
19
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
20
|
+
/**
|
|
21
|
+
* Parse `grader-judgments.json` and return its array length. Every "no usable
|
|
22
|
+
* judgments" shape (missing, unreadable, invalid JSON, non-array) collapses to
|
|
23
|
+
* 0 — mirroring how the downstream enrichment precondition reads the same file.
|
|
24
|
+
*/
|
|
25
|
+
function defaultCountPersisted(path) {
|
|
26
|
+
if (!existsSync(path))
|
|
27
|
+
return 0;
|
|
28
|
+
try {
|
|
29
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
30
|
+
return Array.isArray(parsed) ? parsed.length : 0;
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Fail loud when a non-empty grader-judgment extraction did not round-trip to
|
|
38
|
+
* disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
|
|
39
|
+
* / no llm-rubric) is valid and persists nothing by design.
|
|
40
|
+
*
|
|
41
|
+
* @param extractedCount Judgments returned by extraction, captured BEFORE any
|
|
42
|
+
* in-place mutation (e.g. the borderline-consensus pass) so the count
|
|
43
|
+
* reflects what extraction actually produced.
|
|
44
|
+
* @param judgmentsPath Absolute path to `grader-judgments.json`.
|
|
45
|
+
* @param deps Injectable disk reader; defaults to the real filesystem.
|
|
46
|
+
* @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
|
|
47
|
+
*/
|
|
48
|
+
export function assertGraderJudgmentsPersisted(extractedCount, judgmentsPath, deps = { countPersisted: defaultCountPersisted }) {
|
|
49
|
+
if (extractedCount <= 0)
|
|
50
|
+
return;
|
|
51
|
+
const persisted = deps.countPersisted(judgmentsPath);
|
|
52
|
+
if (persisted <= 0) {
|
|
53
|
+
throw new Error(`Grader judgments extract/persist divergence: extracted ${extractedCount} ` +
|
|
54
|
+
`judgment(s) but grader-judgments.json persisted 0. Refusing to finish ` +
|
|
55
|
+
`scoring — a scored report with no grader judgments would strand ` +
|
|
56
|
+
`gap-analysis and ship with no test details.`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -42,6 +42,7 @@ import { loadSource } from "../sources.js";
|
|
|
42
42
|
import { LiteracyVariant } from "./normalize-mode.js";
|
|
43
43
|
import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
|
|
44
44
|
import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
|
|
45
|
+
import { assertGraderJudgmentsPersisted } from "./assert-grader-judgments-persisted.js";
|
|
45
46
|
// Re-export from core for backward compatibility.
|
|
46
47
|
// Existing imports from this file continue to work unchanged.
|
|
47
48
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -1544,6 +1545,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1544
1545
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1545
1546
|
// Extract and persist grader judgments
|
|
1546
1547
|
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1548
|
+
const extractedJudgmentCount = judgments.length;
|
|
1547
1549
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1548
1550
|
baselineResultsPath,
|
|
1549
1551
|
]);
|
|
@@ -1555,6 +1557,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1555
1557
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1556
1558
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1557
1559
|
}
|
|
1560
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a
|
|
1561
|
+
// transient divergence at the persist junction): otherwise gap-analysis
|
|
1562
|
+
// skips and the report ships a score with no test details.
|
|
1563
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1558
1564
|
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1559
1565
|
// Agent-harness produces a single profile shared across detected variants
|
|
1560
1566
|
// (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
|
|
@@ -1607,6 +1613,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1607
1613
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1608
1614
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1609
1615
|
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1616
|
+
const extractedJudgmentCount = judgments.length;
|
|
1610
1617
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1611
1618
|
baselineResultsPath,
|
|
1612
1619
|
]);
|
|
@@ -1618,6 +1625,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1618
1625
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1619
1626
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1620
1627
|
}
|
|
1628
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a
|
|
1629
|
+
// transient divergence at the persist junction): otherwise gap-analysis
|
|
1630
|
+
// skips and the report ships a score with no test details.
|
|
1631
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1621
1632
|
// Knowledge-probe deletes vars.docs in the compiler, so every entry's
|
|
1622
1633
|
// detected variant is "baseline" — supply the probe profile under both
|
|
1623
1634
|
// keys so the composite is populated regardless of detection.
|
|
@@ -1744,6 +1755,9 @@ export async function calculateAndWriteScores(options) {
|
|
|
1744
1755
|
? [baselineResultsPath, agenticResultsPath]
|
|
1745
1756
|
: [baselineResultsPath];
|
|
1746
1757
|
const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
|
|
1758
|
+
// Capture the extracted count before the borderline pass mutates the array
|
|
1759
|
+
// in place — the persist guard below compares it against what lands on disk.
|
|
1760
|
+
const extractedJudgmentCount = judgments.length;
|
|
1747
1761
|
// Borderline-consensus pass — re-grade the ±5 borderline subset N times
|
|
1748
1762
|
// and merge medians back into the canonical judgments BEFORE
|
|
1749
1763
|
// `validateGraderJudgmentsCalibration` runs, so the calibration counter
|
|
@@ -1774,6 +1788,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1774
1788
|
});
|
|
1775
1789
|
}
|
|
1776
1790
|
}
|
|
1791
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a transient
|
|
1792
|
+
// divergence at the persist junction): otherwise gap-analysis skips and the
|
|
1793
|
+
// report ships a score with no test details.
|
|
1794
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1777
1795
|
// Extract and persist per-test results (D0029: model output + metadata).
|
|
1778
1796
|
// Literacy gold (with-docs) entries score against the default profile;
|
|
1779
1797
|
// baseline (without-docs) entries score against the output-only profile.
|