@sanity/ailf 7.1.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +4 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
- package/dist/_vendor/ailf-core/schemas/report.js +14 -0
- package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/user.js +23 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
- package/dist/_vendor/ailf-core/types/index.js +13 -0
- package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
- package/dist/_vendor/ailf-core/types/user.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
- package/dist/_vendor/ailf-shared/index.d.ts +1 -1
- package/dist/_vendor/ailf-shared/index.js +1 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +19 -6
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
- package/dist/adapters/task-sources/content-lake-task-source.js +12 -7
- package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
- package/dist/orchestration/steps/compute-attribution-step.js +17 -2
- package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
- package/dist/orchestration/steps/gap-analysis-step.js +29 -10
- package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
- package/dist/orchestration/steps/publish-report-step.js +63 -6
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +125 -22
- package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
- package/dist/pipeline/enrichment-preconditions.js +84 -0
- package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
- package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
|
@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
|
|
|
18
18
|
import { join, resolve } from "path";
|
|
19
19
|
import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
21
|
+
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
21
22
|
export class GapAnalysisStep {
|
|
22
23
|
name = "gap-analysis";
|
|
23
24
|
optional = true;
|
|
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
|
|
|
34
35
|
}
|
|
35
36
|
return [];
|
|
36
37
|
}
|
|
37
|
-
async execute(ctx) {
|
|
38
|
+
async execute(ctx, state) {
|
|
38
39
|
const root = ctx.config.rootDir;
|
|
39
40
|
const start = Date.now();
|
|
40
41
|
const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
|
|
41
42
|
const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
|
|
42
|
-
|
|
43
|
+
// Distinguish a legitimate skip (no graded eval ran this pipeline) from a
|
|
44
|
+
// degraded run where a full eval scored tests but no judgments persisted.
|
|
45
|
+
// The latter must fail loud — returning a benign `skipped` is what let
|
|
46
|
+
// reports publish with a score but no test details.
|
|
47
|
+
//
|
|
48
|
+
// A remote cache hit restores score-summary.json (with testCount) from a
|
|
49
|
+
// prior report but never writes grader-judgments.json, so judgments are
|
|
50
|
+
// legitimately absent — that is a benign skip, not a degraded full eval.
|
|
51
|
+
const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
|
|
52
|
+
const inputs = classifyEnrichmentInputs(root);
|
|
53
|
+
if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
|
|
54
|
+
return {
|
|
55
|
+
durationMs: Date.now() - start,
|
|
56
|
+
status: "failed",
|
|
57
|
+
error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
if (inputs.kind !== "ready") {
|
|
43
61
|
return {
|
|
44
62
|
status: "skipped",
|
|
45
63
|
reason: "No grader-judgments.json — run a full evaluation first",
|
|
@@ -82,14 +100,15 @@ export class GapAnalysisStep {
|
|
|
82
100
|
const resolveRefs = (slugs) => slugs
|
|
83
101
|
.map((slug) => {
|
|
84
102
|
const m = refBySlug.get(slug);
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
}
|
|
92
|
-
:
|
|
103
|
+
if (!m)
|
|
104
|
+
return { documentId: "", slug, title: slug };
|
|
105
|
+
return {
|
|
106
|
+
documentId: m._id,
|
|
107
|
+
revision: m._rev,
|
|
108
|
+
slug: m.slug,
|
|
109
|
+
...(m.path ? { path: m.path } : {}),
|
|
110
|
+
title: m.title,
|
|
111
|
+
};
|
|
93
112
|
})
|
|
94
113
|
.filter((r) => r.documentId !== "");
|
|
95
114
|
// ── Build description→docs mapping from TaskSource ─────────
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* - P5: Local-first (pipeline never fails because of a store write)
|
|
11
11
|
* - P6: Sinks are fire-and-forget (failures logged, not thrown)
|
|
12
12
|
*/
|
|
13
|
-
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
13
|
+
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
14
|
import { type ProvenanceInput } from "../../pipeline/provenance.js";
|
|
15
15
|
export declare class PublishReportStep implements PipelineStep {
|
|
16
16
|
private readonly pipelineStart;
|
|
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
|
|
|
25
25
|
check(): ValidationIssue[];
|
|
26
26
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
27
27
|
}
|
|
28
|
+
/**
|
|
29
|
+
* Detect whether a report should publish as degraded.
|
|
30
|
+
*
|
|
31
|
+
* The symptom is a scored run whose per-test details never landed: a full
|
|
32
|
+
* eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
|
|
33
|
+
* absent because gap-analysis skipped or failed. Such a report renders an
|
|
34
|
+
* empty "no tests" state in Studio despite carrying a score. Returns the
|
|
35
|
+
* marker enumerating which enrichment surfaces are missing, or `undefined`
|
|
36
|
+
* for a healthy report (or a run with no scored tests, where an empty report
|
|
37
|
+
* is legitimate).
|
|
38
|
+
*
|
|
39
|
+
* Exported for unit testing — production callers reach it via execute().
|
|
40
|
+
*/
|
|
41
|
+
export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
|
|
28
42
|
/**
|
|
29
43
|
* Assemble provenance input from the score summary and pipeline context.
|
|
30
44
|
*
|
|
@@ -110,9 +110,15 @@ export class PublishReportStep {
|
|
|
110
110
|
// agentBehavior arrays) point at their external artifacts via
|
|
111
111
|
// `id = manifestEntryKey`; Studio hydrates on drill-down.
|
|
112
112
|
const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
|
|
113
|
+
// Degraded-report detection (the "no tests on a scored report" symptom):
|
|
114
|
+
// a full eval scored tests but the gap-analysis enrichment never landed.
|
|
115
|
+
// Computed from the full summary read above — independent of which
|
|
116
|
+
// upstream step skipped — so the marker fires regardless of the cause.
|
|
117
|
+
const degraded = detectReportDegradation(summary);
|
|
113
118
|
const report = {
|
|
114
119
|
comparison: comparison ?? undefined,
|
|
115
120
|
completedAt: now,
|
|
121
|
+
...(degraded ? { degraded } : {}),
|
|
116
122
|
durationMs,
|
|
117
123
|
id: reportId,
|
|
118
124
|
provenance,
|
|
@@ -192,6 +198,45 @@ export class PublishReportStep {
|
|
|
192
198
|
// ---------------------------------------------------------------------------
|
|
193
199
|
// Helpers
|
|
194
200
|
// ---------------------------------------------------------------------------
|
|
201
|
+
/**
|
|
202
|
+
* Detect whether a report should publish as degraded.
|
|
203
|
+
*
|
|
204
|
+
* The symptom is a scored run whose per-test details never landed: a full
|
|
205
|
+
* eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
|
|
206
|
+
* absent because gap-analysis skipped or failed. Such a report renders an
|
|
207
|
+
* empty "no tests" state in Studio despite carrying a score. Returns the
|
|
208
|
+
* marker enumerating which enrichment surfaces are missing, or `undefined`
|
|
209
|
+
* for a healthy report (or a run with no scored tests, where an empty report
|
|
210
|
+
* is legitimate).
|
|
211
|
+
*
|
|
212
|
+
* Exported for unit testing — production callers reach it via execute().
|
|
213
|
+
*/
|
|
214
|
+
export function detectReportDegradation(summary) {
|
|
215
|
+
const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
|
|
216
|
+
const hasTestResults = (summary.testResults?.length ?? 0) > 0;
|
|
217
|
+
if (scoredTestCount === 0 || hasTestResults)
|
|
218
|
+
return undefined;
|
|
219
|
+
// `testResults` is the load-bearing signal (its absence is the rendered
|
|
220
|
+
// "no tests" symptom). The remaining fields are best-effort detail: some
|
|
221
|
+
// are literacy-only (e.g. documentManifest), so they may appear here for a
|
|
222
|
+
// degraded non-literacy run even though that mode never produces them.
|
|
223
|
+
const missing = ["testResults"];
|
|
224
|
+
if (!summary.failureModes)
|
|
225
|
+
missing.push("failureModes");
|
|
226
|
+
if (!summary.lowScoringJudgments?.length)
|
|
227
|
+
missing.push("lowScoringJudgments");
|
|
228
|
+
if (!summary.documentManifest?.length)
|
|
229
|
+
missing.push("documentManifest");
|
|
230
|
+
if (!summary.recommendations)
|
|
231
|
+
missing.push("recommendations");
|
|
232
|
+
return {
|
|
233
|
+
reason: "enrichment-missing",
|
|
234
|
+
missing,
|
|
235
|
+
detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
|
|
236
|
+
`complete; per-test details and failure analysis are unavailable for ` +
|
|
237
|
+
`this report.`,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
195
240
|
/**
|
|
196
241
|
* Assemble provenance input from the score summary and pipeline context.
|
|
197
242
|
*
|
|
@@ -214,20 +259,32 @@ export function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
214
259
|
// summary.source undefined). Without this fallback, the report
|
|
215
260
|
// reads "production" regardless of what the dashboard sent.
|
|
216
261
|
// 3. "production" — last-resort built-in default.
|
|
217
|
-
|
|
262
|
+
//
|
|
263
|
+
// Per-field fallbacks (dataset/projectId/perspective) only fire when
|
|
264
|
+
// `summary.source` itself is absent — i.e. the loadSource throw was
|
|
265
|
+
// swallowed. When summary.source is present, trust what the fetch
|
|
266
|
+
// actually used; papering over a missing `perspective` from
|
|
267
|
+
// `ctx.config.perspectiveOverride` makes provenance claim a release
|
|
268
|
+
// was used when it wasn't (W0295).
|
|
269
|
+
const sourceResolved = summary.source?.name !== undefined;
|
|
270
|
+
if (!sourceResolved && ctx.config.source) {
|
|
218
271
|
ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
|
|
219
272
|
}
|
|
220
273
|
const source = {
|
|
221
274
|
baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
|
|
222
|
-
dataset:
|
|
275
|
+
dataset: sourceResolved
|
|
276
|
+
? (summary.source.dataset ?? "next")
|
|
277
|
+
: (ctx.config.datasetOverride ?? "next"),
|
|
223
278
|
documentIds: [],
|
|
224
279
|
llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
|
|
225
280
|
name: summary.source?.name ?? ctx.config.source ?? "production",
|
|
226
|
-
perspective:
|
|
227
|
-
|
|
228
|
-
undefined,
|
|
281
|
+
perspective: sourceResolved
|
|
282
|
+
? summary.source.perspective
|
|
283
|
+
: (ctx.config.perspectiveOverride ?? undefined),
|
|
229
284
|
priorityDomain: "sanity.io",
|
|
230
|
-
projectId:
|
|
285
|
+
projectId: sourceResolved
|
|
286
|
+
? summary.source.projectId
|
|
287
|
+
: (ctx.config.projectIdOverride ?? "3do82whm"),
|
|
231
288
|
studioOrigin: "https://admin.sanity.io",
|
|
232
289
|
urls: [],
|
|
233
290
|
};
|
|
@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
|
|
|
187
187
|
* @param manifestSlugs - All slugs in the run's document manifest.
|
|
188
188
|
*/
|
|
189
189
|
export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
|
|
190
|
+
/**
|
|
191
|
+
* Per-variant scoring profiles passed to {@link extractStoredTestResults}.
|
|
192
|
+
* Each profile maps dimension id → weight. Variants whose dimensions don't
|
|
193
|
+
* intersect the supplied keys yield `compositeScore: undefined` rather than
|
|
194
|
+
* a misleading 0.
|
|
195
|
+
*/
|
|
196
|
+
export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
|
|
190
197
|
/**
|
|
191
198
|
* Extract per-test results with model output from evaluation results.
|
|
192
199
|
*
|
|
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
|
|
|
194
201
|
* shape including response.output (truncated), latency, and cost.
|
|
195
202
|
* One StoredTestResult per test × model combination.
|
|
196
203
|
*
|
|
204
|
+
* When `profiles` is provided, each entry's `compositeScore` is computed as
|
|
205
|
+
* the weighted mean of its dimension scores using the profile matching its
|
|
206
|
+
* detected `variant`. Without profiles, `compositeScore` is omitted — legacy
|
|
207
|
+
* behavior preserved.
|
|
208
|
+
*
|
|
197
209
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
198
210
|
*/
|
|
199
|
-
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
211
|
+
export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
|
|
200
212
|
/**
|
|
201
213
|
* W0198 — aggregate every per-test `SymbolPreflightReport` into a single
|
|
202
214
|
* resolver-health summary. Returns `undefined` when the run had no
|
|
@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
|
|
|
41
41
|
import { loadSource } from "../sources.js";
|
|
42
42
|
import { LiteracyVariant } from "./normalize-mode.js";
|
|
43
43
|
import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
|
|
44
|
+
import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
|
|
44
45
|
// Re-export from core for backward compatibility.
|
|
45
46
|
// Existing imports from this file continue to work unchanged.
|
|
46
47
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
|
|
|
321
322
|
}
|
|
322
323
|
return judgments;
|
|
323
324
|
}
|
|
325
|
+
/**
|
|
326
|
+
* Light parse of a results file's entry count — diagnostics only. Avoids the
|
|
327
|
+
* full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
|
|
328
|
+
* the file is missing or unparseable.
|
|
329
|
+
*/
|
|
330
|
+
function countResultEntries(resultsPath) {
|
|
331
|
+
try {
|
|
332
|
+
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
333
|
+
const wrapper = file.results ?? file;
|
|
334
|
+
return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
|
|
335
|
+
}
|
|
336
|
+
catch {
|
|
337
|
+
return 0;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* Count classifiable llm-rubric components in a results file — i.e. the number
|
|
342
|
+
* of judgments a healthy `extractGraderJudgments` should produce. Used only to
|
|
343
|
+
* set the severity of a persistent-empty extraction: a file with classifiable
|
|
344
|
+
* components but 0 extracted judgments is an error; a file with none (all
|
|
345
|
+
* api-errors / no llm-rubric) is a benign empty.
|
|
346
|
+
*
|
|
347
|
+
* Deliberately an independent count path (not `extractGraderJudgments`) so the
|
|
348
|
+
* cross-check is meaningful. Returns 0 when the file is missing or unparseable.
|
|
349
|
+
*/
|
|
350
|
+
function countClassifiableRubricComponents(resultsPath) {
|
|
351
|
+
if (!existsSync(resultsPath))
|
|
352
|
+
return 0;
|
|
353
|
+
let n = 0;
|
|
354
|
+
for (const result of readAndNormalizeResults(resultsPath)) {
|
|
355
|
+
for (const comp of result.gradingResult.componentResults) {
|
|
356
|
+
if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
|
|
357
|
+
n += 1;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
return n;
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
|
|
365
|
+
* real extractor + fs counters. Defined once so all persist sites self-heal
|
|
366
|
+
* identically.
|
|
367
|
+
*/
|
|
368
|
+
const resilientJudgmentDeps = {
|
|
369
|
+
countClassifiable: countClassifiableRubricComponents,
|
|
370
|
+
countResults: countResultEntries,
|
|
371
|
+
extract: extractGraderJudgments,
|
|
372
|
+
};
|
|
324
373
|
/**
|
|
325
374
|
* Stamp every grader judgment with a D0049 ceiling-cross-check confidence
|
|
326
375
|
* triple and increment `GraderReliability.failureModeCalibration` whenever
|
|
@@ -469,6 +518,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
|
|
|
469
518
|
* `responseOutputTruncated` still flips for the extreme tail.
|
|
470
519
|
*/
|
|
471
520
|
const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
|
|
521
|
+
/**
|
|
522
|
+
* Weighted mean of dimension scores. Mirrors the dashboard's read-side
|
|
523
|
+
* fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
|
|
524
|
+
* and reader stay aligned. Returns `undefined` when no dimension matches the
|
|
525
|
+
* profile (caller decides whether that signals misconfiguration).
|
|
526
|
+
*/
|
|
527
|
+
function computeStoredCompositeScore(dimensions, weights) {
|
|
528
|
+
let weighted = 0;
|
|
529
|
+
let totalWeight = 0;
|
|
530
|
+
for (const dim of dimensions) {
|
|
531
|
+
const w = weights[dim.dimension];
|
|
532
|
+
if (w === undefined)
|
|
533
|
+
continue;
|
|
534
|
+
weighted += dim.score * w;
|
|
535
|
+
totalWeight += w;
|
|
536
|
+
}
|
|
537
|
+
if (totalWeight === 0)
|
|
538
|
+
return undefined;
|
|
539
|
+
return Math.round(weighted / totalWeight);
|
|
540
|
+
}
|
|
472
541
|
/**
|
|
473
542
|
* Extract per-test results with model output from evaluation results.
|
|
474
543
|
*
|
|
@@ -476,9 +545,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
|
|
|
476
545
|
* shape including response.output (truncated), latency, and cost.
|
|
477
546
|
* One StoredTestResult per test × model combination.
|
|
478
547
|
*
|
|
548
|
+
* When `profiles` is provided, each entry's `compositeScore` is computed as
|
|
549
|
+
* the weighted mean of its dimension scores using the profile matching its
|
|
550
|
+
* detected `variant`. Without profiles, `compositeScore` is omitted — legacy
|
|
551
|
+
* behavior preserved.
|
|
552
|
+
*
|
|
479
553
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
480
554
|
*/
|
|
481
|
-
export function extractStoredTestResults(resultsPath) {
|
|
555
|
+
export function extractStoredTestResults(resultsPath, profiles) {
|
|
482
556
|
const results = readAndNormalizeResults(resultsPath);
|
|
483
557
|
const testResults = [];
|
|
484
558
|
for (const result of results) {
|
|
@@ -523,8 +597,13 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
523
597
|
dimensions.push({ dimension, reason, score });
|
|
524
598
|
}
|
|
525
599
|
const tokenUsage = result.response?.tokenUsage;
|
|
600
|
+
const profileForVariant = profiles?.[variant];
|
|
601
|
+
const compositeScore = profileForVariant
|
|
602
|
+
? computeStoredCompositeScore(dimensions, profileForVariant)
|
|
603
|
+
: undefined;
|
|
526
604
|
testResults.push({
|
|
527
605
|
area,
|
|
606
|
+
...(compositeScore !== undefined && { compositeScore }),
|
|
528
607
|
cost: result.cost || undefined,
|
|
529
608
|
dimensions,
|
|
530
609
|
latencyMs: result.latencyMs,
|
|
@@ -1464,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1464
1543
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1465
1544
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1466
1545
|
// Extract and persist grader judgments
|
|
1467
|
-
const judgments =
|
|
1546
|
+
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1468
1547
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1469
1548
|
baselineResultsPath,
|
|
1470
1549
|
]);
|
|
@@ -1477,7 +1556,12 @@ export async function calculateAndWriteScores(options) {
|
|
|
1477
1556
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1478
1557
|
}
|
|
1479
1558
|
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1480
|
-
|
|
1559
|
+
// Agent-harness produces a single profile shared across detected variants
|
|
1560
|
+
// (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
|
|
1561
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1562
|
+
gold: agentProfile,
|
|
1563
|
+
baseline: agentProfile,
|
|
1564
|
+
});
|
|
1481
1565
|
if (testResults.length > 0) {
|
|
1482
1566
|
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
1483
1567
|
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
@@ -1522,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1522
1606
|
mkdirSync(outDir, { recursive: true });
|
|
1523
1607
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1524
1608
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1525
|
-
const judgments =
|
|
1609
|
+
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1526
1610
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1527
1611
|
baselineResultsPath,
|
|
1528
1612
|
]);
|
|
@@ -1534,7 +1618,13 @@ export async function calculateAndWriteScores(options) {
|
|
|
1534
1618
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1535
1619
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1536
1620
|
}
|
|
1537
|
-
|
|
1621
|
+
// Knowledge-probe deletes vars.docs in the compiler, so every entry's
|
|
1622
|
+
// detected variant is "baseline" — supply the probe profile under both
|
|
1623
|
+
// keys so the composite is populated regardless of detection.
|
|
1624
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1625
|
+
gold: probeProfile,
|
|
1626
|
+
baseline: probeProfile,
|
|
1627
|
+
});
|
|
1538
1628
|
if (testResults.length > 0) {
|
|
1539
1629
|
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
1540
1630
|
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
@@ -1548,9 +1638,15 @@ export async function calculateAndWriteScores(options) {
|
|
|
1548
1638
|
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
1549
1639
|
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
1550
1640
|
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
1641
|
+
// Hoisted so the post-scoring extractStoredTestResults call against the
|
|
1642
|
+
// agentic results file can attach the matching profile (W0291).
|
|
1643
|
+
const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
|
|
1644
|
+
? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
|
|
1645
|
+
: undefined;
|
|
1551
1646
|
log.debug("Loaded scoring profiles", {
|
|
1552
1647
|
gold: goldProfile,
|
|
1553
1648
|
baseline: baselineProfileWeights,
|
|
1649
|
+
...(agenticProfile && { agentic: agenticProfile }),
|
|
1554
1650
|
});
|
|
1555
1651
|
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
|
|
1556
1652
|
log.debug("Baseline scores calculated", {
|
|
@@ -1577,7 +1673,8 @@ export async function calculateAndWriteScores(options) {
|
|
|
1577
1673
|
let evaluationMode;
|
|
1578
1674
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1579
1675
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
1580
|
-
|
|
1676
|
+
// Non-null assertion safe — the outer guard hoisting agenticProfile uses
|
|
1677
|
+
// the same condition; if we entered this block, the profile was resolved.
|
|
1581
1678
|
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
|
|
1582
1679
|
log.debug("Agentic scores calculated", {
|
|
1583
1680
|
featureCount: Object.keys(agenticScores).length,
|
|
@@ -1639,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
|
|
|
1639
1736
|
// the ceiling-cross-check disagreement counter (`failureModeCalibration`)
|
|
1640
1737
|
// is incremented during the post-extraction validation pass below.
|
|
1641
1738
|
const reliability = { graderModel: "unknown" };
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
...(options.runId ? { runId: options.runId } : {}),
|
|
1651
|
-
});
|
|
1652
|
-
judgments.push(...agenticJudgments);
|
|
1653
|
-
}
|
|
1739
|
+
// Extract through the resilient wrapper so an empty result from the transient
|
|
1740
|
+
// read anomaly is instrumented and self-healed rather than silently skipping
|
|
1741
|
+
// the grader-judgments persist. In full mode both the baseline and agentic
|
|
1742
|
+
// result files are graded against the shared telemetry.
|
|
1743
|
+
const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
|
|
1744
|
+
? [baselineResultsPath, agenticResultsPath]
|
|
1745
|
+
: [baselineResultsPath];
|
|
1746
|
+
const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
|
|
1654
1747
|
// Borderline-consensus pass — re-grade the ±5 borderline subset N times
|
|
1655
1748
|
// and merge medians back into the canonical judgments BEFORE
|
|
1656
1749
|
// `validateGraderJudgmentsCalibration` runs, so the calibration counter
|
|
@@ -1681,11 +1774,21 @@ export async function calculateAndWriteScores(options) {
|
|
|
1681
1774
|
});
|
|
1682
1775
|
}
|
|
1683
1776
|
}
|
|
1684
|
-
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1685
|
-
|
|
1686
|
-
//
|
|
1777
|
+
// Extract and persist per-test results (D0029: model output + metadata).
|
|
1778
|
+
// Literacy gold (with-docs) entries score against the default profile;
|
|
1779
|
+
// baseline (without-docs) entries score against the output-only profile.
|
|
1780
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1781
|
+
gold: goldProfile,
|
|
1782
|
+
baseline: baselineProfileWeights,
|
|
1783
|
+
});
|
|
1784
|
+
// In full mode, also extract test results from agentic results — the
|
|
1785
|
+
// agentic file's gold entries score against the agentic profile while
|
|
1786
|
+
// baseline entries (if any leak through) still use the literacy baseline.
|
|
1687
1787
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1688
|
-
const agenticTestResults = extractStoredTestResults(agenticResultsPath
|
|
1788
|
+
const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
|
|
1789
|
+
gold: agenticProfile,
|
|
1790
|
+
baseline: baselineProfileWeights,
|
|
1791
|
+
});
|
|
1689
1792
|
testResults.push(...agenticTestResults);
|
|
1690
1793
|
}
|
|
1691
1794
|
if (testResults.length > 0) {
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/enrichment-preconditions.ts
|
|
3
|
+
*
|
|
4
|
+
* Classifies the inputs the post-scoring enrichment steps (gap-analysis,
|
|
5
|
+
* compute-attribution) depend on, so a missing `grader-judgments.json` can be
|
|
6
|
+
* told apart as either a legitimate skip (no graded eval ran this pipeline) or
|
|
7
|
+
* a degraded outcome (a full eval scored tests but no judgments persisted).
|
|
8
|
+
*
|
|
9
|
+
* The degraded case is the failure these steps must stop swallowing:
|
|
10
|
+
* `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
|
|
11
|
+
* but never wrote `grader-judgments.json`, so the enrichment steps self-skip
|
|
12
|
+
* and the report ships with no test details while still showing a score.
|
|
13
|
+
* Distinguishing the two is what lets the steps fail loud instead of returning
|
|
14
|
+
* a benign `skipped`.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Outcome of classifying the enrichment inputs under `results/latest/`.
|
|
18
|
+
*
|
|
19
|
+
* - `ready` — `grader-judgments.json` is present and non-empty; enrichment
|
|
20
|
+
* can run.
|
|
21
|
+
* - `no-full-eval` — no graded eval produced judgments this run. A legitimate
|
|
22
|
+
* skip: standalone gap-analysis on cached results, a non-graded run, or an
|
|
23
|
+
* eval that scored nothing.
|
|
24
|
+
* - `judgments-missing-after-eval` — a full eval scored tests
|
|
25
|
+
* (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
|
|
26
|
+
* is missing or empty. This is the degraded condition the steps surface.
|
|
27
|
+
*/
|
|
28
|
+
export type EnrichmentInputs = {
|
|
29
|
+
kind: "ready";
|
|
30
|
+
judgmentCount: number;
|
|
31
|
+
} | {
|
|
32
|
+
kind: "no-full-eval";
|
|
33
|
+
} | {
|
|
34
|
+
kind: "judgments-missing-after-eval";
|
|
35
|
+
scoredTestCount: number;
|
|
36
|
+
};
|
|
37
|
+
/**
|
|
38
|
+
* Classify the enrichment inputs for a run by inspecting
|
|
39
|
+
* `results/latest/grader-judgments.json` and `score-summary.json`.
|
|
40
|
+
*
|
|
41
|
+
* Pure read-only filesystem inspection — never throws on malformed input; a
|
|
42
|
+
* file that does not parse to the expected shape is treated as absent so that
|
|
43
|
+
* "no usable judgments" and "no usable summary" both collapse to a single
|
|
44
|
+
* branch.
|
|
45
|
+
*/
|
|
46
|
+
export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
|
|
47
|
+
/**
|
|
48
|
+
* Build the fail-loud error message for the degraded
|
|
49
|
+
* `judgments-missing-after-eval` case. Shared by the enrichment steps so the
|
|
50
|
+
* pipeline-result and job-document surfaces carry one consistent wording.
|
|
51
|
+
*/
|
|
52
|
+
export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/enrichment-preconditions.ts
|
|
3
|
+
*
|
|
4
|
+
* Classifies the inputs the post-scoring enrichment steps (gap-analysis,
|
|
5
|
+
* compute-attribution) depend on, so a missing `grader-judgments.json` can be
|
|
6
|
+
* told apart as either a legitimate skip (no graded eval ran this pipeline) or
|
|
7
|
+
* a degraded outcome (a full eval scored tests but no judgments persisted).
|
|
8
|
+
*
|
|
9
|
+
* The degraded case is the failure these steps must stop swallowing:
|
|
10
|
+
* `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
|
|
11
|
+
* but never wrote `grader-judgments.json`, so the enrichment steps self-skip
|
|
12
|
+
* and the report ships with no test details while still showing a score.
|
|
13
|
+
* Distinguishing the two is what lets the steps fail loud instead of returning
|
|
14
|
+
* a benign `skipped`.
|
|
15
|
+
*/
|
|
16
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
17
|
+
import { resolve } from "node:path";
|
|
18
|
+
/**
|
|
19
|
+
* Classify the enrichment inputs for a run by inspecting
|
|
20
|
+
* `results/latest/grader-judgments.json` and `score-summary.json`.
|
|
21
|
+
*
|
|
22
|
+
* Pure read-only filesystem inspection — never throws on malformed input; a
|
|
23
|
+
* file that does not parse to the expected shape is treated as absent so that
|
|
24
|
+
* "no usable judgments" and "no usable summary" both collapse to a single
|
|
25
|
+
* branch.
|
|
26
|
+
*/
|
|
27
|
+
export function classifyEnrichmentInputs(rootDir) {
|
|
28
|
+
const judgmentCount = countGraderJudgments(rootDir);
|
|
29
|
+
if (judgmentCount > 0) {
|
|
30
|
+
return { kind: "ready", judgmentCount };
|
|
31
|
+
}
|
|
32
|
+
const scoredTestCount = scoredTestCountFromSummary(rootDir);
|
|
33
|
+
if (scoredTestCount > 0) {
|
|
34
|
+
return { kind: "judgments-missing-after-eval", scoredTestCount };
|
|
35
|
+
}
|
|
36
|
+
return { kind: "no-full-eval" };
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Build the fail-loud error message for the degraded
|
|
40
|
+
* `judgments-missing-after-eval` case. Shared by the enrichment steps so the
|
|
41
|
+
* pipeline-result and job-document surfaces carry one consistent wording.
|
|
42
|
+
*/
|
|
43
|
+
export function degradedEnrichmentError(step, scoredTestCount) {
|
|
44
|
+
return (`${step}: grader-judgments.json missing after a full eval — ` +
|
|
45
|
+
`${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
|
|
46
|
+
`The report is marked degraded rather than published as healthy.`);
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Count the judgments in `grader-judgments.json`. Returns 0 when the file is
|
|
50
|
+
* absent, unreadable, not valid JSON, or not an array — every "no usable
|
|
51
|
+
* judgments" shape collapses to 0 so callers branch on a single number. An
|
|
52
|
+
* empty array is therefore indistinguishable from a missing file by design
|
|
53
|
+
* (both are "no judgments persisted").
|
|
54
|
+
*/
|
|
55
|
+
function countGraderJudgments(rootDir) {
|
|
56
|
+
const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
|
|
57
|
+
if (!existsSync(path))
|
|
58
|
+
return 0;
|
|
59
|
+
try {
|
|
60
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
61
|
+
return Array.isArray(parsed) ? parsed.length : 0;
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return 0;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Sum the per-area `testCount` from `score-summary.json` — the signal that a
|
|
69
|
+
* full eval scored tests this run. Returns 0 when the summary is absent,
|
|
70
|
+
* unreadable, or carries no scored tests.
|
|
71
|
+
*/
|
|
72
|
+
function scoredTestCountFromSummary(rootDir) {
|
|
73
|
+
const path = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
74
|
+
if (!existsSync(path))
|
|
75
|
+
return 0;
|
|
76
|
+
try {
|
|
77
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
78
|
+
const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
|
|
79
|
+
return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
return 0;
|
|
83
|
+
}
|
|
84
|
+
}
|