@sanity/ailf 7.1.0 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
  2. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  3. package/dist/_vendor/ailf-core/schemas/index.js +4 -0
  4. package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
  5. package/dist/_vendor/ailf-core/schemas/report.js +14 -0
  6. package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
  7. package/dist/_vendor/ailf-core/schemas/user.js +23 -0
  8. package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
  9. package/dist/_vendor/ailf-core/types/index.js +13 -0
  10. package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
  11. package/dist/_vendor/ailf-core/types/user.js +1 -0
  12. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
  13. package/dist/_vendor/ailf-shared/document-ref.js +23 -1
  14. package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
  15. package/dist/_vendor/ailf-shared/index.d.ts +1 -1
  16. package/dist/_vendor/ailf-shared/index.js +1 -0
  17. package/dist/_vendor/ailf-shared/owner-teams.js +19 -6
  18. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
  19. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
  20. package/dist/adapters/task-sources/content-lake-task-source.js +12 -7
  21. package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
  22. package/dist/orchestration/steps/compute-attribution-step.js +17 -2
  23. package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
  24. package/dist/orchestration/steps/gap-analysis-step.js +29 -10
  25. package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
  26. package/dist/orchestration/steps/publish-report-step.js +63 -6
  27. package/dist/pipeline/calculate-scores.d.ts +13 -1
  28. package/dist/pipeline/calculate-scores.js +125 -22
  29. package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
  30. package/dist/pipeline/enrichment-preconditions.js +84 -0
  31. package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
  32. package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
  33. package/dist/report-store.d.ts +1 -0
  34. package/dist/report-store.js +2 -0
  35. package/dist/sanity/queries.d.ts +1 -1
  36. package/dist/sanity/queries.js +1 -0
  37. package/dist/sources.js +40 -2
  38. package/package.json +1 -1
@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
18
18
  import { join, resolve } from "path";
19
19
  import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
20
20
  import { emitFileContents } from "../../artifact-capture/emit-file.js";
21
+ import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
21
22
  export class GapAnalysisStep {
22
23
  name = "gap-analysis";
23
24
  optional = true;
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
34
35
  }
35
36
  return [];
36
37
  }
37
- async execute(ctx) {
38
+ async execute(ctx, state) {
38
39
  const root = ctx.config.rootDir;
39
40
  const start = Date.now();
40
41
  const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
41
42
  const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
42
- if (!existsSync(judgmentsPath)) {
43
+ // Distinguish a legitimate skip (no graded eval ran this pipeline) from a
44
+ // degraded run where a full eval scored tests but no judgments persisted.
45
+ // The latter must fail loud — returning a benign `skipped` is what let
46
+ // reports publish with a score but no test details.
47
+ //
48
+ // A remote cache hit restores score-summary.json (with testCount) from a
49
+ // prior report but never writes grader-judgments.json, so judgments are
50
+ // legitimately absent — that is a benign skip, not a degraded full eval.
51
+ const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
52
+ const inputs = classifyEnrichmentInputs(root);
53
+ if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
54
+ return {
55
+ durationMs: Date.now() - start,
56
+ status: "failed",
57
+ error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
58
+ };
59
+ }
60
+ if (inputs.kind !== "ready") {
43
61
  return {
44
62
  status: "skipped",
45
63
  reason: "No grader-judgments.json — run a full evaluation first",
@@ -82,14 +100,15 @@ export class GapAnalysisStep {
82
100
  const resolveRefs = (slugs) => slugs
83
101
  .map((slug) => {
84
102
  const m = refBySlug.get(slug);
85
- return m
86
- ? {
87
- documentId: m._id,
88
- revision: m._rev,
89
- slug: m.slug,
90
- title: m.title,
91
- }
92
- : { documentId: "", slug, title: slug };
103
+ if (!m)
104
+ return { documentId: "", slug, title: slug };
105
+ return {
106
+ documentId: m._id,
107
+ revision: m._rev,
108
+ slug: m.slug,
109
+ ...(m.path ? { path: m.path } : {}),
110
+ title: m.title,
111
+ };
93
112
  })
94
113
  .filter((r) => r.documentId !== "");
95
114
  // ── Build description→docs mapping from TaskSource ─────────
@@ -10,7 +10,7 @@
10
10
  * - P5: Local-first (pipeline never fails because of a store write)
11
11
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
12
12
  */
13
- import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
13
+ import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
14
14
  import { type ProvenanceInput } from "../../pipeline/provenance.js";
15
15
  export declare class PublishReportStep implements PipelineStep {
16
16
  private readonly pipelineStart;
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
25
25
  check(): ValidationIssue[];
26
26
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
27
27
  }
28
+ /**
29
+ * Detect whether a report should publish as degraded.
30
+ *
31
+ * The symptom is a scored run whose per-test details never landed: a full
32
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
33
+ * absent because gap-analysis skipped or failed. Such a report renders an
34
+ * empty "no tests" state in Studio despite carrying a score. Returns the
35
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
36
+ * for a healthy report (or a run with no scored tests, where an empty report
37
+ * is legitimate).
38
+ *
39
+ * Exported for unit testing — production callers reach it via execute().
40
+ */
41
+ export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
28
42
  /**
29
43
  * Assemble provenance input from the score summary and pipeline context.
30
44
  *
@@ -110,9 +110,15 @@ export class PublishReportStep {
110
110
  // agentBehavior arrays) point at their external artifacts via
111
111
  // `id = manifestEntryKey`; Studio hydrates on drill-down.
112
112
  const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
113
+ // Degraded-report detection (the "no tests on a scored report" symptom):
114
+ // a full eval scored tests but the gap-analysis enrichment never landed.
115
+ // Computed from the full summary read above — independent of which
116
+ // upstream step skipped — so the marker fires regardless of the cause.
117
+ const degraded = detectReportDegradation(summary);
113
118
  const report = {
114
119
  comparison: comparison ?? undefined,
115
120
  completedAt: now,
121
+ ...(degraded ? { degraded } : {}),
116
122
  durationMs,
117
123
  id: reportId,
118
124
  provenance,
@@ -192,6 +198,45 @@ export class PublishReportStep {
192
198
  // ---------------------------------------------------------------------------
193
199
  // Helpers
194
200
  // ---------------------------------------------------------------------------
201
+ /**
202
+ * Detect whether a report should publish as degraded.
203
+ *
204
+ * The symptom is a scored run whose per-test details never landed: a full
205
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
206
+ * absent because gap-analysis skipped or failed. Such a report renders an
207
+ * empty "no tests" state in Studio despite carrying a score. Returns the
208
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
209
+ * for a healthy report (or a run with no scored tests, where an empty report
210
+ * is legitimate).
211
+ *
212
+ * Exported for unit testing — production callers reach it via execute().
213
+ */
214
+ export function detectReportDegradation(summary) {
215
+ const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
216
+ const hasTestResults = (summary.testResults?.length ?? 0) > 0;
217
+ if (scoredTestCount === 0 || hasTestResults)
218
+ return undefined;
219
+ // `testResults` is the load-bearing signal (its absence is the rendered
220
+ // "no tests" symptom). The remaining fields are best-effort detail: some
221
+ // are literacy-only (e.g. documentManifest), so they may appear here for a
222
+ // degraded non-literacy run even though that mode never produces them.
223
+ const missing = ["testResults"];
224
+ if (!summary.failureModes)
225
+ missing.push("failureModes");
226
+ if (!summary.lowScoringJudgments?.length)
227
+ missing.push("lowScoringJudgments");
228
+ if (!summary.documentManifest?.length)
229
+ missing.push("documentManifest");
230
+ if (!summary.recommendations)
231
+ missing.push("recommendations");
232
+ return {
233
+ reason: "enrichment-missing",
234
+ missing,
235
+ detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
236
+ `complete; per-test details and failure analysis are unavailable for ` +
237
+ `this report.`,
238
+ };
239
+ }
195
240
  /**
196
241
  * Assemble provenance input from the score summary and pipeline context.
197
242
  *
@@ -214,20 +259,32 @@ export function buildProvenanceInput(summary, ctx, options, autoScope) {
214
259
  // summary.source undefined). Without this fallback, the report
215
260
  // reads "production" regardless of what the dashboard sent.
216
261
  // 3. "production" — last-resort built-in default.
217
- if (summary.source?.name === undefined && ctx.config.source) {
262
+ //
263
+ // Per-field fallbacks (dataset/projectId/perspective) only fire when
264
+ // `summary.source` itself is absent — i.e. the loadSource throw was
265
+ // swallowed. When summary.source is present, trust what the fetch
266
+ // actually used; papering over a missing `perspective` from
267
+ // `ctx.config.perspectiveOverride` makes provenance claim a release
268
+ // was used when it wasn't (W0295).
269
+ const sourceResolved = summary.source?.name !== undefined;
270
+ if (!sourceResolved && ctx.config.source) {
218
271
  ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
219
272
  }
220
273
  const source = {
221
274
  baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
222
- dataset: summary.source?.dataset ?? ctx.config.datasetOverride ?? "next",
275
+ dataset: sourceResolved
276
+ ? (summary.source.dataset ?? "next")
277
+ : (ctx.config.datasetOverride ?? "next"),
223
278
  documentIds: [],
224
279
  llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
225
280
  name: summary.source?.name ?? ctx.config.source ?? "production",
226
- perspective: summary.source?.perspective ??
227
- ctx.config.perspectiveOverride ??
228
- undefined,
281
+ perspective: sourceResolved
282
+ ? summary.source.perspective
283
+ : (ctx.config.perspectiveOverride ?? undefined),
229
284
  priorityDomain: "sanity.io",
230
- projectId: summary.source?.projectId ?? ctx.config.projectIdOverride ?? "3do82whm",
285
+ projectId: sourceResolved
286
+ ? summary.source.projectId
287
+ : (ctx.config.projectIdOverride ?? "3do82whm"),
231
288
  studioOrigin: "https://admin.sanity.io",
232
289
  urls: [],
233
290
  };
@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
187
187
  * @param manifestSlugs - All slugs in the run's document manifest.
188
188
  */
189
189
  export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
190
+ /**
191
+ * Per-variant scoring profiles passed to {@link extractStoredTestResults}.
192
+ * Each profile maps dimension id → weight. Variants whose dimensions don't
193
+ * intersect the supplied keys yield `compositeScore: undefined` rather than
194
+ * a misleading 0.
195
+ */
196
+ export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
190
197
  /**
191
198
  * Extract per-test results with model output from evaluation results.
192
199
  *
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
194
201
  * shape including response.output (truncated), latency, and cost.
195
202
  * One StoredTestResult per test × model combination.
196
203
  *
204
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
205
+ * the weighted mean of its dimension scores using the profile matching its
206
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
207
+ * behavior preserved.
208
+ *
197
209
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
198
210
  */
199
- export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
211
+ export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
200
212
  /**
201
213
  * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
202
214
  * resolver-health summary. Returns `undefined` when the run had no
@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
41
41
  import { loadSource } from "../sources.js";
42
42
  import { LiteracyVariant } from "./normalize-mode.js";
43
43
  import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
44
+ import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
44
45
  // Re-export from core for backward compatibility.
45
46
  // Existing imports from this file continue to work unchanged.
46
47
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
321
322
  }
322
323
  return judgments;
323
324
  }
325
+ /**
326
+ * Light parse of a results file's entry count — diagnostics only. Avoids the
327
+ * full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
328
+ * the file is missing or unparseable.
329
+ */
330
+ function countResultEntries(resultsPath) {
331
+ try {
332
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
333
+ const wrapper = file.results ?? file;
334
+ return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
335
+ }
336
+ catch {
337
+ return 0;
338
+ }
339
+ }
340
+ /**
341
+ * Count classifiable llm-rubric components in a results file — i.e. the number
342
+ * of judgments a healthy `extractGraderJudgments` should produce. Used only to
343
+ * set the severity of a persistent-empty extraction: a file with classifiable
344
+ * components but 0 extracted judgments is an error; a file with none (all
345
+ * api-errors / no llm-rubric) is a benign empty.
346
+ *
347
+ * Deliberately an independent count path (not `extractGraderJudgments`) so the
348
+ * cross-check is meaningful. Returns 0 when the file is missing or unparseable.
349
+ */
350
+ function countClassifiableRubricComponents(resultsPath) {
351
+ if (!existsSync(resultsPath))
352
+ return 0;
353
+ let n = 0;
354
+ for (const result of readAndNormalizeResults(resultsPath)) {
355
+ for (const comp of result.gradingResult.componentResults) {
356
+ if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
357
+ n += 1;
358
+ }
359
+ }
360
+ }
361
+ return n;
362
+ }
363
+ /**
364
+ * Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
365
+ * real extractor + fs counters. Defined once so all persist sites self-heal
366
+ * identically.
367
+ */
368
+ const resilientJudgmentDeps = {
369
+ countClassifiable: countClassifiableRubricComponents,
370
+ countResults: countResultEntries,
371
+ extract: extractGraderJudgments,
372
+ };
324
373
  /**
325
374
  * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
326
375
  * triple and increment `GraderReliability.failureModeCalibration` whenever
@@ -469,6 +518,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
469
518
  * `responseOutputTruncated` still flips for the extreme tail.
470
519
  */
471
520
  const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
521
+ /**
522
+ * Weighted mean of dimension scores. Mirrors the dashboard's read-side
523
+ * fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
524
+ * and reader stay aligned. Returns `undefined` when no dimension matches the
525
+ * profile (caller decides whether that signals misconfiguration).
526
+ */
527
+ function computeStoredCompositeScore(dimensions, weights) {
528
+ let weighted = 0;
529
+ let totalWeight = 0;
530
+ for (const dim of dimensions) {
531
+ const w = weights[dim.dimension];
532
+ if (w === undefined)
533
+ continue;
534
+ weighted += dim.score * w;
535
+ totalWeight += w;
536
+ }
537
+ if (totalWeight === 0)
538
+ return undefined;
539
+ return Math.round(weighted / totalWeight);
540
+ }
472
541
  /**
473
542
  * Extract per-test results with model output from evaluation results.
474
543
  *
@@ -476,9 +545,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
476
545
  * shape including response.output (truncated), latency, and cost.
477
546
  * One StoredTestResult per test × model combination.
478
547
  *
548
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
549
+ * the weighted mean of its dimension scores using the profile matching its
550
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
551
+ * behavior preserved.
552
+ *
479
553
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
480
554
  */
481
- export function extractStoredTestResults(resultsPath) {
555
+ export function extractStoredTestResults(resultsPath, profiles) {
482
556
  const results = readAndNormalizeResults(resultsPath);
483
557
  const testResults = [];
484
558
  for (const result of results) {
@@ -523,8 +597,13 @@ export function extractStoredTestResults(resultsPath) {
523
597
  dimensions.push({ dimension, reason, score });
524
598
  }
525
599
  const tokenUsage = result.response?.tokenUsage;
600
+ const profileForVariant = profiles?.[variant];
601
+ const compositeScore = profileForVariant
602
+ ? computeStoredCompositeScore(dimensions, profileForVariant)
603
+ : undefined;
526
604
  testResults.push({
527
605
  area,
606
+ ...(compositeScore !== undefined && { compositeScore }),
528
607
  cost: result.cost || undefined,
529
608
  dimensions,
530
609
  latencyMs: result.latencyMs,
@@ -1464,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
1464
1543
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1465
1544
  log.info("Score summary written to results/latest/score-summary.json");
1466
1545
  // Extract and persist grader judgments
1467
- const judgments = extractGraderJudgments(baselineResultsPath);
1546
+ const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
1468
1547
  const borderlineConsistency = await runBorderlinePass(judgments, [
1469
1548
  baselineResultsPath,
1470
1549
  ]);
@@ -1477,7 +1556,12 @@ export async function calculateAndWriteScores(options) {
1477
1556
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1478
1557
  }
1479
1558
  // Extract and persist per-test results (D0029: model output + metadata)
1480
- const testResults = extractStoredTestResults(baselineResultsPath);
1559
+ // Agent-harness produces a single profile shared across detected variants
1560
+ // (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
1561
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1562
+ gold: agentProfile,
1563
+ baseline: agentProfile,
1564
+ });
1481
1565
  if (testResults.length > 0) {
1482
1566
  writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
1483
1567
  log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1522,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
1522
1606
  mkdirSync(outDir, { recursive: true });
1523
1607
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1524
1608
  log.info("Score summary written to results/latest/score-summary.json");
1525
- const judgments = extractGraderJudgments(baselineResultsPath);
1609
+ const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
1526
1610
  const borderlineConsistency = await runBorderlinePass(judgments, [
1527
1611
  baselineResultsPath,
1528
1612
  ]);
@@ -1534,7 +1618,13 @@ export async function calculateAndWriteScores(options) {
1534
1618
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1535
1619
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1536
1620
  }
1537
- const testResults = extractStoredTestResults(baselineResultsPath);
1621
+ // Knowledge-probe deletes vars.docs in the compiler, so every entry's
1622
+ // detected variant is "baseline" — supply the probe profile under both
1623
+ // keys so the composite is populated regardless of detection.
1624
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1625
+ gold: probeProfile,
1626
+ baseline: probeProfile,
1627
+ });
1538
1628
  if (testResults.length > 0) {
1539
1629
  writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
1540
1630
  log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1548,9 +1638,15 @@ export async function calculateAndWriteScores(options) {
1548
1638
  // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
1549
1639
  const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
1550
1640
  const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
1641
+ // Hoisted so the post-scoring extractStoredTestResults call against the
1642
+ // agentic results file can attach the matching profile (W0291).
1643
+ const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
1644
+ ? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
1645
+ : undefined;
1551
1646
  log.debug("Loaded scoring profiles", {
1552
1647
  gold: goldProfile,
1553
1648
  baseline: baselineProfileWeights,
1649
+ ...(agenticProfile && { agentic: agenticProfile }),
1554
1650
  });
1555
1651
  const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1556
1652
  log.debug("Baseline scores calculated", {
@@ -1577,7 +1673,8 @@ export async function calculateAndWriteScores(options) {
1577
1673
  let evaluationMode;
1578
1674
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1579
1675
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
1580
- const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
1676
+ // Non-null assertion safe the outer guard hoisting agenticProfile uses
1677
+ // the same condition; if we entered this block, the profile was resolved.
1581
1678
  const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
1582
1679
  log.debug("Agentic scores calculated", {
1583
1680
  featureCount: Object.keys(agenticScores).length,
@@ -1639,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
1639
1736
  // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
1640
1737
  // is incremented during the post-extraction validation pass below.
1641
1738
  const reliability = { graderModel: "unknown" };
1642
- const judgments = extractGraderJudgments(baselineResultsPath, {
1643
- reliability,
1644
- ...(options.runId ? { runId: options.runId } : {}),
1645
- });
1646
- // In full mode, also extract judgments from agentic results
1647
- if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1648
- const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
1649
- reliability,
1650
- ...(options.runId ? { runId: options.runId } : {}),
1651
- });
1652
- judgments.push(...agenticJudgments);
1653
- }
1739
+ // Extract through the resilient wrapper so an empty result from the transient
1740
+ // read anomaly is instrumented and self-healed rather than silently skipping
1741
+ // the grader-judgments persist. In full mode both the baseline and agentic
1742
+ // result files are graded against the shared telemetry.
1743
+ const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
1744
+ ? [baselineResultsPath, agenticResultsPath]
1745
+ : [baselineResultsPath];
1746
+ const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
1654
1747
  // Borderline-consensus pass — re-grade the ±5 borderline subset N times
1655
1748
  // and merge medians back into the canonical judgments BEFORE
1656
1749
  // `validateGraderJudgmentsCalibration` runs, so the calibration counter
@@ -1681,11 +1774,21 @@ export async function calculateAndWriteScores(options) {
1681
1774
  });
1682
1775
  }
1683
1776
  }
1684
- // Extract and persist per-test results (D0029: model output + metadata)
1685
- const testResults = extractStoredTestResults(baselineResultsPath);
1686
- // In full mode, also extract test results from agentic results
1777
+ // Extract and persist per-test results (D0029: model output + metadata).
1778
+ // Literacy gold (with-docs) entries score against the default profile;
1779
+ // baseline (without-docs) entries score against the output-only profile.
1780
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1781
+ gold: goldProfile,
1782
+ baseline: baselineProfileWeights,
1783
+ });
1784
+ // In full mode, also extract test results from agentic results — the
1785
+ // agentic file's gold entries score against the agentic profile while
1786
+ // baseline entries (if any leak through) still use the literacy baseline.
1687
1787
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1688
- const agenticTestResults = extractStoredTestResults(agenticResultsPath);
1788
+ const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
1789
+ gold: agenticProfile,
1790
+ baseline: baselineProfileWeights,
1791
+ });
1689
1792
  testResults.push(...agenticTestResults);
1690
1793
  }
1691
1794
  if (testResults.length > 0) {
@@ -0,0 +1,52 @@
1
+ /**
2
+ * pipeline/enrichment-preconditions.ts
3
+ *
4
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
5
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
6
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
7
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
8
+ *
9
+ * The degraded case is the failure these steps must stop swallowing:
10
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
11
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
12
+ * and the report ships with no test details while still showing a score.
13
+ * Distinguishing the two is what lets the steps fail loud instead of returning
14
+ * a benign `skipped`.
15
+ */
16
+ /**
17
+ * Outcome of classifying the enrichment inputs under `results/latest/`.
18
+ *
19
+ * - `ready` — `grader-judgments.json` is present and non-empty; enrichment
20
+ * can run.
21
+ * - `no-full-eval` — no graded eval produced judgments this run. A legitimate
22
+ * skip: standalone gap-analysis on cached results, a non-graded run, or an
23
+ * eval that scored nothing.
24
+ * - `judgments-missing-after-eval` — a full eval scored tests
25
+ * (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
26
+ * is missing or empty. This is the degraded condition the steps surface.
27
+ */
28
+ export type EnrichmentInputs = {
29
+ kind: "ready";
30
+ judgmentCount: number;
31
+ } | {
32
+ kind: "no-full-eval";
33
+ } | {
34
+ kind: "judgments-missing-after-eval";
35
+ scoredTestCount: number;
36
+ };
37
+ /**
38
+ * Classify the enrichment inputs for a run by inspecting
39
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
40
+ *
41
+ * Pure read-only filesystem inspection — never throws on malformed input; a
42
+ * file that does not parse to the expected shape is treated as absent so that
43
+ * "no usable judgments" and "no usable summary" both collapse to a single
44
+ * branch.
45
+ */
46
+ export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
47
+ /**
48
+ * Build the fail-loud error message for the degraded
49
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
50
+ * pipeline-result and job-document surfaces carry one consistent wording.
51
+ */
52
+ export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;
@@ -0,0 +1,84 @@
1
+ /**
2
+ * pipeline/enrichment-preconditions.ts
3
+ *
4
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
5
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
6
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
7
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
8
+ *
9
+ * The degraded case is the failure these steps must stop swallowing:
10
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
11
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
12
+ * and the report ships with no test details while still showing a score.
13
+ * Distinguishing the two is what lets the steps fail loud instead of returning
14
+ * a benign `skipped`.
15
+ */
16
+ import { existsSync, readFileSync } from "node:fs";
17
+ import { resolve } from "node:path";
18
+ /**
19
+ * Classify the enrichment inputs for a run by inspecting
20
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
21
+ *
22
+ * Pure read-only filesystem inspection — never throws on malformed input; a
23
+ * file that does not parse to the expected shape is treated as absent so that
24
+ * "no usable judgments" and "no usable summary" both collapse to a single
25
+ * branch.
26
+ */
27
+ export function classifyEnrichmentInputs(rootDir) {
28
+ const judgmentCount = countGraderJudgments(rootDir);
29
+ if (judgmentCount > 0) {
30
+ return { kind: "ready", judgmentCount };
31
+ }
32
+ const scoredTestCount = scoredTestCountFromSummary(rootDir);
33
+ if (scoredTestCount > 0) {
34
+ return { kind: "judgments-missing-after-eval", scoredTestCount };
35
+ }
36
+ return { kind: "no-full-eval" };
37
+ }
38
+ /**
39
+ * Build the fail-loud error message for the degraded
40
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
41
+ * pipeline-result and job-document surfaces carry one consistent wording.
42
+ */
43
+ export function degradedEnrichmentError(step, scoredTestCount) {
44
+ return (`${step}: grader-judgments.json missing after a full eval — ` +
45
+ `${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
46
+ `The report is marked degraded rather than published as healthy.`);
47
+ }
48
+ /**
49
+ * Count the judgments in `grader-judgments.json`. Returns 0 when the file is
50
+ * absent, unreadable, not valid JSON, or not an array — every "no usable
51
+ * judgments" shape collapses to 0 so callers branch on a single number. An
52
+ * empty array is therefore indistinguishable from a missing file by design
53
+ * (both are "no judgments persisted").
54
+ */
55
+ function countGraderJudgments(rootDir) {
56
+ const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
57
+ if (!existsSync(path))
58
+ return 0;
59
+ try {
60
+ const parsed = JSON.parse(readFileSync(path, "utf-8"));
61
+ return Array.isArray(parsed) ? parsed.length : 0;
62
+ }
63
+ catch {
64
+ return 0;
65
+ }
66
+ }
67
+ /**
68
+ * Sum the per-area `testCount` from `score-summary.json` — the signal that a
69
+ * full eval scored tests this run. Returns 0 when the summary is absent,
70
+ * unreadable, or carries no scored tests.
71
+ */
72
+ function scoredTestCountFromSummary(rootDir) {
73
+ const path = resolve(rootDir, "results", "latest", "score-summary.json");
74
+ if (!existsSync(path))
75
+ return 0;
76
+ try {
77
+ const parsed = JSON.parse(readFileSync(path, "utf-8"));
78
+ const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
79
+ return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
80
+ }
81
+ catch {
82
+ return 0;
83
+ }
84
+ }