@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  22. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  23. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
  25. package/dist/_vendor/ailf-core/types/index.js +1 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  27. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  28. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  29. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  30. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  31. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  32. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  33. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  34. package/dist/adapters/api-client/build-request.d.ts +1 -0
  35. package/dist/adapters/api-client/build-request.js +3 -0
  36. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  37. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
  38. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
  39. package/dist/adapters/index.d.ts +1 -0
  40. package/dist/adapters/index.js +1 -0
  41. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  42. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  43. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  44. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  45. package/dist/adapters/package-surface/index.d.ts +9 -0
  46. package/dist/adapters/package-surface/index.js +8 -0
  47. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  48. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  49. package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
  50. package/dist/adapters/task-sources/repo-schemas.js +15 -0
  51. package/dist/commands/pipeline-action.d.ts +2 -0
  52. package/dist/commands/pipeline-action.js +12 -0
  53. package/dist/commands/remote-pipeline.js +10 -2
  54. package/dist/commands/remote-results.d.ts +12 -1
  55. package/dist/commands/remote-results.js +25 -5
  56. package/dist/composition-root.js +9 -0
  57. package/dist/config/package-surface.ts +37 -0
  58. package/dist/config/preflight-scoring.ts +26 -0
  59. package/dist/index.d.ts +2 -2
  60. package/dist/index.js +1 -1
  61. package/dist/orchestration/build-app-context.js +1 -0
  62. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  63. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  64. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  65. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  66. package/dist/orchestration/steps/run-eval-step.js +27 -0
  67. package/dist/pipeline/calculate-scores.d.ts +66 -5
  68. package/dist/pipeline/calculate-scores.js +141 -27
  69. package/dist/pipeline/compiler/index.d.ts +1 -1
  70. package/dist/pipeline/compiler/index.js +1 -1
  71. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  72. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/map-request-to-config.js +1 -0
  85. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  86. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  88. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  89. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  90. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  91. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  92. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  94. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  95. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  96. package/dist/pipeline/preflight/parse-imports.js +125 -0
  97. package/dist/report-store.d.ts +8 -0
  98. package/dist/report-store.js +55 -6
  99. package/dist/sanity/document-renderers.d.ts +45 -7
  100. package/dist/sanity/document-renderers.js +99 -13
  101. package/dist/sanity/queries.d.ts +11 -11
  102. package/dist/sanity/queries.js +7 -0
  103. package/dist/sanity/symbol-index.d.ts +98 -0
  104. package/dist/sanity/symbol-index.js +615 -0
  105. package/package.json +2 -1
@@ -8,6 +8,8 @@
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
10
  import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
11
+ import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
12
+ import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
11
13
  import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
12
14
  import { getStepInputPaths } from "../../pipeline/cache.js";
13
15
  import { buildCacheContext } from "../cache-context.js";
@@ -90,6 +92,7 @@ export class RunEvalStep {
90
92
  graderModel: loadGraderModel(rootDir).id,
91
93
  mode: this.mode,
92
94
  rootDir,
95
+ graderContext: ctx.config.graderContext,
93
96
  });
94
97
  // Share fingerprint with downstream steps (PublishReportStep)
95
98
  state.evalFingerprint = evalFingerprint;
@@ -224,6 +227,30 @@ export class RunEvalStep {
224
227
  const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
225
228
  if (existsSync(resultsPath)) {
226
229
  await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
230
+ // W0198 Phase 4 — deterministic-lane reports per (task, model).
231
+ // Loaded lazily so test contexts that don't wire the manifest /
232
+ // resolver pay nothing; the helper is itself a no-op when its
233
+ // inputs are missing.
234
+ const packageSurface = await loadPackageSurface(rootDir).catch((err) => {
235
+ console.warn(` ⚠️ W0198 preflight: failed to load package-surface manifest — ${err instanceof Error ? err.message : String(err)}`);
236
+ return undefined;
237
+ });
238
+ const preflight = await emitSymbolPreflight({
239
+ writer: ctx.artifactWriter,
240
+ ctx,
241
+ mode: this.mode,
242
+ resultsPath,
243
+ packageSurface,
244
+ resolver: ctx.packageSurfaceResolver,
245
+ });
246
+ if (preflight.reports.size > 0) {
247
+ if (!state.preflightReports) {
248
+ state.preflightReports = new Map();
249
+ }
250
+ for (const [k, v] of preflight.reports) {
251
+ state.preflightReports.set(k, v);
252
+ }
253
+ }
227
254
  }
228
255
  // Extract Promptfoo share URL from eval results (Step 3b)
229
256
  if (ctx.evalRunner.extractShareUrl) {
@@ -1,6 +1,7 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
3
  import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
+ import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
4
5
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
6
  export interface PromptfooResultsWrapper {
6
7
  results: RawTestResult[];
@@ -60,6 +61,34 @@ export interface RawTestResult {
60
61
  };
61
62
  vars: Record<string, string>;
62
63
  }
64
+ /**
65
+ * Aggregate counts across every preflight report emitted by the run.
66
+ *
67
+ * `unresolvedRate` is `unresolved / totalFindings` in `[0, 1]`, set only
68
+ * when `totalFindings > 0`. The vacuous case (`totalFindings === 0` —
69
+ * reports exist but every candidate cited zero in-scope bindings) is
70
+ * deliberately distinguished from "every binding resolved cleanly"
71
+ * (`unresolvedRate === 0`) so a CI threshold like `unresolvedRate > 0.1`
72
+ * doesn't fire green on a run that had nothing to resolve.
73
+ */
74
+ export interface PreflightSummary {
75
+ /** Number of per-test preflight reports the run emitted. */
76
+ reportCount: number;
77
+ /** Total findings across all reports. */
78
+ totalFindings: number;
79
+ /** Findings classified `exists`. */
80
+ exists: number;
81
+ /** Findings classified `missing` — the deterministic-deduction lane. */
82
+ missing: number;
83
+ /** Findings classified `unresolved` — the resolver-couldn't-answer lane. */
84
+ unresolved: number;
85
+ /**
86
+ * `unresolved / totalFindings` in `[0, 1]`. Absent when `totalFindings`
87
+ * is zero — distinguishes "nothing to resolve" from "all resolutions
88
+ * succeeded" so CI thresholds aren't vacuously green.
89
+ */
90
+ unresolvedRate?: number;
91
+ }
63
92
  /**
64
93
  * Calculate scores grouped by model. Each model gets its own FeatureScore[]
65
94
  * and model-level aggregates.
@@ -70,7 +99,7 @@ export interface RawTestResult {
70
99
  * @returns Record keyed by model ID, or null if only one model was used
71
100
  * (per-model breakdown is redundant when there's only one model).
72
101
  */
73
- export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
102
+ export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): null | PerModelEntry[];
74
103
  /**
75
104
  * Extract grader judgments (reason text + scores) from evaluation results.
76
105
  *
@@ -91,6 +120,19 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
91
120
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
92
121
  */
93
122
  export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
123
+ /**
124
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
125
+ * resolver-health summary. Returns `undefined` when the run had no
126
+ * preflight reports (manifest disabled, resolver missing, or every
127
+ * candidate output cited zero in-scope packages) so the consumer can
128
+ * cleanly omit the field from the score summary instead of writing a
129
+ * vacuous block of zeros.
130
+ *
131
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
132
+ * production calls go through `calculateAndWriteScores`, which threads
133
+ * the result into the `EvalScoreSummary.preflight` field.
134
+ */
135
+ export declare function summarizePreflight(reports: Map<string, SymbolPreflightReport> | undefined): PreflightSummary | undefined;
94
136
  /**
95
137
  * Score knowledge-probe evaluation results.
96
138
  *
@@ -105,7 +147,7 @@ export declare function extractStoredTestResults(resultsPath: string): StoredTes
105
147
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
106
148
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
107
149
  */
108
- export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
150
+ export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): FeatureScore[];
109
151
  /**
110
152
  * Score agentic evaluation results. In agentic mode, all test entries are
111
153
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -113,7 +155,7 @@ export declare function scoreKnowledgeProbeResults(results: TestResult[], profil
113
155
  *
114
156
  * Returns a record keyed by feature area with the composite actual score.
115
157
  */
116
- export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
158
+ export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, ActualScoreEntry>;
117
159
  /**
118
160
  * Score agentic results broken down by model.
119
161
  *
@@ -121,7 +163,7 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
121
163
  * producing a map of model → feature → ActualScoreEntry.
122
164
  * Used to enrich the per-model breakdown with actual scores in full mode.
123
165
  */
124
- export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
166
+ export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
125
167
  /** Options for the calculate-scores main() function. */
126
168
  export interface CalculateScoresOptions {
127
169
  /** Allowed origins for source isolation reporting */
@@ -130,12 +172,31 @@ export interface CalculateScoresOptions {
130
172
  logger?: Logger;
131
173
  /** Evaluation mode (controls which result files are read) */
132
174
  mode?: string;
175
+ /**
176
+ * W0198 — symbol-preflight reports keyed by `${runId}/${mode}/${task}/${model}`,
177
+ * populated by `RunEvalStep` via `emitSymbolPreflight`. When provided, the
178
+ * scoring engine merges deterministic preflight findings into the
179
+ * `code-correctness` dimension. Absence (or empty map) collapses cleanly
180
+ * to the pre-W0198 rubric-only path.
181
+ */
182
+ preflightReports?: Map<string, SymbolPreflightReport>;
183
+ /**
184
+ * W0198 — preflight's share of `code-correctness` in `[0, 1]`. Defaults
185
+ * to `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` when omitted.
186
+ */
187
+ preflightWeight?: number;
133
188
  /** Pre-resolved source config (skips loadSource() call) */
134
189
  resolvedSource?: ResolvedSourceConfig;
135
190
  /** Path to baseline results file (default: results/latest/eval-results.json) */
136
191
  resultsPath?: string;
137
192
  /** Root directory of the eval package (required) */
138
193
  rootDir: string;
194
+ /**
195
+ * W0198 — runId axis used to look up preflight reports. Required when
196
+ * `preflightReports` is provided; otherwise the lookup callback can't
197
+ * reconstruct the right key.
198
+ */
199
+ runId?: string;
139
200
  /** Search mode for source verification metadata */
140
201
  searchMode?: string;
141
202
  /** Documentation source name */
@@ -29,7 +29,7 @@
29
29
  */
30
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
31
31
  import { join } from "path";
32
- import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
32
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
33
33
  import { calculateCost } from "../agent-observer/pricing.js";
34
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
35
  import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
@@ -38,7 +38,7 @@ import { loadRubricTemplates } from "./rubric-loader.js";
38
38
  import { resolveProfile } from "./profile-resolution.js";
39
39
  import { loadSource } from "../sources.js";
40
40
  import { LiteracyVariant } from "./normalize-mode.js";
41
- import { scoreTestGroup } from "./compiler/scoring-bridge.js";
41
+ import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
42
42
  // Re-export from core for backward compatibility.
43
43
  // Existing imports from this file continue to work unchanged.
44
44
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -52,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
52
52
  * @returns Record keyed by model ID, or null if only one model was used
53
53
  * (per-model breakdown is redundant when there's only one model).
54
54
  */
55
- export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
55
+ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile, preflightOptions) {
56
56
  const results = readAndNormalizeResults(resultsPath);
57
57
  // Group results by provider
58
58
  const byModel = {};
@@ -72,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
72
72
  }
73
73
  const perModel = [];
74
74
  for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
75
- const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
75
+ const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId, preflightOptions);
76
76
  const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
77
77
  const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
78
78
  const avgScore = scores.length > 0
@@ -408,9 +408,111 @@ function buildSourceVerification(root, source, verificationCtx) {
408
408
  * Calculate overall scores (all models combined).
409
409
  * This is the original scoring path — backward compatible.
410
410
  */
411
- function calculateScores(resultsPath, goldProfile, baselineProfile) {
411
+ function calculateScores(resultsPath, goldProfile, baselineProfile, preflightOptions) {
412
412
  const results = readAndNormalizeResults(resultsPath);
413
- return scoreResults(results, goldProfile, baselineProfile);
413
+ return scoreResults(results, goldProfile, baselineProfile, undefined, preflightOptions);
414
+ }
415
+ /**
416
+ * W0198 — build a `ScoreTestGroupOptions` that the scoring bridge can
417
+ * use to look up a `SymbolPreflightReport` for any given `TestResult`.
418
+ *
419
+ * Mirrors the keying scheme `emitSymbolPreflight` uses:
420
+ * `${runId}/${mode}/${task}/${model}` where `(mode, task)` come from
421
+ * `resolveVariantMode(test.description, defaultMode)`.
422
+ *
423
+ * Returns `undefined` (effectively a no-op) when reports are absent,
424
+ * empty, or the runId hasn't been provided — those collapse cleanly
425
+ * to the pre-W0198 path. The runId branch logs a warning when reports
426
+ * exist but the caller forgot to wire `runId` so the silent
427
+ * preflight-disabled state doesn't go unobserved.
428
+ */
429
+ function makePreflightOptions(reports, runId, defaultMode, weight, logger) {
430
+ if (!reports || reports.size === 0)
431
+ return undefined;
432
+ if (!runId) {
433
+ logger?.warn(`[warn] W0198 preflight: ${reports.size} preflight report(s) provided but no runId — skipping merge into code-correctness`);
434
+ return undefined;
435
+ }
436
+ return {
437
+ preflightWeight: weight,
438
+ preflightForTest: (test) => {
439
+ const modelId = test.providerId ?? test.providerLabel ?? "unknown-model";
440
+ const { mode: axisMode, task } = resolveVariantMode(test.description, defaultMode);
441
+ const key = `${runId}/${axisMode}/${task}/${modelId}`;
442
+ return reports.get(key);
443
+ },
444
+ };
445
+ }
446
+ /**
447
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
448
+ * resolver-health summary. Returns `undefined` when the run had no
449
+ * preflight reports (manifest disabled, resolver missing, or every
450
+ * candidate output cited zero in-scope packages) so the consumer can
451
+ * cleanly omit the field from the score summary instead of writing a
452
+ * vacuous block of zeros.
453
+ *
454
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
455
+ * production calls go through `calculateAndWriteScores`, which threads
456
+ * the result into the `EvalScoreSummary.preflight` field.
457
+ */
458
+ export function summarizePreflight(reports) {
459
+ if (!reports || reports.size === 0)
460
+ return undefined;
461
+ let totalFindings = 0;
462
+ let exists = 0;
463
+ let missing = 0;
464
+ let unresolved = 0;
465
+ for (const report of reports.values()) {
466
+ for (const finding of report.findings) {
467
+ totalFindings++;
468
+ if (finding.result === "exists") {
469
+ exists++;
470
+ }
471
+ else if (finding.result === "missing") {
472
+ missing++;
473
+ }
474
+ else if (finding.result === "unresolved") {
475
+ unresolved++;
476
+ }
477
+ else {
478
+ // Exhaustiveness guard: a future fourth `result` variant lands
479
+ // here and surfaces as a build error rather than silently
480
+ // counting into `unresolved`.
481
+ const _exhaustive = finding;
482
+ void _exhaustive;
483
+ }
484
+ }
485
+ }
486
+ return {
487
+ reportCount: reports.size,
488
+ totalFindings,
489
+ exists,
490
+ missing,
491
+ unresolved,
492
+ ...(totalFindings > 0 && { unresolvedRate: unresolved / totalFindings }),
493
+ };
494
+ }
495
+ /**
496
+ * Print the preflight summary to the run log. Format mirrors the other
497
+ * single-line health signals (URL fetch, agent isolation) so CI grep can
498
+ * extract `unresolvedRate` directly from the log when score-summary.json
499
+ * isn't already in scope.
500
+ */
501
+ function printPreflightSummary(summary, log) {
502
+ if (!summary)
503
+ return;
504
+ // `unresolvedRate` is absent when the run produced reports but no
505
+ // findings — distinguish vacuous-green from all-resolved so CI doesn't
506
+ // misread the threshold.
507
+ const rateLabel = summary.unresolvedRate === undefined
508
+ ? "n/a (no findings)"
509
+ : `${(summary.unresolvedRate * 100).toFixed(1)}%`;
510
+ log.info("-".repeat(80));
511
+ log.info("SYMBOL PREFLIGHT (W0198)");
512
+ log.info("-".repeat(80));
513
+ log.info(` ${summary.reportCount} report(s), ${summary.totalFindings} finding(s): ${summary.exists} exists / ${summary.missing} missing / ${summary.unresolved} unresolved`);
514
+ log.info(` unresolvedRate: ${rateLabel} (resolver-health signal — not a candidate score factor)`);
515
+ log.info("");
414
516
  }
415
517
  /**
416
518
  * Extracts agent behavior summary from a test result's metadata.
@@ -644,7 +746,7 @@ function readAndNormalizeResults(resultsPath, log) {
644
746
  * @param baselineProfile Weight profile for baseline (without-docs) entries
645
747
  * @param modelId Optional model identifier to tag each FeatureScore
646
748
  */
647
- function scoreResults(results, goldProfile, baselineProfile, modelId) {
749
+ function scoreResults(results, goldProfile, baselineProfile, modelId, preflightOptions) {
648
750
  // Group by feature + docs/no-docs
649
751
  const byFeature = {};
650
752
  for (const result of results) {
@@ -663,12 +765,12 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
663
765
  const scores = [];
664
766
  for (const [feature, data] of Object.entries(byFeature)) {
665
767
  // --- With docs (gold / ceiling) — scored via 4-tier engine ---
666
- const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
768
+ const gold = scoreTestGroup(data.withDocs, goldProfile, feature, preflightOptions);
667
769
  // --- Without docs (baseline / floor) ---
668
770
  // Uses the baseline profile (e.g. "output-only") which may exclude
669
771
  // dimensions like doc-coverage that are undefined without docs.
670
772
  // See docs/design-docs/named-scoring-profiles.md.
671
- const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
773
+ const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature, preflightOptions);
672
774
  const featureCost = gold.totalCost + baseline.totalCost;
673
775
  const ceilingScore = gold.composite;
674
776
  const floorScore = baseline.composite;
@@ -709,7 +811,7 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
709
811
  * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
710
812
  * are set to 0 for backward compatibility with downstream consumers.
711
813
  */
712
- function scoreAgentHarnessResults(results, profile) {
814
+ function scoreAgentHarnessResults(results, profile, preflightOptions) {
713
815
  // Group by task ID (extracted from description: "task-id — Title")
714
816
  const byTask = {};
715
817
  for (const result of results) {
@@ -721,7 +823,7 @@ function scoreAgentHarnessResults(results, profile) {
721
823
  }
722
824
  const scores = [];
723
825
  for (const [taskId, taskResults] of Object.entries(byTask)) {
724
- const scored = scoreTestGroup(taskResults, profile, taskId);
826
+ const scored = scoreTestGroup(taskResults, profile, taskId, preflightOptions);
725
827
  const totalCost = scored.totalCost;
726
828
  // Detect feature area for backward compat (used by report grouping)
727
829
  const feature = taskResults[0]?.vars.__featureArea ??
@@ -774,7 +876,7 @@ function extractTaskId(description) {
774
876
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
775
877
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
776
878
  */
777
- export function scoreKnowledgeProbeResults(results, profile) {
879
+ export function scoreKnowledgeProbeResults(results, profile, preflightOptions) {
778
880
  const byFeature = {};
779
881
  for (const result of results) {
780
882
  const feature = result.vars.__featureArea || detectFeatureArea(result.description);
@@ -785,7 +887,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
785
887
  }
786
888
  const scores = [];
787
889
  for (const [feature, featureResults] of Object.entries(byFeature)) {
788
- const scored = scoreTestGroup(featureResults, profile, feature);
890
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
789
891
  scores.push({
790
892
  assertionPassRate: scored.dimensions.assertionPassRate,
791
893
  ceilingScore: 0,
@@ -817,7 +919,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
817
919
  * Returns a record keyed by feature area with the composite actual score.
818
920
  */
819
921
  // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
820
- export function scoreAgenticResults(resultsPath, profile) {
922
+ export function scoreAgenticResults(resultsPath, profile, preflightOptions) {
821
923
  const results = readAndNormalizeResults(resultsPath);
822
924
  // Group by feature area
823
925
  const byFeature = {};
@@ -830,7 +932,7 @@ export function scoreAgenticResults(resultsPath, profile) {
830
932
  }
831
933
  const entries = {};
832
934
  for (const [feature, featureResults] of Object.entries(byFeature)) {
833
- const scored = scoreTestGroup(featureResults, profile, feature);
935
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
834
936
  entries[feature] = {
835
937
  actualScore: scored.composite,
836
938
  codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -849,7 +951,7 @@ export function scoreAgenticResults(resultsPath, profile) {
849
951
  * producing a map of model → feature → ActualScoreEntry.
850
952
  * Used to enrich the per-model breakdown with actual scores in full mode.
851
953
  */
852
- export function scoreAgenticResultsPerModel(resultsPath, profile) {
954
+ export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptions) {
853
955
  const results = readAndNormalizeResults(resultsPath);
854
956
  // Group by model, then feature
855
957
  const byModel = {};
@@ -866,7 +968,7 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
866
968
  for (const [modelId, features] of Object.entries(byModel)) {
867
969
  perModel[modelId] = {};
868
970
  for (const [feature, featureResults] of Object.entries(features)) {
869
- const scored = scoreTestGroup(featureResults, profile, feature);
971
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
870
972
  perModel[modelId][feature] = {
871
973
  actualScore: scored.composite,
872
974
  codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -912,6 +1014,15 @@ export function calculateAndWriteScores(options) {
912
1014
  }
913
1015
  // Determine mode — controls which result files are read
914
1016
  const mode = options.mode ?? LiteracyVariant.STANDARD;
1017
+ // W0198 — assemble preflight options once. The helper returns
1018
+ // `undefined` when reports / runId are missing, so all downstream
1019
+ // callers handle the no-preflight case uniformly.
1020
+ const preflightOptions = makePreflightOptions(options.preflightReports, options.runId, mode, options.preflightWeight, log);
1021
+ // W0198 — resolver-health summary. Independent of `preflightOptions`
1022
+ // (which gates the score merge): when reports exist but the runId is
1023
+ // missing, scoring stays on the rubric-only path while telemetry still
1024
+ // surfaces, so the resolver's drift remains visible.
1025
+ const preflightSummary = summarizePreflight(options.preflightReports);
915
1026
  const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
916
1027
  // Agentic results path (only used in full mode)
917
1028
  const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -940,7 +1051,7 @@ export function calculateAndWriteScores(options) {
940
1051
  const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
941
1052
  log.debug("Agent-harness scoring profile", agentProfile);
942
1053
  const results = readAndNormalizeResults(baselineResultsPath);
943
- const scores = scoreAgentHarnessResults(results, agentProfile);
1054
+ const scores = scoreAgentHarnessResults(results, agentProfile, preflightOptions);
944
1055
  log.debug("Agent-harness scores calculated", {
945
1056
  taskCount: scores.length,
946
1057
  tasks: scores.map((s) => ({
@@ -960,7 +1071,7 @@ export function calculateAndWriteScores(options) {
960
1071
  const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
961
1072
  graderCost, null, // no per-model breakdown
962
1073
  null, // no source isolation
963
- sourceVerification, "agent-harness", log);
1074
+ sourceVerification, "agent-harness", log, preflightSummary);
964
1075
  // Persist
965
1076
  const outDir = join(ROOT, "results", "latest");
966
1077
  mkdirSync(outDir, { recursive: true });
@@ -992,7 +1103,7 @@ export function calculateAndWriteScores(options) {
992
1103
  const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
993
1104
  log.debug("Knowledge-probe scoring profile", probeProfile);
994
1105
  const results = readAndNormalizeResults(baselineResultsPath);
995
- const scores = scoreKnowledgeProbeResults(results, probeProfile);
1106
+ const scores = scoreKnowledgeProbeResults(results, probeProfile, preflightOptions);
996
1107
  log.debug("Knowledge-probe scores calculated", {
997
1108
  featureCount: scores.length,
998
1109
  features: scores.map((s) => ({
@@ -1012,7 +1123,7 @@ export function calculateAndWriteScores(options) {
1012
1123
  const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
1013
1124
  graderCost, null, // no per-model breakdown for now
1014
1125
  null, // no source isolation — KP doesn't fetch sources
1015
- sourceVerification, "knowledge-probe", log);
1126
+ sourceVerification, "knowledge-probe", log, preflightSummary);
1016
1127
  // Persist
1017
1128
  const outDir = join(ROOT, "results", "latest");
1018
1129
  mkdirSync(outDir, { recursive: true });
@@ -1041,7 +1152,7 @@ export function calculateAndWriteScores(options) {
1041
1152
  gold: goldProfile,
1042
1153
  baseline: baselineProfileWeights,
1043
1154
  });
1044
- const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
1155
+ const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1045
1156
  log.debug("Baseline scores calculated", {
1046
1157
  featureCount: baselineScores.length,
1047
1158
  features: baselineScores.map((s) => ({
@@ -1051,7 +1162,7 @@ export function calculateAndWriteScores(options) {
1051
1162
  docLift: s.docLift,
1052
1163
  })),
1053
1164
  });
1054
- const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
1165
+ const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1055
1166
  const urlRefs = aggregateUrlReferences(baselineResultsPath);
1056
1167
  const sourceVerification = buildSourceVerification(ROOT, source, {
1057
1168
  allowedOrigins: options.allowedOrigins,
@@ -1067,7 +1178,7 @@ export function calculateAndWriteScores(options) {
1067
1178
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1068
1179
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
1069
1180
  const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
1070
- const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
1181
+ const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
1071
1182
  log.debug("Agentic scores calculated", {
1072
1183
  featureCount: Object.keys(agenticScores).length,
1073
1184
  features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -1080,7 +1191,7 @@ export function calculateAndWriteScores(options) {
1080
1191
  evaluationMode = LiteracyVariant.FULL;
1081
1192
  // Merge agentic actual scores into the per-model breakdown
1082
1193
  if (perModel) {
1083
- const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
1194
+ const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile, preflightOptions);
1084
1195
  for (const entry of perModel) {
1085
1196
  const modelAgentic = agenticPerModel[entry.modelId];
1086
1197
  if (modelAgentic) {
@@ -1115,7 +1226,7 @@ export function calculateAndWriteScores(options) {
1115
1226
  ? LiteracyVariant.OBSERVED
1116
1227
  : LiteracyVariant.STANDARD;
1117
1228
  }
1118
- const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
1229
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary);
1119
1230
  // Persist
1120
1231
  const outDir = join(ROOT, "results", "latest");
1121
1232
  mkdirSync(outDir, { recursive: true });
@@ -1269,7 +1380,7 @@ function printPerModelReport(perModel, log) {
1269
1380
  // ---------------------------------------------------------------------------
1270
1381
  // Main
1271
1382
  // ---------------------------------------------------------------------------
1272
- function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
1383
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary) {
1273
1384
  const _log = log ?? new ConsoleLogger();
1274
1385
  _log.info("\n" + "=".repeat(80));
1275
1386
  _log.info(" SANITY AI LITERACY SCORE REPORT");
@@ -1428,6 +1539,8 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1428
1539
  if (perModel) {
1429
1540
  printPerModelReport(perModel, _log);
1430
1541
  }
1542
+ // W0198 — symbol preflight resolver-health summary
1543
+ printPreflightSummary(preflightSummary, _log);
1431
1544
  // URL References
1432
1545
  printUrlReport(urlRefs, _log);
1433
1546
  // Agent Behavior (only present when run with instrumented provider)
@@ -1557,6 +1670,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1557
1670
  }
1558
1671
  : undefined,
1559
1672
  ...(perModel && { perModel }),
1673
+ ...(preflightSummary && { preflight: preflightSummary }),
1560
1674
  ...(sourceIsolation && { sourceIsolation }),
1561
1675
  ...(sourceVerification && { sourceVerification }),
1562
1676
  timestamp: new Date().toISOString(),
@@ -21,6 +21,6 @@ export { checkBudget, classifyToolCall, classifyToolCalls, collectTrace, compute
21
21
  export { registerSanityLiteracyPreset, sanityLiteracyPreset, } from "./presets/index.js";
22
22
  export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from "./ignore-fields.js";
23
23
  export { simpleHash } from "./hash.js";
24
- export { scoreTestGroup, type BridgedScoreResult } from "./scoring-bridge.js";
24
+ export { preflightToScore, scoreTestGroup, type BridgedScoreResult, type ScoreTestGroupOptions, } from "./scoring-bridge.js";
25
25
  export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
26
26
  export type { ConfigLoadResult } from "./config-loader.js";
@@ -37,6 +37,6 @@ export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from
37
37
  // Hash utility
38
38
  export { simpleHash } from "./hash.js";
39
39
  // Scoring bridge — 4-tier engine integration
40
- export { scoreTestGroup } from "./scoring-bridge.js";
40
+ export { preflightToScore, scoreTestGroup, } from "./scoring-bridge.js";
41
41
  // Unified config loader
42
42
  export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
@@ -20,6 +20,7 @@
20
20
  */
21
21
  import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
22
22
  import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
23
+ import type { PreflightRubricContext } from "./rubric-resolution.js";
23
24
  import { type LiteracyEvalSubMode } from "../normalize-mode.js";
24
25
  /** Options for compiling all literacy tasks via the new compiler */
25
26
  export interface LiteracyBridgeOptions {
@@ -35,6 +36,14 @@ export interface LiteracyBridgeOptions {
35
36
  label: string;
36
37
  config?: Record<string, unknown>;
37
38
  }[];
39
+ /** Grader context policy passed through to `compileLiteracyTask`. */
40
+ graderContext?: "rubric-only" | "with-docs";
41
+ /**
42
+ * W0198 Phase 6 — preflight context passed through to every task's
43
+ * `code-correctness` rubric so the grader treats the deterministic
44
+ * lane's existence verdicts as ground truth.
45
+ */
46
+ preflightContext?: PreflightRubricContext;
38
47
  }
39
48
  /** Result of compiling all literacy tasks */
40
49
  export interface LiteracyBridgeResult {
@@ -73,6 +73,8 @@ export function compileLiteracyTasks(tasks, options) {
73
73
  evalMode: options.evalMode,
74
74
  models: options.models,
75
75
  rubricConfig,
76
+ graderContext: options.graderContext,
77
+ preflightContext: options.preflightContext,
76
78
  };
77
79
  for (const node of orderedNodes) {
78
80
  const task = taskMap.get(node.taskId);
@@ -7,7 +7,7 @@
7
7
  import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
9
  import type { LiteracyCompileOptions } from "./types.js";
10
- export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
10
+ export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
11
11
  /**
12
12
  * Build baseline assertions matching the legacy expand-tasks behavior.
13
13
  *