npm - @sanity/ailf - Versions diffs - 4.2.0 → 4.3.1 - Mend

@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

package/config/package-surface.ts +37 -0
package/config/preflight-scoring.ts +26 -0
package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
package/dist/_vendor/ailf-core/config-helpers.js +67 -0
package/dist/_vendor/ailf-core/index.d.ts +1 -1
package/dist/_vendor/ailf-core/index.js +1 -1
package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
package/dist/_vendor/ailf-core/ports/index.js +1 -0
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/index.js +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
package/dist/_vendor/ailf-core/types/index.js +1 -0
package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
package/dist/adapters/api-client/build-request.d.ts +1 -0
package/dist/adapters/api-client/build-request.js +3 -0
package/dist/adapters/config-sources/file-config-adapter.js +1 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
package/dist/adapters/index.d.ts +1 -0
package/dist/adapters/index.js +1 -0
package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
package/dist/adapters/package-surface/dts-package-surface.js +173 -0
package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
package/dist/adapters/package-surface/index.d.ts +9 -0
package/dist/adapters/package-surface/index.js +8 -0
package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
package/dist/adapters/task-sources/repo-schemas.js +15 -0
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +12 -0
package/dist/commands/remote-pipeline.js +10 -2
package/dist/commands/remote-results.d.ts +12 -1
package/dist/commands/remote-results.js +25 -5
package/dist/composition-root.js +9 -0
package/dist/config/package-surface.ts +37 -0
package/dist/config/preflight-scoring.ts +26 -0
package/dist/index.d.ts +2 -2
package/dist/index.js +1 -1
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
package/dist/orchestration/pipeline-orchestrator.js +38 -0
package/dist/orchestration/steps/calculate-scores-step.js +11 -0
package/dist/orchestration/steps/generate-configs-step.js +16 -1
package/dist/orchestration/steps/run-eval-step.js +27 -0
package/dist/pipeline/calculate-scores.d.ts +66 -5
package/dist/pipeline/calculate-scores.js +141 -27
package/dist/pipeline/compiler/index.d.ts +1 -1
package/dist/pipeline/compiler/index.js +1 -1
package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
package/dist/pipeline/compiler/literacy-bridge.js +2 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
package/dist/pipeline/compiler/rubric-resolution.js +78 -2
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
package/dist/pipeline/compiler/scoring-bridge.js +104 -10
package/dist/pipeline/eval-fingerprint.d.ts +9 -0
package/dist/pipeline/eval-fingerprint.js +7 -1
package/dist/pipeline/map-request-to-config.js +1 -0
package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
package/dist/pipeline/preflight/compute-preflight.js +118 -0
package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
package/dist/pipeline/preflight/load-package-surface.js +19 -0
package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
package/dist/pipeline/preflight/load-preflight-context.js +25 -0
package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
package/dist/pipeline/preflight/parse-imports.js +125 -0
package/dist/report-store.d.ts +8 -0
package/dist/report-store.js +55 -6
package/dist/sanity/document-renderers.d.ts +45 -7
package/dist/sanity/document-renderers.js +99 -13
package/dist/sanity/queries.d.ts +11 -11
package/dist/sanity/queries.js +7 -0
package/dist/sanity/symbol-index.d.ts +98 -0
package/dist/sanity/symbol-index.js +615 -0
package/package.json +2 -1

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -8,6 +8,8 @@
 import { existsSync, mkdirSync, writeFileSync } from "fs";
 import { resolve } from "path";
 import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
+import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
+import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
 import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
@@ -90,6 +92,7 @@ export class RunEvalStep {
                     graderModel: loadGraderModel(rootDir).id,
                     mode: this.mode,
                     rootDir,
+                    graderContext: ctx.config.graderContext,
                 });
                 // Share fingerprint with downstream steps (PublishReportStep)
                 state.evalFingerprint = evalFingerprint;
@@ -224,6 +227,30 @@ export class RunEvalStep {
         const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
         if (existsSync(resultsPath)) {
             await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
+            // W0198 Phase 4 — deterministic-lane reports per (task, model).
+            // Loaded lazily so test contexts that don't wire the manifest /
+            // resolver pay nothing; the helper is itself a no-op when its
+            // inputs are missing.
+            const packageSurface = await loadPackageSurface(rootDir).catch((err) => {
+                console.warn(`  ⚠️  W0198 preflight: failed to load package-surface manifest — ${err instanceof Error ? err.message : String(err)}`);
+                return undefined;
+            });
+            const preflight = await emitSymbolPreflight({
+                writer: ctx.artifactWriter,
+                ctx,
+                mode: this.mode,
+                resultsPath,
+                packageSurface,
+                resolver: ctx.packageSurfaceResolver,
+            });
+            if (preflight.reports.size > 0) {
+                if (!state.preflightReports) {
+                    state.preflightReports = new Map();
+                }
+                for (const [k, v] of preflight.reports) {
+                    state.preflightReports.set(k, v);
+                }
+            }
         }
         // Extract Promptfoo share URL from eval results (Step 3b)
         if (ctx.evalRunner.extractShareUrl) {

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
 import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
+import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
@@ -60,6 +61,34 @@ export interface RawTestResult {
     };
     vars: Record<string, string>;
 }
+/**
+ * Aggregate counts across every preflight report emitted by the run.
+ *
+ * `unresolvedRate` is `unresolved / totalFindings` in `[0, 1]`, set only
+ * when `totalFindings > 0`. The vacuous case (`totalFindings === 0` —
+ * reports exist but every candidate cited zero in-scope bindings) is
+ * deliberately distinguished from "every binding resolved cleanly"
+ * (`unresolvedRate === 0`) so a CI threshold like `unresolvedRate > 0.1`
+ * doesn't fire green on a run that had nothing to resolve.
+ */
+export interface PreflightSummary {
+    /** Number of per-test preflight reports the run emitted. */
+    reportCount: number;
+    /** Total findings across all reports. */
+    totalFindings: number;
+    /** Findings classified `exists`. */
+    exists: number;
+    /** Findings classified `missing` — the deterministic-deduction lane. */
+    missing: number;
+    /** Findings classified `unresolved` — the resolver-couldn't-answer lane. */
+    unresolved: number;
+    /**
+     * `unresolved / totalFindings` in `[0, 1]`. Absent when `totalFindings`
+     * is zero — distinguishes "nothing to resolve" from "all resolutions
+     * succeeded" so CI thresholds aren't vacuously green.
+     */
+    unresolvedRate?: number;
+}
 /**
  * Calculate scores grouped by model. Each model gets its own FeatureScore[]
  * and model-level aggregates.
@@ -70,7 +99,7 @@ export interface RawTestResult {
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
+export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): null | PerModelEntry[];
 /**
  * Extract grader judgments (reason text + scores) from evaluation results.
  *
@@ -91,6 +120,19 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
 export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
+/**
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
+ * resolver-health summary. Returns `undefined` when the run had no
+ * preflight reports (manifest disabled, resolver missing, or every
+ * candidate output cited zero in-scope packages) so the consumer can
+ * cleanly omit the field from the score summary instead of writing a
+ * vacuous block of zeros.
+ *
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
+ * production calls go through `calculateAndWriteScores`, which threads
+ * the result into the `EvalScoreSummary.preflight` field.
+ */
+export declare function summarizePreflight(reports: Map<string, SymbolPreflightReport> | undefined): PreflightSummary | undefined;
 /**
  * Score knowledge-probe evaluation results.
  *
@@ -105,7 +147,7 @@ export declare function extractStoredTestResults(resultsPath: string): StoredTes
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
  */
-export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
+export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): FeatureScore[];
 /**
  * Score agentic evaluation results. In agentic mode, all test entries are
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -113,7 +155,7 @@ export declare function scoreKnowledgeProbeResults(results: TestResult[], profil
  *
  * Returns a record keyed by feature area with the composite actual score.
  */
-export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
+export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, ActualScoreEntry>;
 /**
  * Score agentic results broken down by model.
  *
@@ -121,7 +163,7 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
+export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
 /** Options for the calculate-scores main() function. */
 export interface CalculateScoresOptions {
     /** Allowed origins for source isolation reporting */
@@ -130,12 +172,31 @@ export interface CalculateScoresOptions {
     logger?: Logger;
     /** Evaluation mode (controls which result files are read) */
     mode?: string;
+    /**
+     * W0198 — symbol-preflight reports keyed by `${runId}/${mode}/${task}/${model}`,
+     * populated by `RunEvalStep` via `emitSymbolPreflight`. When provided, the
+     * scoring engine merges deterministic preflight findings into the
+     * `code-correctness` dimension. Absence (or empty map) collapses cleanly
+     * to the pre-W0198 rubric-only path.
+     */
+    preflightReports?: Map<string, SymbolPreflightReport>;
+    /**
+     * W0198 — preflight's share of `code-correctness` in `[0, 1]`. Defaults
+     * to `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` when omitted.
+     */
+    preflightWeight?: number;
     /** Pre-resolved source config (skips loadSource() call) */
     resolvedSource?: ResolvedSourceConfig;
     /** Path to baseline results file (default: results/latest/eval-results.json) */
     resultsPath?: string;
     /** Root directory of the eval package (required) */
     rootDir: string;
+    /**
+     * W0198 — runId axis used to look up preflight reports. Required when
+     * `preflightReports` is provided; otherwise the lookup callback can't
+     * reconstruct the right key.
+     */
+    runId?: string;
     /** Search mode for source verification metadata */
     searchMode?: string;
     /** Documentation source name */

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -29,7 +29,7 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
-import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
+import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
 import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
@@ -38,7 +38,7 @@ import { loadRubricTemplates } from "./rubric-loader.js";
 import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
 import { LiteracyVariant } from "./normalize-mode.js";
-import { scoreTestGroup } from "./compiler/scoring-bridge.js";
+import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
 // Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -52,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
+export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group results by provider
     const byModel = {};
@@ -72,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
     }
     const perModel = [];
     for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
-        const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
+        const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId, preflightOptions);
         const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
         const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
         const avgScore = scores.length > 0
@@ -408,9 +408,111 @@ function buildSourceVerification(root, source, verificationCtx) {
  * Calculate overall scores (all models combined).
  * This is the original scoring path — backward compatible.
  */
-function calculateScores(resultsPath, goldProfile, baselineProfile) {
+function calculateScores(resultsPath, goldProfile, baselineProfile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
-    return scoreResults(results, goldProfile, baselineProfile);
+    return scoreResults(results, goldProfile, baselineProfile, undefined, preflightOptions);
+}
+/**
+ * W0198 — build a `ScoreTestGroupOptions` that the scoring bridge can
+ * use to look up a `SymbolPreflightReport` for any given `TestResult`.
+ *
+ * Mirrors the keying scheme `emitSymbolPreflight` uses:
+ * `${runId}/${mode}/${task}/${model}` where `(mode, task)` come from
+ * `resolveVariantMode(test.description, defaultMode)`.
+ *
+ * Returns `undefined` (effectively a no-op) when reports are absent,
+ * empty, or the runId hasn't been provided — those collapse cleanly
+ * to the pre-W0198 path. The runId branch logs a warning when reports
+ * exist but the caller forgot to wire `runId` so the silent
+ * preflight-disabled state doesn't go unobserved.
+ */
+function makePreflightOptions(reports, runId, defaultMode, weight, logger) {
+    if (!reports || reports.size === 0)
+        return undefined;
+    if (!runId) {
+        logger?.warn(`[warn] W0198 preflight: ${reports.size} preflight report(s) provided but no runId — skipping merge into code-correctness`);
+        return undefined;
+    }
+    return {
+        preflightWeight: weight,
+        preflightForTest: (test) => {
+            const modelId = test.providerId ?? test.providerLabel ?? "unknown-model";
+            const { mode: axisMode, task } = resolveVariantMode(test.description, defaultMode);
+            const key = `${runId}/${axisMode}/${task}/${modelId}`;
+            return reports.get(key);
+        },
+    };
+}
+/**
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
+ * resolver-health summary. Returns `undefined` when the run had no
+ * preflight reports (manifest disabled, resolver missing, or every
+ * candidate output cited zero in-scope packages) so the consumer can
+ * cleanly omit the field from the score summary instead of writing a
+ * vacuous block of zeros.
+ *
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
+ * production calls go through `calculateAndWriteScores`, which threads
+ * the result into the `EvalScoreSummary.preflight` field.
+ */
+export function summarizePreflight(reports) {
+    if (!reports || reports.size === 0)
+        return undefined;
+    let totalFindings = 0;
+    let exists = 0;
+    let missing = 0;
+    let unresolved = 0;
+    for (const report of reports.values()) {
+        for (const finding of report.findings) {
+            totalFindings++;
+            if (finding.result === "exists") {
+                exists++;
+            }
+            else if (finding.result === "missing") {
+                missing++;
+            }
+            else if (finding.result === "unresolved") {
+                unresolved++;
+            }
+            else {
+                // Exhaustiveness guard: a future fourth `result` variant lands
+                // here and surfaces as a build error rather than silently
+                // counting into `unresolved`.
+                const _exhaustive = finding;
+                void _exhaustive;
+            }
+        }
+    }
+    return {
+        reportCount: reports.size,
+        totalFindings,
+        exists,
+        missing,
+        unresolved,
+        ...(totalFindings > 0 && { unresolvedRate: unresolved / totalFindings }),
+    };
+}
+/**
+ * Print the preflight summary to the run log. Format mirrors the other
+ * single-line health signals (URL fetch, agent isolation) so CI grep can
+ * extract `unresolvedRate` directly from the log when score-summary.json
+ * isn't already in scope.
+ */
+function printPreflightSummary(summary, log) {
+    if (!summary)
+        return;
+    // `unresolvedRate` is absent when the run produced reports but no
+    // findings — distinguish vacuous-green from all-resolved so CI doesn't
+    // misread the threshold.
+    const rateLabel = summary.unresolvedRate === undefined
+        ? "n/a (no findings)"
+        : `${(summary.unresolvedRate * 100).toFixed(1)}%`;
+    log.info("-".repeat(80));
+    log.info("SYMBOL PREFLIGHT (W0198)");
+    log.info("-".repeat(80));
+    log.info(`  ${summary.reportCount} report(s), ${summary.totalFindings} finding(s): ${summary.exists} exists / ${summary.missing} missing / ${summary.unresolved} unresolved`);
+    log.info(`  unresolvedRate: ${rateLabel}  (resolver-health signal — not a candidate score factor)`);
+    log.info("");
 }
 /**
  * Extracts agent behavior summary from a test result's metadata.
@@ -644,7 +746,7 @@ function readAndNormalizeResults(resultsPath, log) {
  * @param baselineProfile Weight profile for baseline (without-docs) entries
  * @param modelId         Optional model identifier to tag each FeatureScore
  */
-function scoreResults(results, goldProfile, baselineProfile, modelId) {
+function scoreResults(results, goldProfile, baselineProfile, modelId, preflightOptions) {
     // Group by feature + docs/no-docs
     const byFeature = {};
     for (const result of results) {
@@ -663,12 +765,12 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
     const scores = [];
     for (const [feature, data] of Object.entries(byFeature)) {
         // --- With docs (gold / ceiling) — scored via 4-tier engine ---
-        const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
+        const gold = scoreTestGroup(data.withDocs, goldProfile, feature, preflightOptions);
         // --- Without docs (baseline / floor) ---
         // Uses the baseline profile (e.g. "output-only") which may exclude
         // dimensions like doc-coverage that are undefined without docs.
         // See docs/design-docs/named-scoring-profiles.md.
-        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
+        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature, preflightOptions);
         const featureCost = gold.totalCost + baseline.totalCost;
         const ceilingScore = gold.composite;
         const floorScore = baseline.composite;
@@ -709,7 +811,7 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
  * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
  * are set to 0 for backward compatibility with downstream consumers.
  */
-function scoreAgentHarnessResults(results, profile) {
+function scoreAgentHarnessResults(results, profile, preflightOptions) {
     // Group by task ID (extracted from description: "task-id — Title")
     const byTask = {};
     for (const result of results) {
@@ -721,7 +823,7 @@ function scoreAgentHarnessResults(results, profile) {
     }
     const scores = [];
     for (const [taskId, taskResults] of Object.entries(byTask)) {
-        const scored = scoreTestGroup(taskResults, profile, taskId);
+        const scored = scoreTestGroup(taskResults, profile, taskId, preflightOptions);
         const totalCost = scored.totalCost;
         // Detect feature area for backward compat (used by report grouping)
         const feature = taskResults[0]?.vars.__featureArea ??
@@ -774,7 +876,7 @@ function extractTaskId(description) {
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
  */
-export function scoreKnowledgeProbeResults(results, profile) {
+export function scoreKnowledgeProbeResults(results, profile, preflightOptions) {
     const byFeature = {};
     for (const result of results) {
         const feature = result.vars.__featureArea || detectFeatureArea(result.description);
@@ -785,7 +887,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
     }
     const scores = [];
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        const scored = scoreTestGroup(featureResults, profile, feature);
+        const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
         scores.push({
             assertionPassRate: scored.dimensions.assertionPassRate,
             ceilingScore: 0,
@@ -817,7 +919,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
  * Returns a record keyed by feature area with the composite actual score.
  */
 // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
-export function scoreAgenticResults(resultsPath, profile) {
+export function scoreAgenticResults(resultsPath, profile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group by feature area
     const byFeature = {};
@@ -830,7 +932,7 @@ export function scoreAgenticResults(resultsPath, profile) {
     }
     const entries = {};
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        const scored = scoreTestGroup(featureResults, profile, feature);
+        const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
         entries[feature] = {
             actualScore: scored.composite,
             codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -849,7 +951,7 @@ export function scoreAgenticResults(resultsPath, profile) {
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export function scoreAgenticResultsPerModel(resultsPath, profile) {
+export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group by model, then feature
     const byModel = {};
@@ -866,7 +968,7 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
     for (const [modelId, features] of Object.entries(byModel)) {
         perModel[modelId] = {};
         for (const [feature, featureResults] of Object.entries(features)) {
-            const scored = scoreTestGroup(featureResults, profile, feature);
+            const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
             perModel[modelId][feature] = {
                 actualScore: scored.composite,
                 codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -912,6 +1014,15 @@ export function calculateAndWriteScores(options) {
     }
     // Determine mode — controls which result files are read
     const mode = options.mode ?? LiteracyVariant.STANDARD;
+    // W0198 — assemble preflight options once. The helper returns
+    // `undefined` when reports / runId are missing, so all downstream
+    // callers handle the no-preflight case uniformly.
+    const preflightOptions = makePreflightOptions(options.preflightReports, options.runId, mode, options.preflightWeight, log);
+    // W0198 — resolver-health summary. Independent of `preflightOptions`
+    // (which gates the score merge): when reports exist but the runId is
+    // missing, scoring stays on the rubric-only path while telemetry still
+    // surfaces, so the resolver's drift remains visible.
+    const preflightSummary = summarizePreflight(options.preflightReports);
     const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
     // Agentic results path (only used in full mode)
     const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -940,7 +1051,7 @@ export function calculateAndWriteScores(options) {
         const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
         log.debug("Agent-harness scoring profile", agentProfile);
         const results = readAndNormalizeResults(baselineResultsPath);
-        const scores = scoreAgentHarnessResults(results, agentProfile);
+        const scores = scoreAgentHarnessResults(results, agentProfile, preflightOptions);
         log.debug("Agent-harness scores calculated", {
             taskCount: scores.length,
             tasks: scores.map((s) => ({
@@ -960,7 +1071,7 @@ export function calculateAndWriteScores(options) {
         const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
         graderCost, null, // no per-model breakdown
         null, // no source isolation
-        sourceVerification, "agent-harness", log);
+        sourceVerification, "agent-harness", log, preflightSummary);
         // Persist
         const outDir = join(ROOT, "results", "latest");
         mkdirSync(outDir, { recursive: true });
@@ -992,7 +1103,7 @@ export function calculateAndWriteScores(options) {
         const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
         log.debug("Knowledge-probe scoring profile", probeProfile);
         const results = readAndNormalizeResults(baselineResultsPath);
-        const scores = scoreKnowledgeProbeResults(results, probeProfile);
+        const scores = scoreKnowledgeProbeResults(results, probeProfile, preflightOptions);
         log.debug("Knowledge-probe scores calculated", {
             featureCount: scores.length,
             features: scores.map((s) => ({
@@ -1012,7 +1123,7 @@ export function calculateAndWriteScores(options) {
         const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
         graderCost, null, // no per-model breakdown for now
         null, // no source isolation — KP doesn't fetch sources
-        sourceVerification, "knowledge-probe", log);
+        sourceVerification, "knowledge-probe", log, preflightSummary);
         // Persist
         const outDir = join(ROOT, "results", "latest");
         mkdirSync(outDir, { recursive: true });
@@ -1041,7 +1152,7 @@ export function calculateAndWriteScores(options) {
         gold: goldProfile,
         baseline: baselineProfileWeights,
     });
-    const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
+    const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     log.debug("Baseline scores calculated", {
         featureCount: baselineScores.length,
         features: baselineScores.map((s) => ({
@@ -1051,7 +1162,7 @@ export function calculateAndWriteScores(options) {
             docLift: s.docLift,
         })),
     });
-    const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
+    const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     const urlRefs = aggregateUrlReferences(baselineResultsPath);
     const sourceVerification = buildSourceVerification(ROOT, source, {
         allowedOrigins: options.allowedOrigins,
@@ -1067,7 +1178,7 @@ export function calculateAndWriteScores(options) {
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
         const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
-        const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
+        const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
             features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -1080,7 +1191,7 @@ export function calculateAndWriteScores(options) {
         evaluationMode = LiteracyVariant.FULL;
         // Merge agentic actual scores into the per-model breakdown
         if (perModel) {
-            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
+            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile, preflightOptions);
             for (const entry of perModel) {
                 const modelAgentic = agenticPerModel[entry.modelId];
                 if (modelAgentic) {
@@ -1115,7 +1226,7 @@ export function calculateAndWriteScores(options) {
                 ? LiteracyVariant.OBSERVED
                 : LiteracyVariant.STANDARD;
     }
-    const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
+    const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary);
     // Persist
     const outDir = join(ROOT, "results", "latest");
     mkdirSync(outDir, { recursive: true });
@@ -1269,7 +1380,7 @@ function printPerModelReport(perModel, log) {
 // ---------------------------------------------------------------------------
 // Main
 // ---------------------------------------------------------------------------
-function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
+function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary) {
     const _log = log ?? new ConsoleLogger();
     _log.info("\n" + "=".repeat(80));
     _log.info("                    SANITY AI LITERACY SCORE REPORT");
@@ -1428,6 +1539,8 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
     if (perModel) {
         printPerModelReport(perModel, _log);
     }
+    // W0198 — symbol preflight resolver-health summary
+    printPreflightSummary(preflightSummary, _log);
     // URL References
     printUrlReport(urlRefs, _log);
     // Agent Behavior (only present when run with instrumented provider)
@@ -1557,6 +1670,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
             }
             : undefined,
         ...(perModel && { perModel }),
+        ...(preflightSummary && { preflight: preflightSummary }),
         ...(sourceIsolation && { sourceIsolation }),
         ...(sourceVerification && { sourceVerification }),
         timestamp: new Date().toISOString(),

package/dist/pipeline/compiler/index.d.ts CHANGED Viewed

@@ -21,6 +21,6 @@ export { checkBudget, classifyToolCall, classifyToolCalls, collectTrace, compute
 export { registerSanityLiteracyPreset, sanityLiteracyPreset, } from "./presets/index.js";
 export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from "./ignore-fields.js";
 export { simpleHash } from "./hash.js";
-export { scoreTestGroup, type BridgedScoreResult } from "./scoring-bridge.js";
+export { preflightToScore, scoreTestGroup, type BridgedScoreResult, type ScoreTestGroupOptions, } from "./scoring-bridge.js";
 export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
 export type { ConfigLoadResult } from "./config-loader.js";

package/dist/pipeline/compiler/index.js CHANGED Viewed

@@ -37,6 +37,6 @@ export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from
 // Hash utility
 export { simpleHash } from "./hash.js";
 // Scoring bridge — 4-tier engine integration
-export { scoreTestGroup } from "./scoring-bridge.js";
+export { preflightToScore, scoreTestGroup, } from "./scoring-bridge.js";
 // Unified config loader
 export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";

package/dist/pipeline/compiler/literacy-bridge.d.ts CHANGED Viewed

@@ -20,6 +20,7 @@
  */
 import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
 import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
+import type { PreflightRubricContext } from "./rubric-resolution.js";
 import { type LiteracyEvalSubMode } from "../normalize-mode.js";
 /** Options for compiling all literacy tasks via the new compiler */
 export interface LiteracyBridgeOptions {
@@ -35,6 +36,14 @@ export interface LiteracyBridgeOptions {
         label: string;
         config?: Record<string, unknown>;
     }[];
+    /** Grader context policy passed through to `compileLiteracyTask`. */
+    graderContext?: "rubric-only" | "with-docs";
+    /**
+     * W0198 Phase 6 — preflight context passed through to every task's
+     * `code-correctness` rubric so the grader treats the deterministic
+     * lane's existence verdicts as ground truth.
+     */
+    preflightContext?: PreflightRubricContext;
 }
 /** Result of compiling all literacy tasks */
 export interface LiteracyBridgeResult {

package/dist/pipeline/compiler/literacy-bridge.js CHANGED Viewed

@@ -73,6 +73,8 @@ export function compileLiteracyTasks(tasks, options) {
         evalMode: options.evalMode,
         models: options.models,
         rubricConfig,
+        graderContext: options.graderContext,
+        preflightContext: options.preflightContext,
     };
     for (const node of orderedNodes) {
         const task = taskMap.get(node.taskId);

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts CHANGED Viewed

@@ -7,7 +7,7 @@
 import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooAssertion } from "../../assertion-mapper.js";
 import type { LiteracyCompileOptions } from "./types.js";
-export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
+export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
 /**
  * Build baseline assertions matching the legacy expand-tasks behavior.
  *