npm - @sanity/ailf - Versions diffs - 4.1.0 → 4.3.0 - Mend

@sanity/ailf 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

package/dist/commands/validate-tasks.js CHANGED Viewed

@@ -17,7 +17,7 @@ import { existsSync, readdirSync, readFileSync } from "fs";
 import { resolve, relative, basename } from "path";
 import { Command } from "commander";
 import { load } from "js-yaml";
-import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
+import { detectLegacyFieldNames, migratePromptShape, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
 import { validateCanonicalTasks, formatRepoValidationResult, } from "../adapters/task-sources/repo-validation.js";
 import { discoverTsTaskFiles, loadTsTaskFile, } from "../adapters/task-sources/task-file-loader.js";
 export function createValidateTasksCommand() {
@@ -133,8 +133,14 @@ function validateTaskArray(entries, file, accumulator) {
         console.error();
         return false;
     }
+    // W0193: pre-migrate legacy prompt.vars.{task,docs,__featureArea} shape
+    // and surface deprecation warnings (non-fatal — the file still validates).
+    const { migrated, warnings: deprecationWarnings } = migratePromptShape(entries, file);
+    for (const warning of deprecationWarnings) {
+        console.warn(`  ${warning}`);
+    }
     try {
-        const tasks = parseCanonicalTaskFile(entries, file);
+        const tasks = parseCanonicalTaskFile(migrated, file);
         console.log(`  ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
         accumulator.push(...tasks);
         return true;

package/dist/composition-root.js CHANGED Viewed

@@ -29,6 +29,7 @@ import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js"
 import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
 import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
 import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
+import { DtsPackageSurface } from "./adapters/package-surface/index.js";
 import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
 import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
 import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
@@ -65,6 +66,13 @@ export function createAppContext(config) {
     const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
     // Eval runner — Promptfoo subprocess
     const evalRunner = new PromptfooEvalAdapter(config.rootDir);
+    // Package-surface resolver for the W0198 symbol-resolution preflight.
+    // Reads installed `.d.ts` from the eval package's node_modules chain
+    // (anchored at `config.rootDir`). Phase 1 wires the adapter; later
+    // phases consume it.
+    const packageSurfaceResolver = new DtsPackageSurface({
+        resolveFromDir: config.rootDir,
+    });
     // Report store — Sanity Content Lake (for publish + auto-compare)
     const reportStore = createReportStore(config);
     // Sinks — loaded from config/sinks
@@ -90,6 +98,7 @@ export function createAppContext(config) {
         docFetcher,
         evalRunner,
         logger,
+        packageSurfaceResolver,
         progress,
         registry,
         reportStore,

package/dist/config/package-surface.ts ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * package-surface.ts — Framework-level package-surface manifest for the
+ * W0198 symbol-resolution preflight.
+ *
+ * The manifest pins each in-scope package to a single semver-major range.
+ * The preflight resolver answers "does symbol X export from package Y" by
+ * reading the installed package's `.d.ts` against this pin. Tasks
+ * reference packages by name; they do not carry per-package version
+ * metadata (per-task overrides remain a future extension point).
+ *
+ * Bumping a major is an editorial event — one PR that updates the pin,
+ * regenerates cached surfaces, and re-runs the historical comparison set.
+ * Patch and minor releases within a pinned major flow silently because
+ * semver disallows the export removals that would change a deduction
+ * outcome.
+ *
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
+ */
+import { definePackageSurface } from "../_vendor/ailf-core/index.js"
+export default definePackageSurface({
+  packages: [
+    // Sanity App SDK — drives App SDK literacy tasks (e.g. DOC-2117).
+    { pkg: "@sanity/sdk-react", semverPin: "^2.0.0" },
+    // Sanity Studio runtime — drives Studio-side literacy tasks.
+    // Pinned to the installed major in this repo (^5.x). The earlier
+    // major (v3) is an explicit per-task override candidate when a task
+    // exercises legacy Studio behavior.
+    { pkg: "sanity", semverPin: "^5.0.0" },
+    // @sanity/client — drives direct-client literacy tasks. Pinned to
+    // the installed major in this repo (^7.x).
+    { pkg: "@sanity/client", semverPin: "^7.0.0" },
+  ],
+})

package/dist/config/preflight-scoring.ts ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * preflight-scoring.ts — How heavily the W0198 deterministic preflight
+ * contributes to the `code-correctness` dimension.
+ *
+ * The preflight (`SymbolPreflightReport`) and the LLM rubric both feed
+ * into `code-correctness` per D0010's weighted dimension aggregation.
+ * `codeCorrectnessWeight` sets the relative share between them — `0.4`
+ * means preflight is 40% of the dimension, rubric 60%.
+ *
+ * Bumping this is an editorial decision: a higher weight ties more of the
+ * `code-correctness` score to the deterministic existence-check (less
+ * grader noise on the symbol-existence question, but also less elasticity
+ * for the rubric to penalize stylistic/correctness issues the preflight
+ * cannot see). A lower weight cedes more authority back to the rubric.
+ *
+ * The default of `0.4` is a starting balance; revisit once the
+ * `unresolved` rate stabilizes in CI.
+ *
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
+ */
+import { definePreflightScoring } from "../_vendor/ailf-core/index.js"
+export default definePreflightScoring({
+  codeCorrectnessWeight: 0.4,
+})

package/dist/index.d.ts CHANGED Viewed

@@ -33,8 +33,8 @@
  * })
  * ```
  */
-export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
-export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core/index.d.ts";
+export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
+export type { PackageSurfaceConfig, PackageSurfaceEntry, PreflightScoringConfig, PricingEntry, PromptEntry, SourceEntry, } from "./_vendor/ailf-core/index.d.ts";
 export { env } from "./_vendor/ailf-core/index.d.ts";
 export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
 export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";

package/dist/index.js CHANGED Viewed

@@ -36,7 +36,7 @@
 // ---------------------------------------------------------------------------
 // Configuration helpers (define* identity functions for typed authoring)
 // ---------------------------------------------------------------------------
-export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
+export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
 // ---------------------------------------------------------------------------
 // Environment helper
 // ---------------------------------------------------------------------------

package/dist/orchestration/build-app-context.js CHANGED Viewed

@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
         noCache: opts.noCache,
         noRemoteCache: opts.noRemoteCache,
         graderReplications: opts.graderReplications,
+        graderContext: opts.graderContext,
         outputDir: opts.outputDir,
         outputPath: opts.outputPath,
         urls: opts.urlArgs.length > 0 ? opts.urlArgs : undefined,

package/dist/orchestration/pipeline-orchestrator.d.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  * each step completes. This enables the GET /v1/jobs/:jobId polling
  * endpoint to show real-time progress.
  */
-import { type AppContext, type PipelineResult, type PipelineStep } from "../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineResult, type PipelineStep, type StepResult } from "../_vendor/ailf-core/index.d.ts";
 /**
  * Run a sequence of pipeline steps, short-circuiting on required step failure.
  *
@@ -22,3 +22,21 @@ import { type AppContext, type PipelineResult, type PipelineStep } from "../_ven
  * Lake after each step completes.
  */
 export declare function orchestratePipeline(ctx: AppContext, steps: PipelineStep[]): Promise<PipelineResult>;
+/**
+ * Find the first optional pipeline step that returned `status: "failed"`
+ * in step-array order. Returns the diagnostic shape the API job document
+ * already accepts (`{ message, step }`), or null when no optional step
+ * failed.
+ *
+ * Required-step failures don't reach this code path — the orchestrator
+ * aborts before completion when a required step fails. This helper is
+ * the bridge between "step ran and failed" and the wire signal that
+ * external `--remote` consumers use to distinguish a clean completion
+ * from a degraded one.
+ *
+ * @see docs/design-docs/optional-step-failure-surfacing.md
+ */
+export declare function getFirstOptionalFailure(steps: readonly PipelineStep[], results: Record<string, StepResult>): {
+    message: string;
+    step: string;
+} | null;

package/dist/orchestration/pipeline-orchestrator.js CHANGED Viewed

@@ -236,6 +236,12 @@ export async function orchestratePipeline(ctx, steps) {
                     process.env.SANITY_API_TOKEN ??
                     undefined,
             });
+            // DOC-2121 RC-3 — surface the first configured-but-failed optional
+            // step on the job document so external --remote consumers can detect
+            // partial-completion outcomes. The pipeline still ran end to end
+            // (P5 / local-first) and `success: true` is preserved; the `error`
+            // field is the wire signal that a configured optional step failed.
+            const firstOptionalFailure = getFirstOptionalFailure(steps, results);
             await store.updateJob(ctx.config.jobId, {
                 status: "completed",
                 completedAt: new Date().toISOString(),
@@ -245,6 +251,7 @@ export async function orchestratePipeline(ctx, steps) {
                     totalSteps: steps.length,
                 },
                 ...(state.reportId ? { reportId: state.reportId } : {}),
+                ...(firstOptionalFailure ? { error: firstOptionalFailure } : {}),
             });
         }
         catch {
@@ -275,6 +282,37 @@ export async function orchestratePipeline(ctx, steps) {
     };
 }
 // ---------------------------------------------------------------------------
+// Optional-step failure surfacing (DOC-2121 RC-3)
+// ---------------------------------------------------------------------------
+/**
+ * Find the first optional pipeline step that returned `status: "failed"`
+ * in step-array order. Returns the diagnostic shape the API job document
+ * already accepts (`{ message, step }`), or null when no optional step
+ * failed.
+ *
+ * Required-step failures don't reach this code path — the orchestrator
+ * aborts before completion when a required step fails. This helper is
+ * the bridge between "step ran and failed" and the wire signal that
+ * external `--remote` consumers use to distinguish a clean completion
+ * from a degraded one.
+ *
+ * @see docs/design-docs/optional-step-failure-surfacing.md
+ */
+export function getFirstOptionalFailure(steps, results) {
+    for (const step of steps) {
+        if (step.optional !== true)
+            continue;
+        const result = results[step.name];
+        if (result?.status === "failed") {
+            return {
+                message: result.error ?? `${step.name} failed`,
+                step: step.name,
+            };
+        }
+    }
+    return null;
+}
+// ---------------------------------------------------------------------------
 // Artifact export phase gate (W0053)
 // ---------------------------------------------------------------------------
 /**

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -14,6 +14,7 @@ import { buildCacheContext } from "../cache-context.js";
 import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
 import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
 import { resultsFileForMode } from "../../pipeline/eval-constants.js";
+import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
 import { loadSource } from "../../sources.js";
 import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
 import { configToSourceOverrides } from "../config-to-source-overrides.js";
@@ -77,6 +78,13 @@ export class CalculateScoresStep {
         catch {
             // Non-fatal — proceed without source metadata
         }
+        // W0198 — load preflight scoring config (silent fall-through to the
+        // default weight when absent). Lazy: ignored when no preflight reports
+        // were emitted upstream.
+        const preflightScoring = await loadPreflightScoring(ctx.config.rootDir).catch((err) => {
+            ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
+            return undefined;
+        });
         let belowCritical = [];
         try {
             const result = calculateAndWriteScores({
@@ -87,11 +95,14 @@ export class CalculateScoresStep {
                 mode: ctx.config.mode === "literacy"
                     ? (ctx.config.variant ?? LiteracyVariant.STANDARD)
                     : ctx.config.mode,
+                preflightReports: state.preflightReports,
+                preflightWeight: preflightScoring?.codeCorrectnessWeight,
                 resolvedSource,
                 resultsPath: primaryResultsRun !== LiteracyVariant.STANDARD
                     ? join(ctx.config.rootDir, resultsFile)
                     : undefined,
                 rootDir: ctx.config.rootDir,
+                runId: ctx.runId,
                 searchMode: ctx.config.searchMode,
                 source: ctx.config.source,
             });

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -18,6 +18,7 @@ import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
 import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
 import { validateModelsYaml } from "../../pipeline/validate.js";
+import { loadPreflightContext } from "../../pipeline/preflight/load-preflight-context.js";
 import { loadSource } from "../../sources.js";
 import { configToSourceOverrides } from "../config-to-source-overrides.js";
 export class GenerateConfigsStep {
@@ -110,12 +111,21 @@ export class GenerateConfigsStep {
         catch {
             ctx.logger.warn("  ⚠ Could not load rubric config — templates will not resolve");
         }
-        // Compile for each variant
+        // Compile for each variant. `graderContext` defaults to "rubric-only" here
+        // so handlers see a definite value rather than implicit-undefined.
+        const graderContext = ctx.config.graderContext ?? "rubric-only";
+        // W0198 Phase 6 — when the package-surface manifest is authored, pass
+        // the in-scope package list down so the literacy mode handler can
+        // prefix the `code-correctness` rubric with the deterministic-lane
+        // system instruction. Silent fall-through when absent.
+        const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
         const baselineResults = this.compileAll(handler, tasks, {
             rootDir: ctx.config.rootDir,
             graderProvider: models.grader.id,
             models: baselineModels,
             rubricConfig,
+            graderContext,
+            preflightContext,
             evalMode: LiteracyVariant.STANDARD,
         });
         const agenticResults = this.compileAll(handler, tasks, {
@@ -123,6 +133,8 @@ export class GenerateConfigsStep {
             graderProvider: models.grader.id,
             models: agenticModels,
             rubricConfig,
+            graderContext,
+            preflightContext,
             evalMode: LiteracyVariant.AGENTIC,
         });
         // Log warnings
@@ -174,11 +186,14 @@ export class GenerateConfigsStep {
         catch {
             ctx.logger.warn("  ⚠ Could not load rubric config — templates will not resolve");
         }
+        const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
         const merged = this.compileAll(handler, tasks, {
             rootDir: ctx.config.rootDir,
             graderProvider: models.grader.id,
             models: modeModels,
             rubricConfig,
+            graderContext: ctx.config.graderContext ?? "rubric-only",
+            preflightContext,
         });
         for (const w of merged.warnings) {
             ctx.logger.warn(`  ⚠ ${w}`);

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -8,6 +8,8 @@
 import { existsSync, mkdirSync, writeFileSync } from "fs";
 import { resolve } from "path";
 import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
+import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
+import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
 import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
@@ -90,6 +92,7 @@ export class RunEvalStep {
                     graderModel: loadGraderModel(rootDir).id,
                     mode: this.mode,
                     rootDir,
+                    graderContext: ctx.config.graderContext,
                 });
                 // Share fingerprint with downstream steps (PublishReportStep)
                 state.evalFingerprint = evalFingerprint;
@@ -224,6 +227,30 @@ export class RunEvalStep {
         const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
         if (existsSync(resultsPath)) {
             await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
+            // W0198 Phase 4 — deterministic-lane reports per (task, model).
+            // Loaded lazily so test contexts that don't wire the manifest /
+            // resolver pay nothing; the helper is itself a no-op when its
+            // inputs are missing.
+            const packageSurface = await loadPackageSurface(rootDir).catch((err) => {
+                console.warn(`  ⚠️  W0198 preflight: failed to load package-surface manifest — ${err instanceof Error ? err.message : String(err)}`);
+                return undefined;
+            });
+            const preflight = await emitSymbolPreflight({
+                writer: ctx.artifactWriter,
+                ctx,
+                mode: this.mode,
+                resultsPath,
+                packageSurface,
+                resolver: ctx.packageSurfaceResolver,
+            });
+            if (preflight.reports.size > 0) {
+                if (!state.preflightReports) {
+                    state.preflightReports = new Map();
+                }
+                for (const [k, v] of preflight.reports) {
+                    state.preflightReports.set(k, v);
+                }
+            }
         }
         // Extract Promptfoo share URL from eval results (Step 3b)
         if (ctx.evalRunner.extractShareUrl) {

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
 import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
+import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
@@ -60,6 +61,34 @@ export interface RawTestResult {
     };
     vars: Record<string, string>;
 }
+/**
+ * Aggregate counts across every preflight report emitted by the run.
+ *
+ * `unresolvedRate` is `unresolved / totalFindings` in `[0, 1]`, set only
+ * when `totalFindings > 0`. The vacuous case (`totalFindings === 0` —
+ * reports exist but every candidate cited zero in-scope bindings) is
+ * deliberately distinguished from "every binding resolved cleanly"
+ * (`unresolvedRate === 0`) so a CI threshold like `unresolvedRate > 0.1`
+ * doesn't fire green on a run that had nothing to resolve.
+ */
+export interface PreflightSummary {
+    /** Number of per-test preflight reports the run emitted. */
+    reportCount: number;
+    /** Total findings across all reports. */
+    totalFindings: number;
+    /** Findings classified `exists`. */
+    exists: number;
+    /** Findings classified `missing` — the deterministic-deduction lane. */
+    missing: number;
+    /** Findings classified `unresolved` — the resolver-couldn't-answer lane. */
+    unresolved: number;
+    /**
+     * `unresolved / totalFindings` in `[0, 1]`. Absent when `totalFindings`
+     * is zero — distinguishes "nothing to resolve" from "all resolutions
+     * succeeded" so CI thresholds aren't vacuously green.
+     */
+    unresolvedRate?: number;
+}
 /**
  * Calculate scores grouped by model. Each model gets its own FeatureScore[]
  * and model-level aggregates.
@@ -70,7 +99,7 @@ export interface RawTestResult {
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
+export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): null | PerModelEntry[];
 /**
  * Extract grader judgments (reason text + scores) from evaluation results.
  *
@@ -91,6 +120,19 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
 export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
+/**
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
+ * resolver-health summary. Returns `undefined` when the run had no
+ * preflight reports (manifest disabled, resolver missing, or every
+ * candidate output cited zero in-scope packages) so the consumer can
+ * cleanly omit the field from the score summary instead of writing a
+ * vacuous block of zeros.
+ *
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
+ * production calls go through `calculateAndWriteScores`, which threads
+ * the result into the `EvalScoreSummary.preflight` field.
+ */
+export declare function summarizePreflight(reports: Map<string, SymbolPreflightReport> | undefined): PreflightSummary | undefined;
 /**
  * Score knowledge-probe evaluation results.
  *
@@ -105,7 +147,7 @@ export declare function extractStoredTestResults(resultsPath: string): StoredTes
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
  */
-export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
+export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): FeatureScore[];
 /**
  * Score agentic evaluation results. In agentic mode, all test entries are
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -113,7 +155,7 @@ export declare function scoreKnowledgeProbeResults(results: TestResult[], profil
  *
  * Returns a record keyed by feature area with the composite actual score.
  */
-export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
+export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, ActualScoreEntry>;
 /**
  * Score agentic results broken down by model.
  *
@@ -121,7 +163,7 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
+export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
 /** Options for the calculate-scores main() function. */
 export interface CalculateScoresOptions {
     /** Allowed origins for source isolation reporting */
@@ -130,12 +172,31 @@ export interface CalculateScoresOptions {
     logger?: Logger;
     /** Evaluation mode (controls which result files are read) */
     mode?: string;
+    /**
+     * W0198 — symbol-preflight reports keyed by `${runId}/${mode}/${task}/${model}`,
+     * populated by `RunEvalStep` via `emitSymbolPreflight`. When provided, the
+     * scoring engine merges deterministic preflight findings into the
+     * `code-correctness` dimension. Absence (or empty map) collapses cleanly
+     * to the pre-W0198 rubric-only path.
+     */
+    preflightReports?: Map<string, SymbolPreflightReport>;
+    /**
+     * W0198 — preflight's share of `code-correctness` in `[0, 1]`. Defaults
+     * to `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` when omitted.
+     */
+    preflightWeight?: number;
     /** Pre-resolved source config (skips loadSource() call) */
     resolvedSource?: ResolvedSourceConfig;
     /** Path to baseline results file (default: results/latest/eval-results.json) */
     resultsPath?: string;
     /** Root directory of the eval package (required) */
     rootDir: string;
+    /**
+     * W0198 — runId axis used to look up preflight reports. Required when
+     * `preflightReports` is provided; otherwise the lookup callback can't
+     * reconstruct the right key.
+     */
+    runId?: string;
     /** Search mode for source verification metadata */
     searchMode?: string;
     /** Documentation source name */