npm - @sanity/ailf - Versions diffs - 4.2.0 → 4.3.1 - Mend

@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

package/config/package-surface.ts +37 -0
package/config/preflight-scoring.ts +26 -0
package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
package/dist/_vendor/ailf-core/config-helpers.js +67 -0
package/dist/_vendor/ailf-core/index.d.ts +1 -1
package/dist/_vendor/ailf-core/index.js +1 -1
package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
package/dist/_vendor/ailf-core/ports/index.js +1 -0
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/index.js +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
package/dist/_vendor/ailf-core/types/index.js +1 -0
package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
package/dist/adapters/api-client/build-request.d.ts +1 -0
package/dist/adapters/api-client/build-request.js +3 -0
package/dist/adapters/config-sources/file-config-adapter.js +1 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
package/dist/adapters/index.d.ts +1 -0
package/dist/adapters/index.js +1 -0
package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
package/dist/adapters/package-surface/dts-package-surface.js +173 -0
package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
package/dist/adapters/package-surface/index.d.ts +9 -0
package/dist/adapters/package-surface/index.js +8 -0
package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
package/dist/adapters/task-sources/repo-schemas.js +15 -0
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +12 -0
package/dist/commands/remote-pipeline.js +10 -2
package/dist/commands/remote-results.d.ts +12 -1
package/dist/commands/remote-results.js +25 -5
package/dist/composition-root.js +9 -0
package/dist/config/package-surface.ts +37 -0
package/dist/config/preflight-scoring.ts +26 -0
package/dist/index.d.ts +2 -2
package/dist/index.js +1 -1
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
package/dist/orchestration/pipeline-orchestrator.js +38 -0
package/dist/orchestration/steps/calculate-scores-step.js +11 -0
package/dist/orchestration/steps/generate-configs-step.js +16 -1
package/dist/orchestration/steps/run-eval-step.js +27 -0
package/dist/pipeline/calculate-scores.d.ts +66 -5
package/dist/pipeline/calculate-scores.js +141 -27
package/dist/pipeline/compiler/index.d.ts +1 -1
package/dist/pipeline/compiler/index.js +1 -1
package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
package/dist/pipeline/compiler/literacy-bridge.js +2 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
package/dist/pipeline/compiler/rubric-resolution.js +78 -2
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
package/dist/pipeline/compiler/scoring-bridge.js +104 -10
package/dist/pipeline/eval-fingerprint.d.ts +9 -0
package/dist/pipeline/eval-fingerprint.js +7 -1
package/dist/pipeline/map-request-to-config.js +1 -0
package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
package/dist/pipeline/preflight/compute-preflight.js +118 -0
package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
package/dist/pipeline/preflight/load-package-surface.js +19 -0
package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
package/dist/pipeline/preflight/load-preflight-context.js +25 -0
package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
package/dist/pipeline/preflight/parse-imports.js +125 -0
package/dist/report-store.d.ts +8 -0
package/dist/report-store.js +55 -6
package/dist/sanity/document-renderers.d.ts +45 -7
package/dist/sanity/document-renderers.js +99 -13
package/dist/sanity/queries.d.ts +11 -11
package/dist/sanity/queries.js +7 -0
package/dist/sanity/symbol-index.d.ts +98 -0
package/dist/sanity/symbol-index.js +615 -0
package/package.json +2 -1

package/dist/pipeline/compiler/scoring-bridge.js CHANGED Viewed

@@ -16,11 +16,19 @@
  * engine works in [0, 1]; this module handles the conversion at
  * boundaries.
  *
+ * W0198 Phase 5 — when a `preflightForTest` callback is provided and
+ * returns a `SymbolPreflightReport`, the bridge synthesizes one extra
+ * `AssertionScore` per test in the `code-correctness` dimension. The
+ * deterministic preflight and the LLM rubric merge through D0010's
+ * weighted dimension aggregation; the relative share is set by
+ * `preflightWeight` in `[0, 1]`.
+ *
  * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
  * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
  */
-import { aggregateDimensions, computeTaskScore, normalizeScore, } from "../../_vendor/ailf-core/index.js";
+import { aggregateDimensions, computeTaskScore, DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, normalizeScore, } from "../../_vendor/ailf-core/index.js";
 import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.js";
 // ---------------------------------------------------------------------------
 // Public API
@@ -36,28 +44,53 @@ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.
  * @param profile  Weight profile mapping kebab-case dimension names to weights
  *                 (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
  * @param taskId   Optional task identifier for traceability in TaskScore output
+ * @param options  Optional W0198 Phase 5 preflight integration
  * @returns        Dimensions (0–100) and composite (0–100), matching legacy output format
  */
-export function scoreTestGroup(tests, profile, taskId) {
+export function scoreTestGroup(tests, profile, taskId, options) {
     let totalCost = 0;
+    const preflightForTest = options?.preflightForTest;
+    const preflightWeight = clampWeight(options?.preflightWeight ?? DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT);
+    const preflightActive = typeof preflightForTest === "function" && preflightWeight > 0;
     // Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
     //
-    // Two assertion types contribute to scoring:
+    // Three assertion sources contribute to scoring:
     //   - llm-rubric: dimension from metadata, score from grader (0–100 → [0,1])
     //   - javascript: mapped to "assertion-pass-rate" dimension (pass=1, fail=0)
+    //   - preflight (W0198): synthesized per test from SymbolPreflightReport,
+    //     dimension "code-correctness", weight = preflightWeight.
+    //
+    // Rubric weight reduction is per-test, not global: a test's
+    // `code-correctness` rubric assertion only drops to `1 - preflightWeight`
+    // when preflight actually contributes a paired finding for that test.
+    // Without this gate, tests with no preflight coverage would have their
+    // rubric authority silently downweighted with nothing to compensate, so
+    // partial-coverage runs would systematically bias the dimension toward
+    // tests that DO have preflight data.
     //
     // Other types (cost, trajectory, contains, etc.) are metadata or guards —
     // they don't produce dimension scores.
     const assertionScores = [];
     for (const test of tests) {
         totalCost += test.cost;
+        const report = preflightActive ? preflightForTest(test) : undefined;
+        const ccRubricWeight = report ? 1 - preflightWeight : 1;
         for (const comp of test.gradingResult.componentResults) {
-            const converted = componentToScore(comp);
+            const converted = componentToScore(comp, ccRubricWeight);
             if (converted)
                 assertionScores.push(converted);
         }
+        if (report) {
+            assertionScores.push(preflightToScore(report, preflightWeight));
+        }
     }
-    // Step 2: Aggregate into DimensionScores (0–1 scale)
+    // Step 2: Aggregate into DimensionScores (0–1 scale).
+    //
+    // Use `weighted-mean` so the W0198 preflight / rubric weights inside
+    // `code-correctness` are honored. With everything at weight=1.0 the
+    // result is identical to plain `mean`, so behavior outside the
+    // code-correctness merge is unchanged. See `aggregateScores` in
+    // `packages/core/src/services/scoring-engine.ts` for the equivalence.
     const dimensionLabels = {
         "assertion-pass-rate": "Assertion Pass Rate",
         "code-correctness": "Code Correctness",
@@ -65,7 +98,7 @@ export function scoreTestGroup(tests, profile, taskId) {
         "task-completion": "Task Completion",
     };
     const rawDimensions = aggregateDimensions(assertionScores, {
-        defaultAggregation: "mean",
+        defaultAggregation: "weighted-mean",
         dimensionLabels,
     });
     // Step 3: Compute weighted composite via TaskScore (0–1 scale)
@@ -102,10 +135,10 @@ export function scoreTestGroup(tests, profile, taskId) {
  * This replaces the previous llm-rubric-only filter that caused agent-harness
  * javascript assertions to be invisible to the scoring engine (DOC-2029).
  */
-function componentToScore(comp) {
+function componentToScore(comp, rubricCodeCorrectnessWeight) {
     const type = comp.assertion?.type;
     if (type === "llm-rubric") {
-        return llmRubricToScore(comp);
+        return llmRubricToScore(comp, rubricCodeCorrectnessWeight);
     }
     if (type === "javascript") {
         return javascriptAssertionToScore(comp);
@@ -118,14 +151,19 @@ function componentToScore(comp) {
  *
  * The dimension comes from metadata (set during rubric template resolution).
  * Returns null if the component doesn't map to any dimension.
+ *
+ * For the `code-correctness` dimension specifically, the assertion's
+ * weight is reduced when W0198's deterministic preflight is also feeding
+ * the same dimension; the complementary share belongs to the preflight.
  */
-function llmRubricToScore(comp) {
+function llmRubricToScore(comp, rubricCodeCorrectnessWeight) {
     const dim = classifyRubric(comp);
     if (!dim)
         return null;
     // Parse the raw score (0–100 from the grader) and normalize to [0, 1]
     const rawScore = parseRubricScore(comp);
     const normalized = normalizeScore(rawScore, "llm-rubric");
+    const weight = dim === "code-correctness" ? rubricCodeCorrectnessWeight : 1.0;
     return {
         assertionType: comp.assertion?.type ?? "llm-rubric",
         dimension: dim,
@@ -133,7 +171,7 @@ function llmRubricToScore(comp) {
         pass: comp.pass,
         reason: comp.reason ?? "",
         score: normalized,
-        weight: 1.0,
+        weight,
     };
 }
 /**
@@ -160,6 +198,62 @@ function javascriptAssertionToScore(comp) {
         weight: 1.0,
     };
 }
+/**
+ * Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
+ * preflight report.
+ *
+ * The score is `1 - min(1, total / cap)`. With the default
+ * `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
+ * 2 missing → 0.333, ≥3 missing → 0.0.
+ *
+ * Edge cases for the deduction config:
+ * - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
+ *   would NaN otherwise.
+ * - `cap < 0` (misconfigured): score is 1.0 and the merge silently
+ *   collapses to rubric-only on this dimension. The Phase 3 Zod schema
+ *   gates against this upstream so it should never reach here, but the
+ *   guard preserves the never-deduct invariant if it does.
+ *
+ * `unresolved` findings never deduct (the preflight's never-deduct rule):
+ * they're not part of `total`, so they fall through to the LLM rubric.
+ */
+export function preflightToScore(report, weight) {
+    const { perMissing, cap, total } = report.deduction;
+    const score = cap > 0 ? 1 - Math.min(1, total / cap) : 1;
+    const counts = countLanes(report);
+    return {
+        assertionType: "preflight",
+        dimension: "code-correctness",
+        latencyMs: 0,
+        pass: total === 0,
+        reason: `preflight: ${counts.exists} exists, ${counts.missing} missing, ${counts.unresolved} unresolved (deduction ${total}/${cap}, ${perMissing} per missing)`,
+        score,
+        weight,
+    };
+}
+function countLanes(report) {
+    let exists = 0;
+    let missing = 0;
+    let unresolved = 0;
+    for (const f of report.findings) {
+        if (f.result === "exists")
+            exists++;
+        else if (f.result === "missing")
+            missing++;
+        else
+            unresolved++;
+    }
+    return { exists, missing, unresolved };
+}
+function clampWeight(w) {
+    if (!Number.isFinite(w))
+        return 0;
+    if (w < 0)
+        return 0;
+    if (w > 1)
+        return 1;
+    return w;
+}
 /** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
 function kebabToCamel(kebab) {
     return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());

package/dist/pipeline/eval-fingerprint.d.ts CHANGED Viewed

@@ -41,6 +41,15 @@ export interface FingerprintInput {
     mode: EvalMode;
     /** Path to the packages/eval root directory */
     rootDir: string;
+    /**
+     * Grader context policy. Distinct values produce distinct rubricPrompt
+     * content, so the cache must treat them as different evaluations even
+     * when tasks + docs + grader model match.
+     *
+     * Defaults to "rubric-only" inside the hash when undefined, matching
+     * the EvalConfig boundary default.
+     */
+    graderContext?: "rubric-only" | "with-docs";
 }
 /**
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.

package/dist/pipeline/eval-fingerprint.js CHANGED Viewed

@@ -38,8 +38,12 @@ import { join, relative, resolve } from "path";
  * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
  * files), file paths normalized to rootDir-relative, grader passed
  * through verbatim instead of the literal string "default".
+ *
+ * v3 (2026-05-06): grader-context policy ("rubric-only" vs "with-docs")
+ * affects rubricPrompt content and therefore eval output, so it must be
+ * hashed. Bumping invalidates v2 fingerprints.
  */
-const FINGERPRINT_VERSION = "eval-fingerprint-v2";
+const FINGERPRINT_VERSION = "eval-fingerprint-v3";
 /**
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
  *
@@ -52,10 +56,12 @@ const FINGERPRINT_VERSION = "eval-fingerprint-v2";
  */
 export function computeEvalFingerprint(input) {
     const { graderModel, mode, rootDir, tasks } = input;
+    const graderContext = input.graderContext ?? "rubric-only";
     const hash = createHash("sha256");
     hash.update(`version:${FINGERPRINT_VERSION}\n`);
     hash.update(`mode:${mode}\n`);
     hash.update(`grader:${graderModel}\n`);
+    hash.update(`graderContext:${graderContext}\n`);
     hash.update(`tasks:${hashTaskSet(tasks)}\n`);
     // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
     // so a CI runner at /home/runner/... and a laptop at /Users/... produce

package/dist/pipeline/map-request-to-config.js CHANGED Viewed

@@ -52,6 +52,7 @@ export function mapRequestToConfig(request, rootDir) {
         noAutoScope: request.noAutoScope ?? false,
         noCache: request.noCache ?? false,
         noRemoteCache: request.noRemoteCache ?? false,
+        graderContext: request.graderContext,
         graderReplications: request.graderReplications,
         urls: request.urls,
         headers: request.headers,

package/dist/pipeline/preflight/compute-preflight.d.ts ADDED Viewed

@@ -0,0 +1,67 @@
+/**
+ * compute-preflight — pure function that turns a candidate's code +
+ * a `PackageSurfaceResolver` + the framework-level package-surface
+ * manifest into a `SymbolPreflightReport`.
+ *
+ * Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
+ * from package Y" entirely out of LLM judgment. The LLM rubric runs
+ * after this and is told the preflight's findings as ground truth.
+ *
+ * The function is pure (no I/O beyond the resolver's): it parses the
+ * candidate's imports, asks the resolver about each in-scope package,
+ * and emits one finding per imported binding.
+ *
+ * Per-binding decision tree:
+ *
+ *   1. Drop the binding if it isn't a `named` import. Default,
+ *      namespace, and side-effect imports are intentionally not
+ *      checked — the package surface only includes named exports
+ *      (per the design's "named bindings only" rule), so default /
+ *      namespace imports cannot be answered against it without
+ *      false-deducting legitimate code.
+ *
+ *   2. Drop the binding if its `source` package is not in the
+ *      framework-level manifest. Out-of-scope packages don't get
+ *      findings — they are silently passed through to the LLM rubric.
+ *
+ *   3. Resolve the package surface. If the resolver throws a typed
+ *      `PackageSurfaceResolverError`, every binding from that package
+ *      becomes `unresolved` with the matching reason. **Never deduct.**
+ *
+ *   4. If the binding is in the surface, emit `exists` (no deduction).
+ *
+ *   5. Otherwise, emit `missing` (deterministic deduction).
+ *
+ * Deduction is `total = min(missing_count * perMissing, cap)`. The
+ * scoring bridge (Phase 5) computes the per-dimension score from this
+ * report; this function stays a pure data factory.
+ */
+import { type PackageSurfaceConfig, type PackageSurfaceResolver, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
+/** Default deduction config — `−20 per missing, capped at 60`. */
+export declare const DEFAULT_DEDUCTION: {
+    readonly perMissing: 20;
+    readonly cap: 60;
+};
+export interface ComputePreflightInput {
+    /** Raw candidate output. Typically the contents of a single TS/TSX/JS code block. */
+    readonly code: string;
+    /** Identity for this candidate, recorded in the report's `candidate` field. */
+    readonly candidate: {
+        readonly taskId: string;
+        readonly testIndex: number;
+    };
+    /** Framework-level package-surface manifest (Phase 0 / `definePackageSurface`). */
+    readonly packageSurface: PackageSurfaceConfig;
+    /** Resolver used to fetch each in-scope package's surface (Phase 1). */
+    readonly resolver: PackageSurfaceResolver;
+    /**
+     * Deduction config. Defaults to `DEFAULT_DEDUCTION`. Pass
+     * `{ perMissing: 0, cap: 0 }` to compute findings without deduction
+     * (e.g. for measurement-only runs).
+     */
+    readonly deduction?: {
+        readonly perMissing?: number;
+        readonly cap?: number;
+    };
+}
+export declare function computePreflight(input: ComputePreflightInput): Promise<SymbolPreflightReport>;

package/dist/pipeline/preflight/compute-preflight.js ADDED Viewed

@@ -0,0 +1,118 @@
+/**
+ * compute-preflight — pure function that turns a candidate's code +
+ * a `PackageSurfaceResolver` + the framework-level package-surface
+ * manifest into a `SymbolPreflightReport`.
+ *
+ * Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
+ * from package Y" entirely out of LLM judgment. The LLM rubric runs
+ * after this and is told the preflight's findings as ground truth.
+ *
+ * The function is pure (no I/O beyond the resolver's): it parses the
+ * candidate's imports, asks the resolver about each in-scope package,
+ * and emits one finding per imported binding.
+ *
+ * Per-binding decision tree:
+ *
+ *   1. Drop the binding if it isn't a `named` import. Default,
+ *      namespace, and side-effect imports are intentionally not
+ *      checked — the package surface only includes named exports
+ *      (per the design's "named bindings only" rule), so default /
+ *      namespace imports cannot be answered against it without
+ *      false-deducting legitimate code.
+ *
+ *   2. Drop the binding if its `source` package is not in the
+ *      framework-level manifest. Out-of-scope packages don't get
+ *      findings — they are silently passed through to the LLM rubric.
+ *
+ *   3. Resolve the package surface. If the resolver throws a typed
+ *      `PackageSurfaceResolverError`, every binding from that package
+ *      becomes `unresolved` with the matching reason. **Never deduct.**
+ *
+ *   4. If the binding is in the surface, emit `exists` (no deduction).
+ *
+ *   5. Otherwise, emit `missing` (deterministic deduction).
+ *
+ * Deduction is `total = min(missing_count * perMissing, cap)`. The
+ * scoring bridge (Phase 5) computes the per-dimension score from this
+ * report; this function stays a pure data factory.
+ */
+import { PackageSurfaceResolverError, } from "../../_vendor/ailf-core/index.js";
+import { parseImports } from "./parse-imports.js";
+/** Default deduction config — `−20 per missing, capped at 60`. */
+export const DEFAULT_DEDUCTION = {
+    perMissing: 20,
+    cap: 60,
+};
+export async function computePreflight(input) {
+    const perMissing = input.deduction?.perMissing ?? DEFAULT_DEDUCTION.perMissing;
+    const cap = input.deduction?.cap ?? DEFAULT_DEDUCTION.cap;
+    const inScope = new Set(input.packageSurface.packages.map((p) => p.pkg));
+    const imports = parseImports(input.code);
+    // Bucket named imports by package so we resolve each surface at most once.
+    const namedBySource = new Map();
+    for (const binding of imports) {
+        if (binding.kind !== "named")
+            continue;
+        if (!inScope.has(binding.source))
+            continue;
+        let bindings = namedBySource.get(binding.source);
+        if (!bindings) {
+            bindings = [];
+            namedBySource.set(binding.source, bindings);
+        }
+        if (!bindings.includes(binding.imported))
+            bindings.push(binding.imported);
+    }
+    const findings = [];
+    for (const [pkg, bindings] of namedBySource) {
+        let surface;
+        try {
+            surface = await input.resolver.resolveExports(pkg);
+        }
+        catch (err) {
+            const reason = unresolvedReasonFor(err);
+            for (const binding of bindings) {
+                findings.push({ result: "unresolved", pkg, binding, reason });
+            }
+            continue;
+        }
+        const surfaceNames = new Map();
+        for (const sym of surface.symbols)
+            surfaceNames.set(sym.name, sym.source);
+        for (const binding of bindings) {
+            const source = surfaceNames.get(binding);
+            if (source) {
+                findings.push({
+                    result: "exists",
+                    pkg,
+                    version: surface.version,
+                    binding,
+                    source,
+                });
+            }
+            else {
+                findings.push({
+                    result: "missing",
+                    pkg,
+                    version: surface.version,
+                    binding,
+                });
+            }
+        }
+    }
+    const missingCount = findings.filter((f) => f.result === "missing").length;
+    const total = Math.min(missingCount * perMissing, cap);
+    return {
+        candidate: { ...input.candidate },
+        findings,
+        deduction: { perMissing, cap, total },
+    };
+}
+function unresolvedReasonFor(err) {
+    if (err instanceof PackageSurfaceResolverError) {
+        return err.reason;
+    }
+    // Anything else from the resolver is treated as a parse failure —
+    // fail-loud, never-deduct.
+    return "parse-failed";
+}

package/dist/pipeline/preflight/emit-symbol-preflight.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+/**
+ * emit-symbol-preflight — turns a Promptfoo results file into per-test
+ * `symbolPreflight` artifact emissions, one per (run, mode, task, model).
+ *
+ * Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
+ * of `RunEvalStep`. For every test row in the results file we:
+ *
+ *   1. Pull the candidate's response text out of `result.response.output`.
+ *   2. Run `computePreflight` against the framework-level package-surface
+ *      manifest using the wired `PackageSurfaceResolver` from the
+ *      `AppContext`.
+ *   3. Emit the report through the artifact writer at axes
+ *      `(run, mode, task, model)` — same axes as `rawResults`.
+ *   4. Attach the report onto `state.preflightReports` keyed by the
+ *      same axes so the scoring step (Phase 5) can read it without a
+ *      second filesystem hop.
+ *
+ * Non-blocking: a missing resolver, missing manifest, missing response,
+ * or per-row exception logs a warning and continues. The deterministic
+ * lane is additive; if any of its inputs are missing the LLM rubric
+ * still scores the candidate normally.
+ */
+import { type ArtifactRef, type ArtifactWriter, type PackageSurfaceConfig, type PackageSurfaceResolver, type RunId, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
+/**
+ * Per-row preflight key. Mirrors the axis set the writer uses for
+ * `symbolPreflight` so the scoring step can look up reports without
+ * re-deriving them from disk.
+ */
+export interface SymbolPreflightKey {
+    run: RunId;
+    mode: string;
+    task: string;
+    model: string;
+}
+/** Map a per-row preflight key to a stable string for in-memory lookup. */
+export declare function preflightKey(key: SymbolPreflightKey): string;
+export interface EmitSymbolPreflightInput {
+    writer: ArtifactWriter;
+    ctx: {
+        runId: RunId;
+    };
+    mode: string;
+    resultsPath: string;
+    packageSurface: PackageSurfaceConfig | undefined;
+    resolver: PackageSurfaceResolver | undefined;
+}
+export interface EmitSymbolPreflightOutput {
+    reports: Map<string, SymbolPreflightReport>;
+    refs: readonly (ArtifactRef | null)[];
+}
+export declare function emitSymbolPreflight(input: EmitSymbolPreflightInput): Promise<EmitSymbolPreflightOutput>;

package/dist/pipeline/preflight/emit-symbol-preflight.js ADDED Viewed

@@ -0,0 +1,102 @@
+/**
+ * emit-symbol-preflight — turns a Promptfoo results file into per-test
+ * `symbolPreflight` artifact emissions, one per (run, mode, task, model).
+ *
+ * Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
+ * of `RunEvalStep`. For every test row in the results file we:
+ *
+ *   1. Pull the candidate's response text out of `result.response.output`.
+ *   2. Run `computePreflight` against the framework-level package-surface
+ *      manifest using the wired `PackageSurfaceResolver` from the
+ *      `AppContext`.
+ *   3. Emit the report through the artifact writer at axes
+ *      `(run, mode, task, model)` — same axes as `rawResults`.
+ *   4. Attach the report onto `state.preflightReports` keyed by the
+ *      same axes so the scoring step (Phase 5) can read it without a
+ *      second filesystem hop.
+ *
+ * Non-blocking: a missing resolver, missing manifest, missing response,
+ * or per-row exception logs a warning and continues. The deterministic
+ * lane is additive; if any of its inputs are missing the LLM rubric
+ * still scores the candidate normally.
+ */
+import { readFileSync } from "node:fs";
+import { resolveVariantMode, } from "../../_vendor/ailf-core/index.js";
+import { computePreflight } from "./compute-preflight.js";
+/** Map a per-row preflight key to a stable string for in-memory lookup. */
+export function preflightKey(key) {
+    return `${key.run}/${key.mode}/${key.task}/${key.model}`;
+}
+export async function emitSymbolPreflight(input) {
+    const reports = new Map();
+    const refs = [];
+    if (!input.packageSurface || !input.resolver) {
+        // The deterministic lane is additive — when its inputs aren't wired
+        // (test contexts, opt-out, partial rollouts) the LLM rubric still
+        // grades the candidate. Stay silent.
+        return { reports, refs };
+    }
+    if (input.packageSurface.packages.length === 0) {
+        return { reports, refs };
+    }
+    let raw;
+    try {
+        raw = JSON.parse(readFileSync(input.resultsPath, "utf-8"));
+    }
+    catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        console.warn(`  ⚠️  emitSymbolPreflight: failed to read ${input.resultsPath} — ${message}`);
+        return { reports, refs };
+    }
+    const wrapper = raw.results && "results" in raw.results
+        ? raw.results
+        : raw;
+    const rows = wrapper?.results ?? [];
+    if (rows.length === 0)
+        return { reports, refs };
+    // Track per-(task, model) to dedupe — Promptfoo emits multiple rows
+    // for the same candidate when there are multiple assertions, but
+    // the preflight only depends on the candidate's text, not the
+    // assertion outcome. One report per (task, model) suffices.
+    const seen = new Set();
+    const emits = [];
+    for (let i = 0; i < rows.length; i++) {
+        const row = rows[i];
+        const rawTaskId = row.testCase?.description ?? "unknown-task";
+        const modelId = row.provider?.id ?? row.provider?.label ?? "unknown-model";
+        const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, input.mode);
+        const baseAssoc = {
+            run: input.ctx.runId,
+            mode: axisMode,
+            task: axisTask,
+            model: modelId,
+        };
+        const key = preflightKey(baseAssoc);
+        if (seen.has(key))
+            continue;
+        seen.add(key);
+        const output = row.response?.output;
+        if (typeof output !== "string" || output.length === 0)
+            continue;
+        let report;
+        try {
+            report = await computePreflight({
+                code: output,
+                candidate: { taskId: axisTask, testIndex: i },
+                packageSurface: input.packageSurface,
+                resolver: input.resolver,
+            });
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            console.warn(`  ⚠️  emitSymbolPreflight: computePreflight threw for ${key} — ${message}`);
+            continue;
+        }
+        reports.set(key, report);
+        emits.push(input.writer.emit("symbolPreflight", baseAssoc, report));
+    }
+    const settled = await Promise.all(emits);
+    for (const ref of settled)
+        refs.push(ref);
+    return { reports, refs };
+}

package/dist/pipeline/preflight/load-package-surface.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * load-package-surface — read the framework-level package-surface
+ * manifest (`config/package-surface.ts`) authored via
+ * `definePackageSurface()`.
+ *
+ * Returns `undefined` when the file is absent so the W0198 preflight
+ * step can no-op cleanly during the staged rollout. The eval package
+ * itself ships a manifest under `config/package-surface.ts` (Phase 0),
+ * so the live pipeline always finds one; the optional return path
+ * exists for downstream / external callers that may not have authored
+ * one yet.
+ */
+import type { PackageSurfaceConfig } from "../../_vendor/ailf-core/index.d.ts";
+export declare function loadPackageSurface(rootDir: string): Promise<PackageSurfaceConfig | undefined>;

package/dist/pipeline/preflight/load-package-surface.js ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * load-package-surface — read the framework-level package-surface
+ * manifest (`config/package-surface.ts`) authored via
+ * `definePackageSurface()`.
+ *
+ * Returns `undefined` when the file is absent so the W0198 preflight
+ * step can no-op cleanly during the staged rollout. The eval package
+ * itself ships a manifest under `config/package-surface.ts` (Phase 0),
+ * so the live pipeline always finds one; the optional return path
+ * exists for downstream / external callers that may not have authored
+ * one yet.
+ */
+import { tryLoadConfigFile } from "../compiler/config-loader.js";
+export async function loadPackageSurface(rootDir) {
+    const result = tryLoadConfigFile("package-surface", rootDir);
+    if (!result)
+        return undefined;
+    return result.data;
+}

package/dist/pipeline/preflight/load-preflight-context.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * load-preflight-context — read the framework-level package-surface
+ * manifest and project it down to the rubric-side context shape.
+ *
+ * Returns `undefined` when the manifest is absent or empty so callers
+ * collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
+ * `loadPackageSurface` and `loadPreflightScoring` — one loader per
+ * lazily-read W0198 input, all in `pipeline/preflight/`.
+ *
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
+ */
+import type { Logger, PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
+export declare function loadPreflightContext(rootDir: string, logger?: Pick<Logger, "warn">): Promise<PreflightRubricContext | undefined>;

package/dist/pipeline/preflight/load-preflight-context.js ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * load-preflight-context — read the framework-level package-surface
+ * manifest and project it down to the rubric-side context shape.
+ *
+ * Returns `undefined` when the manifest is absent or empty so callers
+ * collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
+ * `loadPackageSurface` and `loadPreflightScoring` — one loader per
+ * lazily-read W0198 input, all in `pipeline/preflight/`.
+ *
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
+ */
+import { loadPackageSurface } from "./load-package-surface.js";
+export async function loadPreflightContext(rootDir, logger) {
+    try {
+        const manifest = await loadPackageSurface(rootDir);
+        if (!manifest || manifest.packages.length === 0)
+            return undefined;
+        return { packages: manifest.packages.map((p) => p.pkg) };
+    }
+    catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        logger?.warn(`[warn] W0198 preflight: failed to load package-surface manifest — ${message}`);
+        return undefined;
+    }
+}