npm - @sanity/ailf - Versions diffs - 4.1.0 → 4.3.0 - Mend

@sanity/ailf 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -29,7 +29,7 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
-import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
+import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
 import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
@@ -38,7 +38,7 @@ import { loadRubricTemplates } from "./rubric-loader.js";
 import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
 import { LiteracyVariant } from "./normalize-mode.js";
-import { scoreTestGroup } from "./compiler/scoring-bridge.js";
+import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
 // Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -52,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
+export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group results by provider
     const byModel = {};
@@ -72,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
     }
     const perModel = [];
     for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
-        const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
+        const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId, preflightOptions);
         const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
         const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
         const avgScore = scores.length > 0
@@ -408,9 +408,111 @@ function buildSourceVerification(root, source, verificationCtx) {
  * Calculate overall scores (all models combined).
  * This is the original scoring path — backward compatible.
  */
-function calculateScores(resultsPath, goldProfile, baselineProfile) {
+function calculateScores(resultsPath, goldProfile, baselineProfile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
-    return scoreResults(results, goldProfile, baselineProfile);
+    return scoreResults(results, goldProfile, baselineProfile, undefined, preflightOptions);
+}
+/**
+ * W0198 — build a `ScoreTestGroupOptions` that the scoring bridge can
+ * use to look up a `SymbolPreflightReport` for any given `TestResult`.
+ *
+ * Mirrors the keying scheme `emitSymbolPreflight` uses:
+ * `${runId}/${mode}/${task}/${model}` where `(mode, task)` come from
+ * `resolveVariantMode(test.description, defaultMode)`.
+ *
+ * Returns `undefined` (effectively a no-op) when reports are absent,
+ * empty, or the runId hasn't been provided — those collapse cleanly
+ * to the pre-W0198 path. The runId branch logs a warning when reports
+ * exist but the caller forgot to wire `runId` so the silent
+ * preflight-disabled state doesn't go unobserved.
+ */
+function makePreflightOptions(reports, runId, defaultMode, weight, logger) {
+    if (!reports || reports.size === 0)
+        return undefined;
+    if (!runId) {
+        logger?.warn(`[warn] W0198 preflight: ${reports.size} preflight report(s) provided but no runId — skipping merge into code-correctness`);
+        return undefined;
+    }
+    return {
+        preflightWeight: weight,
+        preflightForTest: (test) => {
+            const modelId = test.providerId ?? test.providerLabel ?? "unknown-model";
+            const { mode: axisMode, task } = resolveVariantMode(test.description, defaultMode);
+            const key = `${runId}/${axisMode}/${task}/${modelId}`;
+            return reports.get(key);
+        },
+    };
+}
+/**
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
+ * resolver-health summary. Returns `undefined` when the run had no
+ * preflight reports (manifest disabled, resolver missing, or every
+ * candidate output cited zero in-scope packages) so the consumer can
+ * cleanly omit the field from the score summary instead of writing a
+ * vacuous block of zeros.
+ *
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
+ * production calls go through `calculateAndWriteScores`, which threads
+ * the result into the `EvalScoreSummary.preflight` field.
+ */
+export function summarizePreflight(reports) {
+    if (!reports || reports.size === 0)
+        return undefined;
+    let totalFindings = 0;
+    let exists = 0;
+    let missing = 0;
+    let unresolved = 0;
+    for (const report of reports.values()) {
+        for (const finding of report.findings) {
+            totalFindings++;
+            if (finding.result === "exists") {
+                exists++;
+            }
+            else if (finding.result === "missing") {
+                missing++;
+            }
+            else if (finding.result === "unresolved") {
+                unresolved++;
+            }
+            else {
+                // Exhaustiveness guard: a future fourth `result` variant lands
+                // here and surfaces as a build error rather than silently
+                // counting into `unresolved`.
+                const _exhaustive = finding;
+                void _exhaustive;
+            }
+        }
+    }
+    return {
+        reportCount: reports.size,
+        totalFindings,
+        exists,
+        missing,
+        unresolved,
+        ...(totalFindings > 0 && { unresolvedRate: unresolved / totalFindings }),
+    };
+}
+/**
+ * Print the preflight summary to the run log. Format mirrors the other
+ * single-line health signals (URL fetch, agent isolation) so CI grep can
+ * extract `unresolvedRate` directly from the log when score-summary.json
+ * isn't already in scope.
+ */
+function printPreflightSummary(summary, log) {
+    if (!summary)
+        return;
+    // `unresolvedRate` is absent when the run produced reports but no
+    // findings — distinguish vacuous-green from all-resolved so CI doesn't
+    // misread the threshold.
+    const rateLabel = summary.unresolvedRate === undefined
+        ? "n/a (no findings)"
+        : `${(summary.unresolvedRate * 100).toFixed(1)}%`;
+    log.info("-".repeat(80));
+    log.info("SYMBOL PREFLIGHT (W0198)");
+    log.info("-".repeat(80));
+    log.info(`  ${summary.reportCount} report(s), ${summary.totalFindings} finding(s): ${summary.exists} exists / ${summary.missing} missing / ${summary.unresolved} unresolved`);
+    log.info(`  unresolvedRate: ${rateLabel}  (resolver-health signal — not a candidate score factor)`);
+    log.info("");
 }
 /**
  * Extracts agent behavior summary from a test result's metadata.
@@ -644,7 +746,7 @@ function readAndNormalizeResults(resultsPath, log) {
  * @param baselineProfile Weight profile for baseline (without-docs) entries
  * @param modelId         Optional model identifier to tag each FeatureScore
  */
-function scoreResults(results, goldProfile, baselineProfile, modelId) {
+function scoreResults(results, goldProfile, baselineProfile, modelId, preflightOptions) {
     // Group by feature + docs/no-docs
     const byFeature = {};
     for (const result of results) {
@@ -663,12 +765,12 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
     const scores = [];
     for (const [feature, data] of Object.entries(byFeature)) {
         // --- With docs (gold / ceiling) — scored via 4-tier engine ---
-        const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
+        const gold = scoreTestGroup(data.withDocs, goldProfile, feature, preflightOptions);
         // --- Without docs (baseline / floor) ---
         // Uses the baseline profile (e.g. "output-only") which may exclude
         // dimensions like doc-coverage that are undefined without docs.
         // See docs/design-docs/named-scoring-profiles.md.
-        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
+        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature, preflightOptions);
         const featureCost = gold.totalCost + baseline.totalCost;
         const ceilingScore = gold.composite;
         const floorScore = baseline.composite;
@@ -709,7 +811,7 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
  * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
  * are set to 0 for backward compatibility with downstream consumers.
  */
-function scoreAgentHarnessResults(results, profile) {
+function scoreAgentHarnessResults(results, profile, preflightOptions) {
     // Group by task ID (extracted from description: "task-id — Title")
     const byTask = {};
     for (const result of results) {
@@ -721,7 +823,7 @@ function scoreAgentHarnessResults(results, profile) {
     }
     const scores = [];
     for (const [taskId, taskResults] of Object.entries(byTask)) {
-        const scored = scoreTestGroup(taskResults, profile, taskId);
+        const scored = scoreTestGroup(taskResults, profile, taskId, preflightOptions);
         const totalCost = scored.totalCost;
         // Detect feature area for backward compat (used by report grouping)
         const feature = taskResults[0]?.vars.__featureArea ??
@@ -774,7 +876,7 @@ function extractTaskId(description) {
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
  */
-export function scoreKnowledgeProbeResults(results, profile) {
+export function scoreKnowledgeProbeResults(results, profile, preflightOptions) {
     const byFeature = {};
     for (const result of results) {
         const feature = result.vars.__featureArea || detectFeatureArea(result.description);
@@ -785,7 +887,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
     }
     const scores = [];
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        const scored = scoreTestGroup(featureResults, profile, feature);
+        const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
         scores.push({
             assertionPassRate: scored.dimensions.assertionPassRate,
             ceilingScore: 0,
@@ -817,7 +919,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
  * Returns a record keyed by feature area with the composite actual score.
  */
 // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
-export function scoreAgenticResults(resultsPath, profile) {
+export function scoreAgenticResults(resultsPath, profile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group by feature area
     const byFeature = {};
@@ -830,7 +932,7 @@ export function scoreAgenticResults(resultsPath, profile) {
     }
     const entries = {};
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        const scored = scoreTestGroup(featureResults, profile, feature);
+        const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
         entries[feature] = {
             actualScore: scored.composite,
             codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -849,7 +951,7 @@ export function scoreAgenticResults(resultsPath, profile) {
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export function scoreAgenticResultsPerModel(resultsPath, profile) {
+export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptions) {
     const results = readAndNormalizeResults(resultsPath);
     // Group by model, then feature
     const byModel = {};
@@ -866,7 +968,7 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
     for (const [modelId, features] of Object.entries(byModel)) {
         perModel[modelId] = {};
         for (const [feature, featureResults] of Object.entries(features)) {
-            const scored = scoreTestGroup(featureResults, profile, feature);
+            const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
             perModel[modelId][feature] = {
                 actualScore: scored.composite,
                 codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -912,6 +1014,15 @@ export function calculateAndWriteScores(options) {
     }
     // Determine mode — controls which result files are read
     const mode = options.mode ?? LiteracyVariant.STANDARD;
+    // W0198 — assemble preflight options once. The helper returns
+    // `undefined` when reports / runId are missing, so all downstream
+    // callers handle the no-preflight case uniformly.
+    const preflightOptions = makePreflightOptions(options.preflightReports, options.runId, mode, options.preflightWeight, log);
+    // W0198 — resolver-health summary. Independent of `preflightOptions`
+    // (which gates the score merge): when reports exist but the runId is
+    // missing, scoring stays on the rubric-only path while telemetry still
+    // surfaces, so the resolver's drift remains visible.
+    const preflightSummary = summarizePreflight(options.preflightReports);
     const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
     // Agentic results path (only used in full mode)
     const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -940,7 +1051,7 @@ export function calculateAndWriteScores(options) {
         const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
         log.debug("Agent-harness scoring profile", agentProfile);
         const results = readAndNormalizeResults(baselineResultsPath);
-        const scores = scoreAgentHarnessResults(results, agentProfile);
+        const scores = scoreAgentHarnessResults(results, agentProfile, preflightOptions);
         log.debug("Agent-harness scores calculated", {
             taskCount: scores.length,
             tasks: scores.map((s) => ({
@@ -960,7 +1071,7 @@ export function calculateAndWriteScores(options) {
         const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
         graderCost, null, // no per-model breakdown
         null, // no source isolation
-        sourceVerification, "agent-harness", log);
+        sourceVerification, "agent-harness", log, preflightSummary);
         // Persist
         const outDir = join(ROOT, "results", "latest");
         mkdirSync(outDir, { recursive: true });
@@ -992,7 +1103,7 @@ export function calculateAndWriteScores(options) {
         const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
         log.debug("Knowledge-probe scoring profile", probeProfile);
         const results = readAndNormalizeResults(baselineResultsPath);
-        const scores = scoreKnowledgeProbeResults(results, probeProfile);
+        const scores = scoreKnowledgeProbeResults(results, probeProfile, preflightOptions);
         log.debug("Knowledge-probe scores calculated", {
             featureCount: scores.length,
             features: scores.map((s) => ({
@@ -1012,7 +1123,7 @@ export function calculateAndWriteScores(options) {
         const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
         graderCost, null, // no per-model breakdown for now
         null, // no source isolation — KP doesn't fetch sources
-        sourceVerification, "knowledge-probe", log);
+        sourceVerification, "knowledge-probe", log, preflightSummary);
         // Persist
         const outDir = join(ROOT, "results", "latest");
         mkdirSync(outDir, { recursive: true });
@@ -1041,7 +1152,7 @@ export function calculateAndWriteScores(options) {
         gold: goldProfile,
         baseline: baselineProfileWeights,
     });
-    const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
+    const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     log.debug("Baseline scores calculated", {
         featureCount: baselineScores.length,
         features: baselineScores.map((s) => ({
@@ -1051,7 +1162,7 @@ export function calculateAndWriteScores(options) {
             docLift: s.docLift,
         })),
     });
-    const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
+    const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     const urlRefs = aggregateUrlReferences(baselineResultsPath);
     const sourceVerification = buildSourceVerification(ROOT, source, {
         allowedOrigins: options.allowedOrigins,
@@ -1067,7 +1178,7 @@ export function calculateAndWriteScores(options) {
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
         const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
-        const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
+        const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
             features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -1080,7 +1191,7 @@ export function calculateAndWriteScores(options) {
         evaluationMode = LiteracyVariant.FULL;
         // Merge agentic actual scores into the per-model breakdown
         if (perModel) {
-            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
+            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile, preflightOptions);
             for (const entry of perModel) {
                 const modelAgentic = agenticPerModel[entry.modelId];
                 if (modelAgentic) {
@@ -1115,7 +1226,7 @@ export function calculateAndWriteScores(options) {
                 ? LiteracyVariant.OBSERVED
                 : LiteracyVariant.STANDARD;
     }
-    const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
+    const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary);
     // Persist
     const outDir = join(ROOT, "results", "latest");
     mkdirSync(outDir, { recursive: true });
@@ -1269,7 +1380,7 @@ function printPerModelReport(perModel, log) {
 // ---------------------------------------------------------------------------
 // Main
 // ---------------------------------------------------------------------------
-function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
+function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary) {
     const _log = log ?? new ConsoleLogger();
     _log.info("\n" + "=".repeat(80));
     _log.info("                    SANITY AI LITERACY SCORE REPORT");
@@ -1428,6 +1539,8 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
     if (perModel) {
         printPerModelReport(perModel, _log);
     }
+    // W0198 — symbol preflight resolver-health summary
+    printPreflightSummary(preflightSummary, _log);
     // URL References
     printUrlReport(urlRefs, _log);
     // Agent Behavior (only present when run with instrumented provider)
@@ -1557,6 +1670,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
             }
             : undefined,
         ...(perModel && { perModel }),
+        ...(preflightSummary && { preflight: preflightSummary }),
         ...(sourceIsolation && { sourceIsolation }),
         ...(sourceVerification && { sourceVerification }),
         timestamp: new Date().toISOString(),

package/dist/pipeline/compiler/index.d.ts CHANGED Viewed

@@ -21,6 +21,6 @@ export { checkBudget, classifyToolCall, classifyToolCalls, collectTrace, compute
 export { registerSanityLiteracyPreset, sanityLiteracyPreset, } from "./presets/index.js";
 export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from "./ignore-fields.js";
 export { simpleHash } from "./hash.js";
-export { scoreTestGroup, type BridgedScoreResult } from "./scoring-bridge.js";
+export { preflightToScore, scoreTestGroup, type BridgedScoreResult, type ScoreTestGroupOptions, } from "./scoring-bridge.js";
 export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
 export type { ConfigLoadResult } from "./config-loader.js";

package/dist/pipeline/compiler/index.js CHANGED Viewed

@@ -37,6 +37,6 @@ export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from
 // Hash utility
 export { simpleHash } from "./hash.js";
 // Scoring bridge — 4-tier engine integration
-export { scoreTestGroup } from "./scoring-bridge.js";
+export { preflightToScore, scoreTestGroup, } from "./scoring-bridge.js";
 // Unified config loader
 export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";

package/dist/pipeline/compiler/literacy-bridge.d.ts CHANGED Viewed

@@ -20,6 +20,7 @@
  */
 import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
 import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
+import type { PreflightRubricContext } from "./rubric-resolution.js";
 import { type LiteracyEvalSubMode } from "../normalize-mode.js";
 /** Options for compiling all literacy tasks via the new compiler */
 export interface LiteracyBridgeOptions {
@@ -35,6 +36,14 @@ export interface LiteracyBridgeOptions {
         label: string;
         config?: Record<string, unknown>;
     }[];
+    /** Grader context policy passed through to `compileLiteracyTask`. */
+    graderContext?: "rubric-only" | "with-docs";
+    /**
+     * W0198 Phase 6 — preflight context passed through to every task's
+     * `code-correctness` rubric so the grader treats the deterministic
+     * lane's existence verdicts as ground truth.
+     */
+    preflightContext?: PreflightRubricContext;
 }
 /** Result of compiling all literacy tasks */
 export interface LiteracyBridgeResult {

package/dist/pipeline/compiler/literacy-bridge.js CHANGED Viewed

@@ -73,6 +73,8 @@ export function compileLiteracyTasks(tasks, options) {
         evalMode: options.evalMode,
         models: options.models,
         rubricConfig,
+        graderContext: options.graderContext,
+        preflightContext: options.preflightContext,
     };
     for (const node of orderedNodes) {
         const task = taskMap.get(node.taskId);

package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js CHANGED Viewed

@@ -30,10 +30,6 @@ export const scaffoldProjectTask = {
             "2. Configure sanity.config.ts with project ID 'test-project' and dataset 'production'\n" +
             "3. Create a 'post' schema type with title, slug, body, and author fields\n" +
             "4. Ensure the project builds without errors",
-        vars: {
-            task: "Scaffold a Sanity Studio project with a post schema type. " +
-                "The project should build cleanly.",
-        },
     },
     assertions: [
         { type: "file-exists", value: "sanity.config.ts" },
@@ -70,10 +66,6 @@ export const modifyCodeTask = {
         text: "In the existing Sanity Studio project, add a custom document action " +
             "that logs a message before publishing. Follow the Sanity docs for " +
             "custom document actions.",
-        vars: {
-            task: "Add a custom document action that wraps the default publish action " +
-                "and logs 'Publishing document: <title>' before executing.",
-        },
     },
     assertions: [
         { type: "file-exists", value: "actions/logPublishAction.ts" },
@@ -127,10 +119,6 @@ export const multiFileRefactorTask = {
             "3. Query method calls (fetch → client.fetch with new signature)\n" +
             "4. Mutation helpers (create/patch/delete API changes)\n" +
             "Ensure the project compiles after migration.",
-        vars: {
-            task: "Migrate the codebase from @sanity/client v5 to v6, " +
-                "updating all files. Project must compile cleanly after migration.",
-        },
     },
     assertions: [
         {

package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js CHANGED Viewed

@@ -38,10 +38,6 @@ export const groqProjectionTask = {
             "5. Array slicing with `[0..5]` and `[0...5]`\n" +
             "6. Conditional projections using `select()`\n\n" +
             "Provide working code examples for each.",
-        vars: {
-            task: "Explain GROQ projection syntax with working code examples " +
-                "covering projections, spread, dereference, slicing, and select().",
-        },
     },
     assertions: [
         { type: "contains", value: "->" },
@@ -89,10 +85,6 @@ export const defineTypeApiTask = {
             "3. Why were these typed helpers introduced? What did they replace?\n" +
             "4. Show a complete example of a document schema with various field types\n" +
             "5. How do you add validation rules using the typed API?",
-        vars: {
-            task: "Explain Sanity's defineType/defineField schema API with examples, " +
-                "motivation, and validation rules.",
-        },
     },
     assertions: [
         { type: "contains", value: "defineType" },
@@ -142,10 +134,6 @@ export const ecosystemComparisonTask = {
             "4. Developer experience and customization\n" +
             "5. Pricing models\n" +
             "6. When would you choose one over the other?",
-        vars: {
-            task: "Compare Sanity and Contentful across architecture, content modeling, " +
-                "querying, DX, pricing, and use case fit.",
-        },
     },
     assertions: [
         { type: "contains-any", value: ["GROQ", "groq"] },

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts CHANGED Viewed

@@ -7,7 +7,7 @@
 import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooAssertion } from "../../assertion-mapper.js";
 import type { LiteracyCompileOptions } from "./types.js";
-export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
+export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
 /**
  * Build baseline assertions matching the legacy expand-tasks behavior.
  *

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js CHANGED Viewed

@@ -8,11 +8,11 @@ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
 // ---------------------------------------------------------------------------
 // Assertion resolution
 // ---------------------------------------------------------------------------
-export function resolveAssertions(task, options, warnings) {
+export function resolveAssertions(task, options, warnings, canonicalReference) {
     const assertions = [];
     for (const a of task.assertions ?? []) {
         if (a.type === "llm-rubric" && "template" in a) {
-            const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
+            const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings, canonicalReference, options?.preflightContext);
             if (resolved)
                 assertions.push(resolved);
         }
@@ -31,7 +31,7 @@ export function resolveAssertions(task, options, warnings) {
     }
     // Doc-coverage auto-generation
     if (task.docCoverage) {
-        const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
+        const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider, canonicalReference);
         if (docCoverageAssertion)
             assertions.push(docCoverageAssertion);
     }
@@ -40,22 +40,49 @@ export function resolveAssertions(task, options, warnings) {
 // ---------------------------------------------------------------------------
 // Doc-coverage assertion
 // ---------------------------------------------------------------------------
-function buildDocCoverageAssertion(rubricConfig, graderProvider) {
+function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalReference) {
     if (!rubricConfig?.templates["doc-coverage"])
         return null;
     const template = rubricConfig.templates["doc-coverage"];
     const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
     const rubricValue = `${template.header}\n${scaleText}\n\n` +
         `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
+    // doc-coverage benefits from the same authoritative reference — the grader
+    // needs the doc content to judge whether the candidate actually used what
+    // was documented.
+    const rubricPrompt = canonicalReference
+        ? buildDocCoverageRubricPrompt(rubricValue, canonicalReference)
+        : undefined;
     return {
         type: "llm-rubric",
         value: rubricValue,
+        ...(rubricPrompt ? { rubricPrompt } : {}),
         ...(graderProvider ? { provider: graderProvider } : {}),
         ...(template.dimension
             ? { metadata: { dimension: template.dimension, maxScore: 100 } }
             : {}),
     };
 }
+function buildDocCoverageRubricPrompt(rubric, reference) {
+    return [
+        "You are grading documentation coverage of a candidate response.",
+        "",
+        "AUTHORITATIVE REFERENCE — this is what the candidate had access to.",
+        "Score how well the candidate used what was documented here. Do not",
+        "penalize the candidate for missing information that is absent from",
+        "the reference.",
+        "",
+        "--- BEGIN REFERENCE ---",
+        reference,
+        "--- END REFERENCE ---",
+        "",
+        "RUBRIC:",
+        rubric,
+        "",
+        "CANDIDATE RESPONSE:",
+        "{{output}}",
+    ].join("\n");
+}
 // ---------------------------------------------------------------------------
 // Baseline assertion filtering
 // ---------------------------------------------------------------------------