npm - @sanity/ailf - Versions diffs - 3.8.1 → 4.0.0 - Mend

@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/config/canary-tasks.ts +64 -0
package/config/models.ts +32 -4
package/config/test-budgets.ts +24 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
package/dist/_vendor/ailf-core/config-helpers.js +81 -1
package/dist/_vendor/ailf-core/index.d.ts +1 -1
package/dist/_vendor/ailf-core/index.js +1 -1
package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +2 -0
package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
package/dist/_vendor/ailf-shared/index.d.ts +16 -9
package/dist/_vendor/ailf-shared/index.js +13 -9
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/canary-tasks.ts +64 -0
package/dist/config/models.ts +32 -4
package/dist/config/test-budgets.ts +24 -0
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/calculate-scores.d.ts +17 -2
package/dist/pipeline/calculate-scores.js +139 -1
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
package/package.json +2 -1
package/tasks/knowledge-probe/groq-projections.task.ts +29 -11

package/dist/agent-observer/types.d.ts CHANGED Viewed

@@ -101,19 +101,37 @@ export interface ExternalRequest {
     url: string;
 }
 export interface ObservedRequest {
-    /** Request body (for POST searches, etc.), truncated to maxBodyBytes */
+    /** Request body (for POST searches, etc.), truncated to maxBodyBytes.
+     *  Always omitted for `capture: "status-only"` entries. */
     body?: string;
-    /** Content-Type of the response */
+    /**
+     * Capture mode discriminator (W0132).
+     *
+     * - `"full"` — URL matched `includePatterns`; body, headers, contentType,
+     *   responseSize, and responsePreview are all captured.
+     * - `"status-only"` — URL did not match `includePatterns` but
+     *   `statusOnlyForUnmatched` is true. Only url/method/statusCode/
+     *   latencyMs/timestamp/seq are recorded; body/headers/contentType/
+     *   responsePreview are intentionally omitted to avoid capturing
+     *   prompts, completions, or API keys for third-party endpoints.
+     *
+     * Defaults to `"full"` on legacy records that pre-date W0132.
+     */
+    capture?: "full" | "status-only";
+    /** Content-Type of the response. Always omitted for status-only entries. */
     contentType?: string;
-    /** Relevant request headers (e.g., Accept, User-Agent) */
+    /** Relevant request headers (e.g., Accept, User-Agent).
+     *  Always empty for status-only entries (no header capture at all). */
     headers: Record<string, string>;
     /** Time from request start to response complete, in ms */
     latencyMs: number;
     /** HTTP method */
     method: string;
-    /** Response body preview (first N chars), useful for seeing what the agent actually read */
+    /** Response body preview (first N chars), useful for seeing what the agent
+     *  actually read. Always omitted for status-only entries. */
     responsePreview?: string;
-    /** Response body size in bytes */
+    /** Response body size in bytes. 0 for status-only entries (we never read
+     *  the body). */
     responseSize: number;
     /** Monotonic sequence number within the test run */
     seq: number;

package/dist/cli-program.js CHANGED Viewed

@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
         .option("-q, --quiet", "Suppress non-error output")
         .option("--dotenv <path>", "Override default .env file path")
         .option("--explain", "Show execution plan without running")
-        .option("--format <fmt>", "Output format for --explain (console, json)", "console")
+        .option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
         .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
     configureProgram(program);
     // Global --explain hook — intercepts any command before execution

package/dist/commands/baseline.d.ts CHANGED Viewed

@@ -3,7 +3,9 @@
  *
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
  * Commander subcommand interface: `baseline save`, `baseline compare`,
- * `baseline history`.
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
+ * tree (not the eval package's installed location); use `--baselines-dir`
+ * or `AILF_BASELINES_DIR` to override (W0098).
  */
 import { Command } from "commander";
 export declare function createBaselineCommand(): Command;

package/dist/commands/baseline.js CHANGED Viewed

@@ -3,17 +3,34 @@
  *
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
  * Commander subcommand interface: `baseline save`, `baseline compare`,
- * `baseline history`.
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
+ * tree (not the eval package's installed location); use `--baselines-dir`
+ * or `AILF_BASELINES_DIR` to override (W0098).
  */
-import { dirname, resolve } from "path";
-import { fileURLToPath } from "url";
+import { join, resolve } from "path";
 import { Command } from "commander";
 import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const ROOT = resolve(__dirname, "../..");
+import { getCallerCwd } from "./shared/resolve-output-dir.js";
 // CLI command name — kept as a constant to centralize the string literal.
 // "baseline" here refers to score baseline snapshots, not the legacy eval mode.
 const CMD_NAME = "baseline";
+/**
+ * Resolve the directory that holds baseline `*.json` snapshots.
+ * Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
+ */
+function resolveBaselinesDir(flag) {
+    if (flag)
+        return resolve(getCallerCwd(), flag);
+    if (process.env.AILF_BASELINES_DIR)
+        return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
+    return join(getCallerCwd(), ".ailf", "results", "baselines");
+}
+function resolveBaselineDirs(flag) {
+    return {
+        baselinesDir: resolveBaselinesDir(flag),
+        scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
+    };
+}
 export function createBaselineCommand() {
     const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
     // -----------------------------------------------------------------------
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
         .command("save")
         .description("Save current scores as a baseline snapshot")
         .option("-t, --tag <tag>", "Descriptive tag for the baseline")
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
         .action(async (opts) => {
         console.log("=== Saving baseline snapshot ===\n");
-        const result = saveBaseline(ROOT, opts.tag);
+        const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
         if (result.success) {
             console.log(`  ✅ ${result.message}`);
         }
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
         .command("compare")
         .description("Compare current scores against a saved baseline")
         .option("-f, --file <path>", "Specific baseline file to compare against")
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
         .action(async (opts) => {
         console.log("=== Baseline Comparison ===\n");
-        const result = compareBaseline(ROOT, opts.file);
+        const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
         if (!result.success) {
             console.error(`  ❌ ${result.message}`);
             process.exit(1);
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
     cmd
         .command("history")
         .description("List all saved baselines")
-        .action(async () => {
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
+        .action(async (opts) => {
         console.log("=== Baseline History ===\n");
-        const baselines = listBaselines(ROOT);
+        const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
         if (baselines.length === 0) {
             console.log("  No baselines saved yet.");
             return;

package/dist/commands/cache.d.ts CHANGED Viewed

@@ -2,9 +2,13 @@
  * cache command — manage the local pipeline cache.
  *
  * Subcommands:
- *   cache clear   Delete all local cache manifests (results/cache/).
+ *   cache clear   Delete all local cache manifests (.ailf/results/cache/).
  *   cache status  Show current cache entries and their ages.
  *
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
+ * override (W0098).
+ *
  * Note: This only affects the local file-system cache used to skip unchanged
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.

package/dist/commands/cache.js CHANGED Viewed

@@ -2,20 +2,32 @@
  * cache command — manage the local pipeline cache.
  *
  * Subcommands:
- *   cache clear   Delete all local cache manifests (results/cache/).
+ *   cache clear   Delete all local cache manifests (.ailf/results/cache/).
  *   cache status  Show current cache entries and their ages.
  *
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
+ * override (W0098).
+ *
  * Note: This only affects the local file-system cache used to skip unchanged
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
  */
 import { Command } from "commander";
 import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
-import { dirname, join, resolve } from "path";
-import { fileURLToPath } from "url";
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const ROOT = resolve(__dirname, "..", "..");
-const CACHE_DIR = resolve(ROOT, "results", "cache");
+import { join, resolve } from "path";
+import { getCallerCwd } from "./shared/resolve-output-dir.js";
+/**
+ * Resolve the local pipeline cache directory.
+ * Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
+ */
+function resolveCacheDir(flag) {
+    if (flag)
+        return resolve(getCallerCwd(), flag);
+    if (process.env.AILF_CACHE_DIR)
+        return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
+    return join(getCallerCwd(), ".ailf", "results", "cache");
+}
 export function createCacheCommand() {
     const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
     // -----------------------------------------------------------------------
@@ -24,17 +36,19 @@ export function createCacheCommand() {
     cmd
         .command("clear")
         .description("Delete all local cache manifests so every pipeline step re-executes")
-        .action(() => {
-        if (!existsSync(CACHE_DIR)) {
+        .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
+        .action((opts) => {
+        const cacheDir = resolveCacheDir(opts.cacheDir);
+        if (!existsSync(cacheDir)) {
             console.log("  ℹ️  No local cache directory found — nothing to clear.");
             return;
         }
-        const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
+        const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
         if (files.length === 0) {
             console.log("  ℹ️  Local cache directory is empty — nothing to clear.");
             return;
         }
-        rmSync(CACHE_DIR, { recursive: true, force: true });
+        rmSync(cacheDir, { recursive: true, force: true });
         console.log(`  🗑️  Cleared ${files.length} local cache manifest(s).`);
         console.log("  ℹ️  Next pipeline run will re-execute all steps from scratch.");
         console.log("\n  Note: The remote Content Lake cache is unaffected.");
@@ -46,12 +60,14 @@ export function createCacheCommand() {
     cmd
         .command("status")
         .description("Show current local cache entries and their ages")
-        .action(() => {
-        if (!existsSync(CACHE_DIR)) {
+        .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
+        .action((opts) => {
+        const cacheDir = resolveCacheDir(opts.cacheDir);
+        if (!existsSync(cacheDir)) {
             console.log("  ℹ️  No local cache directory found.");
             return;
         }
-        const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
+        const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
         if (files.length === 0) {
             console.log("  ℹ️  Local cache directory is empty.");
             return;
@@ -64,7 +80,7 @@ export function createCacheCommand() {
             "Outputs");
         console.log("  " + "-".repeat(65));
         for (const file of files.sort()) {
-            const filePath = join(CACHE_DIR, file);
+            const filePath = join(cacheDir, file);
             try {
                 const raw = readFileSync(filePath, "utf-8");
                 const manifest = JSON.parse(raw);
@@ -88,7 +104,7 @@ export function createCacheCommand() {
         }
         const totalSize = files.reduce((sum, f) => {
             try {
-                return sum + statSync(join(CACHE_DIR, f)).size;
+                return sum + statSync(join(cacheDir, f)).size;
             }
             catch {
                 return sum;

package/dist/commands/compare.js CHANGED Viewed

@@ -4,7 +4,7 @@
  * Wraps the existing compare pipeline logic and formatting utilities
  * in a Commander.js command for consistent CLI integration.
  */
-import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
+import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
 import { dirname, join, resolve } from "path";
 import { fileURLToPath } from "url";
 import { Command } from "commander";
@@ -79,7 +79,7 @@ export function createCompareCommand() {
         if (opts.format === "json") {
             const json = JSON.stringify(report, null, 2);
             if (opts.output) {
-                writeFileSync(opts.output, json);
+                writeReport(opts.output, json);
                 console.log(`  ✅ Comparison report written to ${opts.output}`);
             }
             else {
@@ -91,13 +91,13 @@ export function createCompareCommand() {
             console.log(table);
             if (opts.output) {
                 const json = JSON.stringify(report, null, 2);
-                writeFileSync(opts.output, json);
+                writeReport(opts.output, json);
                 console.log(`  ✅ Comparison report also written to ${opts.output}`);
             }
         }
         // Write comparison report to output dir for other steps to consume
         const latestComparisonPath = join(outputDir, "comparison-report.json");
-        writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
+        writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
     });
     addOutputDirOption(cmd);
     return cmd;
@@ -122,3 +122,10 @@ function loadSummary(path) {
     const raw = readFileSync(path, "utf-8");
     return JSON.parse(raw);
 }
+// W0097: every write path creates its parent dir so a fresh project (no
+// `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
+// not-yet-existing directory both succeed instead of crashing with ENOENT.
+function writeReport(path, contents) {
+    mkdirSync(dirname(path), { recursive: true });
+    writeFileSync(path, contents);
+}

package/dist/commands/explain-handler.js CHANGED Viewed

@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
             rootDir,
         });
     }
-    // --format is a global option on the root program (actionCommand.parent)
+    // --explain-format is a global option on the root program (actionCommand.parent)
     const globalParentOpts = actionCommand.parent?.opts();
-    const formatOpt = globalParentOpts?.format ?? "console";
+    const formatOpt = globalParentOpts?.explainFormat ?? "console";
     if (formatOpt === "json") {
         console.log(formatPlanJson(plan));
     }

package/dist/config/canary-tasks.ts ADDED Viewed

@@ -0,0 +1,64 @@
+/**
+ * canary-tasks.ts — The Tier 3 canary set.
+ *
+ * Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
+ * Composition follows the design doc's "weighted toward modes/areas with
+ * the most production usage and the highest historical regression rates"
+ * recommendation: GROQ and Content Lake (foundational consumer surfaces),
+ * Portable Text (historically drift-prone), Studio schema authoring (the
+ * second-most-used surface after queries), and a knowledge-probe pairing
+ * for cross-mode coverage.
+ *
+ * Each entry's `rationale` is the canary's load-bearing field — without it,
+ * future maintainers can't reason about whether a regression is meaningful
+ * or whether the slot has lost value. Update the rationale when you swap a
+ * canary entry; never silently replace one.
+ *
+ * Validated against the live task inventory by `scripts/check-canary-tasks.ts`
+ * (`pnpm check`). Dangling task IDs fail the build.
+ *
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
+ * @see .github/workflows/tier-3-nightly.yml — consumer
+ */
+import { defineCanaryTasks } from "../_vendor/ailf-core/index.js"
+export default defineCanaryTasks({
+  tasks: [
+    {
+      taskId: "groq-blog-queries",
+      mode: "literacy",
+      rationale:
+        "Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
+    },
+    {
+      taskId: "content-lake-mutations",
+      mode: "literacy",
+      rationale:
+        "Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
+    },
+    {
+      taskId: "portable-text-rendering",
+      mode: "literacy",
+      rationale:
+        "Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
+    },
+    {
+      taskId: "studio-blog-schema",
+      mode: "literacy",
+      rationale:
+        "Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
+    },
+    {
+      taskId: "kp-groq-projections",
+      mode: "knowledge-probe",
+      rationale:
+        "Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
+    },
+    // mcp-server canary slot — add a third mode here when a committed
+    // mcp-server task lands under packages/eval/tasks/mcp-server/. Today
+    // there are no production mcp-server tasks (only fixtures); the trigger
+    // is upstream and adding a placeholder slot would dangle. Surfaced at
+    // Phase 5 close (2026-04-27) — see W0116 retrospective.
+  ],
+})

package/dist/config/models.ts CHANGED Viewed

@@ -35,16 +35,23 @@ export default defineModels({
     // ── OpenAI ─────────────────────────────────────────────────
     {
+      // gpt-5.2 routes through chat completions (and through the in-house
+      // agentic provider for naive/optimized variants). `verbosity` is a
+      // Responses-API-only field — it would be silently dropped here, so
+      // it isn't configured. See W0131.
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
       config: {
         max_completion_tokens: 8192,
-        verbosity: "medium",
       },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
     {
+      // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
+      // native handling of `openai:responses:` honors reasoning / verbosity /
+      // summary; the in-house agentic provider does not (W0131). MCP-server
+      // and knowledge-probe routes go through Promptfoo native too.
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
@@ -55,7 +62,9 @@ export default defineModels({
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
       modes: ["literacy", "mcp-server", "knowledge-probe"],
-      // All literacy variants included by default
+      variants: {
+        literacy: ["baseline"],
+      },
     },
     // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
   defaults: {
     temperature: 0.2,
     max_tokens: 4096,
-    maxToolRounds: 5, // for agentic modes
+    // Global default round budget for agentic modes. Per-mode overrides
+    // below give naive more headroom (W0134) since it spends rounds on
+    // retries when fetches fail. Per-model `config.maxToolRounds` still
+    // wins over both values.
+    maxToolRounds: 5,
+    modeMaxToolRounds: {
+      "agentic-naive": 8,
+      "agentic-optimized": 5,
+    },
     observerOptions: {
-      maxPreviewBytes: 2048,
+      // Per-class preview caps (W0133): default 4 KB, but search responses
+      // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
+      // which result the model actually saw.
+      maxPreviewBytes: 4096,
+      previewLimits: {
+        default: 4096,
+        llmsTxt: 131072,
+        search: 16384,
+      },
       captureResponsePreview: true,
       includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
       sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
+      // statusOnlyForUnmatched defaults to true (W0132) — model-side
+      // traffic to api.openai.com / api.anthropic.com / googleapis.com
+      // surfaces in run artifacts as slim status-only entries.
     },
   },
 })

package/dist/config/test-budgets.ts ADDED Viewed

@@ -0,0 +1,24 @@
+/**
+ * test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
+ *
+ * Each cap is the maximum cost a single Tier 3 nightly run may incur for
+ * that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
+ * fails loudly if any provider's actual spend exceeds its cap.
+ *
+ * The design doc names a $30–60/day envelope across all providers. Caps
+ * here divide that envelope per-provider; tighten as baseline canary spend
+ * becomes measurable.
+ *
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
+ * @see scripts/tier-3-budget-check.mjs — enforcement
+ */
+import { defineTestBudgets } from "../_vendor/ailf-core/index.js"
+export default defineTestBudgets({
+  perProviderDaily: {
+    anthropic: 30,
+    openai: 30,
+  },
+  warnFraction: 0.8,
+})

package/dist/pipeline/baseline.d.ts CHANGED Viewed

@@ -29,9 +29,20 @@ export interface ScoreComparison {
     delta: number;
     feature: string;
 }
-export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
-export declare function listBaselines(rootDir: string): BaselineMetadata[];
-export declare function saveBaseline(rootDir: string, tag?: string): {
+/**
+ * Paths the baseline pipeline functions read and write. Callers compose this
+ * from caller-relative paths so the functions stay agnostic of where the
+ * eval package itself lives on disk (W0098).
+ */
+export interface BaselineDirs {
+    /** Directory that contains baseline `*.json` snapshots. */
+    baselinesDir: string;
+    /** Absolute path to the current run's `score-summary.json`. */
+    scoreSummaryPath: string;
+}
+export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
+export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
+export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
     success: boolean;
     message: string;
 };

package/dist/pipeline/baseline.js CHANGED Viewed

@@ -7,12 +7,8 @@
  */
 import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
 import { join } from "path";
-// ---------------------------------------------------------------------------
-// Compare
-// ---------------------------------------------------------------------------
-export function compareBaseline(rootDir, baselineFile) {
-    const baselinesDir = join(rootDir, "results", "baselines");
-    const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
+export function compareBaseline(dirs, baselineFile) {
+    const { baselinesDir, scoreSummaryPath } = dirs;
     if (!existsSync(scoreSummaryPath)) {
         return {
             message: "No current score-summary.json found.",
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
         };
     }
     // Find baseline to compare against
-    const baselines = listBaselines(rootDir);
+    const baselines = listBaselines(baselinesDir);
     if (baselines.length === 0) {
         return {
             message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
 // ---------------------------------------------------------------------------
 // List
 // ---------------------------------------------------------------------------
-export function listBaselines(rootDir) {
-    const baselinesDir = join(rootDir, "results", "baselines");
+export function listBaselines(baselinesDir) {
     if (!existsSync(baselinesDir)) {
         return [];
     }
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
 // ---------------------------------------------------------------------------
 // Save
 // ---------------------------------------------------------------------------
-export function saveBaseline(rootDir, tag) {
-    const baselinesDir = join(rootDir, "results", "baselines");
-    const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
+export function saveBaseline(dirs, tag) {
+    const { baselinesDir, scoreSummaryPath } = dirs;
     if (!existsSync(scoreSummaryPath)) {
         return {
             message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
     };
     writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
     return {
-        message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
+        message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
         success: true,
     };
 }

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
-import type { GraderJudgment, PerModelEntry } from "./types.js";
+import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
@@ -91,6 +91,21 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
 export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
+/**
+ * Score knowledge-probe evaluation results.
+ *
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
+ * var and answers from training-data knowledge alone. The compiler explicitly
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
+ *
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
+ * feature area (KP results carry `__featureArea` from the compiler), and
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
+ */
+export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
 /**
  * Score agentic evaluation results. In agentic mode, all test entries are
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).