npm - @sanity/ailf - Versions diffs - 3.9.0 → 4.0.0 - Mend

@sanity/ailf 3.9.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/config/models.ts +32 -4
package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
package/dist/_vendor/ailf-core/config-helpers.js +54 -1
package/dist/_vendor/ailf-shared/index.d.ts +16 -10
package/dist/_vendor/ailf-shared/index.js +13 -10
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/models.ts +32 -4
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/calculate-scores.js +40 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/package.json +2 -2

package/dist/agent-observer/proxy.js CHANGED Viewed

@@ -21,8 +21,49 @@
  *
  *   const log = recorder.stop()
  *   // → AgentBehaviorLog with all requests classified
+ *
+ * W0133 — per-class preview byte caps
+ *
+ * `responsePreview` is capped at `previewLimits.default` (4 KB) for most
+ * responses, with per-class overrides for two payloads whose contents are
+ * the ground truth for trace audits:
+ *
+ *   - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
+ *     bing.com/search, duckduckgo.com, google.com/search responses. Captures
+ *     the full result list (typical 8–10 KB) so trace audits can resolve
+ *     which result the model fetched next.
+ *   - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
+ *     index is ~110 KB. Capturing the full body lets trace audits
+ *     distinguish "model fetched a path that wasn't in the index" from
+ *     "model fetched a path that was in the index but the page is missing".
+ *
+ * The slim Content Lake report (W0051) does not inline previews — they
+ * live in the GCS `traces` NDJSON artifact only, so bumping these caps
+ * has no effect on the 10 MB Sanity document budget.
  */
 import { classifyRequests } from "./classifier.js";
+/** Per-class preview-byte defaults (W0133). */
+const DEFAULT_PREVIEW_LIMITS = {
+    default: 4096,
+    llmsTxt: 131072, // ~128 KB — covers Sanity's ~110 KB llms.txt
+    search: 16384, // ~16 KB — Jina/Google CSE/duckduckgo result lists
+};
+/**
+ * URL patterns for the `search` response class (W0133). These cover the
+ * search providers the agentic loop actually hits; new providers can be
+ * added here without changing the recorder API surface.
+ */
+const SEARCH_URL_PATTERNS = [
+    /r\.jina\.ai\/https?:\/\/(www\.)?duckduckgo\.com/i,
+    /r\.jina\.ai\/https?:\/\/(www\.)?google\.com\/search/i,
+    /r\.jina\.ai\/https?:\/\/(www\.)?bing\.com\/search/i,
+    /^https?:\/\/(www\.)?googleapis\.com\/customsearch/i,
+    /^https?:\/\/(www\.)?google\.com\/search/i,
+    /^https?:\/\/(www\.)?bing\.com\/search/i,
+    /^https?:\/\/(www\.)?duckduckgo\.com/i,
+];
+/** URL pattern for the `llmsTxt` response class (W0133). */
+const LLMS_TXT_PATTERN = /\/llms\.txt(\?|$|\/)/i;
 const DEFAULT_OPTIONS = {
     captureHeaders: [
         "accept",
@@ -40,7 +81,9 @@ const DEFAULT_OPTIONS = {
     ],
     includePatterns: [],
     maxBodyBytes: 4096,
-    maxPreviewBytes: 2048,
+    maxPreviewBytes: DEFAULT_PREVIEW_LIMITS.default,
+    previewLimits: { ...DEFAULT_PREVIEW_LIMITS },
+    statusOnlyForUnmatched: true,
 };
 // ---------------------------------------------------------------------------
 // RequestRecorder
@@ -63,6 +106,19 @@ export class RequestRecorder {
         if (merged.excludePatterns) {
             merged.excludePatterns = merged.excludePatterns.map(toRegExp);
         }
+        // Resolve per-class preview caps. `previewLimits.default` wins over
+        // `maxPreviewBytes`; missing entries fall through to module defaults
+        // (W0133).
+        const userLimits = options?.previewLimits ?? {};
+        const resolvedDefault = userLimits.default ??
+            options?.maxPreviewBytes ??
+            DEFAULT_PREVIEW_LIMITS.default;
+        merged.previewLimits = {
+            default: resolvedDefault,
+            llmsTxt: userLimits.llmsTxt ?? DEFAULT_PREVIEW_LIMITS.llmsTxt,
+            search: userLimits.search ?? DEFAULT_PREVIEW_LIMITS.search,
+        };
+        merged.maxPreviewBytes = resolvedDefault;
         this.options = merged;
     }
     /**
@@ -83,6 +139,7 @@ export class RequestRecorder {
                 ? input.method
                 : "GET") ??
             "GET";
+        const captureMode = this.classifyCaptureMode(url);
         let response;
         let error = null;
         try {
@@ -90,31 +147,64 @@ export class RequestRecorder {
         }
         catch (err) {
             error = err;
-            // Record the failed request
+            if (captureMode === "drop")
+                throw error;
+            // Record the failed request — status-only captures skip body/headers
+            // entirely (W0132).
+            this.record(captureMode === "full"
+                ? {
+                    body: await this.extractBody(init?.body),
+                    capture: "full",
+                    contentType: undefined,
+                    headers: this.extractHeaders(init?.headers),
+                    latencyMs: Date.now() - reqStart,
+                    method: method.toUpperCase(),
+                    responsePreview: `Error: ${error.message}`,
+                    responseSize: 0,
+                    statusCode: 0,
+                    timestamp: new Date(reqStart).toISOString(),
+                    url,
+                }
+                : {
+                    capture: "status-only",
+                    headers: {},
+                    latencyMs: Date.now() - reqStart,
+                    method: method.toUpperCase(),
+                    responseSize: 0,
+                    statusCode: 0,
+                    timestamp: new Date(reqStart).toISOString(),
+                    url,
+                });
+            throw error;
+        }
+        const latencyMs = Date.now() - reqStart;
+        if (captureMode === "drop")
+            return response;
+        if (captureMode === "status-only") {
+            // No body read, no header capture, no preview — only the metadata
+            // needed to know the call happened (W0132).
             this.record({
-                body: await this.extractBody(init?.body),
-                contentType: undefined,
-                headers: this.extractHeaders(init?.headers),
-                latencyMs: Date.now() - reqStart,
+                capture: "status-only",
+                headers: {},
+                latencyMs,
                 method: method.toUpperCase(),
-                responsePreview: `Error: ${error.message}`,
                 responseSize: 0,
-                statusCode: 0,
+                statusCode: response.status,
                 timestamp: new Date(reqStart).toISOString(),
                 url,
             });
-            throw error;
+            return response;
         }
-        const latencyMs = Date.now() - reqStart;
         // Clone the response so we can read the body without consuming it
         const clone = response.clone();
         let responseSize = 0;
         let responsePreview;
         if (this.options.captureResponsePreview) {
+            const previewBytes = this.resolvePreviewBytes(url);
             try {
                 const text = await clone.text();
                 responseSize = new TextEncoder().encode(text).length;
-                responsePreview = text.slice(0, this.options.maxPreviewBytes);
+                responsePreview = text.slice(0, previewBytes);
             }
             catch {
                 // Body might not be text — that's fine
@@ -123,6 +213,7 @@ export class RequestRecorder {
         }
         this.record({
             body: await this.extractBody(init?.body),
+            capture: "full",
             contentType: response.headers.get("content-type") ?? undefined,
             headers: this.extractHeaders(init?.headers),
             latencyMs,
@@ -152,26 +243,93 @@ export class RequestRecorder {
      *
      * Use this when you can't wrap `fetch` directly but can observe traffic
      * (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
+     *
+     * Filter behavior (W0132):
+     * - `excludePatterns` always drops the observation entirely.
+     * - `includePatterns` mismatch produces a slim `capture: "status-only"`
+     *   record when `statusOnlyForUnmatched` is true (default), or drops it
+     *   when false.
+     * - The discriminator on the input is honored: callers that already
+     *   know they're emitting a slim record (e.g., the fetch wrapper) can
+     *   set `capture: "status-only"` themselves.
      */
     record(observation) {
         if (!this.running)
             return;
         const url = observation.url;
-        // Apply filters
+        if (this.options.excludePatterns.some((p) => p.test(url)))
+            return;
+        let capture = observation.capture ?? "full";
         if (this.options.includePatterns.length > 0) {
-            if (!this.options.includePatterns.some((p) => p.test(url)))
-                return;
+            const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
+            if (!matchesIncludes) {
+                if (!this.options.statusOnlyForUnmatched)
+                    return;
+                capture = "status-only";
+            }
         }
-        if (this.options.excludePatterns.some((p) => p.test(url)))
+        if (capture === "status-only") {
+            // Slim shape — strip body/headers/contentType/responsePreview so a
+            // caller that passed full data still produces a sanitized record.
+            this.observations.push({
+                capture: "status-only",
+                headers: {},
+                latencyMs: observation.latencyMs,
+                method: observation.method,
+                responseSize: 0,
+                seq: this.seq++,
+                statusCode: observation.statusCode,
+                timestamp: observation.timestamp,
+                url,
+            });
             return;
+        }
+        const previewBytes = this.resolvePreviewBytes(url);
         this.observations.push({
             ...observation,
+            capture: "full",
             // Truncate body if needed
             body: observation.body?.slice(0, this.options.maxBodyBytes),
-            responsePreview: observation.responsePreview?.slice(0, this.options.maxPreviewBytes),
+            responsePreview: observation.responsePreview?.slice(0, previewBytes),
             seq: this.seq++,
         });
     }
+    /**
+     * Resolve the preview byte cap for a given URL using per-class overrides
+     * (W0133). Order of preference:
+     *   1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
+     *   2. `previewLimits.search` for known search providers.
+     *   3. `previewLimits.default`.
+     */
+    resolvePreviewBytes(url) {
+        if (LLMS_TXT_PATTERN.test(url))
+            return this.options.previewLimits.llmsTxt;
+        if (SEARCH_URL_PATTERNS.some((p) => p.test(url))) {
+            return this.options.previewLimits.search;
+        }
+        return this.options.previewLimits.default;
+    }
+    /**
+     * Decide how to record a URL given the current filter configuration.
+     *
+     * - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
+     *   and `statusOnlyForUnmatched` is false.
+     * - `"status-only"` — `includePatterns` failed but
+     *   `statusOnlyForUnmatched` is true (default). Skip body/headers.
+     * - `"full"` — record everything.
+     *
+     * See W0132.
+     */
+    classifyCaptureMode(url) {
+        if (this.options.excludePatterns.some((p) => p.test(url)))
+            return "drop";
+        if (this.options.includePatterns.length === 0)
+            return "full";
+        const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
+        if (matchesIncludes)
+            return "full";
+        return this.options.statusOnlyForUnmatched ? "status-only" : "drop";
+    }
     /**
      * Reset the recorder for reuse without creating a new instance.
      */

package/dist/agent-observer/types.d.ts CHANGED Viewed

@@ -101,19 +101,37 @@ export interface ExternalRequest {
     url: string;
 }
 export interface ObservedRequest {
-    /** Request body (for POST searches, etc.), truncated to maxBodyBytes */
+    /** Request body (for POST searches, etc.), truncated to maxBodyBytes.
+     *  Always omitted for `capture: "status-only"` entries. */
     body?: string;
-    /** Content-Type of the response */
+    /**
+     * Capture mode discriminator (W0132).
+     *
+     * - `"full"` — URL matched `includePatterns`; body, headers, contentType,
+     *   responseSize, and responsePreview are all captured.
+     * - `"status-only"` — URL did not match `includePatterns` but
+     *   `statusOnlyForUnmatched` is true. Only url/method/statusCode/
+     *   latencyMs/timestamp/seq are recorded; body/headers/contentType/
+     *   responsePreview are intentionally omitted to avoid capturing
+     *   prompts, completions, or API keys for third-party endpoints.
+     *
+     * Defaults to `"full"` on legacy records that pre-date W0132.
+     */
+    capture?: "full" | "status-only";
+    /** Content-Type of the response. Always omitted for status-only entries. */
     contentType?: string;
-    /** Relevant request headers (e.g., Accept, User-Agent) */
+    /** Relevant request headers (e.g., Accept, User-Agent).
+     *  Always empty for status-only entries (no header capture at all). */
     headers: Record<string, string>;
     /** Time from request start to response complete, in ms */
     latencyMs: number;
     /** HTTP method */
     method: string;
-    /** Response body preview (first N chars), useful for seeing what the agent actually read */
+    /** Response body preview (first N chars), useful for seeing what the agent
+     *  actually read. Always omitted for status-only entries. */
     responsePreview?: string;
-    /** Response body size in bytes */
+    /** Response body size in bytes. 0 for status-only entries (we never read
+     *  the body). */
     responseSize: number;
     /** Monotonic sequence number within the test run */
     seq: number;

package/dist/cli-program.js CHANGED Viewed

@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
         .option("-q, --quiet", "Suppress non-error output")
         .option("--dotenv <path>", "Override default .env file path")
         .option("--explain", "Show execution plan without running")
-        .option("--format <fmt>", "Output format for --explain (console, json)", "console")
+        .option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
         .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
     configureProgram(program);
     // Global --explain hook — intercepts any command before execution

package/dist/commands/baseline.d.ts CHANGED Viewed

@@ -3,7 +3,9 @@
  *
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
  * Commander subcommand interface: `baseline save`, `baseline compare`,
- * `baseline history`.
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
+ * tree (not the eval package's installed location); use `--baselines-dir`
+ * or `AILF_BASELINES_DIR` to override (W0098).
  */
 import { Command } from "commander";
 export declare function createBaselineCommand(): Command;

package/dist/commands/baseline.js CHANGED Viewed

@@ -3,17 +3,34 @@
  *
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
  * Commander subcommand interface: `baseline save`, `baseline compare`,
- * `baseline history`.
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
+ * tree (not the eval package's installed location); use `--baselines-dir`
+ * or `AILF_BASELINES_DIR` to override (W0098).
  */
-import { dirname, resolve } from "path";
-import { fileURLToPath } from "url";
+import { join, resolve } from "path";
 import { Command } from "commander";
 import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const ROOT = resolve(__dirname, "../..");
+import { getCallerCwd } from "./shared/resolve-output-dir.js";
 // CLI command name — kept as a constant to centralize the string literal.
 // "baseline" here refers to score baseline snapshots, not the legacy eval mode.
 const CMD_NAME = "baseline";
+/**
+ * Resolve the directory that holds baseline `*.json` snapshots.
+ * Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
+ */
+function resolveBaselinesDir(flag) {
+    if (flag)
+        return resolve(getCallerCwd(), flag);
+    if (process.env.AILF_BASELINES_DIR)
+        return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
+    return join(getCallerCwd(), ".ailf", "results", "baselines");
+}
+function resolveBaselineDirs(flag) {
+    return {
+        baselinesDir: resolveBaselinesDir(flag),
+        scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
+    };
+}
 export function createBaselineCommand() {
     const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
     // -----------------------------------------------------------------------
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
         .command("save")
         .description("Save current scores as a baseline snapshot")
         .option("-t, --tag <tag>", "Descriptive tag for the baseline")
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
         .action(async (opts) => {
         console.log("=== Saving baseline snapshot ===\n");
-        const result = saveBaseline(ROOT, opts.tag);
+        const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
         if (result.success) {
             console.log(`  ✅ ${result.message}`);
         }
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
         .command("compare")
         .description("Compare current scores against a saved baseline")
         .option("-f, --file <path>", "Specific baseline file to compare against")
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
         .action(async (opts) => {
         console.log("=== Baseline Comparison ===\n");
-        const result = compareBaseline(ROOT, opts.file);
+        const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
         if (!result.success) {
             console.error(`  ❌ ${result.message}`);
             process.exit(1);
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
     cmd
         .command("history")
         .description("List all saved baselines")
-        .action(async () => {
+        .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
+        .action(async (opts) => {
         console.log("=== Baseline History ===\n");
-        const baselines = listBaselines(ROOT);
+        const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
         if (baselines.length === 0) {
             console.log("  No baselines saved yet.");
             return;

package/dist/commands/cache.d.ts CHANGED Viewed

@@ -2,9 +2,13 @@
  * cache command — manage the local pipeline cache.
  *
  * Subcommands:
- *   cache clear   Delete all local cache manifests (results/cache/).
+ *   cache clear   Delete all local cache manifests (.ailf/results/cache/).
  *   cache status  Show current cache entries and their ages.
  *
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
+ * override (W0098).
+ *
  * Note: This only affects the local file-system cache used to skip unchanged
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.

package/dist/commands/cache.js CHANGED Viewed

@@ -2,20 +2,32 @@
  * cache command — manage the local pipeline cache.
  *
  * Subcommands:
- *   cache clear   Delete all local cache manifests (results/cache/).
+ *   cache clear   Delete all local cache manifests (.ailf/results/cache/).
  *   cache status  Show current cache entries and their ages.
  *
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
+ * override (W0098).
+ *
  * Note: This only affects the local file-system cache used to skip unchanged
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
  */
 import { Command } from "commander";
 import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
-import { dirname, join, resolve } from "path";
-import { fileURLToPath } from "url";
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const ROOT = resolve(__dirname, "..", "..");
-const CACHE_DIR = resolve(ROOT, "results", "cache");
+import { join, resolve } from "path";
+import { getCallerCwd } from "./shared/resolve-output-dir.js";
+/**
+ * Resolve the local pipeline cache directory.
+ * Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
+ */
+function resolveCacheDir(flag) {
+    if (flag)
+        return resolve(getCallerCwd(), flag);
+    if (process.env.AILF_CACHE_DIR)
+        return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
+    return join(getCallerCwd(), ".ailf", "results", "cache");
+}
 export function createCacheCommand() {
     const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
     // -----------------------------------------------------------------------
@@ -24,17 +36,19 @@ export function createCacheCommand() {
     cmd
         .command("clear")
         .description("Delete all local cache manifests so every pipeline step re-executes")
-        .action(() => {
-        if (!existsSync(CACHE_DIR)) {
+        .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
+        .action((opts) => {
+        const cacheDir = resolveCacheDir(opts.cacheDir);
+        if (!existsSync(cacheDir)) {
             console.log("  ℹ️  No local cache directory found — nothing to clear.");
             return;
         }
-        const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
+        const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
         if (files.length === 0) {
             console.log("  ℹ️  Local cache directory is empty — nothing to clear.");
             return;
         }
-        rmSync(CACHE_DIR, { recursive: true, force: true });
+        rmSync(cacheDir, { recursive: true, force: true });
         console.log(`  🗑️  Cleared ${files.length} local cache manifest(s).`);
         console.log("  ℹ️  Next pipeline run will re-execute all steps from scratch.");
         console.log("\n  Note: The remote Content Lake cache is unaffected.");
@@ -46,12 +60,14 @@ export function createCacheCommand() {
     cmd
         .command("status")
         .description("Show current local cache entries and their ages")
-        .action(() => {
-        if (!existsSync(CACHE_DIR)) {
+        .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
+        .action((opts) => {
+        const cacheDir = resolveCacheDir(opts.cacheDir);
+        if (!existsSync(cacheDir)) {
             console.log("  ℹ️  No local cache directory found.");
             return;
         }
-        const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
+        const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
         if (files.length === 0) {
             console.log("  ℹ️  Local cache directory is empty.");
             return;
@@ -64,7 +80,7 @@ export function createCacheCommand() {
             "Outputs");
         console.log("  " + "-".repeat(65));
         for (const file of files.sort()) {
-            const filePath = join(CACHE_DIR, file);
+            const filePath = join(cacheDir, file);
             try {
                 const raw = readFileSync(filePath, "utf-8");
                 const manifest = JSON.parse(raw);
@@ -88,7 +104,7 @@ export function createCacheCommand() {
         }
         const totalSize = files.reduce((sum, f) => {
             try {
-                return sum + statSync(join(CACHE_DIR, f)).size;
+                return sum + statSync(join(cacheDir, f)).size;
             }
             catch {
                 return sum;

package/dist/commands/compare.js CHANGED Viewed

@@ -4,7 +4,7 @@
  * Wraps the existing compare pipeline logic and formatting utilities
  * in a Commander.js command for consistent CLI integration.
  */
-import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
+import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
 import { dirname, join, resolve } from "path";
 import { fileURLToPath } from "url";
 import { Command } from "commander";
@@ -79,7 +79,7 @@ export function createCompareCommand() {
         if (opts.format === "json") {
             const json = JSON.stringify(report, null, 2);
             if (opts.output) {
-                writeFileSync(opts.output, json);
+                writeReport(opts.output, json);
                 console.log(`  ✅ Comparison report written to ${opts.output}`);
             }
             else {
@@ -91,13 +91,13 @@ export function createCompareCommand() {
             console.log(table);
             if (opts.output) {
                 const json = JSON.stringify(report, null, 2);
-                writeFileSync(opts.output, json);
+                writeReport(opts.output, json);
                 console.log(`  ✅ Comparison report also written to ${opts.output}`);
             }
         }
         // Write comparison report to output dir for other steps to consume
         const latestComparisonPath = join(outputDir, "comparison-report.json");
-        writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
+        writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
     });
     addOutputDirOption(cmd);
     return cmd;
@@ -122,3 +122,10 @@ function loadSummary(path) {
     const raw = readFileSync(path, "utf-8");
     return JSON.parse(raw);
 }
+// W0097: every write path creates its parent dir so a fresh project (no
+// `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
+// not-yet-existing directory both succeed instead of crashing with ENOENT.
+function writeReport(path, contents) {
+    mkdirSync(dirname(path), { recursive: true });
+    writeFileSync(path, contents);
+}

package/dist/commands/explain-handler.js CHANGED Viewed

@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
             rootDir,
         });
     }
-    // --format is a global option on the root program (actionCommand.parent)
+    // --explain-format is a global option on the root program (actionCommand.parent)
     const globalParentOpts = actionCommand.parent?.opts();
-    const formatOpt = globalParentOpts?.format ?? "console";
+    const formatOpt = globalParentOpts?.explainFormat ?? "console";
     if (formatOpt === "json") {
         console.log(formatPlanJson(plan));
     }

package/dist/config/models.ts CHANGED Viewed

@@ -35,16 +35,23 @@ export default defineModels({
     // ── OpenAI ─────────────────────────────────────────────────
     {
+      // gpt-5.2 routes through chat completions (and through the in-house
+      // agentic provider for naive/optimized variants). `verbosity` is a
+      // Responses-API-only field — it would be silently dropped here, so
+      // it isn't configured. See W0131.
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
       config: {
         max_completion_tokens: 8192,
-        verbosity: "medium",
       },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
     {
+      // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
+      // native handling of `openai:responses:` honors reasoning / verbosity /
+      // summary; the in-house agentic provider does not (W0131). MCP-server
+      // and knowledge-probe routes go through Promptfoo native too.
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
@@ -55,7 +62,9 @@ export default defineModels({
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
       modes: ["literacy", "mcp-server", "knowledge-probe"],
-      // All literacy variants included by default
+      variants: {
+        literacy: ["baseline"],
+      },
     },
     // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
   defaults: {
     temperature: 0.2,
     max_tokens: 4096,
-    maxToolRounds: 5, // for agentic modes
+    // Global default round budget for agentic modes. Per-mode overrides
+    // below give naive more headroom (W0134) since it spends rounds on
+    // retries when fetches fail. Per-model `config.maxToolRounds` still
+    // wins over both values.
+    maxToolRounds: 5,
+    modeMaxToolRounds: {
+      "agentic-naive": 8,
+      "agentic-optimized": 5,
+    },
     observerOptions: {
-      maxPreviewBytes: 2048,
+      // Per-class preview caps (W0133): default 4 KB, but search responses
+      // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
+      // which result the model actually saw.
+      maxPreviewBytes: 4096,
+      previewLimits: {
+        default: 4096,
+        llmsTxt: 131072,
+        search: 16384,
+      },
       captureResponsePreview: true,
       includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
       sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
+      // statusOnlyForUnmatched defaults to true (W0132) — model-side
+      // traffic to api.openai.com / api.anthropic.com / googleapis.com
+      // surfaces in run artifacts as slim status-only entries.
     },
   },
 })