@sanity/ailf 3.9.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,8 +21,49 @@
21
21
  *
22
22
  * const log = recorder.stop()
23
23
  * // → AgentBehaviorLog with all requests classified
24
+ *
25
+ * W0133 — per-class preview byte caps
26
+ *
27
+ * `responsePreview` is capped at `previewLimits.default` (4 KB) for most
28
+ * responses, with per-class overrides for two payloads whose contents are
29
+ * the ground truth for trace audits:
30
+ *
31
+ * - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
32
+ * bing.com/search, duckduckgo.com, google.com/search responses. Captures
33
+ * the full result list (typical 8–10 KB) so trace audits can resolve
34
+ * which result the model fetched next.
35
+ * - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
36
+ * index is ~110 KB. Capturing the full body lets trace audits
37
+ * distinguish "model fetched a path that wasn't in the index" from
38
+ * "model fetched a path that was in the index but the page is missing".
39
+ *
40
+ * The slim Content Lake report (W0051) does not inline previews — they
41
+ * live in the GCS `traces` NDJSON artifact only, so bumping these caps
42
+ * has no effect on the 10 MB Sanity document budget.
24
43
  */
25
44
  import { classifyRequests } from "./classifier.js";
45
+ /** Per-class preview-byte defaults (W0133). */
46
+ const DEFAULT_PREVIEW_LIMITS = {
47
+ default: 4096,
48
+ llmsTxt: 131072, // ~128 KB — covers Sanity's ~110 KB llms.txt
49
+ search: 16384, // ~16 KB — Jina/Google CSE/duckduckgo result lists
50
+ };
51
+ /**
52
+ * URL patterns for the `search` response class (W0133). These cover the
53
+ * search providers the agentic loop actually hits; new providers can be
54
+ * added here without changing the recorder API surface.
55
+ */
56
+ const SEARCH_URL_PATTERNS = [
57
+ /r\.jina\.ai\/https?:\/\/(www\.)?duckduckgo\.com/i,
58
+ /r\.jina\.ai\/https?:\/\/(www\.)?google\.com\/search/i,
59
+ /r\.jina\.ai\/https?:\/\/(www\.)?bing\.com\/search/i,
60
+ /^https?:\/\/(www\.)?googleapis\.com\/customsearch/i,
61
+ /^https?:\/\/(www\.)?google\.com\/search/i,
62
+ /^https?:\/\/(www\.)?bing\.com\/search/i,
63
+ /^https?:\/\/(www\.)?duckduckgo\.com/i,
64
+ ];
65
+ /** URL pattern for the `llmsTxt` response class (W0133). */
66
+ const LLMS_TXT_PATTERN = /\/llms\.txt(\?|$|\/)/i;
26
67
  const DEFAULT_OPTIONS = {
27
68
  captureHeaders: [
28
69
  "accept",
@@ -40,7 +81,9 @@ const DEFAULT_OPTIONS = {
40
81
  ],
41
82
  includePatterns: [],
42
83
  maxBodyBytes: 4096,
43
- maxPreviewBytes: 2048,
84
+ maxPreviewBytes: DEFAULT_PREVIEW_LIMITS.default,
85
+ previewLimits: { ...DEFAULT_PREVIEW_LIMITS },
86
+ statusOnlyForUnmatched: true,
44
87
  };
45
88
  // ---------------------------------------------------------------------------
46
89
  // RequestRecorder
@@ -63,6 +106,19 @@ export class RequestRecorder {
63
106
  if (merged.excludePatterns) {
64
107
  merged.excludePatterns = merged.excludePatterns.map(toRegExp);
65
108
  }
109
+ // Resolve per-class preview caps. `previewLimits.default` wins over
110
+ // `maxPreviewBytes`; missing entries fall through to module defaults
111
+ // (W0133).
112
+ const userLimits = options?.previewLimits ?? {};
113
+ const resolvedDefault = userLimits.default ??
114
+ options?.maxPreviewBytes ??
115
+ DEFAULT_PREVIEW_LIMITS.default;
116
+ merged.previewLimits = {
117
+ default: resolvedDefault,
118
+ llmsTxt: userLimits.llmsTxt ?? DEFAULT_PREVIEW_LIMITS.llmsTxt,
119
+ search: userLimits.search ?? DEFAULT_PREVIEW_LIMITS.search,
120
+ };
121
+ merged.maxPreviewBytes = resolvedDefault;
66
122
  this.options = merged;
67
123
  }
68
124
  /**
@@ -83,6 +139,7 @@ export class RequestRecorder {
83
139
  ? input.method
84
140
  : "GET") ??
85
141
  "GET";
142
+ const captureMode = this.classifyCaptureMode(url);
86
143
  let response;
87
144
  let error = null;
88
145
  try {
@@ -90,31 +147,64 @@ export class RequestRecorder {
90
147
  }
91
148
  catch (err) {
92
149
  error = err;
93
- // Record the failed request
150
+ if (captureMode === "drop")
151
+ throw error;
152
+ // Record the failed request — status-only captures skip body/headers
153
+ // entirely (W0132).
154
+ this.record(captureMode === "full"
155
+ ? {
156
+ body: await this.extractBody(init?.body),
157
+ capture: "full",
158
+ contentType: undefined,
159
+ headers: this.extractHeaders(init?.headers),
160
+ latencyMs: Date.now() - reqStart,
161
+ method: method.toUpperCase(),
162
+ responsePreview: `Error: ${error.message}`,
163
+ responseSize: 0,
164
+ statusCode: 0,
165
+ timestamp: new Date(reqStart).toISOString(),
166
+ url,
167
+ }
168
+ : {
169
+ capture: "status-only",
170
+ headers: {},
171
+ latencyMs: Date.now() - reqStart,
172
+ method: method.toUpperCase(),
173
+ responseSize: 0,
174
+ statusCode: 0,
175
+ timestamp: new Date(reqStart).toISOString(),
176
+ url,
177
+ });
178
+ throw error;
179
+ }
180
+ const latencyMs = Date.now() - reqStart;
181
+ if (captureMode === "drop")
182
+ return response;
183
+ if (captureMode === "status-only") {
184
+ // No body read, no header capture, no preview — only the metadata
185
+ // needed to know the call happened (W0132).
94
186
  this.record({
95
- body: await this.extractBody(init?.body),
96
- contentType: undefined,
97
- headers: this.extractHeaders(init?.headers),
98
- latencyMs: Date.now() - reqStart,
187
+ capture: "status-only",
188
+ headers: {},
189
+ latencyMs,
99
190
  method: method.toUpperCase(),
100
- responsePreview: `Error: ${error.message}`,
101
191
  responseSize: 0,
102
- statusCode: 0,
192
+ statusCode: response.status,
103
193
  timestamp: new Date(reqStart).toISOString(),
104
194
  url,
105
195
  });
106
- throw error;
196
+ return response;
107
197
  }
108
- const latencyMs = Date.now() - reqStart;
109
198
  // Clone the response so we can read the body without consuming it
110
199
  const clone = response.clone();
111
200
  let responseSize = 0;
112
201
  let responsePreview;
113
202
  if (this.options.captureResponsePreview) {
203
+ const previewBytes = this.resolvePreviewBytes(url);
114
204
  try {
115
205
  const text = await clone.text();
116
206
  responseSize = new TextEncoder().encode(text).length;
117
- responsePreview = text.slice(0, this.options.maxPreviewBytes);
207
+ responsePreview = text.slice(0, previewBytes);
118
208
  }
119
209
  catch {
120
210
  // Body might not be text — that's fine
@@ -123,6 +213,7 @@ export class RequestRecorder {
123
213
  }
124
214
  this.record({
125
215
  body: await this.extractBody(init?.body),
216
+ capture: "full",
126
217
  contentType: response.headers.get("content-type") ?? undefined,
127
218
  headers: this.extractHeaders(init?.headers),
128
219
  latencyMs,
@@ -152,26 +243,93 @@ export class RequestRecorder {
152
243
  *
153
244
  * Use this when you can't wrap `fetch` directly but can observe traffic
154
245
  * (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
246
+ *
247
+ * Filter behavior (W0132):
248
+ * - `excludePatterns` always drops the observation entirely.
249
+ * - `includePatterns` mismatch produces a slim `capture: "status-only"`
250
+ * record when `statusOnlyForUnmatched` is true (default), or drops it
251
+ * when false.
252
+ * - The discriminator on the input is honored: callers that already
253
+ * know they're emitting a slim record (e.g., the fetch wrapper) can
254
+ * set `capture: "status-only"` themselves.
155
255
  */
156
256
  record(observation) {
157
257
  if (!this.running)
158
258
  return;
159
259
  const url = observation.url;
160
- // Apply filters
260
+ if (this.options.excludePatterns.some((p) => p.test(url)))
261
+ return;
262
+ let capture = observation.capture ?? "full";
161
263
  if (this.options.includePatterns.length > 0) {
162
- if (!this.options.includePatterns.some((p) => p.test(url)))
163
- return;
264
+ const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
265
+ if (!matchesIncludes) {
266
+ if (!this.options.statusOnlyForUnmatched)
267
+ return;
268
+ capture = "status-only";
269
+ }
164
270
  }
165
- if (this.options.excludePatterns.some((p) => p.test(url)))
271
+ if (capture === "status-only") {
272
+ // Slim shape — strip body/headers/contentType/responsePreview so a
273
+ // caller that passed full data still produces a sanitized record.
274
+ this.observations.push({
275
+ capture: "status-only",
276
+ headers: {},
277
+ latencyMs: observation.latencyMs,
278
+ method: observation.method,
279
+ responseSize: 0,
280
+ seq: this.seq++,
281
+ statusCode: observation.statusCode,
282
+ timestamp: observation.timestamp,
283
+ url,
284
+ });
166
285
  return;
286
+ }
287
+ const previewBytes = this.resolvePreviewBytes(url);
167
288
  this.observations.push({
168
289
  ...observation,
290
+ capture: "full",
169
291
  // Truncate body if needed
170
292
  body: observation.body?.slice(0, this.options.maxBodyBytes),
171
- responsePreview: observation.responsePreview?.slice(0, this.options.maxPreviewBytes),
293
+ responsePreview: observation.responsePreview?.slice(0, previewBytes),
172
294
  seq: this.seq++,
173
295
  });
174
296
  }
297
+ /**
298
+ * Resolve the preview byte cap for a given URL using per-class overrides
299
+ * (W0133). Order of preference:
300
+ * 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
301
+ * 2. `previewLimits.search` for known search providers.
302
+ * 3. `previewLimits.default`.
303
+ */
304
+ resolvePreviewBytes(url) {
305
+ if (LLMS_TXT_PATTERN.test(url))
306
+ return this.options.previewLimits.llmsTxt;
307
+ if (SEARCH_URL_PATTERNS.some((p) => p.test(url))) {
308
+ return this.options.previewLimits.search;
309
+ }
310
+ return this.options.previewLimits.default;
311
+ }
312
+ /**
313
+ * Decide how to record a URL given the current filter configuration.
314
+ *
315
+ * - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
316
+ * and `statusOnlyForUnmatched` is false.
317
+ * - `"status-only"` — `includePatterns` failed but
318
+ * `statusOnlyForUnmatched` is true (default). Skip body/headers.
319
+ * - `"full"` — record everything.
320
+ *
321
+ * See W0132.
322
+ */
323
+ classifyCaptureMode(url) {
324
+ if (this.options.excludePatterns.some((p) => p.test(url)))
325
+ return "drop";
326
+ if (this.options.includePatterns.length === 0)
327
+ return "full";
328
+ const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
329
+ if (matchesIncludes)
330
+ return "full";
331
+ return this.options.statusOnlyForUnmatched ? "status-only" : "drop";
332
+ }
175
333
  /**
176
334
  * Reset the recorder for reuse without creating a new instance.
177
335
  */
@@ -101,19 +101,37 @@ export interface ExternalRequest {
101
101
  url: string;
102
102
  }
103
103
  export interface ObservedRequest {
104
- /** Request body (for POST searches, etc.), truncated to maxBodyBytes */
104
+ /** Request body (for POST searches, etc.), truncated to maxBodyBytes.
105
+ * Always omitted for `capture: "status-only"` entries. */
105
106
  body?: string;
106
- /** Content-Type of the response */
107
+ /**
108
+ * Capture mode discriminator (W0132).
109
+ *
110
+ * - `"full"` — URL matched `includePatterns`; body, headers, contentType,
111
+ * responseSize, and responsePreview are all captured.
112
+ * - `"status-only"` — URL did not match `includePatterns` but
113
+ * `statusOnlyForUnmatched` is true. Only url/method/statusCode/
114
+ * latencyMs/timestamp/seq are recorded; body/headers/contentType/
115
+ * responsePreview are intentionally omitted to avoid capturing
116
+ * prompts, completions, or API keys for third-party endpoints.
117
+ *
118
+ * Defaults to `"full"` on legacy records that pre-date W0132.
119
+ */
120
+ capture?: "full" | "status-only";
121
+ /** Content-Type of the response. Always omitted for status-only entries. */
107
122
  contentType?: string;
108
- /** Relevant request headers (e.g., Accept, User-Agent) */
123
+ /** Relevant request headers (e.g., Accept, User-Agent).
124
+ * Always empty for status-only entries (no header capture at all). */
109
125
  headers: Record<string, string>;
110
126
  /** Time from request start to response complete, in ms */
111
127
  latencyMs: number;
112
128
  /** HTTP method */
113
129
  method: string;
114
- /** Response body preview (first N chars), useful for seeing what the agent actually read */
130
+ /** Response body preview (first N chars), useful for seeing what the agent
131
+ * actually read. Always omitted for status-only entries. */
115
132
  responsePreview?: string;
116
- /** Response body size in bytes */
133
+ /** Response body size in bytes. 0 for status-only entries (we never read
134
+ * the body). */
117
135
  responseSize: number;
118
136
  /** Monotonic sequence number within the test run */
119
137
  seq: number;
@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
67
67
  .option("-q, --quiet", "Suppress non-error output")
68
68
  .option("--dotenv <path>", "Override default .env file path")
69
69
  .option("--explain", "Show execution plan without running")
70
- .option("--format <fmt>", "Output format for --explain (console, json)", "console")
70
+ .option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
71
71
  .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
72
72
  configureProgram(program);
73
73
  // Global --explain hook — intercepts any command before execution
@@ -3,7 +3,9 @@
3
3
  *
4
4
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
5
5
  * Commander subcommand interface: `baseline save`, `baseline compare`,
6
- * `baseline history`.
6
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
7
+ * tree (not the eval package's installed location); use `--baselines-dir`
8
+ * or `AILF_BASELINES_DIR` to override (W0098).
7
9
  */
8
10
  import { Command } from "commander";
9
11
  export declare function createBaselineCommand(): Command;
@@ -3,17 +3,34 @@
3
3
  *
4
4
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
5
5
  * Commander subcommand interface: `baseline save`, `baseline compare`,
6
- * `baseline history`.
6
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
7
+ * tree (not the eval package's installed location); use `--baselines-dir`
8
+ * or `AILF_BASELINES_DIR` to override (W0098).
7
9
  */
8
- import { dirname, resolve } from "path";
9
- import { fileURLToPath } from "url";
10
+ import { join, resolve } from "path";
10
11
  import { Command } from "commander";
11
12
  import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
12
- const __dirname = dirname(fileURLToPath(import.meta.url));
13
- const ROOT = resolve(__dirname, "../..");
13
+ import { getCallerCwd } from "./shared/resolve-output-dir.js";
14
14
  // CLI command name — kept as a constant to centralize the string literal.
15
15
  // "baseline" here refers to score baseline snapshots, not the legacy eval mode.
16
16
  const CMD_NAME = "baseline";
17
+ /**
18
+ * Resolve the directory that holds baseline `*.json` snapshots.
19
+ * Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
20
+ */
21
+ function resolveBaselinesDir(flag) {
22
+ if (flag)
23
+ return resolve(getCallerCwd(), flag);
24
+ if (process.env.AILF_BASELINES_DIR)
25
+ return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
26
+ return join(getCallerCwd(), ".ailf", "results", "baselines");
27
+ }
28
+ function resolveBaselineDirs(flag) {
29
+ return {
30
+ baselinesDir: resolveBaselinesDir(flag),
31
+ scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
32
+ };
33
+ }
17
34
  export function createBaselineCommand() {
18
35
  const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
19
36
  // -----------------------------------------------------------------------
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
23
40
  .command("save")
24
41
  .description("Save current scores as a baseline snapshot")
25
42
  .option("-t, --tag <tag>", "Descriptive tag for the baseline")
43
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
26
44
  .action(async (opts) => {
27
45
  console.log("=== Saving baseline snapshot ===\n");
28
- const result = saveBaseline(ROOT, opts.tag);
46
+ const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
29
47
  if (result.success) {
30
48
  console.log(` ✅ ${result.message}`);
31
49
  }
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
41
59
  .command("compare")
42
60
  .description("Compare current scores against a saved baseline")
43
61
  .option("-f, --file <path>", "Specific baseline file to compare against")
62
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
44
63
  .action(async (opts) => {
45
64
  console.log("=== Baseline Comparison ===\n");
46
- const result = compareBaseline(ROOT, opts.file);
65
+ const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
47
66
  if (!result.success) {
48
67
  console.error(` ❌ ${result.message}`);
49
68
  process.exit(1);
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
110
129
  cmd
111
130
  .command("history")
112
131
  .description("List all saved baselines")
113
- .action(async () => {
132
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
133
+ .action(async (opts) => {
114
134
  console.log("=== Baseline History ===\n");
115
- const baselines = listBaselines(ROOT);
135
+ const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
116
136
  if (baselines.length === 0) {
117
137
  console.log(" No baselines saved yet.");
118
138
  return;
@@ -2,9 +2,13 @@
2
2
  * cache command — manage the local pipeline cache.
3
3
  *
4
4
  * Subcommands:
5
- * cache clear Delete all local cache manifests (results/cache/).
5
+ * cache clear Delete all local cache manifests (.ailf/results/cache/).
6
6
  * cache status Show current cache entries and their ages.
7
7
  *
8
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
9
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
10
+ * override (W0098).
11
+ *
8
12
  * Note: This only affects the local file-system cache used to skip unchanged
9
13
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
10
14
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
@@ -2,20 +2,32 @@
2
2
  * cache command — manage the local pipeline cache.
3
3
  *
4
4
  * Subcommands:
5
- * cache clear Delete all local cache manifests (results/cache/).
5
+ * cache clear Delete all local cache manifests (.ailf/results/cache/).
6
6
  * cache status Show current cache entries and their ages.
7
7
  *
8
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
9
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
10
+ * override (W0098).
11
+ *
8
12
  * Note: This only affects the local file-system cache used to skip unchanged
9
13
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
10
14
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
11
15
  */
12
16
  import { Command } from "commander";
13
17
  import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
14
- import { dirname, join, resolve } from "path";
15
- import { fileURLToPath } from "url";
16
- const __dirname = dirname(fileURLToPath(import.meta.url));
17
- const ROOT = resolve(__dirname, "..", "..");
18
- const CACHE_DIR = resolve(ROOT, "results", "cache");
18
+ import { join, resolve } from "path";
19
+ import { getCallerCwd } from "./shared/resolve-output-dir.js";
20
+ /**
21
+ * Resolve the local pipeline cache directory.
22
+ * Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
23
+ */
24
+ function resolveCacheDir(flag) {
25
+ if (flag)
26
+ return resolve(getCallerCwd(), flag);
27
+ if (process.env.AILF_CACHE_DIR)
28
+ return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
29
+ return join(getCallerCwd(), ".ailf", "results", "cache");
30
+ }
19
31
  export function createCacheCommand() {
20
32
  const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
21
33
  // -----------------------------------------------------------------------
@@ -24,17 +36,19 @@ export function createCacheCommand() {
24
36
  cmd
25
37
  .command("clear")
26
38
  .description("Delete all local cache manifests so every pipeline step re-executes")
27
- .action(() => {
28
- if (!existsSync(CACHE_DIR)) {
39
+ .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
40
+ .action((opts) => {
41
+ const cacheDir = resolveCacheDir(opts.cacheDir);
42
+ if (!existsSync(cacheDir)) {
29
43
  console.log(" ℹ️ No local cache directory found — nothing to clear.");
30
44
  return;
31
45
  }
32
- const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
46
+ const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
33
47
  if (files.length === 0) {
34
48
  console.log(" ℹ️ Local cache directory is empty — nothing to clear.");
35
49
  return;
36
50
  }
37
- rmSync(CACHE_DIR, { recursive: true, force: true });
51
+ rmSync(cacheDir, { recursive: true, force: true });
38
52
  console.log(` 🗑️ Cleared ${files.length} local cache manifest(s).`);
39
53
  console.log(" ℹ️ Next pipeline run will re-execute all steps from scratch.");
40
54
  console.log("\n Note: The remote Content Lake cache is unaffected.");
@@ -46,12 +60,14 @@ export function createCacheCommand() {
46
60
  cmd
47
61
  .command("status")
48
62
  .description("Show current local cache entries and their ages")
49
- .action(() => {
50
- if (!existsSync(CACHE_DIR)) {
63
+ .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
64
+ .action((opts) => {
65
+ const cacheDir = resolveCacheDir(opts.cacheDir);
66
+ if (!existsSync(cacheDir)) {
51
67
  console.log(" ℹ️ No local cache directory found.");
52
68
  return;
53
69
  }
54
- const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
70
+ const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
55
71
  if (files.length === 0) {
56
72
  console.log(" ℹ️ Local cache directory is empty.");
57
73
  return;
@@ -64,7 +80,7 @@ export function createCacheCommand() {
64
80
  "Outputs");
65
81
  console.log(" " + "-".repeat(65));
66
82
  for (const file of files.sort()) {
67
- const filePath = join(CACHE_DIR, file);
83
+ const filePath = join(cacheDir, file);
68
84
  try {
69
85
  const raw = readFileSync(filePath, "utf-8");
70
86
  const manifest = JSON.parse(raw);
@@ -88,7 +104,7 @@ export function createCacheCommand() {
88
104
  }
89
105
  const totalSize = files.reduce((sum, f) => {
90
106
  try {
91
- return sum + statSync(join(CACHE_DIR, f)).size;
107
+ return sum + statSync(join(cacheDir, f)).size;
92
108
  }
93
109
  catch {
94
110
  return sum;
@@ -4,7 +4,7 @@
4
4
  * Wraps the existing compare pipeline logic and formatting utilities
5
5
  * in a Commander.js command for consistent CLI integration.
6
6
  */
7
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
7
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
8
8
  import { dirname, join, resolve } from "path";
9
9
  import { fileURLToPath } from "url";
10
10
  import { Command } from "commander";
@@ -79,7 +79,7 @@ export function createCompareCommand() {
79
79
  if (opts.format === "json") {
80
80
  const json = JSON.stringify(report, null, 2);
81
81
  if (opts.output) {
82
- writeFileSync(opts.output, json);
82
+ writeReport(opts.output, json);
83
83
  console.log(` ✅ Comparison report written to ${opts.output}`);
84
84
  }
85
85
  else {
@@ -91,13 +91,13 @@ export function createCompareCommand() {
91
91
  console.log(table);
92
92
  if (opts.output) {
93
93
  const json = JSON.stringify(report, null, 2);
94
- writeFileSync(opts.output, json);
94
+ writeReport(opts.output, json);
95
95
  console.log(` ✅ Comparison report also written to ${opts.output}`);
96
96
  }
97
97
  }
98
98
  // Write comparison report to output dir for other steps to consume
99
99
  const latestComparisonPath = join(outputDir, "comparison-report.json");
100
- writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
100
+ writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
101
101
  });
102
102
  addOutputDirOption(cmd);
103
103
  return cmd;
@@ -122,3 +122,10 @@ function loadSummary(path) {
122
122
  const raw = readFileSync(path, "utf-8");
123
123
  return JSON.parse(raw);
124
124
  }
125
+ // W0097: every write path creates its parent dir so a fresh project (no
126
+ // `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
127
+ // not-yet-existing directory both succeed instead of crashing with ENOENT.
128
+ function writeReport(path, contents) {
129
+ mkdirSync(dirname(path), { recursive: true });
130
+ writeFileSync(path, contents);
131
+ }
@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
541
541
  rootDir,
542
542
  });
543
543
  }
544
- // --format is a global option on the root program (actionCommand.parent)
544
+ // --explain-format is a global option on the root program (actionCommand.parent)
545
545
  const globalParentOpts = actionCommand.parent?.opts();
546
- const formatOpt = globalParentOpts?.format ?? "console";
546
+ const formatOpt = globalParentOpts?.explainFormat ?? "console";
547
547
  if (formatOpt === "json") {
548
548
  console.log(formatPlanJson(plan));
549
549
  }
@@ -35,16 +35,23 @@ export default defineModels({
35
35
 
36
36
  // ── OpenAI ─────────────────────────────────────────────────
37
37
  {
38
+ // gpt-5.2 routes through chat completions (and through the in-house
39
+ // agentic provider for naive/optimized variants). `verbosity` is a
40
+ // Responses-API-only field — it would be silently dropped here, so
41
+ // it isn't configured. See W0131.
38
42
  id: "openai:chat:gpt-5.2",
39
43
  label: "GPT 5.2",
40
44
  config: {
41
45
  max_completion_tokens: 8192,
42
- verbosity: "medium",
43
46
  },
44
47
  modes: ["literacy", "knowledge-probe"],
45
48
  // All literacy variants included by default
46
49
  },
47
50
  {
51
+ // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
52
+ // native handling of `openai:responses:` honors reasoning / verbosity /
53
+ // summary; the in-house agentic provider does not (W0131). MCP-server
54
+ // and knowledge-probe routes go through Promptfoo native too.
48
55
  id: "openai:responses:gpt-5.4",
49
56
  label: "GPT 5.4",
50
57
  config: {
@@ -55,7 +62,9 @@ export default defineModels({
55
62
  },
56
63
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
57
64
  modes: ["literacy", "mcp-server", "knowledge-probe"],
58
- // All literacy variants included by default
65
+ variants: {
66
+ literacy: ["baseline"],
67
+ },
59
68
  },
60
69
 
61
70
  // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
93
102
  defaults: {
94
103
  temperature: 0.2,
95
104
  max_tokens: 4096,
96
- maxToolRounds: 5, // for agentic modes
105
+ // Global default round budget for agentic modes. Per-mode overrides
106
+ // below give naive more headroom (W0134) since it spends rounds on
107
+ // retries when fetches fail. Per-model `config.maxToolRounds` still
108
+ // wins over both values.
109
+ maxToolRounds: 5,
110
+ modeMaxToolRounds: {
111
+ "agentic-naive": 8,
112
+ "agentic-optimized": 5,
113
+ },
97
114
  observerOptions: {
98
- maxPreviewBytes: 2048,
115
+ // Per-class preview caps (W0133): default 4 KB, but search responses
116
+ // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
117
+ // which result the model actually saw.
118
+ maxPreviewBytes: 4096,
119
+ previewLimits: {
120
+ default: 4096,
121
+ llmsTxt: 131072,
122
+ search: 16384,
123
+ },
99
124
  captureResponsePreview: true,
100
125
  includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
101
126
  sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
127
+ // statusOnlyForUnmatched defaults to true (W0132) — model-side
128
+ // traffic to api.openai.com / api.anthropic.com / googleapis.com
129
+ // surfaces in run artifacts as slim status-only entries.
102
130
  },
103
131
  },
104
132
  })