@sanity/ailf 3.9.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.ts +32 -4
- package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
- package/dist/_vendor/ailf-core/config-helpers.js +54 -1
- package/dist/_vendor/ailf-shared/index.d.ts +16 -10
- package/dist/_vendor/ailf-shared/index.js +13 -10
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/models.ts +32 -4
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/calculate-scores.js +40 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/package.json +2 -2
|
@@ -21,8 +21,49 @@
|
|
|
21
21
|
*
|
|
22
22
|
* const log = recorder.stop()
|
|
23
23
|
* // → AgentBehaviorLog with all requests classified
|
|
24
|
+
*
|
|
25
|
+
* W0133 — per-class preview byte caps
|
|
26
|
+
*
|
|
27
|
+
* `responsePreview` is capped at `previewLimits.default` (4 KB) for most
|
|
28
|
+
* responses, with per-class overrides for two payloads whose contents are
|
|
29
|
+
* the ground truth for trace audits:
|
|
30
|
+
*
|
|
31
|
+
* - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
|
|
32
|
+
* bing.com/search, duckduckgo.com, google.com/search responses. Captures
|
|
33
|
+
* the full result list (typical 8–10 KB) so trace audits can resolve
|
|
34
|
+
* which result the model fetched next.
|
|
35
|
+
* - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
|
|
36
|
+
* index is ~110 KB. Capturing the full body lets trace audits
|
|
37
|
+
* distinguish "model fetched a path that wasn't in the index" from
|
|
38
|
+
* "model fetched a path that was in the index but the page is missing".
|
|
39
|
+
*
|
|
40
|
+
* The slim Content Lake report (W0051) does not inline previews — they
|
|
41
|
+
* live in the GCS `traces` NDJSON artifact only, so bumping these caps
|
|
42
|
+
* has no effect on the 10 MB Sanity document budget.
|
|
24
43
|
*/
|
|
25
44
|
import { classifyRequests } from "./classifier.js";
|
|
45
|
+
/** Per-class preview-byte defaults (W0133). */
|
|
46
|
+
const DEFAULT_PREVIEW_LIMITS = {
|
|
47
|
+
default: 4096,
|
|
48
|
+
llmsTxt: 131072, // ~128 KB — covers Sanity's ~110 KB llms.txt
|
|
49
|
+
search: 16384, // ~16 KB — Jina/Google CSE/duckduckgo result lists
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* URL patterns for the `search` response class (W0133). These cover the
|
|
53
|
+
* search providers the agentic loop actually hits; new providers can be
|
|
54
|
+
* added here without changing the recorder API surface.
|
|
55
|
+
*/
|
|
56
|
+
const SEARCH_URL_PATTERNS = [
|
|
57
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?duckduckgo\.com/i,
|
|
58
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?google\.com\/search/i,
|
|
59
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?bing\.com\/search/i,
|
|
60
|
+
/^https?:\/\/(www\.)?googleapis\.com\/customsearch/i,
|
|
61
|
+
/^https?:\/\/(www\.)?google\.com\/search/i,
|
|
62
|
+
/^https?:\/\/(www\.)?bing\.com\/search/i,
|
|
63
|
+
/^https?:\/\/(www\.)?duckduckgo\.com/i,
|
|
64
|
+
];
|
|
65
|
+
/** URL pattern for the `llmsTxt` response class (W0133). */
|
|
66
|
+
const LLMS_TXT_PATTERN = /\/llms\.txt(\?|$|\/)/i;
|
|
26
67
|
const DEFAULT_OPTIONS = {
|
|
27
68
|
captureHeaders: [
|
|
28
69
|
"accept",
|
|
@@ -40,7 +81,9 @@ const DEFAULT_OPTIONS = {
|
|
|
40
81
|
],
|
|
41
82
|
includePatterns: [],
|
|
42
83
|
maxBodyBytes: 4096,
|
|
43
|
-
maxPreviewBytes:
|
|
84
|
+
maxPreviewBytes: DEFAULT_PREVIEW_LIMITS.default,
|
|
85
|
+
previewLimits: { ...DEFAULT_PREVIEW_LIMITS },
|
|
86
|
+
statusOnlyForUnmatched: true,
|
|
44
87
|
};
|
|
45
88
|
// ---------------------------------------------------------------------------
|
|
46
89
|
// RequestRecorder
|
|
@@ -63,6 +106,19 @@ export class RequestRecorder {
|
|
|
63
106
|
if (merged.excludePatterns) {
|
|
64
107
|
merged.excludePatterns = merged.excludePatterns.map(toRegExp);
|
|
65
108
|
}
|
|
109
|
+
// Resolve per-class preview caps. `previewLimits.default` wins over
|
|
110
|
+
// `maxPreviewBytes`; missing entries fall through to module defaults
|
|
111
|
+
// (W0133).
|
|
112
|
+
const userLimits = options?.previewLimits ?? {};
|
|
113
|
+
const resolvedDefault = userLimits.default ??
|
|
114
|
+
options?.maxPreviewBytes ??
|
|
115
|
+
DEFAULT_PREVIEW_LIMITS.default;
|
|
116
|
+
merged.previewLimits = {
|
|
117
|
+
default: resolvedDefault,
|
|
118
|
+
llmsTxt: userLimits.llmsTxt ?? DEFAULT_PREVIEW_LIMITS.llmsTxt,
|
|
119
|
+
search: userLimits.search ?? DEFAULT_PREVIEW_LIMITS.search,
|
|
120
|
+
};
|
|
121
|
+
merged.maxPreviewBytes = resolvedDefault;
|
|
66
122
|
this.options = merged;
|
|
67
123
|
}
|
|
68
124
|
/**
|
|
@@ -83,6 +139,7 @@ export class RequestRecorder {
|
|
|
83
139
|
? input.method
|
|
84
140
|
: "GET") ??
|
|
85
141
|
"GET";
|
|
142
|
+
const captureMode = this.classifyCaptureMode(url);
|
|
86
143
|
let response;
|
|
87
144
|
let error = null;
|
|
88
145
|
try {
|
|
@@ -90,31 +147,64 @@ export class RequestRecorder {
|
|
|
90
147
|
}
|
|
91
148
|
catch (err) {
|
|
92
149
|
error = err;
|
|
93
|
-
|
|
150
|
+
if (captureMode === "drop")
|
|
151
|
+
throw error;
|
|
152
|
+
// Record the failed request — status-only captures skip body/headers
|
|
153
|
+
// entirely (W0132).
|
|
154
|
+
this.record(captureMode === "full"
|
|
155
|
+
? {
|
|
156
|
+
body: await this.extractBody(init?.body),
|
|
157
|
+
capture: "full",
|
|
158
|
+
contentType: undefined,
|
|
159
|
+
headers: this.extractHeaders(init?.headers),
|
|
160
|
+
latencyMs: Date.now() - reqStart,
|
|
161
|
+
method: method.toUpperCase(),
|
|
162
|
+
responsePreview: `Error: ${error.message}`,
|
|
163
|
+
responseSize: 0,
|
|
164
|
+
statusCode: 0,
|
|
165
|
+
timestamp: new Date(reqStart).toISOString(),
|
|
166
|
+
url,
|
|
167
|
+
}
|
|
168
|
+
: {
|
|
169
|
+
capture: "status-only",
|
|
170
|
+
headers: {},
|
|
171
|
+
latencyMs: Date.now() - reqStart,
|
|
172
|
+
method: method.toUpperCase(),
|
|
173
|
+
responseSize: 0,
|
|
174
|
+
statusCode: 0,
|
|
175
|
+
timestamp: new Date(reqStart).toISOString(),
|
|
176
|
+
url,
|
|
177
|
+
});
|
|
178
|
+
throw error;
|
|
179
|
+
}
|
|
180
|
+
const latencyMs = Date.now() - reqStart;
|
|
181
|
+
if (captureMode === "drop")
|
|
182
|
+
return response;
|
|
183
|
+
if (captureMode === "status-only") {
|
|
184
|
+
// No body read, no header capture, no preview — only the metadata
|
|
185
|
+
// needed to know the call happened (W0132).
|
|
94
186
|
this.record({
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
latencyMs: Date.now() - reqStart,
|
|
187
|
+
capture: "status-only",
|
|
188
|
+
headers: {},
|
|
189
|
+
latencyMs,
|
|
99
190
|
method: method.toUpperCase(),
|
|
100
|
-
responsePreview: `Error: ${error.message}`,
|
|
101
191
|
responseSize: 0,
|
|
102
|
-
statusCode:
|
|
192
|
+
statusCode: response.status,
|
|
103
193
|
timestamp: new Date(reqStart).toISOString(),
|
|
104
194
|
url,
|
|
105
195
|
});
|
|
106
|
-
|
|
196
|
+
return response;
|
|
107
197
|
}
|
|
108
|
-
const latencyMs = Date.now() - reqStart;
|
|
109
198
|
// Clone the response so we can read the body without consuming it
|
|
110
199
|
const clone = response.clone();
|
|
111
200
|
let responseSize = 0;
|
|
112
201
|
let responsePreview;
|
|
113
202
|
if (this.options.captureResponsePreview) {
|
|
203
|
+
const previewBytes = this.resolvePreviewBytes(url);
|
|
114
204
|
try {
|
|
115
205
|
const text = await clone.text();
|
|
116
206
|
responseSize = new TextEncoder().encode(text).length;
|
|
117
|
-
responsePreview = text.slice(0,
|
|
207
|
+
responsePreview = text.slice(0, previewBytes);
|
|
118
208
|
}
|
|
119
209
|
catch {
|
|
120
210
|
// Body might not be text — that's fine
|
|
@@ -123,6 +213,7 @@ export class RequestRecorder {
|
|
|
123
213
|
}
|
|
124
214
|
this.record({
|
|
125
215
|
body: await this.extractBody(init?.body),
|
|
216
|
+
capture: "full",
|
|
126
217
|
contentType: response.headers.get("content-type") ?? undefined,
|
|
127
218
|
headers: this.extractHeaders(init?.headers),
|
|
128
219
|
latencyMs,
|
|
@@ -152,26 +243,93 @@ export class RequestRecorder {
|
|
|
152
243
|
*
|
|
153
244
|
* Use this when you can't wrap `fetch` directly but can observe traffic
|
|
154
245
|
* (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
|
|
246
|
+
*
|
|
247
|
+
* Filter behavior (W0132):
|
|
248
|
+
* - `excludePatterns` always drops the observation entirely.
|
|
249
|
+
* - `includePatterns` mismatch produces a slim `capture: "status-only"`
|
|
250
|
+
* record when `statusOnlyForUnmatched` is true (default), or drops it
|
|
251
|
+
* when false.
|
|
252
|
+
* - The discriminator on the input is honored: callers that already
|
|
253
|
+
* know they're emitting a slim record (e.g., the fetch wrapper) can
|
|
254
|
+
* set `capture: "status-only"` themselves.
|
|
155
255
|
*/
|
|
156
256
|
record(observation) {
|
|
157
257
|
if (!this.running)
|
|
158
258
|
return;
|
|
159
259
|
const url = observation.url;
|
|
160
|
-
|
|
260
|
+
if (this.options.excludePatterns.some((p) => p.test(url)))
|
|
261
|
+
return;
|
|
262
|
+
let capture = observation.capture ?? "full";
|
|
161
263
|
if (this.options.includePatterns.length > 0) {
|
|
162
|
-
|
|
163
|
-
|
|
264
|
+
const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
|
|
265
|
+
if (!matchesIncludes) {
|
|
266
|
+
if (!this.options.statusOnlyForUnmatched)
|
|
267
|
+
return;
|
|
268
|
+
capture = "status-only";
|
|
269
|
+
}
|
|
164
270
|
}
|
|
165
|
-
if (
|
|
271
|
+
if (capture === "status-only") {
|
|
272
|
+
// Slim shape — strip body/headers/contentType/responsePreview so a
|
|
273
|
+
// caller that passed full data still produces a sanitized record.
|
|
274
|
+
this.observations.push({
|
|
275
|
+
capture: "status-only",
|
|
276
|
+
headers: {},
|
|
277
|
+
latencyMs: observation.latencyMs,
|
|
278
|
+
method: observation.method,
|
|
279
|
+
responseSize: 0,
|
|
280
|
+
seq: this.seq++,
|
|
281
|
+
statusCode: observation.statusCode,
|
|
282
|
+
timestamp: observation.timestamp,
|
|
283
|
+
url,
|
|
284
|
+
});
|
|
166
285
|
return;
|
|
286
|
+
}
|
|
287
|
+
const previewBytes = this.resolvePreviewBytes(url);
|
|
167
288
|
this.observations.push({
|
|
168
289
|
...observation,
|
|
290
|
+
capture: "full",
|
|
169
291
|
// Truncate body if needed
|
|
170
292
|
body: observation.body?.slice(0, this.options.maxBodyBytes),
|
|
171
|
-
responsePreview: observation.responsePreview?.slice(0,
|
|
293
|
+
responsePreview: observation.responsePreview?.slice(0, previewBytes),
|
|
172
294
|
seq: this.seq++,
|
|
173
295
|
});
|
|
174
296
|
}
|
|
297
|
+
/**
|
|
298
|
+
* Resolve the preview byte cap for a given URL using per-class overrides
|
|
299
|
+
* (W0133). Order of preference:
|
|
300
|
+
* 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
|
|
301
|
+
* 2. `previewLimits.search` for known search providers.
|
|
302
|
+
* 3. `previewLimits.default`.
|
|
303
|
+
*/
|
|
304
|
+
resolvePreviewBytes(url) {
|
|
305
|
+
if (LLMS_TXT_PATTERN.test(url))
|
|
306
|
+
return this.options.previewLimits.llmsTxt;
|
|
307
|
+
if (SEARCH_URL_PATTERNS.some((p) => p.test(url))) {
|
|
308
|
+
return this.options.previewLimits.search;
|
|
309
|
+
}
|
|
310
|
+
return this.options.previewLimits.default;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Decide how to record a URL given the current filter configuration.
|
|
314
|
+
*
|
|
315
|
+
* - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
|
|
316
|
+
* and `statusOnlyForUnmatched` is false.
|
|
317
|
+
* - `"status-only"` — `includePatterns` failed but
|
|
318
|
+
* `statusOnlyForUnmatched` is true (default). Skip body/headers.
|
|
319
|
+
* - `"full"` — record everything.
|
|
320
|
+
*
|
|
321
|
+
* See W0132.
|
|
322
|
+
*/
|
|
323
|
+
classifyCaptureMode(url) {
|
|
324
|
+
if (this.options.excludePatterns.some((p) => p.test(url)))
|
|
325
|
+
return "drop";
|
|
326
|
+
if (this.options.includePatterns.length === 0)
|
|
327
|
+
return "full";
|
|
328
|
+
const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
|
|
329
|
+
if (matchesIncludes)
|
|
330
|
+
return "full";
|
|
331
|
+
return this.options.statusOnlyForUnmatched ? "status-only" : "drop";
|
|
332
|
+
}
|
|
175
333
|
/**
|
|
176
334
|
* Reset the recorder for reuse without creating a new instance.
|
|
177
335
|
*/
|
|
@@ -101,19 +101,37 @@ export interface ExternalRequest {
|
|
|
101
101
|
url: string;
|
|
102
102
|
}
|
|
103
103
|
export interface ObservedRequest {
|
|
104
|
-
/** Request body (for POST searches, etc.), truncated to maxBodyBytes
|
|
104
|
+
/** Request body (for POST searches, etc.), truncated to maxBodyBytes.
|
|
105
|
+
* Always omitted for `capture: "status-only"` entries. */
|
|
105
106
|
body?: string;
|
|
106
|
-
/**
|
|
107
|
+
/**
|
|
108
|
+
* Capture mode discriminator (W0132).
|
|
109
|
+
*
|
|
110
|
+
* - `"full"` — URL matched `includePatterns`; body, headers, contentType,
|
|
111
|
+
* responseSize, and responsePreview are all captured.
|
|
112
|
+
* - `"status-only"` — URL did not match `includePatterns` but
|
|
113
|
+
* `statusOnlyForUnmatched` is true. Only url/method/statusCode/
|
|
114
|
+
* latencyMs/timestamp/seq are recorded; body/headers/contentType/
|
|
115
|
+
* responsePreview are intentionally omitted to avoid capturing
|
|
116
|
+
* prompts, completions, or API keys for third-party endpoints.
|
|
117
|
+
*
|
|
118
|
+
* Defaults to `"full"` on legacy records that pre-date W0132.
|
|
119
|
+
*/
|
|
120
|
+
capture?: "full" | "status-only";
|
|
121
|
+
/** Content-Type of the response. Always omitted for status-only entries. */
|
|
107
122
|
contentType?: string;
|
|
108
|
-
/** Relevant request headers (e.g., Accept, User-Agent)
|
|
123
|
+
/** Relevant request headers (e.g., Accept, User-Agent).
|
|
124
|
+
* Always empty for status-only entries (no header capture at all). */
|
|
109
125
|
headers: Record<string, string>;
|
|
110
126
|
/** Time from request start to response complete, in ms */
|
|
111
127
|
latencyMs: number;
|
|
112
128
|
/** HTTP method */
|
|
113
129
|
method: string;
|
|
114
|
-
/** Response body preview (first N chars), useful for seeing what the agent
|
|
130
|
+
/** Response body preview (first N chars), useful for seeing what the agent
|
|
131
|
+
* actually read. Always omitted for status-only entries. */
|
|
115
132
|
responsePreview?: string;
|
|
116
|
-
/** Response body size in bytes
|
|
133
|
+
/** Response body size in bytes. 0 for status-only entries (we never read
|
|
134
|
+
* the body). */
|
|
117
135
|
responseSize: number;
|
|
118
136
|
/** Monotonic sequence number within the test run */
|
|
119
137
|
seq: number;
|
package/dist/cli-program.js
CHANGED
|
@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
|
|
|
67
67
|
.option("-q, --quiet", "Suppress non-error output")
|
|
68
68
|
.option("--dotenv <path>", "Override default .env file path")
|
|
69
69
|
.option("--explain", "Show execution plan without running")
|
|
70
|
-
.option("--format <fmt>", "Output format for --explain (console, json)", "console")
|
|
70
|
+
.option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
|
|
71
71
|
.option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
|
|
72
72
|
configureProgram(program);
|
|
73
73
|
// Global --explain hook — intercepts any command before execution
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Wraps the core baseline functions from pipeline/baseline.ts behind a
|
|
5
5
|
* Commander subcommand interface: `baseline save`, `baseline compare`,
|
|
6
|
-
* `baseline history`.
|
|
6
|
+
* `baseline history`. All three operate on the *caller's* `.ailf/results/`
|
|
7
|
+
* tree (not the eval package's installed location); use `--baselines-dir`
|
|
8
|
+
* or `AILF_BASELINES_DIR` to override (W0098).
|
|
7
9
|
*/
|
|
8
10
|
import { Command } from "commander";
|
|
9
11
|
export declare function createBaselineCommand(): Command;
|
|
@@ -3,17 +3,34 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Wraps the core baseline functions from pipeline/baseline.ts behind a
|
|
5
5
|
* Commander subcommand interface: `baseline save`, `baseline compare`,
|
|
6
|
-
* `baseline history`.
|
|
6
|
+
* `baseline history`. All three operate on the *caller's* `.ailf/results/`
|
|
7
|
+
* tree (not the eval package's installed location); use `--baselines-dir`
|
|
8
|
+
* or `AILF_BASELINES_DIR` to override (W0098).
|
|
7
9
|
*/
|
|
8
|
-
import {
|
|
9
|
-
import { fileURLToPath } from "url";
|
|
10
|
+
import { join, resolve } from "path";
|
|
10
11
|
import { Command } from "commander";
|
|
11
12
|
import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
|
|
12
|
-
|
|
13
|
-
const ROOT = resolve(__dirname, "../..");
|
|
13
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
14
14
|
// CLI command name — kept as a constant to centralize the string literal.
|
|
15
15
|
// "baseline" here refers to score baseline snapshots, not the legacy eval mode.
|
|
16
16
|
const CMD_NAME = "baseline";
|
|
17
|
+
/**
|
|
18
|
+
* Resolve the directory that holds baseline `*.json` snapshots.
|
|
19
|
+
* Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
|
|
20
|
+
*/
|
|
21
|
+
function resolveBaselinesDir(flag) {
|
|
22
|
+
if (flag)
|
|
23
|
+
return resolve(getCallerCwd(), flag);
|
|
24
|
+
if (process.env.AILF_BASELINES_DIR)
|
|
25
|
+
return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
|
|
26
|
+
return join(getCallerCwd(), ".ailf", "results", "baselines");
|
|
27
|
+
}
|
|
28
|
+
function resolveBaselineDirs(flag) {
|
|
29
|
+
return {
|
|
30
|
+
baselinesDir: resolveBaselinesDir(flag),
|
|
31
|
+
scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
17
34
|
export function createBaselineCommand() {
|
|
18
35
|
const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
|
|
19
36
|
// -----------------------------------------------------------------------
|
|
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
|
|
|
23
40
|
.command("save")
|
|
24
41
|
.description("Save current scores as a baseline snapshot")
|
|
25
42
|
.option("-t, --tag <tag>", "Descriptive tag for the baseline")
|
|
43
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
26
44
|
.action(async (opts) => {
|
|
27
45
|
console.log("=== Saving baseline snapshot ===\n");
|
|
28
|
-
const result = saveBaseline(
|
|
46
|
+
const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
|
|
29
47
|
if (result.success) {
|
|
30
48
|
console.log(` ✅ ${result.message}`);
|
|
31
49
|
}
|
|
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
|
|
|
41
59
|
.command("compare")
|
|
42
60
|
.description("Compare current scores against a saved baseline")
|
|
43
61
|
.option("-f, --file <path>", "Specific baseline file to compare against")
|
|
62
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
44
63
|
.action(async (opts) => {
|
|
45
64
|
console.log("=== Baseline Comparison ===\n");
|
|
46
|
-
const result = compareBaseline(
|
|
65
|
+
const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
|
|
47
66
|
if (!result.success) {
|
|
48
67
|
console.error(` ❌ ${result.message}`);
|
|
49
68
|
process.exit(1);
|
|
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
|
|
|
110
129
|
cmd
|
|
111
130
|
.command("history")
|
|
112
131
|
.description("List all saved baselines")
|
|
113
|
-
.
|
|
132
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
133
|
+
.action(async (opts) => {
|
|
114
134
|
console.log("=== Baseline History ===\n");
|
|
115
|
-
const baselines = listBaselines(
|
|
135
|
+
const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
|
|
116
136
|
if (baselines.length === 0) {
|
|
117
137
|
console.log(" No baselines saved yet.");
|
|
118
138
|
return;
|
package/dist/commands/cache.d.ts
CHANGED
|
@@ -2,9 +2,13 @@
|
|
|
2
2
|
* cache command — manage the local pipeline cache.
|
|
3
3
|
*
|
|
4
4
|
* Subcommands:
|
|
5
|
-
* cache clear Delete all local cache manifests (results/cache/).
|
|
5
|
+
* cache clear Delete all local cache manifests (.ailf/results/cache/).
|
|
6
6
|
* cache status Show current cache entries and their ages.
|
|
7
7
|
*
|
|
8
|
+
* Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
|
|
9
|
+
* package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
|
|
10
|
+
* override (W0098).
|
|
11
|
+
*
|
|
8
12
|
* Note: This only affects the local file-system cache used to skip unchanged
|
|
9
13
|
* pipeline steps. It does NOT touch the remote Content Lake eval cache.
|
|
10
14
|
* Use --no-remote-cache on pipeline commands to bypass the remote cache.
|
package/dist/commands/cache.js
CHANGED
|
@@ -2,20 +2,32 @@
|
|
|
2
2
|
* cache command — manage the local pipeline cache.
|
|
3
3
|
*
|
|
4
4
|
* Subcommands:
|
|
5
|
-
* cache clear Delete all local cache manifests (results/cache/).
|
|
5
|
+
* cache clear Delete all local cache manifests (.ailf/results/cache/).
|
|
6
6
|
* cache status Show current cache entries and their ages.
|
|
7
7
|
*
|
|
8
|
+
* Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
|
|
9
|
+
* package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
|
|
10
|
+
* override (W0098).
|
|
11
|
+
*
|
|
8
12
|
* Note: This only affects the local file-system cache used to skip unchanged
|
|
9
13
|
* pipeline steps. It does NOT touch the remote Content Lake eval cache.
|
|
10
14
|
* Use --no-remote-cache on pipeline commands to bypass the remote cache.
|
|
11
15
|
*/
|
|
12
16
|
import { Command } from "commander";
|
|
13
17
|
import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
import { join, resolve } from "path";
|
|
19
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
20
|
+
/**
|
|
21
|
+
* Resolve the local pipeline cache directory.
|
|
22
|
+
* Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
|
|
23
|
+
*/
|
|
24
|
+
function resolveCacheDir(flag) {
|
|
25
|
+
if (flag)
|
|
26
|
+
return resolve(getCallerCwd(), flag);
|
|
27
|
+
if (process.env.AILF_CACHE_DIR)
|
|
28
|
+
return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
|
|
29
|
+
return join(getCallerCwd(), ".ailf", "results", "cache");
|
|
30
|
+
}
|
|
19
31
|
export function createCacheCommand() {
|
|
20
32
|
const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
|
|
21
33
|
// -----------------------------------------------------------------------
|
|
@@ -24,17 +36,19 @@ export function createCacheCommand() {
|
|
|
24
36
|
cmd
|
|
25
37
|
.command("clear")
|
|
26
38
|
.description("Delete all local cache manifests so every pipeline step re-executes")
|
|
27
|
-
.
|
|
28
|
-
|
|
39
|
+
.option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
|
|
40
|
+
.action((opts) => {
|
|
41
|
+
const cacheDir = resolveCacheDir(opts.cacheDir);
|
|
42
|
+
if (!existsSync(cacheDir)) {
|
|
29
43
|
console.log(" ℹ️ No local cache directory found — nothing to clear.");
|
|
30
44
|
return;
|
|
31
45
|
}
|
|
32
|
-
const files = readdirSync(
|
|
46
|
+
const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
|
|
33
47
|
if (files.length === 0) {
|
|
34
48
|
console.log(" ℹ️ Local cache directory is empty — nothing to clear.");
|
|
35
49
|
return;
|
|
36
50
|
}
|
|
37
|
-
rmSync(
|
|
51
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
38
52
|
console.log(` 🗑️ Cleared ${files.length} local cache manifest(s).`);
|
|
39
53
|
console.log(" ℹ️ Next pipeline run will re-execute all steps from scratch.");
|
|
40
54
|
console.log("\n Note: The remote Content Lake cache is unaffected.");
|
|
@@ -46,12 +60,14 @@ export function createCacheCommand() {
|
|
|
46
60
|
cmd
|
|
47
61
|
.command("status")
|
|
48
62
|
.description("Show current local cache entries and their ages")
|
|
49
|
-
.
|
|
50
|
-
|
|
63
|
+
.option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
|
|
64
|
+
.action((opts) => {
|
|
65
|
+
const cacheDir = resolveCacheDir(opts.cacheDir);
|
|
66
|
+
if (!existsSync(cacheDir)) {
|
|
51
67
|
console.log(" ℹ️ No local cache directory found.");
|
|
52
68
|
return;
|
|
53
69
|
}
|
|
54
|
-
const files = readdirSync(
|
|
70
|
+
const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
|
|
55
71
|
if (files.length === 0) {
|
|
56
72
|
console.log(" ℹ️ Local cache directory is empty.");
|
|
57
73
|
return;
|
|
@@ -64,7 +80,7 @@ export function createCacheCommand() {
|
|
|
64
80
|
"Outputs");
|
|
65
81
|
console.log(" " + "-".repeat(65));
|
|
66
82
|
for (const file of files.sort()) {
|
|
67
|
-
const filePath = join(
|
|
83
|
+
const filePath = join(cacheDir, file);
|
|
68
84
|
try {
|
|
69
85
|
const raw = readFileSync(filePath, "utf-8");
|
|
70
86
|
const manifest = JSON.parse(raw);
|
|
@@ -88,7 +104,7 @@ export function createCacheCommand() {
|
|
|
88
104
|
}
|
|
89
105
|
const totalSize = files.reduce((sum, f) => {
|
|
90
106
|
try {
|
|
91
|
-
return sum + statSync(join(
|
|
107
|
+
return sum + statSync(join(cacheDir, f)).size;
|
|
92
108
|
}
|
|
93
109
|
catch {
|
|
94
110
|
return sum;
|
package/dist/commands/compare.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Wraps the existing compare pipeline logic and formatting utilities
|
|
5
5
|
* in a Commander.js command for consistent CLI integration.
|
|
6
6
|
*/
|
|
7
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
8
8
|
import { dirname, join, resolve } from "path";
|
|
9
9
|
import { fileURLToPath } from "url";
|
|
10
10
|
import { Command } from "commander";
|
|
@@ -79,7 +79,7 @@ export function createCompareCommand() {
|
|
|
79
79
|
if (opts.format === "json") {
|
|
80
80
|
const json = JSON.stringify(report, null, 2);
|
|
81
81
|
if (opts.output) {
|
|
82
|
-
|
|
82
|
+
writeReport(opts.output, json);
|
|
83
83
|
console.log(` ✅ Comparison report written to ${opts.output}`);
|
|
84
84
|
}
|
|
85
85
|
else {
|
|
@@ -91,13 +91,13 @@ export function createCompareCommand() {
|
|
|
91
91
|
console.log(table);
|
|
92
92
|
if (opts.output) {
|
|
93
93
|
const json = JSON.stringify(report, null, 2);
|
|
94
|
-
|
|
94
|
+
writeReport(opts.output, json);
|
|
95
95
|
console.log(` ✅ Comparison report also written to ${opts.output}`);
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
98
|
// Write comparison report to output dir for other steps to consume
|
|
99
99
|
const latestComparisonPath = join(outputDir, "comparison-report.json");
|
|
100
|
-
|
|
100
|
+
writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
|
|
101
101
|
});
|
|
102
102
|
addOutputDirOption(cmd);
|
|
103
103
|
return cmd;
|
|
@@ -122,3 +122,10 @@ function loadSummary(path) {
|
|
|
122
122
|
const raw = readFileSync(path, "utf-8");
|
|
123
123
|
return JSON.parse(raw);
|
|
124
124
|
}
|
|
125
|
+
// W0097: every write path creates its parent dir so a fresh project (no
|
|
126
|
+
// `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
|
|
127
|
+
// not-yet-existing directory both succeed instead of crashing with ENOENT.
|
|
128
|
+
function writeReport(path, contents) {
|
|
129
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
130
|
+
writeFileSync(path, contents);
|
|
131
|
+
}
|
|
@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
|
|
|
541
541
|
rootDir,
|
|
542
542
|
});
|
|
543
543
|
}
|
|
544
|
-
// --format is a global option on the root program (actionCommand.parent)
|
|
544
|
+
// --explain-format is a global option on the root program (actionCommand.parent)
|
|
545
545
|
const globalParentOpts = actionCommand.parent?.opts();
|
|
546
|
-
const formatOpt = globalParentOpts?.
|
|
546
|
+
const formatOpt = globalParentOpts?.explainFormat ?? "console";
|
|
547
547
|
if (formatOpt === "json") {
|
|
548
548
|
console.log(formatPlanJson(plan));
|
|
549
549
|
}
|
package/dist/config/models.ts
CHANGED
|
@@ -35,16 +35,23 @@ export default defineModels({
|
|
|
35
35
|
|
|
36
36
|
// ── OpenAI ─────────────────────────────────────────────────
|
|
37
37
|
{
|
|
38
|
+
// gpt-5.2 routes through chat completions (and through the in-house
|
|
39
|
+
// agentic provider for naive/optimized variants). `verbosity` is a
|
|
40
|
+
// Responses-API-only field — it would be silently dropped here, so
|
|
41
|
+
// it isn't configured. See W0131.
|
|
38
42
|
id: "openai:chat:gpt-5.2",
|
|
39
43
|
label: "GPT 5.2",
|
|
40
44
|
config: {
|
|
41
45
|
max_completion_tokens: 8192,
|
|
42
|
-
verbosity: "medium",
|
|
43
46
|
},
|
|
44
47
|
modes: ["literacy", "knowledge-probe"],
|
|
45
48
|
// All literacy variants included by default
|
|
46
49
|
},
|
|
47
50
|
{
|
|
51
|
+
// GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
|
|
52
|
+
// native handling of `openai:responses:` honors reasoning / verbosity /
|
|
53
|
+
// summary; the in-house agentic provider does not (W0131). MCP-server
|
|
54
|
+
// and knowledge-probe routes go through Promptfoo native too.
|
|
48
55
|
id: "openai:responses:gpt-5.4",
|
|
49
56
|
label: "GPT 5.4",
|
|
50
57
|
config: {
|
|
@@ -55,7 +62,9 @@ export default defineModels({
|
|
|
55
62
|
},
|
|
56
63
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
57
64
|
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
58
|
-
|
|
65
|
+
variants: {
|
|
66
|
+
literacy: ["baseline"],
|
|
67
|
+
},
|
|
59
68
|
},
|
|
60
69
|
|
|
61
70
|
// ── Disabled models (uncomment to enable) ──────────────────
|
|
@@ -93,12 +102,31 @@ export default defineModels({
|
|
|
93
102
|
defaults: {
|
|
94
103
|
temperature: 0.2,
|
|
95
104
|
max_tokens: 4096,
|
|
96
|
-
|
|
105
|
+
// Global default round budget for agentic modes. Per-mode overrides
|
|
106
|
+
// below give naive more headroom (W0134) since it spends rounds on
|
|
107
|
+
// retries when fetches fail. Per-model `config.maxToolRounds` still
|
|
108
|
+
// wins over both values.
|
|
109
|
+
maxToolRounds: 5,
|
|
110
|
+
modeMaxToolRounds: {
|
|
111
|
+
"agentic-naive": 8,
|
|
112
|
+
"agentic-optimized": 5,
|
|
113
|
+
},
|
|
97
114
|
observerOptions: {
|
|
98
|
-
|
|
115
|
+
// Per-class preview caps (W0133): default 4 KB, but search responses
|
|
116
|
+
// get 16 KB and llms.txt gets 128 KB so trace audits can resolve
|
|
117
|
+
// which result the model actually saw.
|
|
118
|
+
maxPreviewBytes: 4096,
|
|
119
|
+
previewLimits: {
|
|
120
|
+
default: 4096,
|
|
121
|
+
llmsTxt: 131072,
|
|
122
|
+
search: 16384,
|
|
123
|
+
},
|
|
99
124
|
captureResponsePreview: true,
|
|
100
125
|
includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
|
|
101
126
|
sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
|
|
127
|
+
// statusOnlyForUnmatched defaults to true (W0132) — model-side
|
|
128
|
+
// traffic to api.openai.com / api.anthropic.com / googleapis.com
|
|
129
|
+
// surfaces in run artifacts as slim status-only entries.
|
|
102
130
|
},
|
|
103
131
|
},
|
|
104
132
|
})
|