@sanity/ailf 3.8.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/canary-tasks.ts +64 -0
- package/config/models.ts +32 -4
- package/config/test-budgets.ts +24 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
- package/dist/_vendor/ailf-core/config-helpers.js +81 -1
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
- package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
- package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
- package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -9
- package/dist/_vendor/ailf-shared/index.js +13 -9
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/canary-tasks.ts +64 -0
- package/dist/config/models.ts +32 -4
- package/dist/config/test-budgets.ts +24 -0
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/calculate-scores.d.ts +17 -2
- package/dist/pipeline/calculate-scores.js +139 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/package.json +2 -1
- package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
|
@@ -101,19 +101,37 @@ export interface ExternalRequest {
|
|
|
101
101
|
url: string;
|
|
102
102
|
}
|
|
103
103
|
export interface ObservedRequest {
|
|
104
|
-
/** Request body (for POST searches, etc.), truncated to maxBodyBytes
|
|
104
|
+
/** Request body (for POST searches, etc.), truncated to maxBodyBytes.
|
|
105
|
+
* Always omitted for `capture: "status-only"` entries. */
|
|
105
106
|
body?: string;
|
|
106
|
-
/**
|
|
107
|
+
/**
|
|
108
|
+
* Capture mode discriminator (W0132).
|
|
109
|
+
*
|
|
110
|
+
* - `"full"` — URL matched `includePatterns`; body, headers, contentType,
|
|
111
|
+
* responseSize, and responsePreview are all captured.
|
|
112
|
+
* - `"status-only"` — URL did not match `includePatterns` but
|
|
113
|
+
* `statusOnlyForUnmatched` is true. Only url/method/statusCode/
|
|
114
|
+
* latencyMs/timestamp/seq are recorded; body/headers/contentType/
|
|
115
|
+
* responsePreview are intentionally omitted to avoid capturing
|
|
116
|
+
* prompts, completions, or API keys for third-party endpoints.
|
|
117
|
+
*
|
|
118
|
+
* Defaults to `"full"` on legacy records that pre-date W0132.
|
|
119
|
+
*/
|
|
120
|
+
capture?: "full" | "status-only";
|
|
121
|
+
/** Content-Type of the response. Always omitted for status-only entries. */
|
|
107
122
|
contentType?: string;
|
|
108
|
-
/** Relevant request headers (e.g., Accept, User-Agent)
|
|
123
|
+
/** Relevant request headers (e.g., Accept, User-Agent).
|
|
124
|
+
* Always empty for status-only entries (no header capture at all). */
|
|
109
125
|
headers: Record<string, string>;
|
|
110
126
|
/** Time from request start to response complete, in ms */
|
|
111
127
|
latencyMs: number;
|
|
112
128
|
/** HTTP method */
|
|
113
129
|
method: string;
|
|
114
|
-
/** Response body preview (first N chars), useful for seeing what the agent
|
|
130
|
+
/** Response body preview (first N chars), useful for seeing what the agent
|
|
131
|
+
* actually read. Always omitted for status-only entries. */
|
|
115
132
|
responsePreview?: string;
|
|
116
|
-
/** Response body size in bytes
|
|
133
|
+
/** Response body size in bytes. 0 for status-only entries (we never read
|
|
134
|
+
* the body). */
|
|
117
135
|
responseSize: number;
|
|
118
136
|
/** Monotonic sequence number within the test run */
|
|
119
137
|
seq: number;
|
package/dist/cli-program.js
CHANGED
|
@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
|
|
|
67
67
|
.option("-q, --quiet", "Suppress non-error output")
|
|
68
68
|
.option("--dotenv <path>", "Override default .env file path")
|
|
69
69
|
.option("--explain", "Show execution plan without running")
|
|
70
|
-
.option("--format <fmt>", "Output format for --explain (console, json)", "console")
|
|
70
|
+
.option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
|
|
71
71
|
.option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
|
|
72
72
|
configureProgram(program);
|
|
73
73
|
// Global --explain hook — intercepts any command before execution
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Wraps the core baseline functions from pipeline/baseline.ts behind a
|
|
5
5
|
* Commander subcommand interface: `baseline save`, `baseline compare`,
|
|
6
|
-
* `baseline history`.
|
|
6
|
+
* `baseline history`. All three operate on the *caller's* `.ailf/results/`
|
|
7
|
+
* tree (not the eval package's installed location); use `--baselines-dir`
|
|
8
|
+
* or `AILF_BASELINES_DIR` to override (W0098).
|
|
7
9
|
*/
|
|
8
10
|
import { Command } from "commander";
|
|
9
11
|
export declare function createBaselineCommand(): Command;
|
|
@@ -3,17 +3,34 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Wraps the core baseline functions from pipeline/baseline.ts behind a
|
|
5
5
|
* Commander subcommand interface: `baseline save`, `baseline compare`,
|
|
6
|
-
* `baseline history`.
|
|
6
|
+
* `baseline history`. All three operate on the *caller's* `.ailf/results/`
|
|
7
|
+
* tree (not the eval package's installed location); use `--baselines-dir`
|
|
8
|
+
* or `AILF_BASELINES_DIR` to override (W0098).
|
|
7
9
|
*/
|
|
8
|
-
import {
|
|
9
|
-
import { fileURLToPath } from "url";
|
|
10
|
+
import { join, resolve } from "path";
|
|
10
11
|
import { Command } from "commander";
|
|
11
12
|
import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
|
|
12
|
-
|
|
13
|
-
const ROOT = resolve(__dirname, "../..");
|
|
13
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
14
14
|
// CLI command name — kept as a constant to centralize the string literal.
|
|
15
15
|
// "baseline" here refers to score baseline snapshots, not the legacy eval mode.
|
|
16
16
|
const CMD_NAME = "baseline";
|
|
17
|
+
/**
|
|
18
|
+
* Resolve the directory that holds baseline `*.json` snapshots.
|
|
19
|
+
* Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
|
|
20
|
+
*/
|
|
21
|
+
function resolveBaselinesDir(flag) {
|
|
22
|
+
if (flag)
|
|
23
|
+
return resolve(getCallerCwd(), flag);
|
|
24
|
+
if (process.env.AILF_BASELINES_DIR)
|
|
25
|
+
return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
|
|
26
|
+
return join(getCallerCwd(), ".ailf", "results", "baselines");
|
|
27
|
+
}
|
|
28
|
+
function resolveBaselineDirs(flag) {
|
|
29
|
+
return {
|
|
30
|
+
baselinesDir: resolveBaselinesDir(flag),
|
|
31
|
+
scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
17
34
|
export function createBaselineCommand() {
|
|
18
35
|
const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
|
|
19
36
|
// -----------------------------------------------------------------------
|
|
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
|
|
|
23
40
|
.command("save")
|
|
24
41
|
.description("Save current scores as a baseline snapshot")
|
|
25
42
|
.option("-t, --tag <tag>", "Descriptive tag for the baseline")
|
|
43
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
26
44
|
.action(async (opts) => {
|
|
27
45
|
console.log("=== Saving baseline snapshot ===\n");
|
|
28
|
-
const result = saveBaseline(
|
|
46
|
+
const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
|
|
29
47
|
if (result.success) {
|
|
30
48
|
console.log(` ✅ ${result.message}`);
|
|
31
49
|
}
|
|
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
|
|
|
41
59
|
.command("compare")
|
|
42
60
|
.description("Compare current scores against a saved baseline")
|
|
43
61
|
.option("-f, --file <path>", "Specific baseline file to compare against")
|
|
62
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
44
63
|
.action(async (opts) => {
|
|
45
64
|
console.log("=== Baseline Comparison ===\n");
|
|
46
|
-
const result = compareBaseline(
|
|
65
|
+
const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
|
|
47
66
|
if (!result.success) {
|
|
48
67
|
console.error(` ❌ ${result.message}`);
|
|
49
68
|
process.exit(1);
|
|
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
|
|
|
110
129
|
cmd
|
|
111
130
|
.command("history")
|
|
112
131
|
.description("List all saved baselines")
|
|
113
|
-
.
|
|
132
|
+
.option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
|
|
133
|
+
.action(async (opts) => {
|
|
114
134
|
console.log("=== Baseline History ===\n");
|
|
115
|
-
const baselines = listBaselines(
|
|
135
|
+
const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
|
|
116
136
|
if (baselines.length === 0) {
|
|
117
137
|
console.log(" No baselines saved yet.");
|
|
118
138
|
return;
|
package/dist/commands/cache.d.ts
CHANGED
|
@@ -2,9 +2,13 @@
|
|
|
2
2
|
* cache command — manage the local pipeline cache.
|
|
3
3
|
*
|
|
4
4
|
* Subcommands:
|
|
5
|
-
* cache clear Delete all local cache manifests (results/cache/).
|
|
5
|
+
* cache clear Delete all local cache manifests (.ailf/results/cache/).
|
|
6
6
|
* cache status Show current cache entries and their ages.
|
|
7
7
|
*
|
|
8
|
+
* Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
|
|
9
|
+
* package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
|
|
10
|
+
* override (W0098).
|
|
11
|
+
*
|
|
8
12
|
* Note: This only affects the local file-system cache used to skip unchanged
|
|
9
13
|
* pipeline steps. It does NOT touch the remote Content Lake eval cache.
|
|
10
14
|
* Use --no-remote-cache on pipeline commands to bypass the remote cache.
|
package/dist/commands/cache.js
CHANGED
|
@@ -2,20 +2,32 @@
|
|
|
2
2
|
* cache command — manage the local pipeline cache.
|
|
3
3
|
*
|
|
4
4
|
* Subcommands:
|
|
5
|
-
* cache clear Delete all local cache manifests (results/cache/).
|
|
5
|
+
* cache clear Delete all local cache manifests (.ailf/results/cache/).
|
|
6
6
|
* cache status Show current cache entries and their ages.
|
|
7
7
|
*
|
|
8
|
+
* Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
|
|
9
|
+
* package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
|
|
10
|
+
* override (W0098).
|
|
11
|
+
*
|
|
8
12
|
* Note: This only affects the local file-system cache used to skip unchanged
|
|
9
13
|
* pipeline steps. It does NOT touch the remote Content Lake eval cache.
|
|
10
14
|
* Use --no-remote-cache on pipeline commands to bypass the remote cache.
|
|
11
15
|
*/
|
|
12
16
|
import { Command } from "commander";
|
|
13
17
|
import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
import { join, resolve } from "path";
|
|
19
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
20
|
+
/**
|
|
21
|
+
* Resolve the local pipeline cache directory.
|
|
22
|
+
* Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
|
|
23
|
+
*/
|
|
24
|
+
function resolveCacheDir(flag) {
|
|
25
|
+
if (flag)
|
|
26
|
+
return resolve(getCallerCwd(), flag);
|
|
27
|
+
if (process.env.AILF_CACHE_DIR)
|
|
28
|
+
return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
|
|
29
|
+
return join(getCallerCwd(), ".ailf", "results", "cache");
|
|
30
|
+
}
|
|
19
31
|
export function createCacheCommand() {
|
|
20
32
|
const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
|
|
21
33
|
// -----------------------------------------------------------------------
|
|
@@ -24,17 +36,19 @@ export function createCacheCommand() {
|
|
|
24
36
|
cmd
|
|
25
37
|
.command("clear")
|
|
26
38
|
.description("Delete all local cache manifests so every pipeline step re-executes")
|
|
27
|
-
.
|
|
28
|
-
|
|
39
|
+
.option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
|
|
40
|
+
.action((opts) => {
|
|
41
|
+
const cacheDir = resolveCacheDir(opts.cacheDir);
|
|
42
|
+
if (!existsSync(cacheDir)) {
|
|
29
43
|
console.log(" ℹ️ No local cache directory found — nothing to clear.");
|
|
30
44
|
return;
|
|
31
45
|
}
|
|
32
|
-
const files = readdirSync(
|
|
46
|
+
const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
|
|
33
47
|
if (files.length === 0) {
|
|
34
48
|
console.log(" ℹ️ Local cache directory is empty — nothing to clear.");
|
|
35
49
|
return;
|
|
36
50
|
}
|
|
37
|
-
rmSync(
|
|
51
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
38
52
|
console.log(` 🗑️ Cleared ${files.length} local cache manifest(s).`);
|
|
39
53
|
console.log(" ℹ️ Next pipeline run will re-execute all steps from scratch.");
|
|
40
54
|
console.log("\n Note: The remote Content Lake cache is unaffected.");
|
|
@@ -46,12 +60,14 @@ export function createCacheCommand() {
|
|
|
46
60
|
cmd
|
|
47
61
|
.command("status")
|
|
48
62
|
.description("Show current local cache entries and their ages")
|
|
49
|
-
.
|
|
50
|
-
|
|
63
|
+
.option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
|
|
64
|
+
.action((opts) => {
|
|
65
|
+
const cacheDir = resolveCacheDir(opts.cacheDir);
|
|
66
|
+
if (!existsSync(cacheDir)) {
|
|
51
67
|
console.log(" ℹ️ No local cache directory found.");
|
|
52
68
|
return;
|
|
53
69
|
}
|
|
54
|
-
const files = readdirSync(
|
|
70
|
+
const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
|
|
55
71
|
if (files.length === 0) {
|
|
56
72
|
console.log(" ℹ️ Local cache directory is empty.");
|
|
57
73
|
return;
|
|
@@ -64,7 +80,7 @@ export function createCacheCommand() {
|
|
|
64
80
|
"Outputs");
|
|
65
81
|
console.log(" " + "-".repeat(65));
|
|
66
82
|
for (const file of files.sort()) {
|
|
67
|
-
const filePath = join(
|
|
83
|
+
const filePath = join(cacheDir, file);
|
|
68
84
|
try {
|
|
69
85
|
const raw = readFileSync(filePath, "utf-8");
|
|
70
86
|
const manifest = JSON.parse(raw);
|
|
@@ -88,7 +104,7 @@ export function createCacheCommand() {
|
|
|
88
104
|
}
|
|
89
105
|
const totalSize = files.reduce((sum, f) => {
|
|
90
106
|
try {
|
|
91
|
-
return sum + statSync(join(
|
|
107
|
+
return sum + statSync(join(cacheDir, f)).size;
|
|
92
108
|
}
|
|
93
109
|
catch {
|
|
94
110
|
return sum;
|
package/dist/commands/compare.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Wraps the existing compare pipeline logic and formatting utilities
|
|
5
5
|
* in a Commander.js command for consistent CLI integration.
|
|
6
6
|
*/
|
|
7
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
8
8
|
import { dirname, join, resolve } from "path";
|
|
9
9
|
import { fileURLToPath } from "url";
|
|
10
10
|
import { Command } from "commander";
|
|
@@ -79,7 +79,7 @@ export function createCompareCommand() {
|
|
|
79
79
|
if (opts.format === "json") {
|
|
80
80
|
const json = JSON.stringify(report, null, 2);
|
|
81
81
|
if (opts.output) {
|
|
82
|
-
|
|
82
|
+
writeReport(opts.output, json);
|
|
83
83
|
console.log(` ✅ Comparison report written to ${opts.output}`);
|
|
84
84
|
}
|
|
85
85
|
else {
|
|
@@ -91,13 +91,13 @@ export function createCompareCommand() {
|
|
|
91
91
|
console.log(table);
|
|
92
92
|
if (opts.output) {
|
|
93
93
|
const json = JSON.stringify(report, null, 2);
|
|
94
|
-
|
|
94
|
+
writeReport(opts.output, json);
|
|
95
95
|
console.log(` ✅ Comparison report also written to ${opts.output}`);
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
98
|
// Write comparison report to output dir for other steps to consume
|
|
99
99
|
const latestComparisonPath = join(outputDir, "comparison-report.json");
|
|
100
|
-
|
|
100
|
+
writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
|
|
101
101
|
});
|
|
102
102
|
addOutputDirOption(cmd);
|
|
103
103
|
return cmd;
|
|
@@ -122,3 +122,10 @@ function loadSummary(path) {
|
|
|
122
122
|
const raw = readFileSync(path, "utf-8");
|
|
123
123
|
return JSON.parse(raw);
|
|
124
124
|
}
|
|
125
|
+
// W0097: every write path creates its parent dir so a fresh project (no
|
|
126
|
+
// `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
|
|
127
|
+
// not-yet-existing directory both succeed instead of crashing with ENOENT.
|
|
128
|
+
function writeReport(path, contents) {
|
|
129
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
130
|
+
writeFileSync(path, contents);
|
|
131
|
+
}
|
|
@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
|
|
|
541
541
|
rootDir,
|
|
542
542
|
});
|
|
543
543
|
}
|
|
544
|
-
// --format is a global option on the root program (actionCommand.parent)
|
|
544
|
+
// --explain-format is a global option on the root program (actionCommand.parent)
|
|
545
545
|
const globalParentOpts = actionCommand.parent?.opts();
|
|
546
|
-
const formatOpt = globalParentOpts?.
|
|
546
|
+
const formatOpt = globalParentOpts?.explainFormat ?? "console";
|
|
547
547
|
if (formatOpt === "json") {
|
|
548
548
|
console.log(formatPlanJson(plan));
|
|
549
549
|
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary-tasks.ts — The Tier 3 canary set.
|
|
3
|
+
*
|
|
4
|
+
* Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
|
|
5
|
+
* Composition follows the design doc's "weighted toward modes/areas with
|
|
6
|
+
* the most production usage and the highest historical regression rates"
|
|
7
|
+
* recommendation: GROQ and Content Lake (foundational consumer surfaces),
|
|
8
|
+
* Portable Text (historically drift-prone), Studio schema authoring (the
|
|
9
|
+
* second-most-used surface after queries), and a knowledge-probe pairing
|
|
10
|
+
* for cross-mode coverage.
|
|
11
|
+
*
|
|
12
|
+
* Each entry's `rationale` is the canary's load-bearing field — without it,
|
|
13
|
+
* future maintainers can't reason about whether a regression is meaningful
|
|
14
|
+
* or whether the slot has lost value. Update the rationale when you swap a
|
|
15
|
+
* canary entry; never silently replace one.
|
|
16
|
+
*
|
|
17
|
+
* Validated against the live task inventory by `scripts/check-canary-tasks.ts`
|
|
18
|
+
* (`pnpm check`). Dangling task IDs fail the build.
|
|
19
|
+
*
|
|
20
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
21
|
+
* @see .github/workflows/tier-3-nightly.yml — consumer
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { defineCanaryTasks } from "../_vendor/ailf-core/index.js"
|
|
25
|
+
|
|
26
|
+
export default defineCanaryTasks({
|
|
27
|
+
tasks: [
|
|
28
|
+
{
|
|
29
|
+
taskId: "groq-blog-queries",
|
|
30
|
+
mode: "literacy",
|
|
31
|
+
rationale:
|
|
32
|
+
"Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
taskId: "content-lake-mutations",
|
|
36
|
+
mode: "literacy",
|
|
37
|
+
rationale:
|
|
38
|
+
"Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
taskId: "portable-text-rendering",
|
|
42
|
+
mode: "literacy",
|
|
43
|
+
rationale:
|
|
44
|
+
"Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
taskId: "studio-blog-schema",
|
|
48
|
+
mode: "literacy",
|
|
49
|
+
rationale:
|
|
50
|
+
"Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
taskId: "kp-groq-projections",
|
|
54
|
+
mode: "knowledge-probe",
|
|
55
|
+
rationale:
|
|
56
|
+
"Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
|
|
57
|
+
},
|
|
58
|
+
// mcp-server canary slot — add a third mode here when a committed
|
|
59
|
+
// mcp-server task lands under packages/eval/tasks/mcp-server/. Today
|
|
60
|
+
// there are no production mcp-server tasks (only fixtures); the trigger
|
|
61
|
+
// is upstream and adding a placeholder slot would dangle. Surfaced at
|
|
62
|
+
// Phase 5 close (2026-04-27) — see W0116 retrospective.
|
|
63
|
+
],
|
|
64
|
+
})
|
package/dist/config/models.ts
CHANGED
|
@@ -35,16 +35,23 @@ export default defineModels({
|
|
|
35
35
|
|
|
36
36
|
// ── OpenAI ─────────────────────────────────────────────────
|
|
37
37
|
{
|
|
38
|
+
// gpt-5.2 routes through chat completions (and through the in-house
|
|
39
|
+
// agentic provider for naive/optimized variants). `verbosity` is a
|
|
40
|
+
// Responses-API-only field — it would be silently dropped here, so
|
|
41
|
+
// it isn't configured. See W0131.
|
|
38
42
|
id: "openai:chat:gpt-5.2",
|
|
39
43
|
label: "GPT 5.2",
|
|
40
44
|
config: {
|
|
41
45
|
max_completion_tokens: 8192,
|
|
42
|
-
verbosity: "medium",
|
|
43
46
|
},
|
|
44
47
|
modes: ["literacy", "knowledge-probe"],
|
|
45
48
|
// All literacy variants included by default
|
|
46
49
|
},
|
|
47
50
|
{
|
|
51
|
+
// GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
|
|
52
|
+
// native handling of `openai:responses:` honors reasoning / verbosity /
|
|
53
|
+
// summary; the in-house agentic provider does not (W0131). MCP-server
|
|
54
|
+
// and knowledge-probe routes go through Promptfoo native too.
|
|
48
55
|
id: "openai:responses:gpt-5.4",
|
|
49
56
|
label: "GPT 5.4",
|
|
50
57
|
config: {
|
|
@@ -55,7 +62,9 @@ export default defineModels({
|
|
|
55
62
|
},
|
|
56
63
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
57
64
|
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
58
|
-
|
|
65
|
+
variants: {
|
|
66
|
+
literacy: ["baseline"],
|
|
67
|
+
},
|
|
59
68
|
},
|
|
60
69
|
|
|
61
70
|
// ── Disabled models (uncomment to enable) ──────────────────
|
|
@@ -93,12 +102,31 @@ export default defineModels({
|
|
|
93
102
|
defaults: {
|
|
94
103
|
temperature: 0.2,
|
|
95
104
|
max_tokens: 4096,
|
|
96
|
-
|
|
105
|
+
// Global default round budget for agentic modes. Per-mode overrides
|
|
106
|
+
// below give naive more headroom (W0134) since it spends rounds on
|
|
107
|
+
// retries when fetches fail. Per-model `config.maxToolRounds` still
|
|
108
|
+
// wins over both values.
|
|
109
|
+
maxToolRounds: 5,
|
|
110
|
+
modeMaxToolRounds: {
|
|
111
|
+
"agentic-naive": 8,
|
|
112
|
+
"agentic-optimized": 5,
|
|
113
|
+
},
|
|
97
114
|
observerOptions: {
|
|
98
|
-
|
|
115
|
+
// Per-class preview caps (W0133): default 4 KB, but search responses
|
|
116
|
+
// get 16 KB and llms.txt gets 128 KB so trace audits can resolve
|
|
117
|
+
// which result the model actually saw.
|
|
118
|
+
maxPreviewBytes: 4096,
|
|
119
|
+
previewLimits: {
|
|
120
|
+
default: 4096,
|
|
121
|
+
llmsTxt: 131072,
|
|
122
|
+
search: 16384,
|
|
123
|
+
},
|
|
99
124
|
captureResponsePreview: true,
|
|
100
125
|
includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
|
|
101
126
|
sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
|
|
127
|
+
// statusOnlyForUnmatched defaults to true (W0132) — model-side
|
|
128
|
+
// traffic to api.openai.com / api.anthropic.com / googleapis.com
|
|
129
|
+
// surfaces in run artifacts as slim status-only entries.
|
|
102
130
|
},
|
|
103
131
|
},
|
|
104
132
|
})
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
|
|
3
|
+
*
|
|
4
|
+
* Each cap is the maximum cost a single Tier 3 nightly run may incur for
|
|
5
|
+
* that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
|
|
6
|
+
* fails loudly if any provider's actual spend exceeds its cap.
|
|
7
|
+
*
|
|
8
|
+
* The design doc names a $30–60/day envelope across all providers. Caps
|
|
9
|
+
* here divide that envelope per-provider; tighten as baseline canary spend
|
|
10
|
+
* becomes measurable.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
13
|
+
* @see scripts/tier-3-budget-check.mjs — enforcement
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { defineTestBudgets } from "../_vendor/ailf-core/index.js"
|
|
17
|
+
|
|
18
|
+
export default defineTestBudgets({
|
|
19
|
+
perProviderDaily: {
|
|
20
|
+
anthropic: 30,
|
|
21
|
+
openai: 30,
|
|
22
|
+
},
|
|
23
|
+
warnFraction: 0.8,
|
|
24
|
+
})
|
|
@@ -29,9 +29,20 @@ export interface ScoreComparison {
|
|
|
29
29
|
delta: number;
|
|
30
30
|
feature: string;
|
|
31
31
|
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
/**
|
|
33
|
+
* Paths the baseline pipeline functions read and write. Callers compose this
|
|
34
|
+
* from caller-relative paths so the functions stay agnostic of where the
|
|
35
|
+
* eval package itself lives on disk (W0098).
|
|
36
|
+
*/
|
|
37
|
+
export interface BaselineDirs {
|
|
38
|
+
/** Directory that contains baseline `*.json` snapshots. */
|
|
39
|
+
baselinesDir: string;
|
|
40
|
+
/** Absolute path to the current run's `score-summary.json`. */
|
|
41
|
+
scoreSummaryPath: string;
|
|
42
|
+
}
|
|
43
|
+
export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
|
|
44
|
+
export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
|
|
45
|
+
export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
|
|
35
46
|
success: boolean;
|
|
36
47
|
message: string;
|
|
37
48
|
};
|
|
@@ -7,12 +7,8 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
9
9
|
import { join } from "path";
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
// ---------------------------------------------------------------------------
|
|
13
|
-
export function compareBaseline(rootDir, baselineFile) {
|
|
14
|
-
const baselinesDir = join(rootDir, "results", "baselines");
|
|
15
|
-
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
10
|
+
export function compareBaseline(dirs, baselineFile) {
|
|
11
|
+
const { baselinesDir, scoreSummaryPath } = dirs;
|
|
16
12
|
if (!existsSync(scoreSummaryPath)) {
|
|
17
13
|
return {
|
|
18
14
|
message: "No current score-summary.json found.",
|
|
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
|
|
|
20
16
|
};
|
|
21
17
|
}
|
|
22
18
|
// Find baseline to compare against
|
|
23
|
-
const baselines = listBaselines(
|
|
19
|
+
const baselines = listBaselines(baselinesDir);
|
|
24
20
|
if (baselines.length === 0) {
|
|
25
21
|
return {
|
|
26
22
|
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
|
|
|
76
72
|
// ---------------------------------------------------------------------------
|
|
77
73
|
// List
|
|
78
74
|
// ---------------------------------------------------------------------------
|
|
79
|
-
export function listBaselines(
|
|
80
|
-
const baselinesDir = join(rootDir, "results", "baselines");
|
|
75
|
+
export function listBaselines(baselinesDir) {
|
|
81
76
|
if (!existsSync(baselinesDir)) {
|
|
82
77
|
return [];
|
|
83
78
|
}
|
|
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
|
|
|
102
97
|
// ---------------------------------------------------------------------------
|
|
103
98
|
// Save
|
|
104
99
|
// ---------------------------------------------------------------------------
|
|
105
|
-
export function saveBaseline(
|
|
106
|
-
const baselinesDir
|
|
107
|
-
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
100
|
+
export function saveBaseline(dirs, tag) {
|
|
101
|
+
const { baselinesDir, scoreSummaryPath } = dirs;
|
|
108
102
|
if (!existsSync(scoreSummaryPath)) {
|
|
109
103
|
return {
|
|
110
104
|
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
|
|
|
135
129
|
};
|
|
136
130
|
writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
|
|
137
131
|
return {
|
|
138
|
-
message: `Saved baseline to
|
|
132
|
+
message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
139
133
|
success: true,
|
|
140
134
|
};
|
|
141
135
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
|
-
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
3
|
+
import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
4
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
5
5
|
export interface PromptfooResultsWrapper {
|
|
6
6
|
results: RawTestResult[];
|
|
@@ -91,6 +91,21 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
|
|
|
91
91
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
92
92
|
*/
|
|
93
93
|
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
94
|
+
/**
|
|
95
|
+
* Score knowledge-probe evaluation results.
|
|
96
|
+
*
|
|
97
|
+
* Knowledge-probe mode evaluates parametric recall: the model has no `docs`
|
|
98
|
+
* var and answers from training-data knowledge alone. The compiler explicitly
|
|
99
|
+
* deletes `vars.docs`, so every result lands in the without-docs bucket of
|
|
100
|
+
* the literacy scoring path — collapsing testCount and ceilingScore to zero.
|
|
101
|
+
*
|
|
102
|
+
* This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
|
|
103
|
+
* feature area (KP results carry `__featureArea` from the compiler), and
|
|
104
|
+
* uses the `knowledge-probe` profile (factual-correctness / completeness /
|
|
105
|
+
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
106
|
+
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
107
|
+
*/
|
|
108
|
+
export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
|
|
94
109
|
/**
|
|
95
110
|
* Score agentic evaluation results. In agentic mode, all test entries are
|
|
96
111
|
* gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
|