@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/models.ts +32 -4
  3. package/config/test-budgets.ts +24 -0
  4. package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
  5. package/dist/_vendor/ailf-core/config-helpers.js +81 -1
  6. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  7. package/dist/_vendor/ailf-core/index.js +1 -1
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  9. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  10. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  11. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  13. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  15. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  16. package/dist/_vendor/ailf-shared/index.d.ts +16 -9
  17. package/dist/_vendor/ailf-shared/index.js +13 -9
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/agent-observer/agentic-provider.js +28 -23
  20. package/dist/agent-observer/classifier.js +7 -2
  21. package/dist/agent-observer/proxy.d.ts +88 -3
  22. package/dist/agent-observer/proxy.js +174 -16
  23. package/dist/agent-observer/types.d.ts +23 -5
  24. package/dist/cli-program.js +1 -1
  25. package/dist/commands/baseline.d.ts +3 -1
  26. package/dist/commands/baseline.js +29 -9
  27. package/dist/commands/cache.d.ts +5 -1
  28. package/dist/commands/cache.js +31 -15
  29. package/dist/commands/compare.js +11 -4
  30. package/dist/commands/explain-handler.js +2 -2
  31. package/dist/config/canary-tasks.ts +64 -0
  32. package/dist/config/models.ts +32 -4
  33. package/dist/config/test-budgets.ts +24 -0
  34. package/dist/pipeline/baseline.d.ts +14 -3
  35. package/dist/pipeline/baseline.js +7 -13
  36. package/dist/pipeline/calculate-scores.d.ts +17 -2
  37. package/dist/pipeline/calculate-scores.js +139 -1
  38. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  39. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  40. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  41. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  42. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  43. package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
  44. package/dist/pipeline/compiler/provider-assembler.js +37 -2
  45. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  46. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  47. package/package.json +2 -1
  48. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
@@ -101,19 +101,37 @@ export interface ExternalRequest {
101
101
  url: string;
102
102
  }
103
103
  export interface ObservedRequest {
104
- /** Request body (for POST searches, etc.), truncated to maxBodyBytes */
104
+ /** Request body (for POST searches, etc.), truncated to maxBodyBytes.
105
+ * Always omitted for `capture: "status-only"` entries. */
105
106
  body?: string;
106
- /** Content-Type of the response */
107
+ /**
108
+ * Capture mode discriminator (W0132).
109
+ *
110
+ * - `"full"` — URL matched `includePatterns`; body, headers, contentType,
111
+ * responseSize, and responsePreview are all captured.
112
+ * - `"status-only"` — URL did not match `includePatterns` but
113
+ * `statusOnlyForUnmatched` is true. Only url/method/statusCode/
114
+ * latencyMs/timestamp/seq are recorded; body/headers/contentType/
115
+ * responsePreview are intentionally omitted to avoid capturing
116
+ * prompts, completions, or API keys for third-party endpoints.
117
+ *
118
+ * Defaults to `"full"` on legacy records that pre-date W0132.
119
+ */
120
+ capture?: "full" | "status-only";
121
+ /** Content-Type of the response. Always omitted for status-only entries. */
107
122
  contentType?: string;
108
- /** Relevant request headers (e.g., Accept, User-Agent) */
123
+ /** Relevant request headers (e.g., Accept, User-Agent).
124
+ * Always empty for status-only entries (no header capture at all). */
109
125
  headers: Record<string, string>;
110
126
  /** Time from request start to response complete, in ms */
111
127
  latencyMs: number;
112
128
  /** HTTP method */
113
129
  method: string;
114
- /** Response body preview (first N chars), useful for seeing what the agent actually read */
130
+ /** Response body preview (first N chars), useful for seeing what the agent
131
+ * actually read. Always omitted for status-only entries. */
115
132
  responsePreview?: string;
116
- /** Response body size in bytes */
133
+ /** Response body size in bytes. 0 for status-only entries (we never read
134
+ * the body). */
117
135
  responseSize: number;
118
136
  /** Monotonic sequence number within the test run */
119
137
  seq: number;
@@ -67,7 +67,7 @@ export function buildCliProgram(opts) {
67
67
  .option("-q, --quiet", "Suppress non-error output")
68
68
  .option("--dotenv <path>", "Override default .env file path")
69
69
  .option("--explain", "Show execution plan without running")
70
- .option("--format <fmt>", "Output format for --explain (console, json)", "console")
70
+ .option("--explain-format <fmt>", "Output format for --explain (console, json)", "console")
71
71
  .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
72
72
  configureProgram(program);
73
73
  // Global --explain hook — intercepts any command before execution
@@ -3,7 +3,9 @@
3
3
  *
4
4
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
5
5
  * Commander subcommand interface: `baseline save`, `baseline compare`,
6
- * `baseline history`.
6
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
7
+ * tree (not the eval package's installed location); use `--baselines-dir`
8
+ * or `AILF_BASELINES_DIR` to override (W0098).
7
9
  */
8
10
  import { Command } from "commander";
9
11
  export declare function createBaselineCommand(): Command;
@@ -3,17 +3,34 @@
3
3
  *
4
4
  * Wraps the core baseline functions from pipeline/baseline.ts behind a
5
5
  * Commander subcommand interface: `baseline save`, `baseline compare`,
6
- * `baseline history`.
6
+ * `baseline history`. All three operate on the *caller's* `.ailf/results/`
7
+ * tree (not the eval package's installed location); use `--baselines-dir`
8
+ * or `AILF_BASELINES_DIR` to override (W0098).
7
9
  */
8
- import { dirname, resolve } from "path";
9
- import { fileURLToPath } from "url";
10
+ import { join, resolve } from "path";
10
11
  import { Command } from "commander";
11
12
  import { compareBaseline, listBaselines, saveBaseline, } from "../pipeline/baseline.js";
12
- const __dirname = dirname(fileURLToPath(import.meta.url));
13
- const ROOT = resolve(__dirname, "../..");
13
+ import { getCallerCwd } from "./shared/resolve-output-dir.js";
14
14
  // CLI command name — kept as a constant to centralize the string literal.
15
15
  // "baseline" here refers to score baseline snapshots, not the legacy eval mode.
16
16
  const CMD_NAME = "baseline";
17
+ /**
18
+ * Resolve the directory that holds baseline `*.json` snapshots.
19
+ * Precedence: explicit flag > `AILF_BASELINES_DIR` env var > caller cwd default.
20
+ */
21
+ function resolveBaselinesDir(flag) {
22
+ if (flag)
23
+ return resolve(getCallerCwd(), flag);
24
+ if (process.env.AILF_BASELINES_DIR)
25
+ return resolve(getCallerCwd(), process.env.AILF_BASELINES_DIR);
26
+ return join(getCallerCwd(), ".ailf", "results", "baselines");
27
+ }
28
+ function resolveBaselineDirs(flag) {
29
+ return {
30
+ baselinesDir: resolveBaselinesDir(flag),
31
+ scoreSummaryPath: join(getCallerCwd(), ".ailf", "results", "latest", "score-summary.json"),
32
+ };
33
+ }
17
34
  export function createBaselineCommand() {
18
35
  const cmd = new Command(CMD_NAME).description("Manage historical baseline snapshots of evaluation scores");
19
36
  // -----------------------------------------------------------------------
@@ -23,9 +40,10 @@ export function createBaselineCommand() {
23
40
  .command("save")
24
41
  .description("Save current scores as a baseline snapshot")
25
42
  .option("-t, --tag <tag>", "Descriptive tag for the baseline")
43
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
26
44
  .action(async (opts) => {
27
45
  console.log("=== Saving baseline snapshot ===\n");
28
- const result = saveBaseline(ROOT, opts.tag);
46
+ const result = saveBaseline(resolveBaselineDirs(opts.baselinesDir), opts.tag);
29
47
  if (result.success) {
30
48
  console.log(` ✅ ${result.message}`);
31
49
  }
@@ -41,9 +59,10 @@ export function createBaselineCommand() {
41
59
  .command("compare")
42
60
  .description("Compare current scores against a saved baseline")
43
61
  .option("-f, --file <path>", "Specific baseline file to compare against")
62
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
44
63
  .action(async (opts) => {
45
64
  console.log("=== Baseline Comparison ===\n");
46
- const result = compareBaseline(ROOT, opts.file);
65
+ const result = compareBaseline(resolveBaselineDirs(opts.baselinesDir), opts.file);
47
66
  if (!result.success) {
48
67
  console.error(` ❌ ${result.message}`);
49
68
  process.exit(1);
@@ -110,9 +129,10 @@ export function createBaselineCommand() {
110
129
  cmd
111
130
  .command("history")
112
131
  .description("List all saved baselines")
113
- .action(async () => {
132
+ .option("--baselines-dir <path>", "Directory holding baseline snapshots (default: <cwd>/.ailf/results/baselines)")
133
+ .action(async (opts) => {
114
134
  console.log("=== Baseline History ===\n");
115
- const baselines = listBaselines(ROOT);
135
+ const baselines = listBaselines(resolveBaselinesDir(opts.baselinesDir));
116
136
  if (baselines.length === 0) {
117
137
  console.log(" No baselines saved yet.");
118
138
  return;
@@ -2,9 +2,13 @@
2
2
  * cache command — manage the local pipeline cache.
3
3
  *
4
4
  * Subcommands:
5
- * cache clear Delete all local cache manifests (results/cache/).
5
+ * cache clear Delete all local cache manifests (.ailf/results/cache/).
6
6
  * cache status Show current cache entries and their ages.
7
7
  *
8
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
9
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
10
+ * override (W0098).
11
+ *
8
12
  * Note: This only affects the local file-system cache used to skip unchanged
9
13
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
10
14
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
@@ -2,20 +2,32 @@
2
2
  * cache command — manage the local pipeline cache.
3
3
  *
4
4
  * Subcommands:
5
- * cache clear Delete all local cache manifests (results/cache/).
5
+ * cache clear Delete all local cache manifests (.ailf/results/cache/).
6
6
  * cache status Show current cache entries and their ages.
7
7
  *
8
+ * Operates on the *caller's* `.ailf/results/cache/` tree (not the eval
9
+ * package's installed location); use `--cache-dir` or `AILF_CACHE_DIR` to
10
+ * override (W0098).
11
+ *
8
12
  * Note: This only affects the local file-system cache used to skip unchanged
9
13
  * pipeline steps. It does NOT touch the remote Content Lake eval cache.
10
14
  * Use --no-remote-cache on pipeline commands to bypass the remote cache.
11
15
  */
12
16
  import { Command } from "commander";
13
17
  import { existsSync, readdirSync, readFileSync, rmSync, statSync } from "fs";
14
- import { dirname, join, resolve } from "path";
15
- import { fileURLToPath } from "url";
16
- const __dirname = dirname(fileURLToPath(import.meta.url));
17
- const ROOT = resolve(__dirname, "..", "..");
18
- const CACHE_DIR = resolve(ROOT, "results", "cache");
18
+ import { join, resolve } from "path";
19
+ import { getCallerCwd } from "./shared/resolve-output-dir.js";
20
+ /**
21
+ * Resolve the local pipeline cache directory.
22
+ * Precedence: explicit flag > `AILF_CACHE_DIR` env var > caller cwd default.
23
+ */
24
+ function resolveCacheDir(flag) {
25
+ if (flag)
26
+ return resolve(getCallerCwd(), flag);
27
+ if (process.env.AILF_CACHE_DIR)
28
+ return resolve(getCallerCwd(), process.env.AILF_CACHE_DIR);
29
+ return join(getCallerCwd(), ".ailf", "results", "cache");
30
+ }
19
31
  export function createCacheCommand() {
20
32
  const cmd = new Command("cache").description("Manage the local pipeline cache (does not affect the remote Content Lake cache)");
21
33
  // -----------------------------------------------------------------------
@@ -24,17 +36,19 @@ export function createCacheCommand() {
24
36
  cmd
25
37
  .command("clear")
26
38
  .description("Delete all local cache manifests so every pipeline step re-executes")
27
- .action(() => {
28
- if (!existsSync(CACHE_DIR)) {
39
+ .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
40
+ .action((opts) => {
41
+ const cacheDir = resolveCacheDir(opts.cacheDir);
42
+ if (!existsSync(cacheDir)) {
29
43
  console.log(" ℹ️ No local cache directory found — nothing to clear.");
30
44
  return;
31
45
  }
32
- const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
46
+ const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
33
47
  if (files.length === 0) {
34
48
  console.log(" ℹ️ Local cache directory is empty — nothing to clear.");
35
49
  return;
36
50
  }
37
- rmSync(CACHE_DIR, { recursive: true, force: true });
51
+ rmSync(cacheDir, { recursive: true, force: true });
38
52
  console.log(` 🗑️ Cleared ${files.length} local cache manifest(s).`);
39
53
  console.log(" ℹ️ Next pipeline run will re-execute all steps from scratch.");
40
54
  console.log("\n Note: The remote Content Lake cache is unaffected.");
@@ -46,12 +60,14 @@ export function createCacheCommand() {
46
60
  cmd
47
61
  .command("status")
48
62
  .description("Show current local cache entries and their ages")
49
- .action(() => {
50
- if (!existsSync(CACHE_DIR)) {
63
+ .option("--cache-dir <path>", "Directory holding cache manifests (default: <cwd>/.ailf/results/cache)")
64
+ .action((opts) => {
65
+ const cacheDir = resolveCacheDir(opts.cacheDir);
66
+ if (!existsSync(cacheDir)) {
51
67
  console.log(" ℹ️ No local cache directory found.");
52
68
  return;
53
69
  }
54
- const files = readdirSync(CACHE_DIR).filter((f) => f.endsWith(".json"));
70
+ const files = readdirSync(cacheDir).filter((f) => f.endsWith(".json"));
55
71
  if (files.length === 0) {
56
72
  console.log(" ℹ️ Local cache directory is empty.");
57
73
  return;
@@ -64,7 +80,7 @@ export function createCacheCommand() {
64
80
  "Outputs");
65
81
  console.log(" " + "-".repeat(65));
66
82
  for (const file of files.sort()) {
67
- const filePath = join(CACHE_DIR, file);
83
+ const filePath = join(cacheDir, file);
68
84
  try {
69
85
  const raw = readFileSync(filePath, "utf-8");
70
86
  const manifest = JSON.parse(raw);
@@ -88,7 +104,7 @@ export function createCacheCommand() {
88
104
  }
89
105
  const totalSize = files.reduce((sum, f) => {
90
106
  try {
91
- return sum + statSync(join(CACHE_DIR, f)).size;
107
+ return sum + statSync(join(cacheDir, f)).size;
92
108
  }
93
109
  catch {
94
110
  return sum;
@@ -4,7 +4,7 @@
4
4
  * Wraps the existing compare pipeline logic and formatting utilities
5
5
  * in a Commander.js command for consistent CLI integration.
6
6
  */
7
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
7
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
8
8
  import { dirname, join, resolve } from "path";
9
9
  import { fileURLToPath } from "url";
10
10
  import { Command } from "commander";
@@ -79,7 +79,7 @@ export function createCompareCommand() {
79
79
  if (opts.format === "json") {
80
80
  const json = JSON.stringify(report, null, 2);
81
81
  if (opts.output) {
82
- writeFileSync(opts.output, json);
82
+ writeReport(opts.output, json);
83
83
  console.log(` ✅ Comparison report written to ${opts.output}`);
84
84
  }
85
85
  else {
@@ -91,13 +91,13 @@ export function createCompareCommand() {
91
91
  console.log(table);
92
92
  if (opts.output) {
93
93
  const json = JSON.stringify(report, null, 2);
94
- writeFileSync(opts.output, json);
94
+ writeReport(opts.output, json);
95
95
  console.log(` ✅ Comparison report also written to ${opts.output}`);
96
96
  }
97
97
  }
98
98
  // Write comparison report to output dir for other steps to consume
99
99
  const latestComparisonPath = join(outputDir, "comparison-report.json");
100
- writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
100
+ writeReport(latestComparisonPath, JSON.stringify(report, null, 2));
101
101
  });
102
102
  addOutputDirOption(cmd);
103
103
  return cmd;
@@ -122,3 +122,10 @@ function loadSummary(path) {
122
122
  const raw = readFileSync(path, "utf-8");
123
123
  return JSON.parse(raw);
124
124
  }
125
+ // W0097: every write path creates its parent dir so a fresh project (no
126
+ // `.ailf/results/latest/`) or a user-supplied `--output` pointing at a
127
+ // not-yet-existing directory both succeed instead of crashing with ENOENT.
128
+ function writeReport(path, contents) {
129
+ mkdirSync(dirname(path), { recursive: true });
130
+ writeFileSync(path, contents);
131
+ }
@@ -541,9 +541,9 @@ export async function handleExplain(actionCommand, confirmExecution, rootDir) {
541
541
  rootDir,
542
542
  });
543
543
  }
544
- // --format is a global option on the root program (actionCommand.parent)
544
+ // --explain-format is a global option on the root program (actionCommand.parent)
545
545
  const globalParentOpts = actionCommand.parent?.opts();
546
- const formatOpt = globalParentOpts?.format ?? "console";
546
+ const formatOpt = globalParentOpts?.explainFormat ?? "console";
547
547
  if (formatOpt === "json") {
548
548
  console.log(formatPlanJson(plan));
549
549
  }
@@ -0,0 +1,64 @@
1
+ /**
2
+ * canary-tasks.ts — The Tier 3 canary set.
3
+ *
4
+ * Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
5
+ * Composition follows the design doc's "weighted toward modes/areas with
6
+ * the most production usage and the highest historical regression rates"
7
+ * recommendation: GROQ and Content Lake (foundational consumer surfaces),
8
+ * Portable Text (historically drift-prone), Studio schema authoring (the
9
+ * second-most-used surface after queries), and a knowledge-probe pairing
10
+ * for cross-mode coverage.
11
+ *
12
+ * Each entry's `rationale` is the canary's load-bearing field — without it,
13
+ * future maintainers can't reason about whether a regression is meaningful
14
+ * or whether the slot has lost value. Update the rationale when you swap a
15
+ * canary entry; never silently replace one.
16
+ *
17
+ * Validated against the live task inventory by `scripts/check-canary-tasks.ts`
18
+ * (`pnpm check`). Dangling task IDs fail the build.
19
+ *
20
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
21
+ * @see .github/workflows/tier-3-nightly.yml — consumer
22
+ */
23
+
24
+ import { defineCanaryTasks } from "../_vendor/ailf-core/index.js"
25
+
26
+ export default defineCanaryTasks({
27
+ tasks: [
28
+ {
29
+ taskId: "groq-blog-queries",
30
+ mode: "literacy",
31
+ rationale:
32
+ "Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
33
+ },
34
+ {
35
+ taskId: "content-lake-mutations",
36
+ mode: "literacy",
37
+ rationale:
38
+ "Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
39
+ },
40
+ {
41
+ taskId: "portable-text-rendering",
42
+ mode: "literacy",
43
+ rationale:
44
+ "Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
45
+ },
46
+ {
47
+ taskId: "studio-blog-schema",
48
+ mode: "literacy",
49
+ rationale:
50
+ "Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
51
+ },
52
+ {
53
+ taskId: "kp-groq-projections",
54
+ mode: "knowledge-probe",
55
+ rationale:
56
+ "Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
57
+ },
58
+ // mcp-server canary slot — add a third mode here when a committed
59
+ // mcp-server task lands under packages/eval/tasks/mcp-server/. Today
60
+ // there are no production mcp-server tasks (only fixtures); the trigger
61
+ // is upstream and adding a placeholder slot would dangle. Surfaced at
62
+ // Phase 5 close (2026-04-27) — see W0116 retrospective.
63
+ ],
64
+ })
@@ -35,16 +35,23 @@ export default defineModels({
35
35
 
36
36
  // ── OpenAI ─────────────────────────────────────────────────
37
37
  {
38
+ // gpt-5.2 routes through chat completions (and through the in-house
39
+ // agentic provider for naive/optimized variants). `verbosity` is a
40
+ // Responses-API-only field — it would be silently dropped here, so
41
+ // it isn't configured. See W0131.
38
42
  id: "openai:chat:gpt-5.2",
39
43
  label: "GPT 5.2",
40
44
  config: {
41
45
  max_completion_tokens: 8192,
42
- verbosity: "medium",
43
46
  },
44
47
  modes: ["literacy", "knowledge-probe"],
45
48
  // All literacy variants included by default
46
49
  },
47
50
  {
51
+ // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
52
+ // native handling of `openai:responses:` honors reasoning / verbosity /
53
+ // summary; the in-house agentic provider does not (W0131). MCP-server
54
+ // and knowledge-probe routes go through Promptfoo native too.
48
55
  id: "openai:responses:gpt-5.4",
49
56
  label: "GPT 5.4",
50
57
  config: {
@@ -55,7 +62,9 @@ export default defineModels({
55
62
  },
56
63
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
57
64
  modes: ["literacy", "mcp-server", "knowledge-probe"],
58
- // All literacy variants included by default
65
+ variants: {
66
+ literacy: ["baseline"],
67
+ },
59
68
  },
60
69
 
61
70
  // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
93
102
  defaults: {
94
103
  temperature: 0.2,
95
104
  max_tokens: 4096,
96
- maxToolRounds: 5, // for agentic modes
105
+ // Global default round budget for agentic modes. Per-mode overrides
106
+ // below give naive more headroom (W0134) since it spends rounds on
107
+ // retries when fetches fail. Per-model `config.maxToolRounds` still
108
+ // wins over both values.
109
+ maxToolRounds: 5,
110
+ modeMaxToolRounds: {
111
+ "agentic-naive": 8,
112
+ "agentic-optimized": 5,
113
+ },
97
114
  observerOptions: {
98
- maxPreviewBytes: 2048,
115
+ // Per-class preview caps (W0133): default 4 KB, but search responses
116
+ // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
117
+ // which result the model actually saw.
118
+ maxPreviewBytes: 4096,
119
+ previewLimits: {
120
+ default: 4096,
121
+ llmsTxt: 131072,
122
+ search: 16384,
123
+ },
99
124
  captureResponsePreview: true,
100
125
  includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
101
126
  sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
127
+ // statusOnlyForUnmatched defaults to true (W0132) — model-side
128
+ // traffic to api.openai.com / api.anthropic.com / googleapis.com
129
+ // surfaces in run artifacts as slim status-only entries.
102
130
  },
103
131
  },
104
132
  })
@@ -0,0 +1,24 @@
1
+ /**
2
+ * test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
3
+ *
4
+ * Each cap is the maximum cost a single Tier 3 nightly run may incur for
5
+ * that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
6
+ * fails loudly if any provider's actual spend exceeds its cap.
7
+ *
8
+ * The design doc names a $30–60/day envelope across all providers. Caps
9
+ * here divide that envelope per-provider; tighten as baseline canary spend
10
+ * becomes measurable.
11
+ *
12
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
13
+ * @see scripts/tier-3-budget-check.mjs — enforcement
14
+ */
15
+
16
+ import { defineTestBudgets } from "../_vendor/ailf-core/index.js"
17
+
18
+ export default defineTestBudgets({
19
+ perProviderDaily: {
20
+ anthropic: 30,
21
+ openai: 30,
22
+ },
23
+ warnFraction: 0.8,
24
+ })
@@ -29,9 +29,20 @@ export interface ScoreComparison {
29
29
  delta: number;
30
30
  feature: string;
31
31
  }
32
- export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
33
- export declare function listBaselines(rootDir: string): BaselineMetadata[];
34
- export declare function saveBaseline(rootDir: string, tag?: string): {
32
+ /**
33
+ * Paths the baseline pipeline functions read and write. Callers compose this
34
+ * from caller-relative paths so the functions stay agnostic of where the
35
+ * eval package itself lives on disk (W0098).
36
+ */
37
+ export interface BaselineDirs {
38
+ /** Directory that contains baseline `*.json` snapshots. */
39
+ baselinesDir: string;
40
+ /** Absolute path to the current run's `score-summary.json`. */
41
+ scoreSummaryPath: string;
42
+ }
43
+ export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
44
+ export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
45
+ export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
35
46
  success: boolean;
36
47
  message: string;
37
48
  };
@@ -7,12 +7,8 @@
7
7
  */
8
8
  import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
9
9
  import { join } from "path";
10
- // ---------------------------------------------------------------------------
11
- // Compare
12
- // ---------------------------------------------------------------------------
13
- export function compareBaseline(rootDir, baselineFile) {
14
- const baselinesDir = join(rootDir, "results", "baselines");
15
- const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
10
+ export function compareBaseline(dirs, baselineFile) {
11
+ const { baselinesDir, scoreSummaryPath } = dirs;
16
12
  if (!existsSync(scoreSummaryPath)) {
17
13
  return {
18
14
  message: "No current score-summary.json found.",
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
20
16
  };
21
17
  }
22
18
  // Find baseline to compare against
23
- const baselines = listBaselines(rootDir);
19
+ const baselines = listBaselines(baselinesDir);
24
20
  if (baselines.length === 0) {
25
21
  return {
26
22
  message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
76
72
  // ---------------------------------------------------------------------------
77
73
  // List
78
74
  // ---------------------------------------------------------------------------
79
- export function listBaselines(rootDir) {
80
- const baselinesDir = join(rootDir, "results", "baselines");
75
+ export function listBaselines(baselinesDir) {
81
76
  if (!existsSync(baselinesDir)) {
82
77
  return [];
83
78
  }
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
102
97
  // ---------------------------------------------------------------------------
103
98
  // Save
104
99
  // ---------------------------------------------------------------------------
105
- export function saveBaseline(rootDir, tag) {
106
- const baselinesDir = join(rootDir, "results", "baselines");
107
- const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
100
+ export function saveBaseline(dirs, tag) {
101
+ const { baselinesDir, scoreSummaryPath } = dirs;
108
102
  if (!existsSync(scoreSummaryPath)) {
109
103
  return {
110
104
  message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
135
129
  };
136
130
  writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
137
131
  return {
138
- message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
132
+ message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
139
133
  success: true,
140
134
  };
141
135
  }
@@ -1,6 +1,6 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
- import type { GraderJudgment, PerModelEntry } from "./types.js";
3
+ import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
4
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
5
  export interface PromptfooResultsWrapper {
6
6
  results: RawTestResult[];
@@ -91,6 +91,21 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
91
91
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
92
92
  */
93
93
  export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
94
+ /**
95
+ * Score knowledge-probe evaluation results.
96
+ *
97
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
98
+ * var and answers from training-data knowledge alone. The compiler explicitly
99
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
100
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
101
+ *
102
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
103
+ * feature area (KP results carry `__featureArea` from the compiler), and
104
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
105
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
106
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
107
+ */
108
+ export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
94
109
  /**
95
110
  * Score agentic evaluation results. In agentic mode, all test entries are
96
111
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).