@sanity/ailf 3.8.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/test-budgets.ts +24 -0
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +19 -0
  4. package/dist/_vendor/ailf-core/config-helpers.js +27 -0
  5. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  6. package/dist/_vendor/ailf-core/index.js +1 -1
  7. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  9. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  11. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  13. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  15. package/dist/_vendor/ailf-shared/index.d.ts +1 -0
  16. package/dist/_vendor/ailf-shared/index.js +1 -0
  17. package/dist/adapters/config-sources/file-config-adapter.js +4 -5
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/cli-program.d.ts +39 -0
  20. package/dist/cli-program.js +137 -0
  21. package/dist/cli.d.ts +6 -0
  22. package/dist/cli.js +12 -122
  23. package/dist/config/canary-tasks.ts +64 -0
  24. package/dist/config/test-budgets.ts +24 -0
  25. package/dist/pipeline/calculate-scores.d.ts +17 -2
  26. package/dist/pipeline/calculate-scores.js +99 -0
  27. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  28. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  29. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  30. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  31. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  32. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  33. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  34. package/package.json +6 -3
  35. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  36. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +0 -10
  37. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +0 -366
  38. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +0 -9
  39. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +0 -145
  40. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +0 -10
  41. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +0 -314
  42. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +0 -10
  43. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +0 -486
  44. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +0 -10
  45. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +0 -425
  46. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +0 -9
  47. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +0 -332
  48. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +0 -12
  49. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +0 -210
  50. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +0 -7
  51. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +0 -404
  52. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +0 -10
  53. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +0 -184
  54. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +0 -8
  55. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +0 -301
  56. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +0 -9
  57. package/dist/pipeline/compiler/__tests__/telemetry.test.js +0 -503
  58. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +0 -10
  59. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +0 -509
@@ -0,0 +1,137 @@
1
+ /**
2
+ * cli-program.ts — pure factory for the AILF Commander program.
3
+ *
4
+ * Splits the program construction out of cli.ts so the CLI is testable
5
+ * in-process. cli.ts owns bootstrap side effects (dotenv loading,
6
+ * retired-flag/env/cmd checks, AILF_LOG_LEVEL pre-scan, parseAsync); this
7
+ * module owns command wiring.
8
+ *
9
+ * The W0078 M4 black-box harness imports `buildCliProgram()` directly so
10
+ * tests can construct a fresh program, attach `exitOverride()`, capture
11
+ * stdout/stderr, and parse a synthetic argv — all without spawning a
12
+ * subprocess.
13
+ *
14
+ * @see packages/eval/src/__tests__/cli-harness/run-cli.ts
15
+ */
16
+ import { Command } from "commander";
17
+ import { readFileSync } from "node:fs";
18
+ import { resolve } from "node:path";
19
+ import { CommandGroup, configureProgram } from "./commands/shared/help.js";
20
+ import { createAgentReportCommand } from "./commands/agent-report.js";
21
+ import { createBaselineCommand } from "./commands/baseline.js";
22
+ import { createCacheCommand } from "./commands/cache.js";
23
+ import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
24
+ import { createCheckStalenessCommand } from "./commands/check-staleness.js";
25
+ import { createChronicFailuresCommand } from "./commands/chronic-failures.js";
26
+ import { createCompareCommand } from "./commands/compare.js";
27
+ import { createCompletionCommand } from "./commands/completion.js";
28
+ import { createCoverageAuditCommand } from "./commands/coverage-audit.js";
29
+ import { createDiscoveryReportCommand } from "./commands/discovery-report.js";
30
+ import { createEvalCommand } from "./commands/eval.js";
31
+ import { createFetchDocsCommand } from "./commands/fetch-docs.js";
32
+ import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
33
+ import { createGraderCommand } from "./commands/grader/index.js";
34
+ import { createInitCommand } from "./commands/init.js";
35
+ import { createInteractiveCommand } from "./commands/interactive.js";
36
+ import { createLookupDocCommand } from "./commands/lookup-doc.js";
37
+ import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
38
+ import { createPrCommentCommand } from "./commands/pr-comment.js";
39
+ import { createPublishCommand } from "./commands/publish.js";
40
+ import { createReadinessReportCommand } from "./commands/readiness-report.js";
41
+ import { createRunCommand } from "./commands/run.js";
42
+ import { createRunsCommand } from "./commands/runs.js";
43
+ import { createValidateConfigCommand } from "./commands/validate.js";
44
+ import { createValidateTasksCommand } from "./commands/validate-tasks.js";
45
+ import { createWebhookServerCommand } from "./commands/webhook-server.js";
46
+ import { createWeeklyDigestCommand } from "./commands/weekly-digest.js";
47
+ /**
48
+ * Construct the Commander program with every subcommand registered.
49
+ *
50
+ * Pure factory — no I/O beyond reading package.json for the version, no
51
+ * `process.exit()`, no `process.argv` access. Tests can call this and
52
+ * attach `program.exitOverride()` before parsing to capture exit codes
53
+ * instead of terminating the process.
54
+ *
55
+ * Registration order determines group display order in `--help`. Commands
56
+ * within a group appear in the order they're added.
57
+ */
58
+ export function buildCliProgram(opts) {
59
+ const { evalRoot } = opts;
60
+ const pkgPath = resolve(evalRoot, "package.json");
61
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
62
+ const program = new Command()
63
+ .name("ailf")
64
+ .description("AI Literacy Framework — evaluate how well docs enable AI coding tools\n\nMeasure whether AI coding agents can find the right documentation\nand produce correct implementations of your product features.")
65
+ .version(pkg.version)
66
+ .option("-v, --verbose", "Increase log output")
67
+ .option("-q, --quiet", "Suppress non-error output")
68
+ .option("--dotenv <path>", "Override default .env file path")
69
+ .option("--explain", "Show execution plan without running")
70
+ .option("--format <fmt>", "Output format for --explain (console, json)", "console")
71
+ .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
72
+ configureProgram(program);
73
+ // Global --explain hook — intercepts any command before execution
74
+ program.hook("preAction", async (thisCommand, actionCommand) => {
75
+ const globalOpts = thisCommand.opts();
76
+ if (!globalOpts.explain)
77
+ return;
78
+ const { handleExplain } = await import("./commands/explain-handler.js");
79
+ try {
80
+ await handleExplain(actionCommand, globalOpts.yes ?? false, evalRoot);
81
+ process.exit(0);
82
+ }
83
+ catch (err) {
84
+ // Sentinel from --yes confirmation: user wants to proceed
85
+ if (err !== null &&
86
+ typeof err === "object" &&
87
+ "__proceedArgv" in err) {
88
+ const filteredArgv = err.__proceedArgv;
89
+ console.log("\n ▸ Proceeding with execution…\n");
90
+ await program.parseAsync(filteredArgv);
91
+ return;
92
+ }
93
+ throw err;
94
+ }
95
+ });
96
+ // ── Core Workflow ──────────────────────────────────────────────────────
97
+ program.addCommand(createRunCommand().helpGroup(CommandGroup.CoreWorkflow));
98
+ program.addCommand(createCompareCommand().helpGroup(CommandGroup.CoreWorkflow));
99
+ program.addCommand(createBaselineCommand().helpGroup(CommandGroup.CoreWorkflow));
100
+ program.addCommand(createPublishCommand().helpGroup(CommandGroup.CoreWorkflow));
101
+ program.addCommand(createRunsCommand().helpGroup(CommandGroup.CoreWorkflow));
102
+ // ── Analysis & Reports ────────────────────────────────────────────────
103
+ const reportCommand = new Command("report")
104
+ .description("Generate analysis and reporting outputs from evaluation runs")
105
+ .addCommand(createReadinessReportCommand())
106
+ .addCommand(createChronicFailuresCommand())
107
+ .addCommand(createCoverageAuditCommand())
108
+ .addCommand(createDiscoveryReportCommand())
109
+ .addCommand(createAgentReportCommand())
110
+ .addCommand(createWeeklyDigestCommand())
111
+ .addCommand(createCheckStalenessCommand());
112
+ program.addCommand(reportCommand.helpGroup(CommandGroup.AnalysisReports));
113
+ // ── Grader Reliability ────────────────────────────────────────────────
114
+ program.addCommand(createGraderCommand().helpGroup(CommandGroup.GraderReliability));
115
+ // ── Setup & Configuration ─────────────────────────────────────────────
116
+ program.addCommand(createInitCommand().helpGroup(CommandGroup.SetupConfig));
117
+ const validateCommand = new Command("validate")
118
+ .description("Validate AILF configuration and task files")
119
+ .addCommand(createValidateConfigCommand())
120
+ .addCommand(createValidateTasksCommand());
121
+ program.addCommand(validateCommand.helpGroup(CommandGroup.SetupConfig));
122
+ program.addCommand(createFetchDocsCommand().helpGroup(CommandGroup.SetupConfig));
123
+ program.addCommand(createCacheCommand().helpGroup(CommandGroup.SetupConfig));
124
+ // ── Pipeline Internals ────────────────────────────────────────────────
125
+ program.addCommand(createEvalCommand().helpGroup(CommandGroup.PipelineInternals));
126
+ program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
127
+ program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
128
+ program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
129
+ program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
130
+ program.addCommand(createLookupDocCommand().helpGroup(CommandGroup.PipelineInternals));
131
+ program.addCommand(createWebhookServerCommand().helpGroup(CommandGroup.PipelineInternals));
132
+ // ── Developer Tools ───────────────────────────────────────────────────
133
+ program.addCommand(createInteractiveCommand().helpGroup(CommandGroup.DeveloperTools));
134
+ // Shell completion — must be registered last (needs full program tree)
135
+ program.addCommand(createCompletionCommand(program).helpGroup(CommandGroup.DeveloperTools));
136
+ return program;
137
+ }
package/dist/cli.d.ts CHANGED
@@ -25,5 +25,11 @@
25
25
  *
26
26
  * Dev mode (without building):
27
27
  * tsx src/cli.ts run --debug
28
+ *
29
+ * Module split: this file owns *bootstrap side effects* (dotenv,
30
+ * retired-flag/env/cmd checks, AILF_LOG_LEVEL pre-scan, parseAsync).
31
+ * The Commander wiring lives in ./cli-program.ts so the W0078 M4 black-box
32
+ * harness can build the program in-process without firing those side
33
+ * effects.
28
34
  */
29
35
  export {};
package/dist/cli.js CHANGED
@@ -1,6 +1,4 @@
1
1
  #!/usr/bin/env node
2
- /* oxlint-disable import/first -- imports are intentionally interleaved with
3
- command registration for readability and lazy loading */
4
2
  /**
5
3
  * cli.ts — AILF CLI entry point.
6
4
  *
@@ -27,15 +25,20 @@
27
25
  *
28
26
  * Dev mode (without building):
29
27
  * tsx src/cli.ts run --debug
28
+ *
29
+ * Module split: this file owns *bootstrap side effects* (dotenv,
30
+ * retired-flag/env/cmd checks, AILF_LOG_LEVEL pre-scan, parseAsync).
31
+ * The Commander wiring lives in ./cli-program.ts so the W0078 M4 black-box
32
+ * harness can build the program in-process without firing those side
33
+ * effects.
30
34
  */
31
35
  import { config as dotenvConfig } from "dotenv";
32
- import { existsSync, readFileSync } from "fs";
36
+ import { existsSync } from "fs";
33
37
  import { dirname, resolve } from "path";
34
38
  import { fileURLToPath } from "url";
39
+ import { buildCliProgram } from "./cli-program.js";
35
40
  const __dirname = dirname(fileURLToPath(import.meta.url));
36
41
  const ROOT = resolve(__dirname, "..");
37
- /** Path to the eval package root (packages/eval). Used by --explain. */
38
- const EVAL_ROOT = ROOT;
39
42
  // ---------------------------------------------------------------------------
40
43
  // Load .env — must happen before Commander parses so that .env()
41
44
  // fallbacks resolve correctly.
@@ -220,127 +223,14 @@ if (retiredFlag || retiredCommand || retiredEnv) {
220
223
  process.exit(2);
221
224
  }
222
225
  // ---------------------------------------------------------------------------
223
- // Build CLI program
224
- // ---------------------------------------------------------------------------
225
- import { Command } from "commander";
226
- import { CommandGroup, configureProgram } from "./commands/shared/help.js";
227
- // Read version from package.json
228
- const pkgPath = resolve(ROOT, "package.json");
229
- const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
230
- const program = new Command()
231
- .name("ailf")
232
- .description("AI Literacy Framework — evaluate how well docs enable AI coding tools\n\nMeasure whether AI coding agents can find the right documentation\nand produce correct implementations of your product features.")
233
- .version(pkg.version)
234
- .option("-v, --verbose", "Increase log output")
235
- .option("-q, --quiet", "Suppress non-error output")
236
- .option("--dotenv <path>", "Override default .env file path")
237
- .option("--explain", "Show execution plan without running")
238
- .option("--format <fmt>", "Output format for --explain (console, json)", "console")
239
- .option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
240
- configureProgram(program);
241
- // ---------------------------------------------------------------------------
242
- // Global --explain hook — intercepts any command before execution
226
+ // Build CLI program (delegates Commander wiring to ./cli-program.ts)
243
227
  // ---------------------------------------------------------------------------
244
- program.hook("preAction", async (thisCommand, actionCommand) => {
245
- const globalOpts = thisCommand.opts();
246
- if (!globalOpts.explain)
247
- return;
248
- const { handleExplain } = await import("./commands/explain-handler.js");
249
- try {
250
- await handleExplain(actionCommand, globalOpts.yes ?? false, EVAL_ROOT);
251
- process.exit(0);
252
- }
253
- catch (err) {
254
- // Sentinel from --yes confirmation: user wants to proceed
255
- if (err !== null &&
256
- typeof err === "object" &&
257
- "__proceedArgv" in err) {
258
- const filteredArgv = err.__proceedArgv;
259
- console.log("\n ▸ Proceeding with execution…\n");
260
- await program.parseAsync(filteredArgv);
261
- return;
262
- }
263
- throw err;
264
- }
265
- });
228
+ const program = buildCliProgram({ evalRoot: ROOT });
266
229
  // ---------------------------------------------------------------------------
267
- // Register commands
230
+ // Parse and run — default to showing help when no arguments given.
268
231
  //
269
- // Registration order determines group display order in --help.
270
- // Within each group, commands appear in the order they are added.
271
- // ---------------------------------------------------------------------------
272
- // ── Core Workflow ──────────────────────────────────────────────────────
273
- import { createRunCommand } from "./commands/run.js";
274
- program.addCommand(createRunCommand().helpGroup(CommandGroup.CoreWorkflow));
275
- import { createCompareCommand } from "./commands/compare.js";
276
- program.addCommand(createCompareCommand().helpGroup(CommandGroup.CoreWorkflow));
277
- import { createBaselineCommand } from "./commands/baseline.js";
278
- program.addCommand(createBaselineCommand().helpGroup(CommandGroup.CoreWorkflow));
279
- import { createPublishCommand } from "./commands/publish.js";
280
- program.addCommand(createPublishCommand().helpGroup(CommandGroup.CoreWorkflow));
281
- import { createRunsCommand } from "./commands/runs.js";
282
- program.addCommand(createRunsCommand().helpGroup(CommandGroup.CoreWorkflow));
283
- // ── Analysis & Reports ────────────────────────────────────────────────
284
- import { createReadinessReportCommand } from "./commands/readiness-report.js";
285
- import { createChronicFailuresCommand } from "./commands/chronic-failures.js";
286
- import { createCoverageAuditCommand } from "./commands/coverage-audit.js";
287
- import { createDiscoveryReportCommand } from "./commands/discovery-report.js";
288
- import { createAgentReportCommand } from "./commands/agent-report.js";
289
- import { createWeeklyDigestCommand } from "./commands/weekly-digest.js";
290
- import { createCheckStalenessCommand } from "./commands/check-staleness.js";
291
- const reportCommand = new Command("report")
292
- .description("Generate analysis and reporting outputs from evaluation runs")
293
- .addCommand(createReadinessReportCommand())
294
- .addCommand(createChronicFailuresCommand())
295
- .addCommand(createCoverageAuditCommand())
296
- .addCommand(createDiscoveryReportCommand())
297
- .addCommand(createAgentReportCommand())
298
- .addCommand(createWeeklyDigestCommand())
299
- .addCommand(createCheckStalenessCommand());
300
- program.addCommand(reportCommand.helpGroup(CommandGroup.AnalysisReports));
301
- // ── Grader Reliability ────────────────────────────────────────────────
302
- import { createGraderCommand } from "./commands/grader/index.js";
303
- program.addCommand(createGraderCommand().helpGroup(CommandGroup.GraderReliability));
304
- // ── Setup & Configuration ─────────────────────────────────────────────
305
- import { createInitCommand } from "./commands/init.js";
306
- program.addCommand(createInitCommand().helpGroup(CommandGroup.SetupConfig));
307
- import { createValidateConfigCommand } from "./commands/validate.js";
308
- import { createValidateTasksCommand } from "./commands/validate-tasks.js";
309
- const validateCommand = new Command("validate")
310
- .description("Validate AILF configuration and task files")
311
- .addCommand(createValidateConfigCommand())
312
- .addCommand(createValidateTasksCommand());
313
- program.addCommand(validateCommand.helpGroup(CommandGroup.SetupConfig));
314
- import { createFetchDocsCommand } from "./commands/fetch-docs.js";
315
- program.addCommand(createFetchDocsCommand().helpGroup(CommandGroup.SetupConfig));
316
- import { createCacheCommand } from "./commands/cache.js";
317
- program.addCommand(createCacheCommand().helpGroup(CommandGroup.SetupConfig));
318
- // ── Pipeline Internals ────────────────────────────────────────────────
319
- import { createEvalCommand } from "./commands/eval.js";
320
- program.addCommand(createEvalCommand().helpGroup(CommandGroup.PipelineInternals));
321
- import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
322
- program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
323
- import { createPrCommentCommand } from "./commands/pr-comment.js";
324
- program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
325
- import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
326
- program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
327
- import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
328
- program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
329
- import { createLookupDocCommand } from "./commands/lookup-doc.js";
330
- program.addCommand(createLookupDocCommand().helpGroup(CommandGroup.PipelineInternals));
331
- import { createWebhookServerCommand } from "./commands/webhook-server.js";
332
- program.addCommand(createWebhookServerCommand().helpGroup(CommandGroup.PipelineInternals));
333
- // ── Developer Tools ───────────────────────────────────────────────────
334
- import { createInteractiveCommand } from "./commands/interactive.js";
335
- program.addCommand(createInteractiveCommand().helpGroup(CommandGroup.DeveloperTools));
336
- // Shell completion — must be registered last (needs full program tree)
337
- import { createCompletionCommand } from "./commands/completion.js";
338
- program.addCommand(createCompletionCommand(program).helpGroup(CommandGroup.DeveloperTools));
339
- // ---------------------------------------------------------------------------
340
- // Parse and run — default to showing help when no arguments given
341
- // ---------------------------------------------------------------------------
342
- // If no command is specified (just `ailf`), show help.
343
232
  // The interactive wizard is still available via `ailf interactive`.
233
+ // ---------------------------------------------------------------------------
344
234
  if (process.argv.length <= 2) {
345
235
  program.outputHelp();
346
236
  }
@@ -0,0 +1,64 @@
1
+ /**
2
+ * canary-tasks.ts — The Tier 3 canary set.
3
+ *
4
+ * Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
5
+ * Composition follows the design doc's "weighted toward modes/areas with
6
+ * the most production usage and the highest historical regression rates"
7
+ * recommendation: GROQ and Content Lake (foundational consumer surfaces),
8
+ * Portable Text (historically drift-prone), Studio schema authoring (the
9
+ * second-most-used surface after queries), and a knowledge-probe pairing
10
+ * for cross-mode coverage.
11
+ *
12
+ * Each entry's `rationale` is the canary's load-bearing field — without it,
13
+ * future maintainers can't reason about whether a regression is meaningful
14
+ * or whether the slot has lost value. Update the rationale when you swap a
15
+ * canary entry; never silently replace one.
16
+ *
17
+ * Validated against the live task inventory by `scripts/check-canary-tasks.ts`
18
+ * (`pnpm check`). Dangling task IDs fail the build.
19
+ *
20
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
21
+ * @see .github/workflows/tier-3-nightly.yml — consumer
22
+ */
23
+
24
+ import { defineCanaryTasks } from "../_vendor/ailf-core/index.js"
25
+
26
+ export default defineCanaryTasks({
27
+ tasks: [
28
+ {
29
+ taskId: "groq-blog-queries",
30
+ mode: "literacy",
31
+ rationale:
32
+ "Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
33
+ },
34
+ {
35
+ taskId: "content-lake-mutations",
36
+ mode: "literacy",
37
+ rationale:
38
+ "Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
39
+ },
40
+ {
41
+ taskId: "portable-text-rendering",
42
+ mode: "literacy",
43
+ rationale:
44
+ "Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
45
+ },
46
+ {
47
+ taskId: "studio-blog-schema",
48
+ mode: "literacy",
49
+ rationale:
50
+ "Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
51
+ },
52
+ {
53
+ taskId: "kp-groq-projections",
54
+ mode: "knowledge-probe",
55
+ rationale:
56
+ "Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
57
+ },
58
+ // mcp-server canary slot — add a third mode here when a committed
59
+ // mcp-server task lands under packages/eval/tasks/mcp-server/. Today
60
+ // there are no production mcp-server tasks (only fixtures); the trigger
61
+ // is upstream and adding a placeholder slot would dangle. Surfaced at
62
+ // Phase 5 close (2026-04-27) — see W0116 retrospective.
63
+ ],
64
+ })
@@ -0,0 +1,24 @@
1
+ /**
2
+ * test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
3
+ *
4
+ * Each cap is the maximum cost a single Tier 3 nightly run may incur for
5
+ * that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
6
+ * fails loudly if any provider's actual spend exceeds its cap.
7
+ *
8
+ * The design doc names a $30–60/day envelope across all providers. Caps
9
+ * here divide that envelope per-provider; tighten as baseline canary spend
10
+ * becomes measurable.
11
+ *
12
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
13
+ * @see scripts/tier-3-budget-check.mjs — enforcement
14
+ */
15
+
16
+ import { defineTestBudgets } from "../_vendor/ailf-core/index.js"
17
+
18
+ export default defineTestBudgets({
19
+ perProviderDaily: {
20
+ anthropic: 30,
21
+ openai: 30,
22
+ },
23
+ warnFraction: 0.8,
24
+ })
@@ -1,6 +1,6 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
- import type { GraderJudgment, PerModelEntry } from "./types.js";
3
+ import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
4
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
5
  export interface PromptfooResultsWrapper {
6
6
  results: RawTestResult[];
@@ -91,6 +91,21 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
91
91
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
92
92
  */
93
93
  export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
94
+ /**
95
+ * Score knowledge-probe evaluation results.
96
+ *
97
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
98
+ * var and answers from training-data knowledge alone. The compiler explicitly
99
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
100
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
101
+ *
102
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
103
+ * feature area (KP results carry `__featureArea` from the compiler), and
104
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
105
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
106
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
107
+ */
108
+ export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
94
109
  /**
95
110
  * Score agentic evaluation results. In agentic mode, all test entries are
96
111
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -719,6 +719,55 @@ function extractTaskId(description) {
719
719
  return description.trim() || "unknown";
720
720
  }
721
721
  // ---------------------------------------------------------------------------
722
+ // Knowledge-probe scoring — closed-book recall with no docs context
723
+ // ---------------------------------------------------------------------------
724
+ /**
725
+ * Score knowledge-probe evaluation results.
726
+ *
727
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
728
+ * var and answers from training-data knowledge alone. The compiler explicitly
729
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
730
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
731
+ *
732
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
733
+ * feature area (KP results carry `__featureArea` from the compiler), and
734
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
735
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
736
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
737
+ */
738
+ export function scoreKnowledgeProbeResults(results, profile) {
739
+ const byFeature = {};
740
+ for (const result of results) {
741
+ const feature = result.vars.__featureArea || detectFeatureArea(result.description);
742
+ if (!byFeature[feature]) {
743
+ byFeature[feature] = [];
744
+ }
745
+ byFeature[feature].push(result);
746
+ }
747
+ const scores = [];
748
+ for (const [feature, featureResults] of Object.entries(byFeature)) {
749
+ const scored = scoreTestGroup(featureResults, profile, feature);
750
+ scores.push({
751
+ assertionPassRate: scored.dimensions.assertionPassRate,
752
+ ceilingScore: 0,
753
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
754
+ dimensions: scored.dimensions,
755
+ docCoverage: scored.dimensions.docCoverage ?? 0,
756
+ docLift: 0,
757
+ docQualityGap: 0,
758
+ feature,
759
+ floorScore: 0,
760
+ groupType: "feature",
761
+ negativeDocLift: false,
762
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
763
+ testCount: featureResults.length,
764
+ totalCost: scored.totalCost,
765
+ totalScore: scored.composite,
766
+ });
767
+ }
768
+ return scores.sort((a, b) => a.feature.localeCompare(b.feature));
769
+ }
770
+ // ---------------------------------------------------------------------------
722
771
  // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
723
772
  // ---------------------------------------------------------------------------
724
773
  /**
@@ -893,6 +942,56 @@ export function calculateAndWriteScores(options) {
893
942
  const testSummary = computeTestSummary(baselineResultsPath);
894
943
  return { belowCritical: summary.belowCritical, testSummary };
895
944
  }
945
+ // ── Knowledge-probe scoring path ────────────────────────────
946
+ // Knowledge-probe mode evaluates parametric recall (no docs context).
947
+ // The KP compiler deletes `vars.docs`, so the literacy path would bucket
948
+ // every result into `withoutDocs` and collapse testCount + dimensions
949
+ // to zero. This branch groups by feature area only and uses the
950
+ // `knowledge-probe` profile (factual-correctness / completeness /
951
+ // currency). See docs/design-docs/mode-agnostic-scoring.md.
952
+ if (mode === "knowledge-probe") {
953
+ const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
954
+ log.debug("Knowledge-probe scoring profile", probeProfile);
955
+ const results = readAndNormalizeResults(baselineResultsPath);
956
+ const scores = scoreKnowledgeProbeResults(results, probeProfile);
957
+ log.debug("Knowledge-probe scores calculated", {
958
+ featureCount: scores.length,
959
+ features: scores.map((s) => ({
960
+ feature: s.feature,
961
+ totalScore: s.totalScore,
962
+ testCount: s.testCount,
963
+ dimensions: s.dimensions,
964
+ })),
965
+ });
966
+ const urlRefs = aggregateUrlReferences(baselineResultsPath);
967
+ const sourceVerification = buildSourceVerification(ROOT, source, {
968
+ allowedOrigins: options.allowedOrigins,
969
+ mode,
970
+ searchMode: options.searchMode,
971
+ });
972
+ const graderCost = extractGraderCost(baselineResultsPath);
973
+ const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
974
+ graderCost, null, // no per-model breakdown for now
975
+ null, // no source isolation — KP doesn't fetch sources
976
+ sourceVerification, "knowledge-probe", log);
977
+ // Persist
978
+ const outDir = join(ROOT, "results", "latest");
979
+ mkdirSync(outDir, { recursive: true });
980
+ writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
981
+ log.info("Score summary written to results/latest/score-summary.json");
982
+ const judgments = extractGraderJudgments(baselineResultsPath);
983
+ if (judgments.length > 0) {
984
+ writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
985
+ log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
986
+ }
987
+ const testResults = extractStoredTestResults(baselineResultsPath);
988
+ if (testResults.length > 0) {
989
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
990
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
991
+ }
992
+ const testSummary = computeTestSummary(baselineResultsPath);
993
+ return { belowCritical: summary.belowCritical, testSummary };
994
+ }
896
995
  // ── Literacy scoring path ───────────────────────────────────
897
996
  // Gold (with-docs) entries use the "default" profile (3 dimensions).
898
997
  // Baseline (without-docs) entries use "output-only" (2 dimensions,
@@ -9,6 +9,11 @@ import type { KnowledgeProbeCompileOptions } from "./types.js";
9
9
  * Tool-use assertions are rejected (knowledge probes don't use tools).
10
10
  * LLM-graded assertions receive the configured grader provider.
11
11
  * All other assertions are passed through.
12
+ *
13
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
14
+ * through the shared rubric resolver so the compiled assertion carries
15
+ * `metadata.dimension` — without this, the scoring engine can't classify
16
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
12
17
  */
13
18
  export declare function mapKnowledgeProbeAssertion(assertion: {
14
19
  type: string;
@@ -1,12 +1,18 @@
1
1
  /**
2
2
  * Assertion mapping for knowledge probe evaluations.
3
3
  */
4
+ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
4
5
  /**
5
6
  * Map a raw knowledge probe assertion to a Promptfoo assertion.
6
7
  *
7
8
  * Tool-use assertions are rejected (knowledge probes don't use tools).
8
9
  * LLM-graded assertions receive the configured grader provider.
9
10
  * All other assertions are passed through.
11
+ *
12
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
13
+ * through the shared rubric resolver so the compiled assertion carries
14
+ * `metadata.dimension` — without this, the scoring engine can't classify
15
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
10
16
  */
11
17
  export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
12
18
  switch (assertion.type) {
@@ -27,9 +33,26 @@ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
27
33
  ? { weight: assertion.weight }
28
34
  : {}),
29
35
  };
30
- // LLM-graded assertions — add grader provider
31
- case "g-eval":
32
36
  case "llm-rubric":
37
+ // Templated form (template + criteria) → resolve to full rubric text
38
+ // with dimension metadata attached.
39
+ if ("template" in assertion && "criteria" in assertion) {
40
+ return resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
41
+ }
42
+ // Inline value form — pass through with grader provider, no metadata.
43
+ // Back-compat for tasks not yet migrated to the templated form.
44
+ return {
45
+ type: "llm-rubric",
46
+ ...("value" in assertion ? { value: assertion.value } : {}),
47
+ ...(typeof assertion.weight === "number"
48
+ ? { weight: assertion.weight }
49
+ : {}),
50
+ ...(options?.graderProvider
51
+ ? { provider: options.graderProvider }
52
+ : {}),
53
+ };
54
+ // Other LLM-graded assertions — add grader provider
55
+ case "g-eval":
33
56
  case "model-graded-closedqa":
34
57
  case "model-graded-factuality":
35
58
  return {
@@ -37,7 +37,11 @@ export const handler = {
37
37
  if (!("mode" in task) || task.mode !== "knowledge-probe") {
38
38
  throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
39
39
  }
40
- const result = compileKnowledgeProbeTask(task, { graderProvider: ctx.graderProvider, models: ctx.models });
40
+ const result = compileKnowledgeProbeTask(task, {
41
+ graderProvider: ctx.graderProvider,
42
+ models: ctx.models,
43
+ rubricConfig: ctx.rubricConfig,
44
+ });
41
45
  return {
42
46
  providers: result.providers,
43
47
  tests: result.tests,
@@ -2,6 +2,7 @@
2
2
  * Public types for the knowledge-probe mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
+ import type { RubricConfig } from "../../rubric-resolution.js";
5
6
  /** Options for compiling a knowledge probe task */
6
7
  export interface KnowledgeProbeCompileOptions {
7
8
  /** Grader provider for LLM-graded assertions */
@@ -12,6 +13,9 @@ export interface KnowledgeProbeCompileOptions {
12
13
  label: string;
13
14
  config?: Record<string, unknown>;
14
15
  }[];
16
+ /** Rubric config (templates, weights, profiles) — needed to resolve
17
+ * templated `llm-rubric` assertions to dimension metadata. */
18
+ rubricConfig?: RubricConfig;
15
19
  }
16
20
  /** Result of compiling a single knowledge probe task */
17
21
  export interface KnowledgeProbeCompileResult {