@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/models.ts +32 -4
  3. package/config/test-budgets.ts +24 -0
  4. package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
  5. package/dist/_vendor/ailf-core/config-helpers.js +81 -1
  6. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  7. package/dist/_vendor/ailf-core/index.js +1 -1
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  9. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  10. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  11. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  13. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  15. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  16. package/dist/_vendor/ailf-shared/index.d.ts +16 -9
  17. package/dist/_vendor/ailf-shared/index.js +13 -9
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/agent-observer/agentic-provider.js +28 -23
  20. package/dist/agent-observer/classifier.js +7 -2
  21. package/dist/agent-observer/proxy.d.ts +88 -3
  22. package/dist/agent-observer/proxy.js +174 -16
  23. package/dist/agent-observer/types.d.ts +23 -5
  24. package/dist/cli-program.js +1 -1
  25. package/dist/commands/baseline.d.ts +3 -1
  26. package/dist/commands/baseline.js +29 -9
  27. package/dist/commands/cache.d.ts +5 -1
  28. package/dist/commands/cache.js +31 -15
  29. package/dist/commands/compare.js +11 -4
  30. package/dist/commands/explain-handler.js +2 -2
  31. package/dist/config/canary-tasks.ts +64 -0
  32. package/dist/config/models.ts +32 -4
  33. package/dist/config/test-budgets.ts +24 -0
  34. package/dist/pipeline/baseline.d.ts +14 -3
  35. package/dist/pipeline/baseline.js +7 -13
  36. package/dist/pipeline/calculate-scores.d.ts +17 -2
  37. package/dist/pipeline/calculate-scores.js +139 -1
  38. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  39. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  40. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  41. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  42. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  43. package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
  44. package/dist/pipeline/compiler/provider-assembler.js +37 -2
  45. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  46. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  47. package/package.json +2 -1
  48. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
@@ -0,0 +1,64 @@
1
+ /**
2
+ * canary-tasks.ts — The Tier 3 canary set.
3
+ *
4
+ * Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
5
+ * Composition follows the design doc's "weighted toward modes/areas with
6
+ * the most production usage and the highest historical regression rates"
7
+ * recommendation: GROQ and Content Lake (foundational consumer surfaces),
8
+ * Portable Text (historically drift-prone), Studio schema authoring (the
9
+ * second-most-used surface after queries), and a knowledge-probe pairing
10
+ * for cross-mode coverage.
11
+ *
12
+ * Each entry's `rationale` is the canary's load-bearing field — without it,
13
+ * future maintainers can't reason about whether a regression is meaningful
14
+ * or whether the slot has lost value. Update the rationale when you swap a
15
+ * canary entry; never silently replace one.
16
+ *
17
+ * Validated against the live task inventory by `scripts/check-canary-tasks.ts`
18
+ * (`pnpm check`). Dangling task IDs fail the build.
19
+ *
20
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
21
+ * @see .github/workflows/tier-3-nightly.yml — consumer
22
+ */
23
+
24
+ import { defineCanaryTasks } from "@sanity/ailf-core"
25
+
26
+ export default defineCanaryTasks({
27
+ tasks: [
28
+ {
29
+ taskId: "groq-blog-queries",
30
+ mode: "literacy",
31
+ rationale:
32
+ "Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
33
+ },
34
+ {
35
+ taskId: "content-lake-mutations",
36
+ mode: "literacy",
37
+ rationale:
38
+ "Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
39
+ },
40
+ {
41
+ taskId: "portable-text-rendering",
42
+ mode: "literacy",
43
+ rationale:
44
+ "Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
45
+ },
46
+ {
47
+ taskId: "studio-blog-schema",
48
+ mode: "literacy",
49
+ rationale:
50
+ "Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
51
+ },
52
+ {
53
+ taskId: "kp-groq-projections",
54
+ mode: "knowledge-probe",
55
+ rationale:
56
+ "Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
57
+ },
58
+ // mcp-server canary slot — add a third mode here when a committed
59
+ // mcp-server task lands under packages/eval/tasks/mcp-server/. Today
60
+ // there are no production mcp-server tasks (only fixtures); the trigger
61
+ // is upstream and adding a placeholder slot would dangle. Surfaced at
62
+ // Phase 5 close (2026-04-27) — see W0116 retrospective.
63
+ ],
64
+ })
package/config/models.ts CHANGED
@@ -35,16 +35,23 @@ export default defineModels({
35
35
 
36
36
  // ── OpenAI ─────────────────────────────────────────────────
37
37
  {
38
+ // gpt-5.2 routes through chat completions (and through the in-house
39
+ // agentic provider for naive/optimized variants). `verbosity` is a
40
+ // Responses-API-only field — it would be silently dropped here, so
41
+ // it isn't configured. See W0131.
38
42
  id: "openai:chat:gpt-5.2",
39
43
  label: "GPT 5.2",
40
44
  config: {
41
45
  max_completion_tokens: 8192,
42
- verbosity: "medium",
43
46
  },
44
47
  modes: ["literacy", "knowledge-probe"],
45
48
  // All literacy variants included by default
46
49
  },
47
50
  {
51
+ // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
52
+ // native handling of `openai:responses:` honors reasoning / verbosity /
53
+ // summary; the in-house agentic provider does not (W0131). MCP-server
54
+ // and knowledge-probe routes go through Promptfoo native too.
48
55
  id: "openai:responses:gpt-5.4",
49
56
  label: "GPT 5.4",
50
57
  config: {
@@ -55,7 +62,9 @@ export default defineModels({
55
62
  },
56
63
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
57
64
  modes: ["literacy", "mcp-server", "knowledge-probe"],
58
- // All literacy variants included by default
65
+ variants: {
66
+ literacy: ["baseline"],
67
+ },
59
68
  },
60
69
 
61
70
  // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
93
102
  defaults: {
94
103
  temperature: 0.2,
95
104
  max_tokens: 4096,
96
- maxToolRounds: 5, // for agentic modes
105
+ // Global default round budget for agentic modes. Per-mode overrides
106
+ // below give naive more headroom (W0134) since it spends rounds on
107
+ // retries when fetches fail. Per-model `config.maxToolRounds` still
108
+ // wins over both values.
109
+ maxToolRounds: 5,
110
+ modeMaxToolRounds: {
111
+ "agentic-naive": 8,
112
+ "agentic-optimized": 5,
113
+ },
97
114
  observerOptions: {
98
- maxPreviewBytes: 2048,
115
+ // Per-class preview caps (W0133): default 4 KB, but search responses
116
+ // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
117
+ // which result the model actually saw.
118
+ maxPreviewBytes: 4096,
119
+ previewLimits: {
120
+ default: 4096,
121
+ llmsTxt: 131072,
122
+ search: 16384,
123
+ },
99
124
  captureResponsePreview: true,
100
125
  includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
101
126
  sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
127
+ // statusOnlyForUnmatched defaults to true (W0132) — model-side
128
+ // traffic to api.openai.com / api.anthropic.com / googleapis.com
129
+ // surfaces in run artifacts as slim status-only entries.
102
130
  },
103
131
  },
104
132
  })
@@ -0,0 +1,24 @@
1
+ /**
2
+ * test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
3
+ *
4
+ * Each cap is the maximum cost a single Tier 3 nightly run may incur for
5
+ * that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
6
+ * fails loudly if any provider's actual spend exceeds its cap.
7
+ *
8
+ * The design doc names a $30–60/day envelope across all providers. Caps
9
+ * here divide that envelope per-provider; tighten as baseline canary spend
10
+ * becomes measurable.
11
+ *
12
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
13
+ * @see scripts/tier-3-budget-check.mjs — enforcement
14
+ */
15
+
16
+ import { defineTestBudgets } from "@sanity/ailf-core"
17
+
18
+ export default defineTestBudgets({
19
+ perProviderDaily: {
20
+ anthropic: 30,
21
+ openai: 30,
22
+ },
23
+ warnFraction: 0.8,
24
+ })
@@ -26,9 +26,11 @@
26
26
  * @see docs/design-docs/architecture-overhaul/typescript-configuration.md (canonical)
27
27
  */
28
28
  import type { EvalConfig } from "./schemas/eval-config.js";
29
+ import type { CanaryTaskSetConfig } from "./schemas/canary-tasks.js";
29
30
  import type { FeatureRegistry, RubricConfig, ThresholdConfig } from "./schemas/pipeline.js";
30
31
  import type { SchedulesFile } from "./schemas/schedules.js";
31
32
  import type { SinksFile } from "./schemas/sinks.js";
33
+ import type { TestBudgetConfig } from "./schemas/test-budgets.js";
32
34
  import type { ModelsConfig } from "./types/index.js";
33
35
  import type { GeneralizedTaskDefinition } from "./types/generalized-task.js";
34
36
  import type { ModeBase, PresetDefinition } from "./types/plugin-registry.js";
@@ -55,8 +57,14 @@ export declare function defineTask(task: GeneralizedTaskDefinition): Generalized
55
57
  * Validates:
56
58
  * - Every `modes` entry is a canonical eval mode name
57
59
  * - Every `variants` key is a mode the model is enrolled in
60
+ * - `openai:responses:` model ids are not used for agentic literacy variants
61
+ * (the in-house agentic loop dispatches to chat completions only)
62
+ * - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
63
+ * set on a model that routes through the agentic provider — they would be
64
+ * silently dropped.
58
65
  *
59
- * @throws {Error} On invalid mode names or mismatched variant keys
66
+ * @throws {Error} On invalid mode names, mismatched variant keys, or
67
+ * misconfigured OpenAI Responses-API fields.
60
68
  */
61
69
  export declare function defineModels(models: ModelsConfig): ModelsConfig;
62
70
  /**
@@ -89,6 +97,23 @@ export declare function defineSinks(sinks: SinksFile): SinksFile;
89
97
  * Used in `config/schedules.ts` for typed schedule configuration.
90
98
  */
91
99
  export declare function defineSchedules(schedules: SchedulesFile): SchedulesFile;
100
+ /**
101
+ * Define per-provider daily USD spend caps for Tier 3 (live-LLM) CI runs.
102
+ *
103
+ * Used in `config/test-budgets.ts` for typed budget configuration.
104
+ *
105
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
106
+ */
107
+ export declare function defineTestBudgets(budgets: TestBudgetConfig): TestBudgetConfig;
108
+ /**
109
+ * Define the curated canary task set for the Tier 3 nightly workflow.
110
+ *
111
+ * Used in `config/canary-tasks.ts`. Validation against the live task
112
+ * inventory happens in `scripts/check-canary-tasks.ts`, not here.
113
+ *
114
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
115
+ */
116
+ export declare function defineCanaryTasks(canary: CanaryTaskSetConfig): CanaryTaskSetConfig;
92
117
  /**
93
118
  * Source configuration — typed inline until a dedicated schema exists.
94
119
  *
@@ -54,6 +54,33 @@ export function defineTask(task) {
54
54
  // ---------------------------------------------------------------------------
55
55
  // Model registry helpers
56
56
  // ---------------------------------------------------------------------------
57
+ /**
58
+ * OpenAI Responses-API-only fields. The agentic provider's OpenAI loop
59
+ * routes everything through `/v1/chat/completions` and would silently drop
60
+ * these. We surface the misconfiguration at config-load time instead.
61
+ *
62
+ * @see docs/work-items/W0131-honor-openai-responses-provider.json
63
+ */
64
+ const RESPONSES_ONLY_FIELDS = ["reasoning", "summary", "verbosity"];
65
+ /**
66
+ * Whether a model would be assembled into agentic-naive or agentic-optimized
67
+ * literacy variants. These are the variants that route through the in-house
68
+ * agentic provider (which speaks chat completions only); baseline routes
69
+ * through Promptfoo's native handling, which honors `openai:responses:` ids.
70
+ *
71
+ * Note: variant names mirror the literacy mode base in
72
+ * `packages/eval/src/pipeline/compiler/mode-bases/literacy.ts`.
73
+ */
74
+ function participatesInAgenticLiteracy(model) {
75
+ const enrolledInLiteracy = !model.modes || model.modes.includes("literacy");
76
+ if (!enrolledInLiteracy)
77
+ return false;
78
+ const literacyVariants = model.variants?.literacy;
79
+ if (!literacyVariants)
80
+ return true;
81
+ return (literacyVariants.includes("agentic-naive") ||
82
+ literacyVariants.includes("agentic-optimized"));
83
+ }
57
84
  /**
58
85
  * Define the model registry (models to evaluate and grader model).
59
86
  *
@@ -62,8 +89,14 @@ export function defineTask(task) {
62
89
  * Validates:
63
90
  * - Every `modes` entry is a canonical eval mode name
64
91
  * - Every `variants` key is a mode the model is enrolled in
92
+ * - `openai:responses:` model ids are not used for agentic literacy variants
93
+ * (the in-house agentic loop dispatches to chat completions only)
94
+ * - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
95
+ * set on a model that routes through the agentic provider — they would be
96
+ * silently dropped.
65
97
  *
66
- * @throws {Error} On invalid mode names or mismatched variant keys
98
+ * @throws {Error} On invalid mode names, mismatched variant keys, or
99
+ * misconfigured OpenAI Responses-API fields.
67
100
  */
68
101
  export function defineModels(models) {
69
102
  const validModes = new Set(CANONICAL_EVAL_MODES);
@@ -87,6 +120,26 @@ export function defineModels(models) {
87
120
  }
88
121
  }
89
122
  }
123
+ const usesAgentic = participatesInAgenticLiteracy(model);
124
+ if (usesAgentic && model.id.startsWith("openai:responses:")) {
125
+ throw new Error(`Model "${model.label ?? model.id}": the in-house agentic provider ` +
126
+ `does not implement the OpenAI Responses API endpoint — requests would ` +
127
+ `be silently downgraded to chat completions. Either restrict variants to ` +
128
+ `["baseline"] (Promptfoo's native handling honors openai:responses:) or ` +
129
+ `change the id to "openai:chat:..." for agentic evaluation. ` +
130
+ `See W0131 for context.`);
131
+ }
132
+ if (usesAgentic && model.config) {
133
+ const droppedFields = RESPONSES_ONLY_FIELDS.filter((f) => f in model.config);
134
+ if (droppedFields.length > 0) {
135
+ throw new Error(`Model "${model.label ?? model.id}": configured fields ` +
136
+ `${droppedFields.map((f) => `"${f}"`).join(", ")} are only honored ` +
137
+ `by the OpenAI Responses API. The agentic provider's chat-completions ` +
138
+ `path would silently drop them. Either remove these fields or restrict ` +
139
+ `variants to ["baseline"] so the model is evaluated only through ` +
140
+ `Promptfoo's native Responses-API handler. See W0131 for context.`);
141
+ }
142
+ }
90
143
  }
91
144
  return models;
92
145
  }
@@ -145,6 +198,33 @@ export function defineSinks(sinks) {
145
198
  export function defineSchedules(schedules) {
146
199
  return schedules;
147
200
  }
201
+ // ---------------------------------------------------------------------------
202
+ // Test-budget helpers
203
+ // ---------------------------------------------------------------------------
204
+ /**
205
+ * Define per-provider daily USD spend caps for Tier 3 (live-LLM) CI runs.
206
+ *
207
+ * Used in `config/test-budgets.ts` for typed budget configuration.
208
+ *
209
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
210
+ */
211
+ export function defineTestBudgets(budgets) {
212
+ return budgets;
213
+ }
214
+ // ---------------------------------------------------------------------------
215
+ // Canary task-set helpers
216
+ // ---------------------------------------------------------------------------
217
+ /**
218
+ * Define the curated canary task set for the Tier 3 nightly workflow.
219
+ *
220
+ * Used in `config/canary-tasks.ts`. Validation against the live task
221
+ * inventory happens in `scripts/check-canary-tasks.ts`, not here.
222
+ *
223
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
224
+ */
225
+ export function defineCanaryTasks(canary) {
226
+ return canary;
227
+ }
148
228
  /**
149
229
  * Define documentation source configurations.
150
230
  *
@@ -17,7 +17,7 @@ export * from "./services/index.js";
17
17
  export * from "./examples/index.js";
18
18
  export * from "./artifact-registry.js";
19
19
  export * from "./batch-signing.js";
20
- export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
20
+ export { defineCanaryTasks, defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineTestBudgets, defineThresholds, } from "./config-helpers.js";
21
21
  export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
22
22
  export { env } from "./env-helper.js";
23
23
  export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
@@ -20,7 +20,7 @@ export * from "./batch-signing.js";
20
20
  // ---------------------------------------------------------------------------
21
21
  // Architecture overhaul — Phase 0 helpers
22
22
  // ---------------------------------------------------------------------------
23
- export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
23
+ export { defineCanaryTasks, defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineTestBudgets, defineThresholds, } from "./config-helpers.js";
24
24
  export { env } from "./env-helper.js";
25
25
  export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
26
26
  export { assoc, resolveVariantMode, splitTaskVariant, } from "./artifact-capture/association.js";
@@ -0,0 +1,52 @@
1
+ /**
2
+ * @sanity/ailf-core — Canary task-set schemas.
3
+ *
4
+ * The canary task set is the curated subset of evaluation tasks the Tier 3
5
+ * nightly workflow runs against live LLMs. Each entry pins a `taskId` and
6
+ * `mode` together with a one-paragraph rationale documenting why the task
7
+ * earned a slot — the rationale is the canary set's single most important
8
+ * field; without it, future maintainers can't reason about whether a
9
+ * regression is meaningful or whether the slot has lost value.
10
+ *
11
+ * Validation that canary IDs map to real tasks lives in
12
+ * `scripts/check-canary-tasks.ts` (run by `pnpm check`); it can't live in
13
+ * Zod because the inventory comes from the repo task glob, not the schema.
14
+ *
15
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
16
+ * @see packages/eval/config/canary-tasks.ts — authored config
17
+ */
18
+ import { z } from "zod";
19
+ export declare const CanaryTaskEntrySchema: z.ZodObject<{
20
+ taskId: z.ZodString;
21
+ mode: z.ZodEnum<{
22
+ custom: "custom";
23
+ agentic: "agentic";
24
+ literacy: "literacy";
25
+ "mcp-server": "mcp-server";
26
+ "agent-harness": "agent-harness";
27
+ "knowledge-probe": "knowledge-probe";
28
+ baseline: "baseline";
29
+ observed: "observed";
30
+ full: "full";
31
+ }>;
32
+ rationale: z.ZodString;
33
+ }, z.core.$strip>;
34
+ export type CanaryTaskEntry = z.infer<typeof CanaryTaskEntrySchema>;
35
+ export declare const CanaryTaskSetConfigSchema: z.ZodObject<{
36
+ tasks: z.ZodArray<z.ZodObject<{
37
+ taskId: z.ZodString;
38
+ mode: z.ZodEnum<{
39
+ custom: "custom";
40
+ agentic: "agentic";
41
+ literacy: "literacy";
42
+ "mcp-server": "mcp-server";
43
+ "agent-harness": "agent-harness";
44
+ "knowledge-probe": "knowledge-probe";
45
+ baseline: "baseline";
46
+ observed: "observed";
47
+ full: "full";
48
+ }>;
49
+ rationale: z.ZodString;
50
+ }, z.core.$strip>>;
51
+ }, z.core.$strip>;
52
+ export type CanaryTaskSetConfig = z.infer<typeof CanaryTaskSetConfigSchema>;
@@ -0,0 +1,46 @@
1
+ /**
2
+ * @sanity/ailf-core — Canary task-set schemas.
3
+ *
4
+ * The canary task set is the curated subset of evaluation tasks the Tier 3
5
+ * nightly workflow runs against live LLMs. Each entry pins a `taskId` and
6
+ * `mode` together with a one-paragraph rationale documenting why the task
7
+ * earned a slot — the rationale is the canary set's single most important
8
+ * field; without it, future maintainers can't reason about whether a
9
+ * regression is meaningful or whether the slot has lost value.
10
+ *
11
+ * Validation that canary IDs map to real tasks lives in
12
+ * `scripts/check-canary-tasks.ts` (run by `pnpm check`); it can't live in
13
+ * Zod because the inventory comes from the repo task glob, not the schema.
14
+ *
15
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
16
+ * @see packages/eval/config/canary-tasks.ts — authored config
17
+ */
18
+ import { z } from "zod";
19
+ import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
20
+ // ---------------------------------------------------------------------------
21
+ // Canary entry
22
+ // ---------------------------------------------------------------------------
23
+ export const CanaryTaskEntrySchema = z.object({
24
+ taskId: z.string().min(1, "canary entries must have a non-empty taskId"),
25
+ mode: z.enum(RAW_EVAL_MODES),
26
+ rationale: z
27
+ .string()
28
+ .min(40, "canary rationale must be at least one informative sentence"),
29
+ });
30
+ // ---------------------------------------------------------------------------
31
+ // Canary set config
32
+ // ---------------------------------------------------------------------------
33
+ export const CanaryTaskSetConfigSchema = z
34
+ .object({
35
+ tasks: z.array(CanaryTaskEntrySchema),
36
+ })
37
+ .refine((config) => {
38
+ const seen = new Set();
39
+ for (const entry of config.tasks) {
40
+ const key = `${entry.mode}:${entry.taskId}`;
41
+ if (seen.has(key))
42
+ return false;
43
+ seen.add(key);
44
+ }
45
+ return true;
46
+ }, { message: "duplicate (mode, taskId) entries in canary set" });
@@ -9,8 +9,10 @@
9
9
  * (Phase 0d). Original files are now re-export barrels.
10
10
  */
11
11
  export * from "./callback-payload.js";
12
+ export * from "./canary-tasks.js";
12
13
  export * from "./eval-config.js";
13
14
  export * from "./pipeline-request.js";
14
15
  export * from "./pipeline.js";
15
16
  export * from "./schedules.js";
16
17
  export * from "./sinks.js";
18
+ export * from "./test-budgets.js";
@@ -9,8 +9,10 @@
9
9
  * (Phase 0d). Original files are now re-export barrels.
10
10
  */
11
11
  export * from "./callback-payload.js";
12
+ export * from "./canary-tasks.js";
12
13
  export * from "./eval-config.js";
13
14
  export * from "./pipeline-request.js";
14
15
  export * from "./pipeline.js";
15
16
  export * from "./schedules.js";
16
17
  export * from "./sinks.js";
18
+ export * from "./test-budgets.js";
@@ -0,0 +1,19 @@
1
+ /**
2
+ * @sanity/ailf-core — Test-budget schemas.
3
+ *
4
+ * Per-provider daily USD spend caps for Tier 3 (live-LLM) CI workflows.
5
+ * The cap is the maximum cost a single Tier 3 nightly run may incur for
6
+ * a given provider; the workflow fails loudly if any provider's actual
7
+ * spend exceeds its cap.
8
+ *
9
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
10
+ * @see packages/eval/config/test-budgets.ts — authored config
11
+ */
12
+ import { z } from "zod";
13
+ export declare const ProviderBudgetCapsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
14
+ export type ProviderBudgetCaps = z.infer<typeof ProviderBudgetCapsSchema>;
15
+ export declare const TestBudgetConfigSchema: z.ZodObject<{
16
+ perProviderDaily: z.ZodRecord<z.ZodString, z.ZodNumber>;
17
+ warnFraction: z.ZodDefault<z.ZodNumber>;
18
+ }, z.core.$strip>;
19
+ export type TestBudgetConfig = z.infer<typeof TestBudgetConfigSchema>;
@@ -0,0 +1,34 @@
1
+ /**
2
+ * @sanity/ailf-core — Test-budget schemas.
3
+ *
4
+ * Per-provider daily USD spend caps for Tier 3 (live-LLM) CI workflows.
5
+ * The cap is the maximum cost a single Tier 3 nightly run may incur for
6
+ * a given provider; the workflow fails loudly if any provider's actual
7
+ * spend exceeds its cap.
8
+ *
9
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
10
+ * @see packages/eval/config/test-budgets.ts — authored config
11
+ */
12
+ import { z } from "zod";
13
+ // ---------------------------------------------------------------------------
14
+ // Provider cap entry
15
+ // ---------------------------------------------------------------------------
16
+ const NonNegativeUsd = z
17
+ .number()
18
+ .nonnegative("budget caps must be non-negative USD amounts");
19
+ export const ProviderBudgetCapsSchema = z
20
+ .record(z.string().min(1), NonNegativeUsd)
21
+ .refine((caps) => Object.keys(caps).length > 0, {
22
+ message: "perProviderDaily must declare at least one provider",
23
+ });
24
+ // ---------------------------------------------------------------------------
25
+ // Test budget config
26
+ // ---------------------------------------------------------------------------
27
+ export const TestBudgetConfigSchema = z.object({
28
+ perProviderDaily: ProviderBudgetCapsSchema,
29
+ warnFraction: z
30
+ .number()
31
+ .gt(0)
32
+ .lte(1, "warnFraction must be in (0, 1]")
33
+ .default(0.8),
34
+ });
@@ -0,0 +1,84 @@
1
+ /**
2
+ * canary/drift.ts — Pure drift-statistic computation for the Tier 3
3
+ * framework-tests-framework loop.
4
+ *
5
+ * Consumes the projection shape returned by Studio's `latestReportsQuery`
6
+ * (we accept a slim subset so the function stays a pure-domain dependency
7
+ * with no Studio-package import). Computes per-area Δscore between the
8
+ * most-recent canary run and the trailing-N median, plus an overall
9
+ * Δscore for the run as a whole. Output classifies each delta as `ok`,
10
+ * `warn`, or `regression` against caller-provided thresholds.
11
+ *
12
+ * The function is total — it never throws. Edge cases (empty trailing
13
+ * window, missing scores) surface as `verdict: "no-baseline"` so the
14
+ * caller can decide whether to treat the missing baseline as a fail.
15
+ *
16
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
17
+ * @see packages/studio/src/queries.ts — `latestReportsQuery`
18
+ */
19
+ /** Slim projection of a canary run report — subset of `ReportListItem`. */
20
+ export interface CanaryReportSlim {
21
+ reportId: string;
22
+ completedAt: string;
23
+ overall: number;
24
+ scores: {
25
+ feature: string;
26
+ totalScore: number;
27
+ }[];
28
+ }
29
+ /** Verdict for a single Δ computation. */
30
+ export type DriftVerdict = "ok" | "warn" | "regression" | "no-baseline";
31
+ /** Δ between the most-recent run and the trailing-N median. */
32
+ export interface DriftEntry {
33
+ /** "overall" for the run-level avg, or the area slug for a per-area Δ. */
34
+ feature: string;
35
+ /** Score in the most-recent run. */
36
+ current: number;
37
+ /** Median of trailing-N runs (excluding the most-recent). Null when no baseline. */
38
+ trailingMedian: number | null;
39
+ /** current − trailingMedian. Null when no baseline. */
40
+ delta: number | null;
41
+ verdict: DriftVerdict;
42
+ }
43
+ /** Tunable thresholds — caller decides what counts as warn vs regression. */
44
+ export interface DriftThresholds {
45
+ /**
46
+ * How many prior runs (excluding the most-recent) form the trailing
47
+ * baseline. Sensible defaults sit between 5 and 10 for a daily canary.
48
+ */
49
+ trailingN: number;
50
+ /** Drop ≥ this magnitude (and < failDelta) → `warn`. */
51
+ warnDelta: number;
52
+ /** Drop ≥ this magnitude → `regression`. */
53
+ failDelta: number;
54
+ /**
55
+ * Minimum trailing-window size required to compute a delta. When the
56
+ * window is smaller, the entry's verdict is `no-baseline`. Defaults to
57
+ * 1 — a single prior run is enough to detect *some* movement.
58
+ */
59
+ minBaselineRuns?: number;
60
+ }
61
+ /** Aggregate result of `computeCanaryDrift`. */
62
+ export interface CanaryDriftReport {
63
+ /** ID + timestamp of the most-recent run. */
64
+ reportId: string;
65
+ completedAt: string;
66
+ /** Run-level Δscore (overall avg). */
67
+ overall: DriftEntry;
68
+ /** Per-area Δscores. One entry per area present in the most-recent run. */
69
+ byArea: DriftEntry[];
70
+ /** True when any verdict is `regression`. */
71
+ hasRegression: boolean;
72
+ /** True when any verdict is `warn` or `regression`. */
73
+ hasMovement: boolean;
74
+ }
75
+ /**
76
+ * Compute per-area + overall drift for a sequence of canary runs.
77
+ *
78
+ * `reports` must be ordered **newest-first** (matching `latestReportsQuery`'s
79
+ * `order(completedAt desc)`). The most-recent run is `reports[0]`; the
80
+ * trailing window is `reports.slice(1, 1 + trailingN)`.
81
+ *
82
+ * @throws never — all error states surface as `no-baseline` verdicts.
83
+ */
84
+ export declare function computeCanaryDrift(reports: CanaryReportSlim[], thresholds: DriftThresholds): CanaryDriftReport | null;