@sanity/ailf 3.8.1 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/canary-tasks.ts +64 -0
- package/config/test-budgets.ts +24 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +19 -0
- package/dist/_vendor/ailf-core/config-helpers.js +27 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
- package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
- package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
- package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
- package/dist/_vendor/ailf-shared/index.d.ts +1 -0
- package/dist/_vendor/ailf-shared/index.js +1 -0
- package/dist/config/canary-tasks.ts +64 -0
- package/dist/config/test-budgets.ts +24 -0
- package/dist/pipeline/calculate-scores.d.ts +17 -2
- package/dist/pipeline/calculate-scores.js +99 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/package.json +2 -1
- package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary-tasks.ts — The Tier 3 canary set.
|
|
3
|
+
*
|
|
4
|
+
* Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
|
|
5
|
+
* Composition follows the design doc's "weighted toward modes/areas with
|
|
6
|
+
* the most production usage and the highest historical regression rates"
|
|
7
|
+
* recommendation: GROQ and Content Lake (foundational consumer surfaces),
|
|
8
|
+
* Portable Text (historically drift-prone), Studio schema authoring (the
|
|
9
|
+
* second-most-used surface after queries), and a knowledge-probe pairing
|
|
10
|
+
* for cross-mode coverage.
|
|
11
|
+
*
|
|
12
|
+
* Each entry's `rationale` is the canary's load-bearing field — without it,
|
|
13
|
+
* future maintainers can't reason about whether a regression is meaningful
|
|
14
|
+
* or whether the slot has lost value. Update the rationale when you swap a
|
|
15
|
+
* canary entry; never silently replace one.
|
|
16
|
+
*
|
|
17
|
+
* Validated against the live task inventory by `scripts/check-canary-tasks.ts`
|
|
18
|
+
* (`pnpm check`). Dangling task IDs fail the build.
|
|
19
|
+
*
|
|
20
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
21
|
+
* @see .github/workflows/tier-3-nightly.yml — consumer
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { defineCanaryTasks } from "@sanity/ailf-core"
|
|
25
|
+
|
|
26
|
+
export default defineCanaryTasks({
|
|
27
|
+
tasks: [
|
|
28
|
+
{
|
|
29
|
+
taskId: "groq-blog-queries",
|
|
30
|
+
mode: "literacy",
|
|
31
|
+
rationale:
|
|
32
|
+
"Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
taskId: "content-lake-mutations",
|
|
36
|
+
mode: "literacy",
|
|
37
|
+
rationale:
|
|
38
|
+
"Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
taskId: "portable-text-rendering",
|
|
42
|
+
mode: "literacy",
|
|
43
|
+
rationale:
|
|
44
|
+
"Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
taskId: "studio-blog-schema",
|
|
48
|
+
mode: "literacy",
|
|
49
|
+
rationale:
|
|
50
|
+
"Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
taskId: "kp-groq-projections",
|
|
54
|
+
mode: "knowledge-probe",
|
|
55
|
+
rationale:
|
|
56
|
+
"Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
|
|
57
|
+
},
|
|
58
|
+
// mcp-server canary slot — add a third mode here when a committed
|
|
59
|
+
// mcp-server task lands under packages/eval/tasks/mcp-server/. Today
|
|
60
|
+
// there are no production mcp-server tasks (only fixtures); the trigger
|
|
61
|
+
// is upstream and adding a placeholder slot would dangle. Surfaced at
|
|
62
|
+
// Phase 5 close (2026-04-27) — see W0116 retrospective.
|
|
63
|
+
],
|
|
64
|
+
})
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
|
|
3
|
+
*
|
|
4
|
+
* Each cap is the maximum cost a single Tier 3 nightly run may incur for
|
|
5
|
+
* that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
|
|
6
|
+
* fails loudly if any provider's actual spend exceeds its cap.
|
|
7
|
+
*
|
|
8
|
+
* The design doc names a $30–60/day envelope across all providers. Caps
|
|
9
|
+
* here divide that envelope per-provider; tighten as baseline canary spend
|
|
10
|
+
* becomes measurable.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
13
|
+
* @see scripts/tier-3-budget-check.mjs — enforcement
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { defineTestBudgets } from "@sanity/ailf-core"
|
|
17
|
+
|
|
18
|
+
export default defineTestBudgets({
|
|
19
|
+
perProviderDaily: {
|
|
20
|
+
anthropic: 30,
|
|
21
|
+
openai: 30,
|
|
22
|
+
},
|
|
23
|
+
warnFraction: 0.8,
|
|
24
|
+
})
|
|
@@ -26,9 +26,11 @@
|
|
|
26
26
|
* @see docs/design-docs/architecture-overhaul/typescript-configuration.md (canonical)
|
|
27
27
|
*/
|
|
28
28
|
import type { EvalConfig } from "./schemas/eval-config.js";
|
|
29
|
+
import type { CanaryTaskSetConfig } from "./schemas/canary-tasks.js";
|
|
29
30
|
import type { FeatureRegistry, RubricConfig, ThresholdConfig } from "./schemas/pipeline.js";
|
|
30
31
|
import type { SchedulesFile } from "./schemas/schedules.js";
|
|
31
32
|
import type { SinksFile } from "./schemas/sinks.js";
|
|
33
|
+
import type { TestBudgetConfig } from "./schemas/test-budgets.js";
|
|
32
34
|
import type { ModelsConfig } from "./types/index.js";
|
|
33
35
|
import type { GeneralizedTaskDefinition } from "./types/generalized-task.js";
|
|
34
36
|
import type { ModeBase, PresetDefinition } from "./types/plugin-registry.js";
|
|
@@ -89,6 +91,23 @@ export declare function defineSinks(sinks: SinksFile): SinksFile;
|
|
|
89
91
|
* Used in `config/schedules.ts` for typed schedule configuration.
|
|
90
92
|
*/
|
|
91
93
|
export declare function defineSchedules(schedules: SchedulesFile): SchedulesFile;
|
|
94
|
+
/**
|
|
95
|
+
* Define per-provider daily USD spend caps for Tier 3 (live-LLM) CI runs.
|
|
96
|
+
*
|
|
97
|
+
* Used in `config/test-budgets.ts` for typed budget configuration.
|
|
98
|
+
*
|
|
99
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
100
|
+
*/
|
|
101
|
+
export declare function defineTestBudgets(budgets: TestBudgetConfig): TestBudgetConfig;
|
|
102
|
+
/**
|
|
103
|
+
* Define the curated canary task set for the Tier 3 nightly workflow.
|
|
104
|
+
*
|
|
105
|
+
* Used in `config/canary-tasks.ts`. Validation against the live task
|
|
106
|
+
* inventory happens in `scripts/check-canary-tasks.ts`, not here.
|
|
107
|
+
*
|
|
108
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
109
|
+
*/
|
|
110
|
+
export declare function defineCanaryTasks(canary: CanaryTaskSetConfig): CanaryTaskSetConfig;
|
|
92
111
|
/**
|
|
93
112
|
* Source configuration — typed inline until a dedicated schema exists.
|
|
94
113
|
*
|
|
@@ -145,6 +145,33 @@ export function defineSinks(sinks) {
|
|
|
145
145
|
export function defineSchedules(schedules) {
|
|
146
146
|
return schedules;
|
|
147
147
|
}
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Test-budget helpers
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
/**
|
|
152
|
+
* Define per-provider daily USD spend caps for Tier 3 (live-LLM) CI runs.
|
|
153
|
+
*
|
|
154
|
+
* Used in `config/test-budgets.ts` for typed budget configuration.
|
|
155
|
+
*
|
|
156
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
157
|
+
*/
|
|
158
|
+
export function defineTestBudgets(budgets) {
|
|
159
|
+
return budgets;
|
|
160
|
+
}
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
// Canary task-set helpers
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
/**
|
|
165
|
+
* Define the curated canary task set for the Tier 3 nightly workflow.
|
|
166
|
+
*
|
|
167
|
+
* Used in `config/canary-tasks.ts`. Validation against the live task
|
|
168
|
+
* inventory happens in `scripts/check-canary-tasks.ts`, not here.
|
|
169
|
+
*
|
|
170
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
171
|
+
*/
|
|
172
|
+
export function defineCanaryTasks(canary) {
|
|
173
|
+
return canary;
|
|
174
|
+
}
|
|
148
175
|
/**
|
|
149
176
|
* Define documentation source configurations.
|
|
150
177
|
*
|
|
@@ -17,7 +17,7 @@ export * from "./services/index.js";
|
|
|
17
17
|
export * from "./examples/index.js";
|
|
18
18
|
export * from "./artifact-registry.js";
|
|
19
19
|
export * from "./batch-signing.js";
|
|
20
|
-
export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
|
|
20
|
+
export { defineCanaryTasks, defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineTestBudgets, defineThresholds, } from "./config-helpers.js";
|
|
21
21
|
export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
|
|
22
22
|
export { env } from "./env-helper.js";
|
|
23
23
|
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
@@ -20,7 +20,7 @@ export * from "./batch-signing.js";
|
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
21
|
// Architecture overhaul — Phase 0 helpers
|
|
22
22
|
// ---------------------------------------------------------------------------
|
|
23
|
-
export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
|
|
23
|
+
export { defineCanaryTasks, defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineTestBudgets, defineThresholds, } from "./config-helpers.js";
|
|
24
24
|
export { env } from "./env-helper.js";
|
|
25
25
|
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
26
26
|
export { assoc, resolveVariantMode, splitTaskVariant, } from "./artifact-capture/association.js";
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Canary task-set schemas.
|
|
3
|
+
*
|
|
4
|
+
* The canary task set is the curated subset of evaluation tasks the Tier 3
|
|
5
|
+
* nightly workflow runs against live LLMs. Each entry pins a `taskId` and
|
|
6
|
+
* `mode` together with a one-paragraph rationale documenting why the task
|
|
7
|
+
* earned a slot — the rationale is the canary set's single most important
|
|
8
|
+
* field; without it, future maintainers can't reason about whether a
|
|
9
|
+
* regression is meaningful or whether the slot has lost value.
|
|
10
|
+
*
|
|
11
|
+
* Validation that canary IDs map to real tasks lives in
|
|
12
|
+
* `scripts/check-canary-tasks.ts` (run by `pnpm check`); it can't live in
|
|
13
|
+
* Zod because the inventory comes from the repo task glob, not the schema.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
16
|
+
* @see packages/eval/config/canary-tasks.ts — authored config
|
|
17
|
+
*/
|
|
18
|
+
import { z } from "zod";
|
|
19
|
+
export declare const CanaryTaskEntrySchema: z.ZodObject<{
|
|
20
|
+
taskId: z.ZodString;
|
|
21
|
+
mode: z.ZodEnum<{
|
|
22
|
+
custom: "custom";
|
|
23
|
+
agentic: "agentic";
|
|
24
|
+
literacy: "literacy";
|
|
25
|
+
"mcp-server": "mcp-server";
|
|
26
|
+
"agent-harness": "agent-harness";
|
|
27
|
+
"knowledge-probe": "knowledge-probe";
|
|
28
|
+
baseline: "baseline";
|
|
29
|
+
observed: "observed";
|
|
30
|
+
full: "full";
|
|
31
|
+
}>;
|
|
32
|
+
rationale: z.ZodString;
|
|
33
|
+
}, z.core.$strip>;
|
|
34
|
+
export type CanaryTaskEntry = z.infer<typeof CanaryTaskEntrySchema>;
|
|
35
|
+
export declare const CanaryTaskSetConfigSchema: z.ZodObject<{
|
|
36
|
+
tasks: z.ZodArray<z.ZodObject<{
|
|
37
|
+
taskId: z.ZodString;
|
|
38
|
+
mode: z.ZodEnum<{
|
|
39
|
+
custom: "custom";
|
|
40
|
+
agentic: "agentic";
|
|
41
|
+
literacy: "literacy";
|
|
42
|
+
"mcp-server": "mcp-server";
|
|
43
|
+
"agent-harness": "agent-harness";
|
|
44
|
+
"knowledge-probe": "knowledge-probe";
|
|
45
|
+
baseline: "baseline";
|
|
46
|
+
observed: "observed";
|
|
47
|
+
full: "full";
|
|
48
|
+
}>;
|
|
49
|
+
rationale: z.ZodString;
|
|
50
|
+
}, z.core.$strip>>;
|
|
51
|
+
}, z.core.$strip>;
|
|
52
|
+
export type CanaryTaskSetConfig = z.infer<typeof CanaryTaskSetConfigSchema>;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Canary task-set schemas.
|
|
3
|
+
*
|
|
4
|
+
* The canary task set is the curated subset of evaluation tasks the Tier 3
|
|
5
|
+
* nightly workflow runs against live LLMs. Each entry pins a `taskId` and
|
|
6
|
+
* `mode` together with a one-paragraph rationale documenting why the task
|
|
7
|
+
* earned a slot — the rationale is the canary set's single most important
|
|
8
|
+
* field; without it, future maintainers can't reason about whether a
|
|
9
|
+
* regression is meaningful or whether the slot has lost value.
|
|
10
|
+
*
|
|
11
|
+
* Validation that canary IDs map to real tasks lives in
|
|
12
|
+
* `scripts/check-canary-tasks.ts` (run by `pnpm check`); it can't live in
|
|
13
|
+
* Zod because the inventory comes from the repo task glob, not the schema.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
16
|
+
* @see packages/eval/config/canary-tasks.ts — authored config
|
|
17
|
+
*/
|
|
18
|
+
import { z } from "zod";
|
|
19
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Canary entry
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export const CanaryTaskEntrySchema = z.object({
|
|
24
|
+
taskId: z.string().min(1, "canary entries must have a non-empty taskId"),
|
|
25
|
+
mode: z.enum(RAW_EVAL_MODES),
|
|
26
|
+
rationale: z
|
|
27
|
+
.string()
|
|
28
|
+
.min(40, "canary rationale must be at least one informative sentence"),
|
|
29
|
+
});
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Canary set config
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
export const CanaryTaskSetConfigSchema = z
|
|
34
|
+
.object({
|
|
35
|
+
tasks: z.array(CanaryTaskEntrySchema),
|
|
36
|
+
})
|
|
37
|
+
.refine((config) => {
|
|
38
|
+
const seen = new Set();
|
|
39
|
+
for (const entry of config.tasks) {
|
|
40
|
+
const key = `${entry.mode}:${entry.taskId}`;
|
|
41
|
+
if (seen.has(key))
|
|
42
|
+
return false;
|
|
43
|
+
seen.add(key);
|
|
44
|
+
}
|
|
45
|
+
return true;
|
|
46
|
+
}, { message: "duplicate (mode, taskId) entries in canary set" });
|
|
@@ -9,8 +9,10 @@
|
|
|
9
9
|
* (Phase 0d). Original files are now re-export barrels.
|
|
10
10
|
*/
|
|
11
11
|
export * from "./callback-payload.js";
|
|
12
|
+
export * from "./canary-tasks.js";
|
|
12
13
|
export * from "./eval-config.js";
|
|
13
14
|
export * from "./pipeline-request.js";
|
|
14
15
|
export * from "./pipeline.js";
|
|
15
16
|
export * from "./schedules.js";
|
|
16
17
|
export * from "./sinks.js";
|
|
18
|
+
export * from "./test-budgets.js";
|
|
@@ -9,8 +9,10 @@
|
|
|
9
9
|
* (Phase 0d). Original files are now re-export barrels.
|
|
10
10
|
*/
|
|
11
11
|
export * from "./callback-payload.js";
|
|
12
|
+
export * from "./canary-tasks.js";
|
|
12
13
|
export * from "./eval-config.js";
|
|
13
14
|
export * from "./pipeline-request.js";
|
|
14
15
|
export * from "./pipeline.js";
|
|
15
16
|
export * from "./schedules.js";
|
|
16
17
|
export * from "./sinks.js";
|
|
18
|
+
export * from "./test-budgets.js";
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Test-budget schemas.
|
|
3
|
+
*
|
|
4
|
+
* Per-provider daily USD spend caps for Tier 3 (live-LLM) CI workflows.
|
|
5
|
+
* The cap is the maximum cost a single Tier 3 nightly run may incur for
|
|
6
|
+
* a given provider; the workflow fails loudly if any provider's actual
|
|
7
|
+
* spend exceeds its cap.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
10
|
+
* @see packages/eval/config/test-budgets.ts — authored config
|
|
11
|
+
*/
|
|
12
|
+
import { z } from "zod";
|
|
13
|
+
export declare const ProviderBudgetCapsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
14
|
+
export type ProviderBudgetCaps = z.infer<typeof ProviderBudgetCapsSchema>;
|
|
15
|
+
export declare const TestBudgetConfigSchema: z.ZodObject<{
|
|
16
|
+
perProviderDaily: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
17
|
+
warnFraction: z.ZodDefault<z.ZodNumber>;
|
|
18
|
+
}, z.core.$strip>;
|
|
19
|
+
export type TestBudgetConfig = z.infer<typeof TestBudgetConfigSchema>;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Test-budget schemas.
|
|
3
|
+
*
|
|
4
|
+
* Per-provider daily USD spend caps for Tier 3 (live-LLM) CI workflows.
|
|
5
|
+
* The cap is the maximum cost a single Tier 3 nightly run may incur for
|
|
6
|
+
* a given provider; the workflow fails loudly if any provider's actual
|
|
7
|
+
* spend exceeds its cap.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
10
|
+
* @see packages/eval/config/test-budgets.ts — authored config
|
|
11
|
+
*/
|
|
12
|
+
import { z } from "zod";
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Provider cap entry
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
const NonNegativeUsd = z
|
|
17
|
+
.number()
|
|
18
|
+
.nonnegative("budget caps must be non-negative USD amounts");
|
|
19
|
+
export const ProviderBudgetCapsSchema = z
|
|
20
|
+
.record(z.string().min(1), NonNegativeUsd)
|
|
21
|
+
.refine((caps) => Object.keys(caps).length > 0, {
|
|
22
|
+
message: "perProviderDaily must declare at least one provider",
|
|
23
|
+
});
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Test budget config
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
export const TestBudgetConfigSchema = z.object({
|
|
28
|
+
perProviderDaily: ProviderBudgetCapsSchema,
|
|
29
|
+
warnFraction: z
|
|
30
|
+
.number()
|
|
31
|
+
.gt(0)
|
|
32
|
+
.lte(1, "warnFraction must be in (0, 1]")
|
|
33
|
+
.default(0.8),
|
|
34
|
+
});
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary/drift.ts — Pure drift-statistic computation for the Tier 3
|
|
3
|
+
* framework-tests-framework loop.
|
|
4
|
+
*
|
|
5
|
+
* Consumes the projection shape returned by Studio's `latestReportsQuery`
|
|
6
|
+
* (we accept a slim subset so the function stays a pure-domain dependency
|
|
7
|
+
* with no Studio-package import). Computes per-area Δscore between the
|
|
8
|
+
* most-recent canary run and the trailing-N median, plus an overall
|
|
9
|
+
* Δscore for the run as a whole. Output classifies each delta as `ok`,
|
|
10
|
+
* `warn`, or `regression` against caller-provided thresholds.
|
|
11
|
+
*
|
|
12
|
+
* The function is total — it never throws. Edge cases (empty trailing
|
|
13
|
+
* window, missing scores) surface as `verdict: "no-baseline"` so the
|
|
14
|
+
* caller can decide whether to treat the missing baseline as a fail.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
17
|
+
* @see packages/studio/src/queries.ts — `latestReportsQuery`
|
|
18
|
+
*/
|
|
19
|
+
/** Slim projection of a canary run report — subset of `ReportListItem`. */
|
|
20
|
+
export interface CanaryReportSlim {
|
|
21
|
+
reportId: string;
|
|
22
|
+
completedAt: string;
|
|
23
|
+
overall: number;
|
|
24
|
+
scores: {
|
|
25
|
+
feature: string;
|
|
26
|
+
totalScore: number;
|
|
27
|
+
}[];
|
|
28
|
+
}
|
|
29
|
+
/** Verdict for a single Δ computation. */
|
|
30
|
+
export type DriftVerdict = "ok" | "warn" | "regression" | "no-baseline";
|
|
31
|
+
/** Δ between the most-recent run and the trailing-N median. */
|
|
32
|
+
export interface DriftEntry {
|
|
33
|
+
/** "overall" for the run-level avg, or the area slug for a per-area Δ. */
|
|
34
|
+
feature: string;
|
|
35
|
+
/** Score in the most-recent run. */
|
|
36
|
+
current: number;
|
|
37
|
+
/** Median of trailing-N runs (excluding the most-recent). Null when no baseline. */
|
|
38
|
+
trailingMedian: number | null;
|
|
39
|
+
/** current − trailingMedian. Null when no baseline. */
|
|
40
|
+
delta: number | null;
|
|
41
|
+
verdict: DriftVerdict;
|
|
42
|
+
}
|
|
43
|
+
/** Tunable thresholds — caller decides what counts as warn vs regression. */
|
|
44
|
+
export interface DriftThresholds {
|
|
45
|
+
/**
|
|
46
|
+
* How many prior runs (excluding the most-recent) form the trailing
|
|
47
|
+
* baseline. Sensible defaults sit between 5 and 10 for a daily canary.
|
|
48
|
+
*/
|
|
49
|
+
trailingN: number;
|
|
50
|
+
/** Drop ≥ this magnitude (and < failDelta) → `warn`. */
|
|
51
|
+
warnDelta: number;
|
|
52
|
+
/** Drop ≥ this magnitude → `regression`. */
|
|
53
|
+
failDelta: number;
|
|
54
|
+
/**
|
|
55
|
+
* Minimum trailing-window size required to compute a delta. When the
|
|
56
|
+
* window is smaller, the entry's verdict is `no-baseline`. Defaults to
|
|
57
|
+
* 1 — a single prior run is enough to detect *some* movement.
|
|
58
|
+
*/
|
|
59
|
+
minBaselineRuns?: number;
|
|
60
|
+
}
|
|
61
|
+
/** Aggregate result of `computeCanaryDrift`. */
|
|
62
|
+
export interface CanaryDriftReport {
|
|
63
|
+
/** ID + timestamp of the most-recent run. */
|
|
64
|
+
reportId: string;
|
|
65
|
+
completedAt: string;
|
|
66
|
+
/** Run-level Δscore (overall avg). */
|
|
67
|
+
overall: DriftEntry;
|
|
68
|
+
/** Per-area Δscores. One entry per area present in the most-recent run. */
|
|
69
|
+
byArea: DriftEntry[];
|
|
70
|
+
/** True when any verdict is `regression`. */
|
|
71
|
+
hasRegression: boolean;
|
|
72
|
+
/** True when any verdict is `warn` or `regression`. */
|
|
73
|
+
hasMovement: boolean;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Compute per-area + overall drift for a sequence of canary runs.
|
|
77
|
+
*
|
|
78
|
+
* `reports` must be ordered **newest-first** (matching `latestReportsQuery`'s
|
|
79
|
+
* `order(completedAt desc)`). The most-recent run is `reports[0]`; the
|
|
80
|
+
* trailing window is `reports.slice(1, 1 + trailingN)`.
|
|
81
|
+
*
|
|
82
|
+
* @throws never — all error states surface as `no-baseline` verdicts.
|
|
83
|
+
*/
|
|
84
|
+
export declare function computeCanaryDrift(reports: CanaryReportSlim[], thresholds: DriftThresholds): CanaryDriftReport | null;
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary/drift.ts — Pure drift-statistic computation for the Tier 3
|
|
3
|
+
* framework-tests-framework loop.
|
|
4
|
+
*
|
|
5
|
+
* Consumes the projection shape returned by Studio's `latestReportsQuery`
|
|
6
|
+
* (we accept a slim subset so the function stays a pure-domain dependency
|
|
7
|
+
* with no Studio-package import). Computes per-area Δscore between the
|
|
8
|
+
* most-recent canary run and the trailing-N median, plus an overall
|
|
9
|
+
* Δscore for the run as a whole. Output classifies each delta as `ok`,
|
|
10
|
+
* `warn`, or `regression` against caller-provided thresholds.
|
|
11
|
+
*
|
|
12
|
+
* The function is total — it never throws. Edge cases (empty trailing
|
|
13
|
+
* window, missing scores) surface as `verdict: "no-baseline"` so the
|
|
14
|
+
* caller can decide whether to treat the missing baseline as a fail.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
17
|
+
* @see packages/studio/src/queries.ts — `latestReportsQuery`
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* Compute per-area + overall drift for a sequence of canary runs.
|
|
21
|
+
*
|
|
22
|
+
* `reports` must be ordered **newest-first** (matching `latestReportsQuery`'s
|
|
23
|
+
* `order(completedAt desc)`). The most-recent run is `reports[0]`; the
|
|
24
|
+
* trailing window is `reports.slice(1, 1 + trailingN)`.
|
|
25
|
+
*
|
|
26
|
+
* @throws never — all error states surface as `no-baseline` verdicts.
|
|
27
|
+
*/
|
|
28
|
+
export function computeCanaryDrift(reports, thresholds) {
|
|
29
|
+
if (reports.length === 0)
|
|
30
|
+
return null;
|
|
31
|
+
const minBaseline = thresholds.minBaselineRuns ?? 1;
|
|
32
|
+
const current = reports[0];
|
|
33
|
+
const trailing = reports.slice(1, 1 + thresholds.trailingN);
|
|
34
|
+
const overall = scoreDrift("overall", current.overall, trailing.map((r) => r.overall), thresholds, minBaseline);
|
|
35
|
+
const byArea = [];
|
|
36
|
+
for (const score of current.scores) {
|
|
37
|
+
const trailingArea = [];
|
|
38
|
+
for (const t of trailing) {
|
|
39
|
+
const match = t.scores.find((s) => s.feature === score.feature);
|
|
40
|
+
if (match)
|
|
41
|
+
trailingArea.push(match.totalScore);
|
|
42
|
+
}
|
|
43
|
+
byArea.push(scoreDrift(score.feature, score.totalScore, trailingArea, thresholds, minBaseline));
|
|
44
|
+
}
|
|
45
|
+
const hasRegression = overall.verdict === "regression" ||
|
|
46
|
+
byArea.some((e) => e.verdict === "regression");
|
|
47
|
+
const hasMovement = hasRegression ||
|
|
48
|
+
overall.verdict === "warn" ||
|
|
49
|
+
byArea.some((e) => e.verdict === "warn");
|
|
50
|
+
return {
|
|
51
|
+
reportId: current.reportId,
|
|
52
|
+
completedAt: current.completedAt,
|
|
53
|
+
overall,
|
|
54
|
+
byArea,
|
|
55
|
+
hasRegression,
|
|
56
|
+
hasMovement,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function scoreDrift(feature, current, trailing, thresholds, minBaseline) {
|
|
60
|
+
if (trailing.length < minBaseline) {
|
|
61
|
+
return {
|
|
62
|
+
feature,
|
|
63
|
+
current,
|
|
64
|
+
trailingMedian: null,
|
|
65
|
+
delta: null,
|
|
66
|
+
verdict: "no-baseline",
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
const trailingMedian = median(trailing);
|
|
70
|
+
const delta = current - trailingMedian;
|
|
71
|
+
const drop = -delta;
|
|
72
|
+
let verdict = "ok";
|
|
73
|
+
if (drop >= thresholds.failDelta)
|
|
74
|
+
verdict = "regression";
|
|
75
|
+
else if (drop >= thresholds.warnDelta)
|
|
76
|
+
verdict = "warn";
|
|
77
|
+
return { feature, current, trailingMedian, delta, verdict };
|
|
78
|
+
}
|
|
79
|
+
function median(values) {
|
|
80
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
81
|
+
const mid = Math.floor(sorted.length / 2);
|
|
82
|
+
if (sorted.length % 2 === 0) {
|
|
83
|
+
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
84
|
+
}
|
|
85
|
+
return sorted[mid];
|
|
86
|
+
}
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
11
|
*/
|
|
12
|
+
export * from "./canary-drift.js";
|
|
12
13
|
export * from "./document-ref.js";
|
|
13
14
|
export * from "./feature-flags.js";
|
|
14
15
|
export * from "./score-grades.js";
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
11
|
*/
|
|
12
|
+
export * from "./canary-drift.js";
|
|
12
13
|
export * from "./document-ref.js";
|
|
13
14
|
export * from "./feature-flags.js";
|
|
14
15
|
export * from "./score-grades.js";
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary-tasks.ts — The Tier 3 canary set.
|
|
3
|
+
*
|
|
4
|
+
* Five tasks the Tier 3 nightly workflow runs against live LLMs every day.
|
|
5
|
+
* Composition follows the design doc's "weighted toward modes/areas with
|
|
6
|
+
* the most production usage and the highest historical regression rates"
|
|
7
|
+
* recommendation: GROQ and Content Lake (foundational consumer surfaces),
|
|
8
|
+
* Portable Text (historically drift-prone), Studio schema authoring (the
|
|
9
|
+
* second-most-used surface after queries), and a knowledge-probe pairing
|
|
10
|
+
* for cross-mode coverage.
|
|
11
|
+
*
|
|
12
|
+
* Each entry's `rationale` is the canary's load-bearing field — without it,
|
|
13
|
+
* future maintainers can't reason about whether a regression is meaningful
|
|
14
|
+
* or whether the slot has lost value. Update the rationale when you swap a
|
|
15
|
+
* canary entry; never silently replace one.
|
|
16
|
+
*
|
|
17
|
+
* Validated against the live task inventory by `scripts/check-canary-tasks.ts`
|
|
18
|
+
* (`pnpm check`). Dangling task IDs fail the build.
|
|
19
|
+
*
|
|
20
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
21
|
+
* @see .github/workflows/tier-3-nightly.yml — consumer
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { defineCanaryTasks } from "../_vendor/ailf-core/index.js"
|
|
25
|
+
|
|
26
|
+
export default defineCanaryTasks({
|
|
27
|
+
tasks: [
|
|
28
|
+
{
|
|
29
|
+
taskId: "groq-blog-queries",
|
|
30
|
+
mode: "literacy",
|
|
31
|
+
rationale:
|
|
32
|
+
"Canonical first-use path for Sanity's most-used API. GROQ is the largest doc surface and the highest-leverage canary slot — drift here means drift in the most-consumed documentation. Filtering and pagination together exercise the largest cross-section of GROQ syntax in a single task.",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
taskId: "content-lake-mutations",
|
|
36
|
+
mode: "literacy",
|
|
37
|
+
rationale:
|
|
38
|
+
"Foundational client API. CRUD is structurally distinct from query reasoning, so this catches regressions in mutation/transaction documentation that GROQ canary slots cannot reach. Every Sanity consumer eventually writes to the Content Lake.",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
taskId: "portable-text-rendering",
|
|
42
|
+
mode: "literacy",
|
|
43
|
+
rationale:
|
|
44
|
+
"Major doc surface flagged as historically drift-prone in the testing audit. React-rendering of Portable Text mixes documentation, type definitions, and worked examples — a regression on any axis surfaces here first.",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
taskId: "studio-blog-schema",
|
|
48
|
+
mode: "literacy",
|
|
49
|
+
rationale:
|
|
50
|
+
"Schema authoring (`defineType` / `defineField`) is the second-most-used surface after queries. Tests structural Studio docs that change shape across versions; pairs naturally with the GROQ canary because consumers typically author schemas before querying them.",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
taskId: "kp-groq-projections",
|
|
54
|
+
mode: "knowledge-probe",
|
|
55
|
+
rationale:
|
|
56
|
+
"Cross-mode coverage. Pairs with `groq-blog-queries` (literacy) so we catch GROQ drift in both implementation (write code) and recall (explain syntax) modes. Knowledge-probe is the only non-literacy mode in the canary today; expand once mcp-server tasks land in the repo.",
|
|
57
|
+
},
|
|
58
|
+
// mcp-server canary slot — add a third mode here when a committed
|
|
59
|
+
// mcp-server task lands under packages/eval/tasks/mcp-server/. Today
|
|
60
|
+
// there are no production mcp-server tasks (only fixtures); the trigger
|
|
61
|
+
// is upstream and adding a placeholder slot would dangle. Surfaced at
|
|
62
|
+
// Phase 5 close (2026-04-27) — see W0116 retrospective.
|
|
63
|
+
],
|
|
64
|
+
})
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* test-budgets.ts — Per-provider daily USD spend caps for Tier 3 CI runs.
|
|
3
|
+
*
|
|
4
|
+
* Each cap is the maximum cost a single Tier 3 nightly run may incur for
|
|
5
|
+
* that provider. The Tier 3 workflow (`.github/workflows/tier-3-nightly.yml`)
|
|
6
|
+
* fails loudly if any provider's actual spend exceeds its cap.
|
|
7
|
+
*
|
|
8
|
+
* The design doc names a $30–60/day envelope across all providers. Caps
|
|
9
|
+
* here divide that envelope per-provider; tighten as baseline canary spend
|
|
10
|
+
* becomes measurable.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
13
|
+
* @see scripts/tier-3-budget-check.mjs — enforcement
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { defineTestBudgets } from "../_vendor/ailf-core/index.js"
|
|
17
|
+
|
|
18
|
+
export default defineTestBudgets({
|
|
19
|
+
perProviderDaily: {
|
|
20
|
+
anthropic: 30,
|
|
21
|
+
openai: 30,
|
|
22
|
+
},
|
|
23
|
+
warnFraction: 0.8,
|
|
24
|
+
})
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
|
-
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
3
|
+
import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
4
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
5
5
|
export interface PromptfooResultsWrapper {
|
|
6
6
|
results: RawTestResult[];
|
|
@@ -91,6 +91,21 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
|
|
|
91
91
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
92
92
|
*/
|
|
93
93
|
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
94
|
+
/**
|
|
95
|
+
* Score knowledge-probe evaluation results.
|
|
96
|
+
*
|
|
97
|
+
* Knowledge-probe mode evaluates parametric recall: the model has no `docs`
|
|
98
|
+
* var and answers from training-data knowledge alone. The compiler explicitly
|
|
99
|
+
* deletes `vars.docs`, so every result lands in the without-docs bucket of
|
|
100
|
+
* the literacy scoring path — collapsing testCount and ceilingScore to zero.
|
|
101
|
+
*
|
|
102
|
+
* This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
|
|
103
|
+
* feature area (KP results carry `__featureArea` from the compiler), and
|
|
104
|
+
* uses the `knowledge-probe` profile (factual-correctness / completeness /
|
|
105
|
+
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
106
|
+
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
107
|
+
*/
|
|
108
|
+
export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
|
|
94
109
|
/**
|
|
95
110
|
* Score agentic evaluation results. In agentic mode, all test entries are
|
|
96
111
|
* gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
|
|
@@ -719,6 +719,55 @@ function extractTaskId(description) {
|
|
|
719
719
|
return description.trim() || "unknown";
|
|
720
720
|
}
|
|
721
721
|
// ---------------------------------------------------------------------------
|
|
722
|
+
// Knowledge-probe scoring — closed-book recall with no docs context
|
|
723
|
+
// ---------------------------------------------------------------------------
|
|
724
|
+
/**
|
|
725
|
+
* Score knowledge-probe evaluation results.
|
|
726
|
+
*
|
|
727
|
+
* Knowledge-probe mode evaluates parametric recall: the model has no `docs`
|
|
728
|
+
* var and answers from training-data knowledge alone. The compiler explicitly
|
|
729
|
+
* deletes `vars.docs`, so every result lands in the without-docs bucket of
|
|
730
|
+
* the literacy scoring path — collapsing testCount and ceilingScore to zero.
|
|
731
|
+
*
|
|
732
|
+
* This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
|
|
733
|
+
* feature area (KP results carry `__featureArea` from the compiler), and
|
|
734
|
+
* uses the `knowledge-probe` profile (factual-correctness / completeness /
|
|
735
|
+
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
736
|
+
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
737
|
+
*/
|
|
738
|
+
export function scoreKnowledgeProbeResults(results, profile) {
|
|
739
|
+
const byFeature = {};
|
|
740
|
+
for (const result of results) {
|
|
741
|
+
const feature = result.vars.__featureArea || detectFeatureArea(result.description);
|
|
742
|
+
if (!byFeature[feature]) {
|
|
743
|
+
byFeature[feature] = [];
|
|
744
|
+
}
|
|
745
|
+
byFeature[feature].push(result);
|
|
746
|
+
}
|
|
747
|
+
const scores = [];
|
|
748
|
+
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
749
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
750
|
+
scores.push({
|
|
751
|
+
assertionPassRate: scored.dimensions.assertionPassRate,
|
|
752
|
+
ceilingScore: 0,
|
|
753
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
754
|
+
dimensions: scored.dimensions,
|
|
755
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
756
|
+
docLift: 0,
|
|
757
|
+
docQualityGap: 0,
|
|
758
|
+
feature,
|
|
759
|
+
floorScore: 0,
|
|
760
|
+
groupType: "feature",
|
|
761
|
+
negativeDocLift: false,
|
|
762
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
763
|
+
testCount: featureResults.length,
|
|
764
|
+
totalCost: scored.totalCost,
|
|
765
|
+
totalScore: scored.composite,
|
|
766
|
+
});
|
|
767
|
+
}
|
|
768
|
+
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
769
|
+
}
|
|
770
|
+
// ---------------------------------------------------------------------------
|
|
722
771
|
// Agentic scoring — all results are "actual" (agent retrieves docs via tools)
|
|
723
772
|
// ---------------------------------------------------------------------------
|
|
724
773
|
/**
|
|
@@ -893,6 +942,56 @@ export function calculateAndWriteScores(options) {
|
|
|
893
942
|
const testSummary = computeTestSummary(baselineResultsPath);
|
|
894
943
|
return { belowCritical: summary.belowCritical, testSummary };
|
|
895
944
|
}
|
|
945
|
+
// ── Knowledge-probe scoring path ────────────────────────────
|
|
946
|
+
// Knowledge-probe mode evaluates parametric recall (no docs context).
|
|
947
|
+
// The KP compiler deletes `vars.docs`, so the literacy path would bucket
|
|
948
|
+
// every result into `withoutDocs` and collapse testCount + dimensions
|
|
949
|
+
// to zero. This branch groups by feature area only and uses the
|
|
950
|
+
// `knowledge-probe` profile (factual-correctness / completeness /
|
|
951
|
+
// currency). See docs/design-docs/mode-agnostic-scoring.md.
|
|
952
|
+
if (mode === "knowledge-probe") {
|
|
953
|
+
const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
|
|
954
|
+
log.debug("Knowledge-probe scoring profile", probeProfile);
|
|
955
|
+
const results = readAndNormalizeResults(baselineResultsPath);
|
|
956
|
+
const scores = scoreKnowledgeProbeResults(results, probeProfile);
|
|
957
|
+
log.debug("Knowledge-probe scores calculated", {
|
|
958
|
+
featureCount: scores.length,
|
|
959
|
+
features: scores.map((s) => ({
|
|
960
|
+
feature: s.feature,
|
|
961
|
+
totalScore: s.totalScore,
|
|
962
|
+
testCount: s.testCount,
|
|
963
|
+
dimensions: s.dimensions,
|
|
964
|
+
})),
|
|
965
|
+
});
|
|
966
|
+
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
967
|
+
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
968
|
+
allowedOrigins: options.allowedOrigins,
|
|
969
|
+
mode,
|
|
970
|
+
searchMode: options.searchMode,
|
|
971
|
+
});
|
|
972
|
+
const graderCost = extractGraderCost(baselineResultsPath);
|
|
973
|
+
const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
|
|
974
|
+
graderCost, null, // no per-model breakdown for now
|
|
975
|
+
null, // no source isolation — KP doesn't fetch sources
|
|
976
|
+
sourceVerification, "knowledge-probe", log);
|
|
977
|
+
// Persist
|
|
978
|
+
const outDir = join(ROOT, "results", "latest");
|
|
979
|
+
mkdirSync(outDir, { recursive: true });
|
|
980
|
+
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
981
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
982
|
+
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
983
|
+
if (judgments.length > 0) {
|
|
984
|
+
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
985
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
986
|
+
}
|
|
987
|
+
const testResults = extractStoredTestResults(baselineResultsPath);
|
|
988
|
+
if (testResults.length > 0) {
|
|
989
|
+
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
990
|
+
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
991
|
+
}
|
|
992
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
993
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
994
|
+
}
|
|
896
995
|
// ── Literacy scoring path ───────────────────────────────────
|
|
897
996
|
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
898
997
|
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
@@ -9,6 +9,11 @@ import type { KnowledgeProbeCompileOptions } from "./types.js";
|
|
|
9
9
|
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
10
10
|
* LLM-graded assertions receive the configured grader provider.
|
|
11
11
|
* All other assertions are passed through.
|
|
12
|
+
*
|
|
13
|
+
* Templated `llm-rubric` assertions (those with `template` + `criteria`) go
|
|
14
|
+
* through the shared rubric resolver so the compiled assertion carries
|
|
15
|
+
* `metadata.dimension` — without this, the scoring engine can't classify
|
|
16
|
+
* KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
|
|
12
17
|
*/
|
|
13
18
|
export declare function mapKnowledgeProbeAssertion(assertion: {
|
|
14
19
|
type: string;
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Assertion mapping for knowledge probe evaluations.
|
|
3
3
|
*/
|
|
4
|
+
import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
4
5
|
/**
|
|
5
6
|
* Map a raw knowledge probe assertion to a Promptfoo assertion.
|
|
6
7
|
*
|
|
7
8
|
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
8
9
|
* LLM-graded assertions receive the configured grader provider.
|
|
9
10
|
* All other assertions are passed through.
|
|
11
|
+
*
|
|
12
|
+
* Templated `llm-rubric` assertions (those with `template` + `criteria`) go
|
|
13
|
+
* through the shared rubric resolver so the compiled assertion carries
|
|
14
|
+
* `metadata.dimension` — without this, the scoring engine can't classify
|
|
15
|
+
* KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
|
|
10
16
|
*/
|
|
11
17
|
export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
|
|
12
18
|
switch (assertion.type) {
|
|
@@ -27,9 +33,26 @@ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
|
|
|
27
33
|
? { weight: assertion.weight }
|
|
28
34
|
: {}),
|
|
29
35
|
};
|
|
30
|
-
// LLM-graded assertions — add grader provider
|
|
31
|
-
case "g-eval":
|
|
32
36
|
case "llm-rubric":
|
|
37
|
+
// Templated form (template + criteria) → resolve to full rubric text
|
|
38
|
+
// with dimension metadata attached.
|
|
39
|
+
if ("template" in assertion && "criteria" in assertion) {
|
|
40
|
+
return resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
|
|
41
|
+
}
|
|
42
|
+
// Inline value form — pass through with grader provider, no metadata.
|
|
43
|
+
// Back-compat for tasks not yet migrated to the templated form.
|
|
44
|
+
return {
|
|
45
|
+
type: "llm-rubric",
|
|
46
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
47
|
+
...(typeof assertion.weight === "number"
|
|
48
|
+
? { weight: assertion.weight }
|
|
49
|
+
: {}),
|
|
50
|
+
...(options?.graderProvider
|
|
51
|
+
? { provider: options.graderProvider }
|
|
52
|
+
: {}),
|
|
53
|
+
};
|
|
54
|
+
// Other LLM-graded assertions — add grader provider
|
|
55
|
+
case "g-eval":
|
|
33
56
|
case "model-graded-closedqa":
|
|
34
57
|
case "model-graded-factuality":
|
|
35
58
|
return {
|
|
@@ -37,7 +37,11 @@ export const handler = {
|
|
|
37
37
|
if (!("mode" in task) || task.mode !== "knowledge-probe") {
|
|
38
38
|
throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
|
|
39
39
|
}
|
|
40
|
-
const result = compileKnowledgeProbeTask(task, {
|
|
40
|
+
const result = compileKnowledgeProbeTask(task, {
|
|
41
|
+
graderProvider: ctx.graderProvider,
|
|
42
|
+
models: ctx.models,
|
|
43
|
+
rubricConfig: ctx.rubricConfig,
|
|
44
|
+
});
|
|
41
45
|
return {
|
|
42
46
|
providers: result.providers,
|
|
43
47
|
tests: result.tests,
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Public types for the knowledge-probe mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
6
|
/** Options for compiling a knowledge probe task */
|
|
6
7
|
export interface KnowledgeProbeCompileOptions {
|
|
7
8
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -12,6 +13,9 @@ export interface KnowledgeProbeCompileOptions {
|
|
|
12
13
|
label: string;
|
|
13
14
|
config?: Record<string, unknown>;
|
|
14
15
|
}[];
|
|
16
|
+
/** Rubric config (templates, weights, profiles) — needed to resolve
|
|
17
|
+
* templated `llm-rubric` assertions to dimension metadata. */
|
|
18
|
+
rubricConfig?: RubricConfig;
|
|
15
19
|
}
|
|
16
20
|
/** Result of compiling a single knowledge probe task */
|
|
17
21
|
export interface KnowledgeProbeCompileResult {
|
|
@@ -11,10 +11,20 @@
|
|
|
11
11
|
*
|
|
12
12
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
13
13
|
*/
|
|
14
|
+
import { dirname, resolve as resolvePath } from "node:path";
|
|
15
|
+
import { fileURLToPath } from "node:url";
|
|
14
16
|
import { mapAssertions } from "./assertion-mapper.js";
|
|
15
17
|
import { resolveTaskFixtures } from "./fixture-resolver.js";
|
|
16
18
|
import { LiteracyVariant } from "../normalize-mode.js";
|
|
17
19
|
import { resolveVariables } from "./variable-resolver.js";
|
|
20
|
+
/**
|
|
21
|
+
* Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
|
|
22
|
+
* once at module load relative to this file. Promptfoo's `file://` provider
|
|
23
|
+
* loader requires an absolute path. See buildProviders for the env-var
|
|
24
|
+
* gate that swaps real providers for this mock.
|
|
25
|
+
*/
|
|
26
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
|
+
const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
|
|
18
28
|
// ---------------------------------------------------------------------------
|
|
19
29
|
// Public API
|
|
20
30
|
// ---------------------------------------------------------------------------
|
|
@@ -143,6 +153,19 @@ function buildProviders(models, mode) {
|
|
|
143
153
|
},
|
|
144
154
|
});
|
|
145
155
|
}
|
|
156
|
+
// Replay swap — when AILF_REPLAY_LLMS=1 is set, rewrite every provider's
|
|
157
|
+
// `id` to the file-based AILF mock provider so the Promptfoo subprocess
|
|
158
|
+
// never makes a live LLM call. We preserve `label` and stash the
|
|
159
|
+
// original `id` in `config.originalId` so the mock provider can surface
|
|
160
|
+
// model identity in its output and reports remain interpretable.
|
|
161
|
+
// See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
|
|
162
|
+
if (process.env.AILF_REPLAY_LLMS === "1") {
|
|
163
|
+
return providers.map((p) => ({
|
|
164
|
+
id: `file://${MOCK_PROVIDER_ABSPATH}`,
|
|
165
|
+
label: p.label,
|
|
166
|
+
config: { ...p.config, originalId: p.id },
|
|
167
|
+
}));
|
|
168
|
+
}
|
|
146
169
|
return providers;
|
|
147
170
|
}
|
|
148
171
|
/**
|
|
@@ -41,22 +41,40 @@ export default defineTask({
|
|
|
41
41
|
assertions: [
|
|
42
42
|
{ type: "contains", value: "->" },
|
|
43
43
|
{ type: "contains", value: "select(" },
|
|
44
|
+
// Templated rubrics so the compiled assertions carry `metadata.dimension`
|
|
45
|
+
// and the scoring engine can populate per-dimension scores from the KP
|
|
46
|
+
// profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
|
|
44
47
|
{
|
|
45
48
|
type: "llm-rubric",
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
49
|
+
template: "factual-correctness",
|
|
50
|
+
criteria: [
|
|
51
|
+
"The dereference operator `->` is correctly explained for following references",
|
|
52
|
+
"The spread operator `...` is shown in a valid projection example",
|
|
53
|
+
"`select()` is used with valid syntax for conditional projections",
|
|
54
|
+
'Computed field names (e.g., `"label": title`) are demonstrated correctly',
|
|
55
|
+
"Code examples use valid GROQ — no fabricated operators or deprecated syntax",
|
|
56
|
+
],
|
|
52
57
|
},
|
|
53
58
|
{
|
|
54
59
|
type: "llm-rubric",
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
+
template: "completeness",
|
|
61
|
+
criteria: [
|
|
62
|
+
"Basic object projection with `{}` is covered",
|
|
63
|
+
"Nested projections and the spread operator are both addressed",
|
|
64
|
+
"Computed/aliased field names are demonstrated",
|
|
65
|
+
"The dereference operator `->` is included with a worked example",
|
|
66
|
+
"Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
|
|
67
|
+
"Conditional projections via `select()` are covered",
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
type: "llm-rubric",
|
|
72
|
+
template: "currency",
|
|
73
|
+
criteria: [
|
|
74
|
+
"Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
|
|
75
|
+
"Recommendations don't reference removed or legacy query forms",
|
|
76
|
+
"Modern projection idioms are used (e.g., spread + override)",
|
|
77
|
+
],
|
|
60
78
|
},
|
|
61
79
|
],
|
|
62
80
|
})
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.9.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -77,6 +77,7 @@
|
|
|
77
77
|
"test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
78
78
|
"test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
|
|
79
79
|
"test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
|
|
80
|
+
"test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
|
|
80
81
|
"test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
81
82
|
"pr-comment": "tsx src/cli.ts pr-comment",
|
|
82
83
|
"coverage-audit": "tsx src/cli.ts report coverage",
|
|
@@ -41,22 +41,40 @@ export default defineTask({
|
|
|
41
41
|
assertions: [
|
|
42
42
|
{ type: "contains", value: "->" },
|
|
43
43
|
{ type: "contains", value: "select(" },
|
|
44
|
+
// Templated rubrics so the compiled assertions carry `metadata.dimension`
|
|
45
|
+
// and the scoring engine can populate per-dimension scores from the KP
|
|
46
|
+
// profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
|
|
44
47
|
{
|
|
45
48
|
type: "llm-rubric",
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
49
|
+
template: "factual-correctness",
|
|
50
|
+
criteria: [
|
|
51
|
+
"The dereference operator `->` is correctly explained for following references",
|
|
52
|
+
"The spread operator `...` is shown in a valid projection example",
|
|
53
|
+
"`select()` is used with valid syntax for conditional projections",
|
|
54
|
+
'Computed field names (e.g., `"label": title`) are demonstrated correctly',
|
|
55
|
+
"Code examples use valid GROQ — no fabricated operators or deprecated syntax",
|
|
56
|
+
],
|
|
52
57
|
},
|
|
53
58
|
{
|
|
54
59
|
type: "llm-rubric",
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
+
template: "completeness",
|
|
61
|
+
criteria: [
|
|
62
|
+
"Basic object projection with `{}` is covered",
|
|
63
|
+
"Nested projections and the spread operator are both addressed",
|
|
64
|
+
"Computed/aliased field names are demonstrated",
|
|
65
|
+
"The dereference operator `->` is included with a worked example",
|
|
66
|
+
"Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
|
|
67
|
+
"Conditional projections via `select()` are covered",
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
type: "llm-rubric",
|
|
72
|
+
template: "currency",
|
|
73
|
+
criteria: [
|
|
74
|
+
"Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
|
|
75
|
+
"Recommendations don't reference removed or legacy query forms",
|
|
76
|
+
"Modern projection idioms are used (e.g., spread + override)",
|
|
77
|
+
],
|
|
60
78
|
},
|
|
61
79
|
],
|
|
62
80
|
})
|