@sanity/ailf 7.0.0 → 7.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +12 -13
- package/dist/_vendor/ailf-core/examples/index.d.ts +3 -3
- package/dist/_vendor/ailf-core/examples/index.js +3 -3
- package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/report.js +2 -0
- package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/team.js +63 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
- package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
- package/dist/_vendor/ailf-core/types/team.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
- package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
- package/dist/_vendor/ailf-shared/event-types.js +23 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
- package/dist/_vendor/ailf-shared/index.d.ts +4 -2
- package/dist/_vendor/ailf-shared/index.js +4 -2
- package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
- package/dist/_vendor/ailf-shared/member-roles.js +16 -0
- package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
- package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
- package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
- package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
- package/dist/adapters/task-sources/repo-task-source.js +2 -1
- package/dist/commands/pipeline-action.d.ts +4 -3
- package/dist/commands/pipeline-action.js +7 -5
- package/dist/commands/run.js +2 -2
- package/dist/config/rubrics.ts +12 -13
- package/dist/job-store.d.ts +18 -0
- package/dist/job-store.js +34 -0
- package/dist/orchestration/build-app-context.js +8 -1
- package/dist/orchestration/pipeline-orchestrator.js +46 -1
- package/dist/orchestration/steps/compare-step.d.ts +7 -0
- package/dist/orchestration/steps/compare-step.js +59 -23
- package/dist/orchestration/steps/fetch-docs-step.js +3 -0
- package/dist/orchestration/steps/finalize-run-step.js +2 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
- package/dist/orchestration/steps/generate-configs-step.js +47 -13
- package/dist/orchestration/steps/grader-consistency-step.js +11 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -3
- package/dist/pipeline/cache-hit-restore.d.ts +30 -5
- package/dist/pipeline/cache-hit-restore.js +36 -6
- package/dist/pipeline/calculate-scores.js +57 -21
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
- package/dist/pipeline/compiler/provider-assembler.js +16 -3
- package/dist/pipeline/failure-modes.d.ts +20 -10
- package/dist/pipeline/failure-modes.js +84 -15
- package/dist/pipeline/map-request-to-config.js +2 -0
- package/dist/pipeline/normalize-mode.d.ts +1 -1
- package/dist/pipeline/normalize-mode.js +2 -0
- package/dist/pipeline/run-context.d.ts +16 -1
- package/dist/pipeline/run-context.js +12 -1
- package/dist/pipeline/validate.d.ts +8 -4
- package/dist/pipeline/validate.js +8 -18
- package/dist/report-store.d.ts +14 -1
- package/dist/report-store.js +32 -0
- package/dist/sanity/client.js +2 -2
- package/package.json +3 -3
|
@@ -32,7 +32,7 @@ import { join } from "path";
|
|
|
32
32
|
import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
33
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
34
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
|
-
import {
|
|
35
|
+
import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
|
|
36
36
|
import { validateFailureMode } from "./failure-modes.js";
|
|
37
37
|
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
38
38
|
import { checkResultsExist } from "./checks.js";
|
|
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
|
|
|
184
184
|
continue;
|
|
185
185
|
}
|
|
186
186
|
const score = parseRubricScore(comp);
|
|
187
|
-
// Extract the reason text — the grader's reasoning.
|
|
188
|
-
//
|
|
189
|
-
//
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
//
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
//
|
|
187
|
+
// Extract the reason text — the grader's reasoning. W0273 splits
|
|
188
|
+
// the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
|
|
189
|
+
// — only fields the LLM controls) and a storage shape
|
|
190
|
+
// (`GraderJudgmentSchema` — full strict surface). The pipeline
|
|
191
|
+
// parses against the wire shape, then synthesizes the pipeline-owned
|
|
192
|
+
// fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
|
|
193
|
+
// hallucinationCheckedAgainst) plus the result-context fields
|
|
194
|
+
// (taskId, modelId, dimension) to build the full storage shape.
|
|
195
|
+
//
|
|
196
|
+
// On parse failure we fall to an `unclassified`-shape Phase 1
|
|
197
|
+
// judgment built from the raw reason string — NEVER fall back to
|
|
198
|
+
// the legacy parser (Pitfall 4: strict and legacy schemas are
|
|
199
|
+
// deliberate siblings, not a fall-through chain).
|
|
196
200
|
const reasonRaw = comp.reason ?? "";
|
|
197
201
|
let parsedJudgment = null;
|
|
198
202
|
let reason = reasonRaw;
|
|
199
203
|
if (reasonRaw) {
|
|
200
204
|
try {
|
|
201
205
|
const candidate = JSON.parse(reasonRaw);
|
|
202
|
-
// The
|
|
203
|
-
//
|
|
204
|
-
//
|
|
206
|
+
// The wire schema asserts only the LLM-emit subset. safeParse
|
|
207
|
+
// handles non-object inputs (number, array, etc.) by failing —
|
|
208
|
+
// we don't pre-narrow here.
|
|
205
209
|
const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
|
|
206
|
-
const result =
|
|
207
|
-
...candidateObj,
|
|
208
|
-
taskId,
|
|
209
|
-
modelId,
|
|
210
|
-
dimension: kind,
|
|
211
|
-
});
|
|
210
|
+
const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
|
|
212
211
|
if (result.success) {
|
|
213
|
-
|
|
214
|
-
|
|
212
|
+
const emitted = result.data;
|
|
213
|
+
parsedJudgment = {
|
|
214
|
+
// Result-context fields — pipeline-supplied:
|
|
215
|
+
taskId,
|
|
216
|
+
modelId,
|
|
217
|
+
dimension: kind,
|
|
218
|
+
// Wire-emitted fields — LLM-controlled:
|
|
219
|
+
score: emitted.score,
|
|
220
|
+
reason: emitted.reason,
|
|
221
|
+
failureMode: emitted.failureMode,
|
|
222
|
+
subJudgments: emitted.subJudgments,
|
|
223
|
+
docCitations: emitted.docCitations,
|
|
224
|
+
confidence: emitted.confidence,
|
|
225
|
+
...(emitted.outputFailure && {
|
|
226
|
+
outputFailure: emitted.outputFailure,
|
|
227
|
+
}),
|
|
228
|
+
// Pipeline-owned fields — synthesized:
|
|
229
|
+
judgmentId: generateJudgmentId({
|
|
230
|
+
taskId,
|
|
231
|
+
modelId,
|
|
232
|
+
dimension: kind,
|
|
233
|
+
...(telemetry?.runId ? { runId: telemetry.runId } : {}),
|
|
234
|
+
}),
|
|
235
|
+
// hallucinationCheckedAgainst is filled in later by
|
|
236
|
+
// populateHallucinationFields (gap-analysis-step.ts) — it
|
|
237
|
+
// needs the run.documentManifest union that isn't visible
|
|
238
|
+
// here. Empty array is the documented pre-fill placeholder.
|
|
239
|
+
hallucinationCheckedAgainst: [],
|
|
240
|
+
metadata: {
|
|
241
|
+
// graderModel is threaded via the existing
|
|
242
|
+
// telemetry.reliability channel. When upstream wires the
|
|
243
|
+
// real grader-provider alias into reliability.graderModel,
|
|
244
|
+
// it propagates here automatically; today it's "unknown"
|
|
245
|
+
// (matching the pre-W0273 synthesized-fallback default).
|
|
246
|
+
graderModel: telemetry?.reliability.graderModel ?? "unknown",
|
|
247
|
+
graderJudgmentsVersion,
|
|
248
|
+
},
|
|
249
|
+
};
|
|
250
|
+
reason = emitted.reason;
|
|
215
251
|
}
|
|
216
252
|
else {
|
|
217
253
|
// Parse failure — drop to failureMode='unclassified' below.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
8
|
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
9
|
-
import type { LiteracyCompileOptions } from "./types.js";
|
|
9
|
+
import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
|
|
10
10
|
export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
|
|
11
11
|
/**
|
|
12
12
|
* Build baseline assertions matching the legacy expand-tasks behavior.
|
|
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
|
|
|
14
14
|
* - "full": all assertions carried over
|
|
15
15
|
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
16
16
|
* - "none": no assertions
|
|
17
|
+
*
|
|
18
|
+
* `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
|
|
19
|
+
* mode's synthetic rubric. Without it the abbreviated emission would fail
|
|
20
|
+
* `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
|
|
21
|
+
* subJudgments, docCitations, confidence).
|
|
17
22
|
*/
|
|
18
|
-
export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
|
|
23
|
+
export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];
|
|
@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
|
|
|
45
45
|
return null;
|
|
46
46
|
const template = rubricConfig.templates["doc-coverage"];
|
|
47
47
|
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
// W0273 — use the centralized wire-shape footer so the grader emission
|
|
49
|
+
// parses against GraderEmittedJudgmentSchema. The pre-W0273 short
|
|
50
|
+
// {score, reason} footer caused 100% parse failures starting 2026-05-11.
|
|
51
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
|
|
50
52
|
// doc-coverage benefits from the same authoritative reference — the grader
|
|
51
53
|
// needs the doc content to judge whether the candidate actually used what
|
|
52
54
|
// was documented.
|
|
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
|
|
|
92
94
|
* - "full": all assertions carried over
|
|
93
95
|
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
94
96
|
* - "none": no assertions
|
|
97
|
+
*
|
|
98
|
+
* `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
|
|
99
|
+
* mode's synthetic rubric. Without it the abbreviated emission would fail
|
|
100
|
+
* `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
|
|
101
|
+
* subJudgments, docCitations, confidence).
|
|
95
102
|
*/
|
|
96
|
-
export function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
103
|
+
export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
|
|
97
104
|
const mode = rubricMode ?? "full";
|
|
98
105
|
if (mode === "none")
|
|
99
106
|
return [];
|
|
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
|
106
113
|
if (a.type === "llm-rubric") {
|
|
107
114
|
if (!foundFirst) {
|
|
108
115
|
foundFirst = true;
|
|
116
|
+
const footer = rubricConfig?.footer ??
|
|
117
|
+
'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
|
|
109
118
|
abbreviated.push({
|
|
110
119
|
type: "llm-rubric",
|
|
111
120
|
value: "Score task completion from 0 to 100 (same criteria as above).\n" +
|
|
112
|
-
|
|
121
|
+
footer,
|
|
113
122
|
...(a.provider ? { provider: a.provider } : {}),
|
|
114
123
|
});
|
|
115
124
|
}
|
|
@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
|
|
|
134
134
|
if (evalMode !== "agentic") {
|
|
135
135
|
const baselineEnabled = task.baseline?.enabled !== false;
|
|
136
136
|
if (baselineEnabled) {
|
|
137
|
-
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
|
|
137
|
+
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
|
|
138
138
|
tests.push({
|
|
139
139
|
description: `${taskTitle} (baseline)`,
|
|
140
140
|
vars: {
|
|
@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
|
|
|
50
50
|
* Returns provider arrays keyed by literacy variant name (baseline,
|
|
51
51
|
* agentic, observed). These are consumed by the YAML writer to produce
|
|
52
52
|
* the per-variant promptfoo config files.
|
|
53
|
+
*
|
|
54
|
+
* `loaded` (optional) lets callers pre-load and pre-filter the
|
|
55
|
+
* `ModelsConfig` so a caller-side filter (e.g. W0281's
|
|
56
|
+
* `filterModelsByRequest`) actually takes effect on the assembled
|
|
57
|
+
* providers — building providers from the unfiltered set would silently
|
|
58
|
+
* defeat the filter, since promptfoo decides which LLMs to call from the
|
|
59
|
+
* providers array, not the returned `models` field.
|
|
53
60
|
*/
|
|
54
|
-
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
|
|
61
|
+
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
|
|
55
62
|
/**
|
|
56
63
|
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
57
64
|
*
|
|
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
|
|
|
60
67
|
* > hard fallback (5).
|
|
61
68
|
*/
|
|
62
69
|
export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
|
|
70
|
+
/**
|
|
71
|
+
* Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
|
|
72
|
+
* that need to pre-filter the model set before provider assembly (e.g.
|
|
73
|
+
* `PipelineRequest.models`) can hand the filtered config back to
|
|
74
|
+
* `loadModelsAndProviders` via its optional `loaded` parameter.
|
|
75
|
+
*/
|
|
76
|
+
export declare function loadModelsYaml(rootDir: string): ModelsConfig;
|
|
@@ -64,9 +64,16 @@ function applyReplaySwap(providers) {
|
|
|
64
64
|
* Returns provider arrays keyed by literacy variant name (baseline,
|
|
65
65
|
* agentic, observed). These are consumed by the YAML writer to produce
|
|
66
66
|
* the per-variant promptfoo config files.
|
|
67
|
+
*
|
|
68
|
+
* `loaded` (optional) lets callers pre-load and pre-filter the
|
|
69
|
+
* `ModelsConfig` so a caller-side filter (e.g. W0281's
|
|
70
|
+
* `filterModelsByRequest`) actually takes effect on the assembled
|
|
71
|
+
* providers — building providers from the unfiltered set would silently
|
|
72
|
+
* defeat the filter, since promptfoo decides which LLMs to call from the
|
|
73
|
+
* providers array, not the returned `models` field.
|
|
67
74
|
*/
|
|
68
|
-
export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins) {
|
|
69
|
-
const models = loadModelsYaml(rootDir);
|
|
75
|
+
export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins, loaded) {
|
|
76
|
+
const models = loaded ?? loadModelsYaml(rootDir);
|
|
70
77
|
return {
|
|
71
78
|
models,
|
|
72
79
|
providers: {
|
|
@@ -203,6 +210,12 @@ export function resolveMaxToolRounds(models, model, variant) {
|
|
|
203
210
|
// ---------------------------------------------------------------------------
|
|
204
211
|
// Helpers
|
|
205
212
|
// ---------------------------------------------------------------------------
|
|
206
|
-
|
|
213
|
+
/**
|
|
214
|
+
* Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
|
|
215
|
+
* that need to pre-filter the model set before provider assembly (e.g.
|
|
216
|
+
* `PipelineRequest.models`) can hand the filtered config back to
|
|
217
|
+
* `loadModelsAndProviders` via its optional `loaded` parameter.
|
|
218
|
+
*/
|
|
219
|
+
export function loadModelsYaml(rootDir) {
|
|
207
220
|
return loadConfigFile("models", rootDir).data;
|
|
208
221
|
}
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
* Ceiling-cross-check failure-mode validator + report assembly
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly + keyword
|
|
5
|
+
* fallback classifier.
|
|
5
6
|
*
|
|
6
7
|
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
-
* (Plan 03-02 — `packages/eval/src/grader/`)
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
|
|
9
|
+
* is available to the pipeline. In practice (W0273 discovery), Promptfoo's
|
|
10
|
+
* `llm-rubric` post-processor extracts `score` + `reason` from the grader's
|
|
11
|
+
* JSON envelope and discards the rest of the structured surface — including
|
|
12
|
+
* `failureMode`. The wire-shape footer instructs the LLM correctly but the
|
|
13
|
+
* structured fields never reach `extractGraderJudgments`, so every emission
|
|
14
|
+
* arrives as the synthesized `failureMode: "unclassified"` placeholder.
|
|
12
15
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
16
|
+
* To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
|
|
17
|
+
* a keyword-pattern classifier is run as a FALLBACK when the grader's
|
|
18
|
+
* emitted mode is `"unclassified"` and the score is below the classification
|
|
19
|
+
* threshold. Plan 03-03 deleted this classifier in favor of grader-emission
|
|
20
|
+
* source-of-truth; W0273 reinstates it because the grader-emission path is
|
|
21
|
+
* blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
|
|
22
|
+
* (capturing the grader's full structured response) is tracked separately.
|
|
23
|
+
*
|
|
24
|
+
* `classifyByCeiling` continues to serve as the confidence cross-check.
|
|
17
25
|
*
|
|
18
26
|
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
27
|
* failureMode under the per-dimension taxonomy
|
|
20
28
|
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
29
|
* shape and `ceiling-cross-check` derivation tag
|
|
30
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
|
|
31
|
+
* cause (Promptfoo strips structured fields)
|
|
22
32
|
*/
|
|
23
33
|
import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
|
|
24
34
|
import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
* Ceiling-cross-check failure-mode validator + report assembly
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly + keyword
|
|
5
|
+
* fallback classifier.
|
|
5
6
|
*
|
|
6
7
|
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
-
* (Plan 03-02 — `packages/eval/src/grader/`)
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
|
|
9
|
+
* is available to the pipeline. In practice (W0273 discovery), Promptfoo's
|
|
10
|
+
* `llm-rubric` post-processor extracts `score` + `reason` from the grader's
|
|
11
|
+
* JSON envelope and discards the rest of the structured surface — including
|
|
12
|
+
* `failureMode`. The wire-shape footer instructs the LLM correctly but the
|
|
13
|
+
* structured fields never reach `extractGraderJudgments`, so every emission
|
|
14
|
+
* arrives as the synthesized `failureMode: "unclassified"` placeholder.
|
|
12
15
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
16
|
+
* To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
|
|
17
|
+
* a keyword-pattern classifier is run as a FALLBACK when the grader's
|
|
18
|
+
* emitted mode is `"unclassified"` and the score is below the classification
|
|
19
|
+
* threshold. Plan 03-03 deleted this classifier in favor of grader-emission
|
|
20
|
+
* source-of-truth; W0273 reinstates it because the grader-emission path is
|
|
21
|
+
* blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
|
|
22
|
+
* (capturing the grader's full structured response) is tracked separately.
|
|
23
|
+
*
|
|
24
|
+
* `classifyByCeiling` continues to serve as the confidence cross-check.
|
|
17
25
|
*
|
|
18
26
|
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
27
|
* failureMode under the per-dimension taxonomy
|
|
20
28
|
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
29
|
* shape and `ceiling-cross-check` derivation tag
|
|
30
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
|
|
31
|
+
* cause (Promptfoo strips structured fields)
|
|
22
32
|
*/
|
|
23
33
|
import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
24
34
|
// ---------------------------------------------------------------------------
|
|
@@ -27,6 +37,20 @@ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/in
|
|
|
27
37
|
/** Only classify judgments with scores below this threshold */
|
|
28
38
|
const CLASSIFICATION_THRESHOLD = 60;
|
|
29
39
|
// ---------------------------------------------------------------------------
|
|
40
|
+
// Keyword patterns (W0273 fallback)
|
|
41
|
+
//
|
|
42
|
+
// Verbatim from the pre-Plan-03-03 implementation. Used only when the
|
|
43
|
+
// grader's emitted `failureMode` is `"unclassified"` — the grader's
|
|
44
|
+
// emission still wins whenever it actually reaches the pipeline.
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
/** API error pattern — checked FIRST to prevent timeout errors containing
|
|
47
|
+
* "deprecated" from being misclassified as outdated-docs. */
|
|
48
|
+
const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
|
|
49
|
+
const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
|
|
50
|
+
const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
|
|
51
|
+
const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
|
|
52
|
+
const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
|
|
53
|
+
// ---------------------------------------------------------------------------
|
|
30
54
|
// Public API
|
|
31
55
|
// ---------------------------------------------------------------------------
|
|
32
56
|
/**
|
|
@@ -69,13 +93,25 @@ export function buildFailureModeReport(judgments, scores) {
|
|
|
69
93
|
// grader's actual taxonomy choice rather than a collapsed
|
|
70
94
|
// `"unclassified"` bucket.
|
|
71
95
|
const emittedMode = readEmittedMode(judgment);
|
|
96
|
+
// W0273 fallback — when the grader's emitted mode is "unclassified"
|
|
97
|
+
// (the synthesized-unparsed-judgment placeholder; in practice this
|
|
98
|
+
// is every judgment today because Promptfoo's llm-rubric strips the
|
|
99
|
+
// grader's structured response), try keyword classification against
|
|
100
|
+
// the reason prose. Gated on score < CLASSIFICATION_THRESHOLD so
|
|
101
|
+
// passing judgments don't get spurious classifications.
|
|
102
|
+
const keywordFallback = emittedMode === "unclassified" &&
|
|
103
|
+
judgment.score < CLASSIFICATION_THRESHOLD
|
|
104
|
+
? classifyByKeyword(judgment.reason)
|
|
105
|
+
: null;
|
|
72
106
|
// Cross-check the grader's emission against ceiling decomposition.
|
|
73
107
|
const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
|
|
74
|
-
const classification =
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
108
|
+
const classification = keywordFallback
|
|
109
|
+
? keywordFallback
|
|
110
|
+
: {
|
|
111
|
+
confidence: stamp.level,
|
|
112
|
+
mode: emittedMode,
|
|
113
|
+
source: "ceiling",
|
|
114
|
+
};
|
|
79
115
|
classifiedJudgments.push({ classification, judgment });
|
|
80
116
|
summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
|
|
81
117
|
// Per-area tracking
|
|
@@ -282,6 +318,39 @@ function readEmittedMode(judgment) {
|
|
|
282
318
|
}
|
|
283
319
|
return emitted;
|
|
284
320
|
}
|
|
321
|
+
/**
|
|
322
|
+
* Classify the failure mode of a low-scoring grader judgment by matching
|
|
323
|
+
* keyword patterns against the reason prose. Returns `null` when no
|
|
324
|
+
* pattern matches. Patterns checked in priority order (API errors first
|
|
325
|
+
* so timeout messages containing "deprecated" don't get misclassified
|
|
326
|
+
* as outdated-docs).
|
|
327
|
+
*
|
|
328
|
+
* W0273 — reinstated as a fallback when the grader's emitted failureMode
|
|
329
|
+
* is "unclassified". Plan 03-03 deleted this code in favor of grader-
|
|
330
|
+
* emission source-of-truth; the deletion is reversed here because
|
|
331
|
+
* Promptfoo's llm-rubric post-processor strips the grader's structured
|
|
332
|
+
* response (only score + reason survive into `comp.*`), so the
|
|
333
|
+
* grader-emission path produces 0% classification on every run.
|
|
334
|
+
*/
|
|
335
|
+
function classifyByKeyword(reason) {
|
|
336
|
+
const lower = reason.toLowerCase();
|
|
337
|
+
if (API_ERROR_PATTERN.test(lower)) {
|
|
338
|
+
return { confidence: "high", mode: "api-error", source: "keyword" };
|
|
339
|
+
}
|
|
340
|
+
if (OUTDATED_PATTERN.test(lower)) {
|
|
341
|
+
return { confidence: "high", mode: "outdated-docs", source: "keyword" };
|
|
342
|
+
}
|
|
343
|
+
if (MISSING_PATTERN.test(lower)) {
|
|
344
|
+
return { confidence: "high", mode: "missing-docs", source: "keyword" };
|
|
345
|
+
}
|
|
346
|
+
if (INCORRECT_PATTERN.test(lower)) {
|
|
347
|
+
return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
|
|
348
|
+
}
|
|
349
|
+
if (POOR_STRUCTURE_PATTERN.test(lower)) {
|
|
350
|
+
return { confidence: "medium", mode: "poor-structure", source: "keyword" };
|
|
351
|
+
}
|
|
352
|
+
return null;
|
|
353
|
+
}
|
|
285
354
|
/**
|
|
286
355
|
* Classify by ceiling-decomposition structural signals — preserved
|
|
287
356
|
* verbatim from the pre-Plan-03-03 implementation. The function itself
|
|
@@ -37,6 +37,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
37
37
|
mode,
|
|
38
38
|
variant,
|
|
39
39
|
debug: mapDebug(request.debug),
|
|
40
|
+
models: request.models,
|
|
40
41
|
areas: request.areas,
|
|
41
42
|
tasks: request.tasks,
|
|
42
43
|
changedDocs: request.changedDocs,
|
|
@@ -46,6 +47,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
46
47
|
compareEnabled: request.compare ?? false,
|
|
47
48
|
compareThreshold: request.compareThreshold,
|
|
48
49
|
compareBaseline: request.compareBaseline,
|
|
50
|
+
compareBaselineReportId: request.compareBaselineReportId,
|
|
49
51
|
gapAnalysisEnabled: request.gapAnalysis ?? true,
|
|
50
52
|
publishEnabled: request.publish ?? publishDefault,
|
|
51
53
|
publishTag: request.publishTag,
|
|
@@ -35,7 +35,7 @@ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof Literacy
|
|
|
35
35
|
export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
|
|
36
36
|
export interface NormalizedMode {
|
|
37
37
|
mode: EvalMode;
|
|
38
|
-
variant?:
|
|
38
|
+
variant?: LiteracyVariantName;
|
|
39
39
|
}
|
|
40
40
|
/**
|
|
41
41
|
* Normalize a raw CLI mode string to a canonical mode + optional variant.
|
|
@@ -55,6 +55,8 @@ const ALL_ACCEPTED = [
|
|
|
55
55
|
export function normalizeMode(input) {
|
|
56
56
|
if (LEGACY_LITERACY_VARIANTS.has(input)) {
|
|
57
57
|
console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
|
|
58
|
+
// The membership check above narrows `input` to LITERACY_VARIANTS — the
|
|
59
|
+
// cast is to the closed type, not a widening.
|
|
58
60
|
return { mode: "literacy", variant: input };
|
|
59
61
|
}
|
|
60
62
|
if (CANONICAL_MODES.has(input)) {
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
|
|
14
14
|
*/
|
|
15
15
|
import { type Logger, type RunContext } from "../_vendor/ailf-core/index.d.ts";
|
|
16
|
-
import { type RunClassification, type RunExecutor, type RunExecutorSurface, type RunHost, type RunLineage, type RunOwner, type RunTool } from "../_vendor/ailf-shared/index.d.ts";
|
|
16
|
+
import { type LiteracyVariant, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunHost, type RunLineage, type RunOwner, type RunTool } from "../_vendor/ailf-shared/index.d.ts";
|
|
17
17
|
import type { ResolvedSourceConfig } from "../sources.js";
|
|
18
18
|
import type { EvalMode } from "./types.js";
|
|
19
19
|
/**
|
|
@@ -74,6 +74,21 @@ export interface RunContextInput {
|
|
|
74
74
|
source: ResolvedSourceConfig;
|
|
75
75
|
/** Specific task IDs evaluated (if scoped) */
|
|
76
76
|
taskIds?: string[];
|
|
77
|
+
/**
|
|
78
|
+
* Literacy mode variant (`baseline | agentic | observed | full`). Only
|
|
79
|
+
* meaningful when `mode === "literacy"`; ignored for other modes. Lands
|
|
80
|
+
* on `RunContext.variant` and `ReportProvenance.variant` so consumers
|
|
81
|
+
* can disambiguate which literacy variant the run executed.
|
|
82
|
+
*/
|
|
83
|
+
variant?: LiteracyVariant;
|
|
84
|
+
/**
|
|
85
|
+
* Model IDs the caller requested via `PipelineRequest.models`. When
|
|
86
|
+
* present, `RunContext.models` is filtered to this subset so the report's
|
|
87
|
+
* `provenance.models` reflects what was actually evaluated. Unknown IDs
|
|
88
|
+
* are silently filtered out — the upstream rejection path (W0281
|
|
89
|
+
* `filterModelsByRequest`) has already failed the run or warned.
|
|
90
|
+
*/
|
|
91
|
+
requestedModelIds?: string[];
|
|
77
92
|
}
|
|
78
93
|
/**
|
|
79
94
|
* Derive `RunContext` from pipeline inputs. The only construction path.
|
|
@@ -68,8 +68,18 @@ export function buildRunContext(input) {
|
|
|
68
68
|
// config/models.ts model matrix — listing those models would be
|
|
69
69
|
// misleading. Only include them for literacy mode where they're the
|
|
70
70
|
// actual eval targets.
|
|
71
|
+
//
|
|
72
|
+
// When `PipelineRequest.models` pinned a subset, filter here too so
|
|
73
|
+
// `provenance.models` matches what actually ran (W0281). Without this
|
|
74
|
+
// the report would advertise the full cohort even though only the
|
|
75
|
+
// requested subset reached the LLMs.
|
|
76
|
+
const requestedSet = input.requestedModelIds?.length
|
|
77
|
+
? new Set(input.requestedModelIds)
|
|
78
|
+
: undefined;
|
|
71
79
|
const evaluatedModels = input.mode === "literacy"
|
|
72
|
-
? models.models
|
|
80
|
+
? models.models
|
|
81
|
+
.filter((m) => !requestedSet || requestedSet.has(m.id))
|
|
82
|
+
.map((m) => ({ id: m.id, label: m.label }))
|
|
73
83
|
: [];
|
|
74
84
|
return {
|
|
75
85
|
areas: input.areas,
|
|
@@ -95,6 +105,7 @@ export function buildRunContext(input) {
|
|
|
95
105
|
taskIds: input.taskIds,
|
|
96
106
|
tool,
|
|
97
107
|
trigger,
|
|
108
|
+
variant: input.mode === "literacy" ? input.variant : undefined,
|
|
98
109
|
};
|
|
99
110
|
}
|
|
100
111
|
// ---------------------------------------------------------------------------
|
|
@@ -14,11 +14,15 @@ import type { ValidationIssue, ValidationResult } from "./types.js";
|
|
|
14
14
|
*/
|
|
15
15
|
export declare function validateConfiguration(rootDir: string): ValidationResult;
|
|
16
16
|
/**
|
|
17
|
-
* Check that canonical
|
|
18
|
-
* gold-retrieval contexts actually referenced by task definitions.
|
|
17
|
+
* Check that the canonical-contexts directory exists.
|
|
19
18
|
*
|
|
20
|
-
* Contexts are
|
|
21
|
-
*
|
|
19
|
+
* Contexts are populated by fetch-docs, which scopes to the tasks
|
|
20
|
+
* actually being evaluated (not every task in the registry). Warning
|
|
21
|
+
* on individual missing files here would fire for every task the user
|
|
22
|
+
* didn't select — pure noise that previously crowded out real errors
|
|
23
|
+
* in the GHA safety-net's tail-of-log capture (W0282). The per-task
|
|
24
|
+
* precondition is enforced by `run-eval-step.ts:checkCanonicalContextsExist`
|
|
25
|
+
* against the filtered task set, where missing files are real errors.
|
|
22
26
|
*/
|
|
23
27
|
export declare function validateContexts(rootDir: string): ValidationIssue[];
|
|
24
28
|
/**
|
|
@@ -34,11 +34,15 @@ export function validateConfiguration(rootDir) {
|
|
|
34
34
|
return { issues, valid };
|
|
35
35
|
}
|
|
36
36
|
/**
|
|
37
|
-
* Check that canonical
|
|
38
|
-
* gold-retrieval contexts actually referenced by task definitions.
|
|
37
|
+
* Check that the canonical-contexts directory exists.
|
|
39
38
|
*
|
|
40
|
-
* Contexts are
|
|
41
|
-
*
|
|
39
|
+
* Contexts are populated by fetch-docs, which scopes to the tasks
|
|
40
|
+
* actually being evaluated (not every task in the registry). Warning
|
|
41
|
+
* on individual missing files here would fire for every task the user
|
|
42
|
+
* didn't select — pure noise that previously crowded out real errors
|
|
43
|
+
* in the GHA safety-net's tail-of-log capture (W0282). The per-task
|
|
44
|
+
* precondition is enforced by `run-eval-step.ts:checkCanonicalContextsExist`
|
|
45
|
+
* against the filtered task set, where missing files are real errors.
|
|
42
46
|
*/
|
|
43
47
|
export function validateContexts(rootDir) {
|
|
44
48
|
const source = "validateContexts";
|
|
@@ -46,20 +50,6 @@ export function validateContexts(rootDir) {
|
|
|
46
50
|
const canonicalDir = path.join(rootDir, "contexts", "canonical");
|
|
47
51
|
if (!fs.existsSync(canonicalDir)) {
|
|
48
52
|
issues.push(warning(source, "contexts/canonical/ directory not found — run 'pnpm fetch-docs' to generate", canonicalDir));
|
|
49
|
-
return issues;
|
|
50
|
-
}
|
|
51
|
-
const mappings = resolveMappings(rootDir);
|
|
52
|
-
for (const [, areaConfig] of Object.entries(mappings.feature_areas)) {
|
|
53
|
-
if (!areaConfig?.tasks)
|
|
54
|
-
continue;
|
|
55
|
-
for (const task of areaConfig.tasks) {
|
|
56
|
-
if (!task.id)
|
|
57
|
-
continue;
|
|
58
|
-
const contextFile = path.join(canonicalDir, `${task.id}.md`);
|
|
59
|
-
if (!fs.existsSync(contextFile)) {
|
|
60
|
-
issues.push(warning(source, `Missing canonical context for task '${task.id}' — run 'pnpm fetch-docs' to generate`, contextFile));
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
53
|
}
|
|
64
54
|
return issues;
|
|
65
55
|
}
|
package/dist/report-store.d.ts
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see docs/design-docs/report-store/domain-model.md
|
|
16
16
|
*/
|
|
17
17
|
import type { SanityClient } from "@sanity/client";
|
|
18
|
-
import type { ArtifactRef, ArtifactType, SynthesisCostTelemetry } from "./_vendor/ailf-core/index.d.ts";
|
|
18
|
+
import type { ArtifactRef, ArtifactType, LoadBaselineResult, SynthesisCostTelemetry } from "./_vendor/ailf-core/index.d.ts";
|
|
19
19
|
import type { ComparisonReport, ISOTimestamp, LineageQuery, Report, ReportId, ReportProvenance, ScoreSummary } from "./pipeline/types.js";
|
|
20
20
|
/**
|
|
21
21
|
* Result of an auto-comparison, bundling the ComparisonReport with the
|
|
@@ -113,6 +113,19 @@ export declare class ReportStore {
|
|
|
113
113
|
* W0191 runtime schema gate. Sanity API failures still return null.
|
|
114
114
|
*/
|
|
115
115
|
read(id: ReportId): Promise<null | Report>;
|
|
116
|
+
/**
|
|
117
|
+
* Load a previously-published report's score summary as a baseline
|
|
118
|
+
* for comparison. Returns a discriminated result so the caller can
|
|
119
|
+
* distinguish a genuine 404 (skip compare with a clear reason) from
|
|
120
|
+
* a transport failure (fail the step — the user pinned a baseline
|
|
121
|
+
* and deserves to know it didn't actually compare).
|
|
122
|
+
*
|
|
123
|
+
* The report's `summary` field is a `ReportSummary` — a superset of
|
|
124
|
+
* `ComparableSummary` — so the projection below carries everything
|
|
125
|
+
* the `compare()` primitive needs (`overall`, `perModel`, `scores`)
|
|
126
|
+
* without re-hydrating the slim prose/array fields.
|
|
127
|
+
*/
|
|
128
|
+
loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
|
|
116
129
|
/**
|
|
117
130
|
* Write a report to the Sanity Content Lake.
|
|
118
131
|
*
|