@sanity/ailf 7.0.1 → 7.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +12 -13
- package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/report.js +2 -0
- package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/team.js +63 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
- package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
- package/dist/_vendor/ailf-core/types/team.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
- package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
- package/dist/_vendor/ailf-shared/event-types.js +23 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
- package/dist/_vendor/ailf-shared/index.d.ts +5 -3
- package/dist/_vendor/ailf-shared/index.js +5 -2
- package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
- package/dist/_vendor/ailf-shared/member-roles.js +16 -0
- package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
- package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
- package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
- package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
- package/dist/adapters/task-sources/repo-task-source.js +2 -1
- package/dist/commands/pipeline-action.d.ts +4 -3
- package/dist/commands/pipeline-action.js +7 -5
- package/dist/commands/run.js +2 -2
- package/dist/config/rubrics.ts +12 -13
- package/dist/job-store.d.ts +18 -0
- package/dist/job-store.js +34 -0
- package/dist/orchestration/build-app-context.js +8 -1
- package/dist/orchestration/pipeline-orchestrator.js +46 -1
- package/dist/orchestration/steps/compare-step.d.ts +7 -0
- package/dist/orchestration/steps/compare-step.js +59 -23
- package/dist/orchestration/steps/fetch-docs-step.js +3 -0
- package/dist/orchestration/steps/finalize-run-step.js +2 -0
- package/dist/orchestration/steps/gap-analysis-step.js +9 -8
- package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
- package/dist/orchestration/steps/generate-configs-step.js +47 -13
- package/dist/orchestration/steps/grader-consistency-step.js +11 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
- package/dist/orchestration/steps/publish-report-step.js +36 -8
- package/dist/pipeline/cache-hit-restore.d.ts +14 -1
- package/dist/pipeline/cache-hit-restore.js +17 -0
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +123 -29
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
- package/dist/pipeline/compiler/provider-assembler.js +16 -3
- package/dist/pipeline/failure-modes.d.ts +20 -10
- package/dist/pipeline/failure-modes.js +84 -15
- package/dist/pipeline/map-request-to-config.js +2 -0
- package/dist/pipeline/normalize-mode.d.ts +1 -1
- package/dist/pipeline/normalize-mode.js +2 -0
- package/dist/pipeline/run-context.d.ts +16 -1
- package/dist/pipeline/run-context.js +12 -1
- package/dist/pipeline/validate.d.ts +8 -4
- package/dist/pipeline/validate.js +8 -18
- package/dist/report-store.d.ts +14 -1
- package/dist/report-store.js +32 -0
- package/dist/sanity/client.js +2 -2
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
|
@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
|
|
|
187
187
|
* @param manifestSlugs - All slugs in the run's document manifest.
|
|
188
188
|
*/
|
|
189
189
|
export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
|
|
190
|
+
/**
|
|
191
|
+
* Per-variant scoring profiles passed to {@link extractStoredTestResults}.
|
|
192
|
+
* Each profile maps dimension id → weight. Variants whose dimensions don't
|
|
193
|
+
* intersect the supplied keys yield `compositeScore: undefined` rather than
|
|
194
|
+
* a misleading 0.
|
|
195
|
+
*/
|
|
196
|
+
export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
|
|
190
197
|
/**
|
|
191
198
|
* Extract per-test results with model output from evaluation results.
|
|
192
199
|
*
|
|
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
|
|
|
194
201
|
* shape including response.output (truncated), latency, and cost.
|
|
195
202
|
* One StoredTestResult per test × model combination.
|
|
196
203
|
*
|
|
204
|
+
* When `profiles` is provided, each entry's `compositeScore` is computed as
|
|
205
|
+
* the weighted mean of its dimension scores using the profile matching its
|
|
206
|
+
* detected `variant`. Without profiles, `compositeScore` is omitted — legacy
|
|
207
|
+
* behavior preserved.
|
|
208
|
+
*
|
|
197
209
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
198
210
|
*/
|
|
199
|
-
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
211
|
+
export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
|
|
200
212
|
/**
|
|
201
213
|
* W0198 — aggregate every per-test `SymbolPreflightReport` into a single
|
|
202
214
|
* resolver-health summary. Returns `undefined` when the run had no
|
|
@@ -32,7 +32,7 @@ import { join } from "path";
|
|
|
32
32
|
import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
33
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
34
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
|
-
import {
|
|
35
|
+
import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
|
|
36
36
|
import { validateFailureMode } from "./failure-modes.js";
|
|
37
37
|
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
38
38
|
import { checkResultsExist } from "./checks.js";
|
|
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
|
|
|
184
184
|
continue;
|
|
185
185
|
}
|
|
186
186
|
const score = parseRubricScore(comp);
|
|
187
|
-
// Extract the reason text — the grader's reasoning.
|
|
188
|
-
//
|
|
189
|
-
//
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
//
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
//
|
|
187
|
+
// Extract the reason text — the grader's reasoning. W0273 splits
|
|
188
|
+
// the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
|
|
189
|
+
// — only fields the LLM controls) and a storage shape
|
|
190
|
+
// (`GraderJudgmentSchema` — full strict surface). The pipeline
|
|
191
|
+
// parses against the wire shape, then synthesizes the pipeline-owned
|
|
192
|
+
// fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
|
|
193
|
+
// hallucinationCheckedAgainst) plus the result-context fields
|
|
194
|
+
// (taskId, modelId, dimension) to build the full storage shape.
|
|
195
|
+
//
|
|
196
|
+
// On parse failure we fall to an `unclassified`-shape Phase 1
|
|
197
|
+
// judgment built from the raw reason string — NEVER fall back to
|
|
198
|
+
// the legacy parser (Pitfall 4: strict and legacy schemas are
|
|
199
|
+
// deliberate siblings, not a fall-through chain).
|
|
196
200
|
const reasonRaw = comp.reason ?? "";
|
|
197
201
|
let parsedJudgment = null;
|
|
198
202
|
let reason = reasonRaw;
|
|
199
203
|
if (reasonRaw) {
|
|
200
204
|
try {
|
|
201
205
|
const candidate = JSON.parse(reasonRaw);
|
|
202
|
-
// The
|
|
203
|
-
//
|
|
204
|
-
//
|
|
206
|
+
// The wire schema asserts only the LLM-emit subset. safeParse
|
|
207
|
+
// handles non-object inputs (number, array, etc.) by failing —
|
|
208
|
+
// we don't pre-narrow here.
|
|
205
209
|
const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
|
|
206
|
-
const result =
|
|
207
|
-
...candidateObj,
|
|
208
|
-
taskId,
|
|
209
|
-
modelId,
|
|
210
|
-
dimension: kind,
|
|
211
|
-
});
|
|
210
|
+
const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
|
|
212
211
|
if (result.success) {
|
|
213
|
-
|
|
214
|
-
|
|
212
|
+
const emitted = result.data;
|
|
213
|
+
parsedJudgment = {
|
|
214
|
+
// Result-context fields — pipeline-supplied:
|
|
215
|
+
taskId,
|
|
216
|
+
modelId,
|
|
217
|
+
dimension: kind,
|
|
218
|
+
// Wire-emitted fields — LLM-controlled:
|
|
219
|
+
score: emitted.score,
|
|
220
|
+
reason: emitted.reason,
|
|
221
|
+
failureMode: emitted.failureMode,
|
|
222
|
+
subJudgments: emitted.subJudgments,
|
|
223
|
+
docCitations: emitted.docCitations,
|
|
224
|
+
confidence: emitted.confidence,
|
|
225
|
+
...(emitted.outputFailure && {
|
|
226
|
+
outputFailure: emitted.outputFailure,
|
|
227
|
+
}),
|
|
228
|
+
// Pipeline-owned fields — synthesized:
|
|
229
|
+
judgmentId: generateJudgmentId({
|
|
230
|
+
taskId,
|
|
231
|
+
modelId,
|
|
232
|
+
dimension: kind,
|
|
233
|
+
...(telemetry?.runId ? { runId: telemetry.runId } : {}),
|
|
234
|
+
}),
|
|
235
|
+
// hallucinationCheckedAgainst is filled in later by
|
|
236
|
+
// populateHallucinationFields (gap-analysis-step.ts) — it
|
|
237
|
+
// needs the run.documentManifest union that isn't visible
|
|
238
|
+
// here. Empty array is the documented pre-fill placeholder.
|
|
239
|
+
hallucinationCheckedAgainst: [],
|
|
240
|
+
metadata: {
|
|
241
|
+
// graderModel is threaded via the existing
|
|
242
|
+
// telemetry.reliability channel. When upstream wires the
|
|
243
|
+
// real grader-provider alias into reliability.graderModel,
|
|
244
|
+
// it propagates here automatically; today it's "unknown"
|
|
245
|
+
// (matching the pre-W0273 synthesized-fallback default).
|
|
246
|
+
graderModel: telemetry?.reliability.graderModel ?? "unknown",
|
|
247
|
+
graderJudgmentsVersion,
|
|
248
|
+
},
|
|
249
|
+
};
|
|
250
|
+
reason = emitted.reason;
|
|
215
251
|
}
|
|
216
252
|
else {
|
|
217
253
|
// Parse failure — drop to failureMode='unclassified' below.
|
|
@@ -433,6 +469,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
|
|
|
433
469
|
* `responseOutputTruncated` still flips for the extreme tail.
|
|
434
470
|
*/
|
|
435
471
|
const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
|
|
472
|
+
/**
|
|
473
|
+
* Weighted mean of dimension scores. Mirrors the dashboard's read-side
|
|
474
|
+
* fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
|
|
475
|
+
* and reader stay aligned. Returns `undefined` when no dimension matches the
|
|
476
|
+
* profile (caller decides whether that signals misconfiguration).
|
|
477
|
+
*/
|
|
478
|
+
function computeStoredCompositeScore(dimensions, weights) {
|
|
479
|
+
let weighted = 0;
|
|
480
|
+
let totalWeight = 0;
|
|
481
|
+
for (const dim of dimensions) {
|
|
482
|
+
const w = weights[dim.dimension];
|
|
483
|
+
if (w === undefined)
|
|
484
|
+
continue;
|
|
485
|
+
weighted += dim.score * w;
|
|
486
|
+
totalWeight += w;
|
|
487
|
+
}
|
|
488
|
+
if (totalWeight === 0)
|
|
489
|
+
return undefined;
|
|
490
|
+
return Math.round(weighted / totalWeight);
|
|
491
|
+
}
|
|
436
492
|
/**
|
|
437
493
|
* Extract per-test results with model output from evaluation results.
|
|
438
494
|
*
|
|
@@ -440,9 +496,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
|
|
|
440
496
|
* shape including response.output (truncated), latency, and cost.
|
|
441
497
|
* One StoredTestResult per test × model combination.
|
|
442
498
|
*
|
|
499
|
+
* When `profiles` is provided, each entry's `compositeScore` is computed as
|
|
500
|
+
* the weighted mean of its dimension scores using the profile matching its
|
|
501
|
+
* detected `variant`. Without profiles, `compositeScore` is omitted — legacy
|
|
502
|
+
* behavior preserved.
|
|
503
|
+
*
|
|
443
504
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
444
505
|
*/
|
|
445
|
-
export function extractStoredTestResults(resultsPath) {
|
|
506
|
+
export function extractStoredTestResults(resultsPath, profiles) {
|
|
446
507
|
const results = readAndNormalizeResults(resultsPath);
|
|
447
508
|
const testResults = [];
|
|
448
509
|
for (const result of results) {
|
|
@@ -487,8 +548,13 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
487
548
|
dimensions.push({ dimension, reason, score });
|
|
488
549
|
}
|
|
489
550
|
const tokenUsage = result.response?.tokenUsage;
|
|
551
|
+
const profileForVariant = profiles?.[variant];
|
|
552
|
+
const compositeScore = profileForVariant
|
|
553
|
+
? computeStoredCompositeScore(dimensions, profileForVariant)
|
|
554
|
+
: undefined;
|
|
490
555
|
testResults.push({
|
|
491
556
|
area,
|
|
557
|
+
...(compositeScore !== undefined && { compositeScore }),
|
|
492
558
|
cost: result.cost || undefined,
|
|
493
559
|
dimensions,
|
|
494
560
|
latencyMs: result.latencyMs,
|
|
@@ -1441,7 +1507,12 @@ export async function calculateAndWriteScores(options) {
|
|
|
1441
1507
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1442
1508
|
}
|
|
1443
1509
|
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1444
|
-
|
|
1510
|
+
// Agent-harness produces a single profile shared across detected variants
|
|
1511
|
+
// (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
|
|
1512
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1513
|
+
gold: agentProfile,
|
|
1514
|
+
baseline: agentProfile,
|
|
1515
|
+
});
|
|
1445
1516
|
if (testResults.length > 0) {
|
|
1446
1517
|
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
1447
1518
|
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
@@ -1498,7 +1569,13 @@ export async function calculateAndWriteScores(options) {
|
|
|
1498
1569
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1499
1570
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1500
1571
|
}
|
|
1501
|
-
|
|
1572
|
+
// Knowledge-probe deletes vars.docs in the compiler, so every entry's
|
|
1573
|
+
// detected variant is "baseline" — supply the probe profile under both
|
|
1574
|
+
// keys so the composite is populated regardless of detection.
|
|
1575
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1576
|
+
gold: probeProfile,
|
|
1577
|
+
baseline: probeProfile,
|
|
1578
|
+
});
|
|
1502
1579
|
if (testResults.length > 0) {
|
|
1503
1580
|
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
1504
1581
|
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
@@ -1512,9 +1589,15 @@ export async function calculateAndWriteScores(options) {
|
|
|
1512
1589
|
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
1513
1590
|
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
1514
1591
|
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
1592
|
+
// Hoisted so the post-scoring extractStoredTestResults call against the
|
|
1593
|
+
// agentic results file can attach the matching profile (W0291).
|
|
1594
|
+
const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
|
|
1595
|
+
? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
|
|
1596
|
+
: undefined;
|
|
1515
1597
|
log.debug("Loaded scoring profiles", {
|
|
1516
1598
|
gold: goldProfile,
|
|
1517
1599
|
baseline: baselineProfileWeights,
|
|
1600
|
+
...(agenticProfile && { agentic: agenticProfile }),
|
|
1518
1601
|
});
|
|
1519
1602
|
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
|
|
1520
1603
|
log.debug("Baseline scores calculated", {
|
|
@@ -1541,7 +1624,8 @@ export async function calculateAndWriteScores(options) {
|
|
|
1541
1624
|
let evaluationMode;
|
|
1542
1625
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1543
1626
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
1544
|
-
|
|
1627
|
+
// Non-null assertion safe — the outer guard hoisting agenticProfile uses
|
|
1628
|
+
// the same condition; if we entered this block, the profile was resolved.
|
|
1545
1629
|
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
|
|
1546
1630
|
log.debug("Agentic scores calculated", {
|
|
1547
1631
|
featureCount: Object.keys(agenticScores).length,
|
|
@@ -1645,11 +1729,21 @@ export async function calculateAndWriteScores(options) {
|
|
|
1645
1729
|
});
|
|
1646
1730
|
}
|
|
1647
1731
|
}
|
|
1648
|
-
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1649
|
-
|
|
1650
|
-
//
|
|
1732
|
+
// Extract and persist per-test results (D0029: model output + metadata).
|
|
1733
|
+
// Literacy gold (with-docs) entries score against the default profile;
|
|
1734
|
+
// baseline (without-docs) entries score against the output-only profile.
|
|
1735
|
+
const testResults = extractStoredTestResults(baselineResultsPath, {
|
|
1736
|
+
gold: goldProfile,
|
|
1737
|
+
baseline: baselineProfileWeights,
|
|
1738
|
+
});
|
|
1739
|
+
// In full mode, also extract test results from agentic results — the
|
|
1740
|
+
// agentic file's gold entries score against the agentic profile while
|
|
1741
|
+
// baseline entries (if any leak through) still use the literacy baseline.
|
|
1651
1742
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1652
|
-
const agenticTestResults = extractStoredTestResults(agenticResultsPath
|
|
1743
|
+
const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
|
|
1744
|
+
gold: agenticProfile,
|
|
1745
|
+
baseline: baselineProfileWeights,
|
|
1746
|
+
});
|
|
1653
1747
|
testResults.push(...agenticTestResults);
|
|
1654
1748
|
}
|
|
1655
1749
|
if (testResults.length > 0) {
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
8
|
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
9
|
-
import type { LiteracyCompileOptions } from "./types.js";
|
|
9
|
+
import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
|
|
10
10
|
export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
|
|
11
11
|
/**
|
|
12
12
|
* Build baseline assertions matching the legacy expand-tasks behavior.
|
|
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
|
|
|
14
14
|
* - "full": all assertions carried over
|
|
15
15
|
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
16
16
|
* - "none": no assertions
|
|
17
|
+
*
|
|
18
|
+
* `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
|
|
19
|
+
* mode's synthetic rubric. Without it the abbreviated emission would fail
|
|
20
|
+
* `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
|
|
21
|
+
* subJudgments, docCitations, confidence).
|
|
17
22
|
*/
|
|
18
|
-
export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
|
|
23
|
+
export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];
|
|
@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
|
|
|
45
45
|
return null;
|
|
46
46
|
const template = rubricConfig.templates["doc-coverage"];
|
|
47
47
|
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
// W0273 — use the centralized wire-shape footer so the grader emission
|
|
49
|
+
// parses against GraderEmittedJudgmentSchema. The pre-W0273 short
|
|
50
|
+
// {score, reason} footer caused 100% parse failures starting 2026-05-11.
|
|
51
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
|
|
50
52
|
// doc-coverage benefits from the same authoritative reference — the grader
|
|
51
53
|
// needs the doc content to judge whether the candidate actually used what
|
|
52
54
|
// was documented.
|
|
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
|
|
|
92
94
|
* - "full": all assertions carried over
|
|
93
95
|
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
94
96
|
* - "none": no assertions
|
|
97
|
+
*
|
|
98
|
+
* `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
|
|
99
|
+
* mode's synthetic rubric. Without it the abbreviated emission would fail
|
|
100
|
+
* `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
|
|
101
|
+
* subJudgments, docCitations, confidence).
|
|
95
102
|
*/
|
|
96
|
-
export function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
103
|
+
export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
|
|
97
104
|
const mode = rubricMode ?? "full";
|
|
98
105
|
if (mode === "none")
|
|
99
106
|
return [];
|
|
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
|
106
113
|
if (a.type === "llm-rubric") {
|
|
107
114
|
if (!foundFirst) {
|
|
108
115
|
foundFirst = true;
|
|
116
|
+
const footer = rubricConfig?.footer ??
|
|
117
|
+
'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
|
|
109
118
|
abbreviated.push({
|
|
110
119
|
type: "llm-rubric",
|
|
111
120
|
value: "Score task completion from 0 to 100 (same criteria as above).\n" +
|
|
112
|
-
|
|
121
|
+
footer,
|
|
113
122
|
...(a.provider ? { provider: a.provider } : {}),
|
|
114
123
|
});
|
|
115
124
|
}
|
|
@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
|
|
|
134
134
|
if (evalMode !== "agentic") {
|
|
135
135
|
const baselineEnabled = task.baseline?.enabled !== false;
|
|
136
136
|
if (baselineEnabled) {
|
|
137
|
-
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
|
|
137
|
+
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
|
|
138
138
|
tests.push({
|
|
139
139
|
description: `${taskTitle} (baseline)`,
|
|
140
140
|
vars: {
|
|
@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
|
|
|
50
50
|
* Returns provider arrays keyed by literacy variant name (baseline,
|
|
51
51
|
* agentic, observed). These are consumed by the YAML writer to produce
|
|
52
52
|
* the per-variant promptfoo config files.
|
|
53
|
+
*
|
|
54
|
+
* `loaded` (optional) lets callers pre-load and pre-filter the
|
|
55
|
+
* `ModelsConfig` so a caller-side filter (e.g. W0281's
|
|
56
|
+
* `filterModelsByRequest`) actually takes effect on the assembled
|
|
57
|
+
* providers — building providers from the unfiltered set would silently
|
|
58
|
+
* defeat the filter, since promptfoo decides which LLMs to call from the
|
|
59
|
+
* providers array, not the returned `models` field.
|
|
53
60
|
*/
|
|
54
|
-
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
|
|
61
|
+
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
|
|
55
62
|
/**
|
|
56
63
|
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
57
64
|
*
|
|
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
|
|
|
60
67
|
* > hard fallback (5).
|
|
61
68
|
*/
|
|
62
69
|
export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
|
|
70
|
+
/**
|
|
71
|
+
* Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
|
|
72
|
+
* that need to pre-filter the model set before provider assembly (e.g.
|
|
73
|
+
* `PipelineRequest.models`) can hand the filtered config back to
|
|
74
|
+
* `loadModelsAndProviders` via its optional `loaded` parameter.
|
|
75
|
+
*/
|
|
76
|
+
export declare function loadModelsYaml(rootDir: string): ModelsConfig;
|
|
@@ -64,9 +64,16 @@ function applyReplaySwap(providers) {
|
|
|
64
64
|
* Returns provider arrays keyed by literacy variant name (baseline,
|
|
65
65
|
* agentic, observed). These are consumed by the YAML writer to produce
|
|
66
66
|
* the per-variant promptfoo config files.
|
|
67
|
+
*
|
|
68
|
+
* `loaded` (optional) lets callers pre-load and pre-filter the
|
|
69
|
+
* `ModelsConfig` so a caller-side filter (e.g. W0281's
|
|
70
|
+
* `filterModelsByRequest`) actually takes effect on the assembled
|
|
71
|
+
* providers — building providers from the unfiltered set would silently
|
|
72
|
+
* defeat the filter, since promptfoo decides which LLMs to call from the
|
|
73
|
+
* providers array, not the returned `models` field.
|
|
67
74
|
*/
|
|
68
|
-
export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins) {
|
|
69
|
-
const models = loadModelsYaml(rootDir);
|
|
75
|
+
export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins, loaded) {
|
|
76
|
+
const models = loaded ?? loadModelsYaml(rootDir);
|
|
70
77
|
return {
|
|
71
78
|
models,
|
|
72
79
|
providers: {
|
|
@@ -203,6 +210,12 @@ export function resolveMaxToolRounds(models, model, variant) {
|
|
|
203
210
|
// ---------------------------------------------------------------------------
|
|
204
211
|
// Helpers
|
|
205
212
|
// ---------------------------------------------------------------------------
|
|
206
|
-
|
|
213
|
+
/**
|
|
214
|
+
* Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
|
|
215
|
+
* that need to pre-filter the model set before provider assembly (e.g.
|
|
216
|
+
* `PipelineRequest.models`) can hand the filtered config back to
|
|
217
|
+
* `loadModelsAndProviders` via its optional `loaded` parameter.
|
|
218
|
+
*/
|
|
219
|
+
export function loadModelsYaml(rootDir) {
|
|
207
220
|
return loadConfigFile("models", rootDir).data;
|
|
208
221
|
}
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
* Ceiling-cross-check failure-mode validator + report assembly
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly + keyword
|
|
5
|
+
* fallback classifier.
|
|
5
6
|
*
|
|
6
7
|
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
-
* (Plan 03-02 — `packages/eval/src/grader/`)
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
|
|
9
|
+
* is available to the pipeline. In practice (W0273 discovery), Promptfoo's
|
|
10
|
+
* `llm-rubric` post-processor extracts `score` + `reason` from the grader's
|
|
11
|
+
* JSON envelope and discards the rest of the structured surface — including
|
|
12
|
+
* `failureMode`. The wire-shape footer instructs the LLM correctly but the
|
|
13
|
+
* structured fields never reach `extractGraderJudgments`, so every emission
|
|
14
|
+
* arrives as the synthesized `failureMode: "unclassified"` placeholder.
|
|
12
15
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
16
|
+
* To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
|
|
17
|
+
* a keyword-pattern classifier is run as a FALLBACK when the grader's
|
|
18
|
+
* emitted mode is `"unclassified"` and the score is below the classification
|
|
19
|
+
* threshold. Plan 03-03 deleted this classifier in favor of grader-emission
|
|
20
|
+
* source-of-truth; W0273 reinstates it because the grader-emission path is
|
|
21
|
+
* blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
|
|
22
|
+
* (capturing the grader's full structured response) is tracked separately.
|
|
23
|
+
*
|
|
24
|
+
* `classifyByCeiling` continues to serve as the confidence cross-check.
|
|
17
25
|
*
|
|
18
26
|
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
27
|
* failureMode under the per-dimension taxonomy
|
|
20
28
|
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
29
|
* shape and `ceiling-cross-check` derivation tag
|
|
30
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
|
|
31
|
+
* cause (Promptfoo strips structured fields)
|
|
22
32
|
*/
|
|
23
33
|
import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
|
|
24
34
|
import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
* Ceiling-cross-check failure-mode validator + report assembly
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly + keyword
|
|
5
|
+
* fallback classifier.
|
|
5
6
|
*
|
|
6
7
|
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
-
* (Plan 03-02 — `packages/eval/src/grader/`)
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
|
|
9
|
+
* is available to the pipeline. In practice (W0273 discovery), Promptfoo's
|
|
10
|
+
* `llm-rubric` post-processor extracts `score` + `reason` from the grader's
|
|
11
|
+
* JSON envelope and discards the rest of the structured surface — including
|
|
12
|
+
* `failureMode`. The wire-shape footer instructs the LLM correctly but the
|
|
13
|
+
* structured fields never reach `extractGraderJudgments`, so every emission
|
|
14
|
+
* arrives as the synthesized `failureMode: "unclassified"` placeholder.
|
|
12
15
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
16
|
+
* To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
|
|
17
|
+
* a keyword-pattern classifier is run as a FALLBACK when the grader's
|
|
18
|
+
* emitted mode is `"unclassified"` and the score is below the classification
|
|
19
|
+
* threshold. Plan 03-03 deleted this classifier in favor of grader-emission
|
|
20
|
+
* source-of-truth; W0273 reinstates it because the grader-emission path is
|
|
21
|
+
* blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
|
|
22
|
+
* (capturing the grader's full structured response) is tracked separately.
|
|
23
|
+
*
|
|
24
|
+
* `classifyByCeiling` continues to serve as the confidence cross-check.
|
|
17
25
|
*
|
|
18
26
|
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
27
|
* failureMode under the per-dimension taxonomy
|
|
20
28
|
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
29
|
* shape and `ceiling-cross-check` derivation tag
|
|
30
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
|
|
31
|
+
* cause (Promptfoo strips structured fields)
|
|
22
32
|
*/
|
|
23
33
|
import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
24
34
|
// ---------------------------------------------------------------------------
|
|
@@ -27,6 +37,20 @@ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/in
|
|
|
27
37
|
/** Only classify judgments with scores below this threshold */
|
|
28
38
|
const CLASSIFICATION_THRESHOLD = 60;
|
|
29
39
|
// ---------------------------------------------------------------------------
|
|
40
|
+
// Keyword patterns (W0273 fallback)
|
|
41
|
+
//
|
|
42
|
+
// Verbatim from the pre-Plan-03-03 implementation. Used only when the
|
|
43
|
+
// grader's emitted `failureMode` is `"unclassified"` — the grader's
|
|
44
|
+
// emission still wins whenever it actually reaches the pipeline.
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
/** API error pattern — checked FIRST to prevent timeout errors containing
|
|
47
|
+
* "deprecated" from being misclassified as outdated-docs. */
|
|
48
|
+
const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
|
|
49
|
+
const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
|
|
50
|
+
const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
|
|
51
|
+
const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
|
|
52
|
+
const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
|
|
53
|
+
// ---------------------------------------------------------------------------
|
|
30
54
|
// Public API
|
|
31
55
|
// ---------------------------------------------------------------------------
|
|
32
56
|
/**
|
|
@@ -69,13 +93,25 @@ export function buildFailureModeReport(judgments, scores) {
|
|
|
69
93
|
// grader's actual taxonomy choice rather than a collapsed
|
|
70
94
|
// `"unclassified"` bucket.
|
|
71
95
|
const emittedMode = readEmittedMode(judgment);
|
|
96
|
+
// W0273 fallback — when the grader's emitted mode is "unclassified"
|
|
97
|
+
// (the synthesized-unparsed-judgment placeholder; in practice this
|
|
98
|
+
// is every judgment today because Promptfoo's llm-rubric strips the
|
|
99
|
+
// grader's structured response), try keyword classification against
|
|
100
|
+
// the reason prose. Gated on score < CLASSIFICATION_THRESHOLD so
|
|
101
|
+
// passing judgments don't get spurious classifications.
|
|
102
|
+
const keywordFallback = emittedMode === "unclassified" &&
|
|
103
|
+
judgment.score < CLASSIFICATION_THRESHOLD
|
|
104
|
+
? classifyByKeyword(judgment.reason)
|
|
105
|
+
: null;
|
|
72
106
|
// Cross-check the grader's emission against ceiling decomposition.
|
|
73
107
|
const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
|
|
74
|
-
const classification =
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
108
|
+
const classification = keywordFallback
|
|
109
|
+
? keywordFallback
|
|
110
|
+
: {
|
|
111
|
+
confidence: stamp.level,
|
|
112
|
+
mode: emittedMode,
|
|
113
|
+
source: "ceiling",
|
|
114
|
+
};
|
|
79
115
|
classifiedJudgments.push({ classification, judgment });
|
|
80
116
|
summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
|
|
81
117
|
// Per-area tracking
|
|
@@ -282,6 +318,39 @@ function readEmittedMode(judgment) {
|
|
|
282
318
|
}
|
|
283
319
|
return emitted;
|
|
284
320
|
}
|
|
321
|
+
/**
|
|
322
|
+
* Classify the failure mode of a low-scoring grader judgment by matching
|
|
323
|
+
* keyword patterns against the reason prose. Returns `null` when no
|
|
324
|
+
* pattern matches. Patterns checked in priority order (API errors first
|
|
325
|
+
* so timeout messages containing "deprecated" don't get misclassified
|
|
326
|
+
* as outdated-docs).
|
|
327
|
+
*
|
|
328
|
+
* W0273 — reinstated as a fallback when the grader's emitted failureMode
|
|
329
|
+
* is "unclassified". Plan 03-03 deleted this code in favor of grader-
|
|
330
|
+
* emission source-of-truth; the deletion is reversed here because
|
|
331
|
+
* Promptfoo's llm-rubric post-processor strips the grader's structured
|
|
332
|
+
* response (only score + reason survive into `comp.*`), so the
|
|
333
|
+
* grader-emission path produces 0% classification on every run.
|
|
334
|
+
*/
|
|
335
|
+
function classifyByKeyword(reason) {
|
|
336
|
+
const lower = reason.toLowerCase();
|
|
337
|
+
if (API_ERROR_PATTERN.test(lower)) {
|
|
338
|
+
return { confidence: "high", mode: "api-error", source: "keyword" };
|
|
339
|
+
}
|
|
340
|
+
if (OUTDATED_PATTERN.test(lower)) {
|
|
341
|
+
return { confidence: "high", mode: "outdated-docs", source: "keyword" };
|
|
342
|
+
}
|
|
343
|
+
if (MISSING_PATTERN.test(lower)) {
|
|
344
|
+
return { confidence: "high", mode: "missing-docs", source: "keyword" };
|
|
345
|
+
}
|
|
346
|
+
if (INCORRECT_PATTERN.test(lower)) {
|
|
347
|
+
return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
|
|
348
|
+
}
|
|
349
|
+
if (POOR_STRUCTURE_PATTERN.test(lower)) {
|
|
350
|
+
return { confidence: "medium", mode: "poor-structure", source: "keyword" };
|
|
351
|
+
}
|
|
352
|
+
return null;
|
|
353
|
+
}
|
|
285
354
|
/**
|
|
286
355
|
* Classify by ceiling-decomposition structural signals — preserved
|
|
287
356
|
* verbatim from the pre-Plan-03-03 implementation. The function itself
|
|
@@ -37,6 +37,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
37
37
|
mode,
|
|
38
38
|
variant,
|
|
39
39
|
debug: mapDebug(request.debug),
|
|
40
|
+
models: request.models,
|
|
40
41
|
areas: request.areas,
|
|
41
42
|
tasks: request.tasks,
|
|
42
43
|
changedDocs: request.changedDocs,
|
|
@@ -46,6 +47,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
46
47
|
compareEnabled: request.compare ?? false,
|
|
47
48
|
compareThreshold: request.compareThreshold,
|
|
48
49
|
compareBaseline: request.compareBaseline,
|
|
50
|
+
compareBaselineReportId: request.compareBaselineReportId,
|
|
49
51
|
gapAnalysisEnabled: request.gapAnalysis ?? true,
|
|
50
52
|
publishEnabled: request.publish ?? publishDefault,
|
|
51
53
|
publishTag: request.publishTag,
|
|
@@ -35,7 +35,7 @@ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof Literacy
|
|
|
35
35
|
export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
|
|
36
36
|
export interface NormalizedMode {
|
|
37
37
|
mode: EvalMode;
|
|
38
|
-
variant?:
|
|
38
|
+
variant?: LiteracyVariantName;
|
|
39
39
|
}
|
|
40
40
|
/**
|
|
41
41
|
* Normalize a raw CLI mode string to a canonical mode + optional variant.
|
|
@@ -55,6 +55,8 @@ const ALL_ACCEPTED = [
|
|
|
55
55
|
export function normalizeMode(input) {
|
|
56
56
|
if (LEGACY_LITERACY_VARIANTS.has(input)) {
|
|
57
57
|
console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
|
|
58
|
+
// The membership check above narrows `input` to LITERACY_VARIANTS — the
|
|
59
|
+
// cast is to the closed type, not a widening.
|
|
58
60
|
return { mode: "literacy", variant: input };
|
|
59
61
|
}
|
|
60
62
|
if (CANONICAL_MODES.has(input)) {
|