@sanity/ailf 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +56 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
- package/dist/_vendor/ailf-core/types/index.js +16 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +59 -1
- package/dist/composition-root.js +95 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -29,9 +29,11 @@
|
|
|
29
29
|
*/
|
|
30
30
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
31
31
|
import { join } from "path";
|
|
32
|
-
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
32
|
+
import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
33
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
34
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
|
+
import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
|
|
36
|
+
import { validateFailureMode } from "./failure-modes.js";
|
|
35
37
|
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
36
38
|
import { checkResultsExist } from "./checks.js";
|
|
37
39
|
import { loadRubricTemplates } from "./rubric-loader.js";
|
|
@@ -99,6 +101,54 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
|
|
|
99
101
|
// ---------------------------------------------------------------------------
|
|
100
102
|
// URL extraction from assertion metadata
|
|
101
103
|
// ---------------------------------------------------------------------------
|
|
104
|
+
/**
|
|
105
|
+
* Synthesize a Phase 3 GRAD-05 fallback judgment when the strict-schema
|
|
106
|
+
* parse fails. Required GRAD-02 fields are populated with sensible
|
|
107
|
+
* defaults; the `judgmentId` is salted with `runId` (when threaded
|
|
108
|
+
* through) so consumers can dedup the natural composite key
|
|
109
|
+
* `(taskId, modelId, dimension)` per-run rather than colliding across
|
|
110
|
+
* re-runs of the same task.
|
|
111
|
+
*
|
|
112
|
+
* The branded-id construction lives in `generateJudgmentId` from
|
|
113
|
+
* `@sanity/ailf-core/branded-ids` — that's the single audited-cast site
|
|
114
|
+
* for the `as JudgmentId` exit, mirroring how `generateRunId` owns the
|
|
115
|
+
* `as RunId` cast.
|
|
116
|
+
*/
|
|
117
|
+
function synthesizeUnparsedJudgment(input) {
|
|
118
|
+
const { taskId, modelId, dimension, score, reason, outputFailure, runId } = input;
|
|
119
|
+
return {
|
|
120
|
+
dimension,
|
|
121
|
+
modelId,
|
|
122
|
+
...(outputFailure && { outputFailure: true }),
|
|
123
|
+
reason,
|
|
124
|
+
score,
|
|
125
|
+
taskId,
|
|
126
|
+
judgmentId: generateJudgmentId({
|
|
127
|
+
taskId,
|
|
128
|
+
modelId,
|
|
129
|
+
dimension,
|
|
130
|
+
...(runId ? { runId } : {}),
|
|
131
|
+
}),
|
|
132
|
+
subJudgments: [],
|
|
133
|
+
docCitations: [],
|
|
134
|
+
failureMode: "unclassified",
|
|
135
|
+
confidence: {
|
|
136
|
+
level: "low",
|
|
137
|
+
signalsPresent: 0,
|
|
138
|
+
// Sentinel — IF this leaks downstream past
|
|
139
|
+
// validateGraderJudgmentsCalibration the tag is obviously a
|
|
140
|
+
// synthesize-time placeholder rather than a real ceiling-cross-check
|
|
141
|
+
// result. The validator overwrites it with "ceiling-cross-check"
|
|
142
|
+
// along the live pipeline path.
|
|
143
|
+
derivation: "synthesized-pre-cross-check",
|
|
144
|
+
},
|
|
145
|
+
hallucinationCheckedAgainst: [],
|
|
146
|
+
metadata: {
|
|
147
|
+
graderModel: "unknown",
|
|
148
|
+
graderJudgmentsVersion,
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
102
152
|
/**
|
|
103
153
|
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
104
154
|
*
|
|
@@ -107,8 +157,15 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
|
|
|
107
157
|
* assertion produces one GraderJudgment entry.
|
|
108
158
|
*
|
|
109
159
|
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
160
|
+
*
|
|
161
|
+
* @param resultsPath - Path to the Promptfoo results JSON file.
|
|
162
|
+
* @param telemetry - Optional reliability counter (Plan 03-03). When passed,
|
|
163
|
+
* `parseFailures` is incremented on every strict-schema rejection so the
|
|
164
|
+
* live pipeline can surface schema drift over time.
|
|
165
|
+
* `runId` (when supplied) is threaded into synthesized fall-back judgment
|
|
166
|
+
* ids so dedup keys are unique per-run.
|
|
110
167
|
*/
|
|
111
|
-
export function extractGraderJudgments(resultsPath) {
|
|
168
|
+
export function extractGraderJudgments(resultsPath, telemetry) {
|
|
112
169
|
const results = readAndNormalizeResults(resultsPath);
|
|
113
170
|
const judgments = [];
|
|
114
171
|
for (const result of results) {
|
|
@@ -127,36 +184,242 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
127
184
|
continue;
|
|
128
185
|
}
|
|
129
186
|
const score = parseRubricScore(comp);
|
|
130
|
-
// Extract the reason text — the grader's reasoning
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
187
|
+
// Extract the reason text — the grader's reasoning. Plan 03-01
|
|
188
|
+
// (D0045 trust boundary): the inline `JSON.parse + as`-cast at
|
|
189
|
+
// this site is replaced with `GraderJudgmentSchema.safeParse`
|
|
190
|
+
// so that grader output flows through a validated schema before
|
|
191
|
+
// it enters the scoring pipeline. On parse failure we fall to
|
|
192
|
+
// an `unclassified`-shape Phase 1 judgment built from the raw
|
|
193
|
+
// reason string — NEVER fall back to the legacy parser (Pitfall
|
|
194
|
+
// 4: strict and legacy schemas are deliberate siblings, not a
|
|
195
|
+
// fall-through chain).
|
|
196
|
+
const reasonRaw = comp.reason ?? "";
|
|
197
|
+
let parsedJudgment = null;
|
|
198
|
+
let reason = reasonRaw;
|
|
199
|
+
if (reasonRaw) {
|
|
134
200
|
try {
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
201
|
+
const candidate = JSON.parse(reasonRaw);
|
|
202
|
+
// The strict schema asserts the full GraderJudgment surface.
|
|
203
|
+
// safeParse handles non-object inputs (number, array, etc.)
|
|
204
|
+
// by failing — we don't pre-narrow here.
|
|
205
|
+
const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
|
|
206
|
+
const result = GraderJudgmentSchema.safeParse({
|
|
207
|
+
...candidateObj,
|
|
208
|
+
taskId,
|
|
209
|
+
modelId,
|
|
210
|
+
dimension: kind,
|
|
211
|
+
});
|
|
212
|
+
if (result.success) {
|
|
213
|
+
parsedJudgment = result.data;
|
|
214
|
+
reason = result.data.reason;
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
// Parse failure — drop to failureMode='unclassified' below.
|
|
218
|
+
// Plan 03-03: increment graderReliability.parseFailures so the
|
|
219
|
+
// live pipeline surfaces schema drift over time. When no
|
|
220
|
+
// telemetry sink is passed, the increment is a no-op (preserves
|
|
221
|
+
// the pre-Plan-03-03 calling shape for any callers that don't
|
|
222
|
+
// care about reliability counters).
|
|
223
|
+
if (telemetry) {
|
|
224
|
+
telemetry.reliability.parseFailures =
|
|
225
|
+
(telemetry.reliability.parseFailures ?? 0) + 1;
|
|
226
|
+
}
|
|
227
|
+
reason = reasonRaw;
|
|
140
228
|
}
|
|
141
229
|
}
|
|
142
230
|
catch {
|
|
143
|
-
// Not JSON —
|
|
231
|
+
// Not JSON (legacy free-prose) — keep raw reason. The
|
|
232
|
+
// unclassified-fall-back path below covers this case.
|
|
144
233
|
}
|
|
145
234
|
}
|
|
146
235
|
// Also flag synthesized api-error judgments as output failures
|
|
147
236
|
const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
237
|
+
if (parsedJudgment !== null) {
|
|
238
|
+
// Strict-shape parse succeeded — preserve the parsed structured
|
|
239
|
+
// surface (additive GRAD-02 fields if the grader emitted them)
|
|
240
|
+
// and overlay pipeline-side semantics (outputFailure flag, the
|
|
241
|
+
// post-extraction score, and outer-context fields).
|
|
242
|
+
//
|
|
243
|
+
// The grader's emitted `score` from the JSON envelope is
|
|
244
|
+
// intentionally discarded — `parseRubricScore(comp)` (the
|
|
245
|
+
// component-level score Promptfoo computed) wins. Phase 3 keeps
|
|
246
|
+
// this asymmetry intentionally; the GRAD-06 cutover plan
|
|
247
|
+
// revisits whether the grader-emitted score should win or
|
|
248
|
+
// surface a tolerance-mismatch parseFailure variant.
|
|
249
|
+
judgments.push({
|
|
250
|
+
...parsedJudgment,
|
|
251
|
+
dimension: kind,
|
|
252
|
+
modelId,
|
|
253
|
+
taskId,
|
|
254
|
+
score,
|
|
255
|
+
...(outputFailure && { outputFailure: true }),
|
|
256
|
+
reason,
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
// Phase 3 GRAD-05 fallback — strict-schema parse failed. The
|
|
261
|
+
// GRAD-02 surface is now required, so we synthesize sensible
|
|
262
|
+
// defaults that mark this judgment as unparsed:
|
|
263
|
+
// - failureMode: "unclassified" (per per-dimension taxonomy in
|
|
264
|
+
// packages/eval/src/grader/common.ts).
|
|
265
|
+
// - confidence: low / 0 signals / "synthesized-pre-cross-check"
|
|
266
|
+
// (D0049 — sentinel placeholder; overwritten by
|
|
267
|
+
// validateGraderJudgmentsCalibration with "ceiling-cross-check"
|
|
268
|
+
// along the live pipeline path. The placeholder is a distinct
|
|
269
|
+
// tag so the unparsed state is obvious if it leaks downstream).
|
|
270
|
+
// - hallucinationCheckedAgainst, subJudgments, docCitations:
|
|
271
|
+
// empty arrays.
|
|
272
|
+
// - judgmentId: deterministic taskId::modelId::dimension stamp
|
|
273
|
+
// so consumers can dedupe on it.
|
|
274
|
+
judgments.push(synthesizeUnparsedJudgment({
|
|
275
|
+
taskId,
|
|
276
|
+
modelId,
|
|
277
|
+
dimension: kind,
|
|
278
|
+
score,
|
|
279
|
+
reason,
|
|
280
|
+
outputFailure,
|
|
281
|
+
...(telemetry?.runId ? { runId: telemetry.runId } : {}),
|
|
282
|
+
}));
|
|
283
|
+
}
|
|
156
284
|
}
|
|
157
285
|
}
|
|
158
286
|
return judgments;
|
|
159
287
|
}
|
|
288
|
+
/**
|
|
289
|
+
* Stamp every grader judgment with a D0049 ceiling-cross-check confidence
|
|
290
|
+
* triple and increment `GraderReliability.failureModeCalibration` whenever
|
|
291
|
+
* the grader's emitted `failureMode` disagrees with the
|
|
292
|
+
* ceiling-decomposition mode.
|
|
293
|
+
*
|
|
294
|
+
* Plan 03-03 — the grader's emitted `failureMode` is the source of truth
|
|
295
|
+
* for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
|
|
296
|
+
* stamps confidence based on whether the structural ceiling signal agrees
|
|
297
|
+
* and surfaces calibration drift as a counter on `GraderReliability`.
|
|
298
|
+
*
|
|
299
|
+
* The function mutates `judgments` in place — it overlays
|
|
300
|
+
* `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
|
|
301
|
+
* already carries a confidence from the strict-schema parse (Plan 03-01),
|
|
302
|
+
* the ceiling-cross-check stamp REPLACES it because the validator's
|
|
303
|
+
* derivation tag is the live-pipeline contract; the parsed-shape
|
|
304
|
+
* confidence (if emitted by the grader) is preserved on the original
|
|
305
|
+
* `parsedJudgment` upstream of this site.
|
|
306
|
+
*
|
|
307
|
+
* `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
|
|
308
|
+
* does not have access to `task.contextDocs ∪ run.documentManifest` at this
|
|
309
|
+
* site (the union travels through a separate path in
|
|
310
|
+
* `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
|
|
311
|
+
* couple the doc-union population at the strict-schema flip site so the
|
|
312
|
+
* field is populated alongside the required-flip.
|
|
313
|
+
*
|
|
314
|
+
* @param judgments - Grader judgments produced by `extractGraderJudgments`.
|
|
315
|
+
* @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
|
|
316
|
+
* come from this lookup. Missing areas default to ceiling 100, floor 0
|
|
317
|
+
* (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
|
|
318
|
+
* @param reliability - `GraderReliability` sink whose
|
|
319
|
+
* `failureModeCalibration` counter is incremented on disagreement.
|
|
320
|
+
*
|
|
321
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
322
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
323
|
+
*/
|
|
324
|
+
export function validateGraderJudgmentsCalibration(judgments, scores, reliability) {
|
|
325
|
+
const scoreByArea = new Map();
|
|
326
|
+
for (const score of scores) {
|
|
327
|
+
scoreByArea.set(score.feature, score);
|
|
328
|
+
}
|
|
329
|
+
for (const judgment of judgments) {
|
|
330
|
+
// Resolve area for ceiling/floor lookup. Mirrors the resolution
|
|
331
|
+
// strategy used inside buildFailureModeReport but kept local so we
|
|
332
|
+
// don't widen the failure-modes module surface.
|
|
333
|
+
const detected = detectFeatureArea(judgment.taskId);
|
|
334
|
+
let area;
|
|
335
|
+
if (detected !== "other" && scoreByArea.has(detected)) {
|
|
336
|
+
area = detected;
|
|
337
|
+
}
|
|
338
|
+
else {
|
|
339
|
+
for (const candidate of scoreByArea.keys()) {
|
|
340
|
+
if (judgment.taskId.startsWith(candidate)) {
|
|
341
|
+
area = candidate;
|
|
342
|
+
break;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
const areaScore = area ? scoreByArea.get(area) : undefined;
|
|
347
|
+
const ceilingScore = areaScore?.ceilingScore ?? 100;
|
|
348
|
+
const floorScore = areaScore?.floorScore ?? 0;
|
|
349
|
+
// Snapshot the pre-overwrite derivation BEFORE replacing
|
|
350
|
+
// `judgment.confidence` with the ceiling-cross-check stamp. The
|
|
351
|
+
// synthesized parse-fallback sentinel is the structurally correct
|
|
352
|
+
// discriminator for "this judgment came from a parse failure, not a
|
|
353
|
+
// genuine grader emission". Reading it before overwrite preserves
|
|
354
|
+
// the signal that L627 erases. Equivalent in shape to a typed
|
|
355
|
+
// sentinel check — derivation is the contract field for upstream
|
|
356
|
+
// provenance.
|
|
357
|
+
const isSynthesizedFallback = judgment.confidence?.derivation === "synthesized-pre-cross-check";
|
|
358
|
+
const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
|
|
359
|
+
judgment.confidence = stamp;
|
|
360
|
+
// A calibration miss requires BOTH signals present and disagreeing.
|
|
361
|
+
// signalsPresent === 1 (grader-only, no ceiling pattern) is NOT a
|
|
362
|
+
// miss — we have nothing to cross-check against. Folding it in
|
|
363
|
+
// over-counted whenever classifyByCeiling returned null. Skip the
|
|
364
|
+
// synthesized parse-fallback judgment whose hard-coded
|
|
365
|
+
// failureMode: "unclassified" would otherwise increment the counter
|
|
366
|
+
// every time ceiling decomposition picks ANY classified mode —
|
|
367
|
+
// that's a parseFailures concern, not a calibration concern. We
|
|
368
|
+
// detect "synthesized fallback" via the derivation sentinel rather
|
|
369
|
+
// than `failureMode !== "unclassified"` — the latter under-counts
|
|
370
|
+
// when a grader genuinely emits "unclassified" as a classification.
|
|
371
|
+
if (stamp.level === "medium" &&
|
|
372
|
+
stamp.signalsPresent === 2 &&
|
|
373
|
+
!isSynthesizedFallback) {
|
|
374
|
+
reliability.failureModeCalibration =
|
|
375
|
+
(reliability.failureModeCalibration ?? 0) + 1;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Populate Pitfall #11 hallucination cross-check fields on grader
|
|
381
|
+
* judgments (Plan 03-04 GRAD-05).
|
|
382
|
+
*
|
|
383
|
+
* For each judgment, sets `hallucinationCheckedAgainst` to the union of
|
|
384
|
+
* (a) the slugs of docs the task declared in `context.docs` and (b) the
|
|
385
|
+
* run's full document manifest. For each entry in `judgment.docCitations`
|
|
386
|
+
* that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
|
|
387
|
+
* that does not appear in either set is a fabrication, not a real
|
|
388
|
+
* resolvable doc.
|
|
389
|
+
*
|
|
390
|
+
* Mutates `judgments` in place. Slug-less citations are left with
|
|
391
|
+
* `hallucinated` undefined since the lookup key is the slug per the
|
|
392
|
+
* GraderJudgment domain doc-comment ("slug does not resolve against the
|
|
393
|
+
* task's contextDocs set").
|
|
394
|
+
*
|
|
395
|
+
* Called from `gap-analysis-step` (literacy mode only) after
|
|
396
|
+
* `descToDocRefs` and `documentManifest` are built — both inputs are
|
|
397
|
+
* unavailable at extract time. Non-literacy modes skip this step
|
|
398
|
+
* entirely (the runtime contract has no canonical doc set to check
|
|
399
|
+
* against).
|
|
400
|
+
*
|
|
401
|
+
* @param judgments - Grader judgments to enrich. Mutated in place.
|
|
402
|
+
* @param taskDocSlugs - Map from base task description (the form judgment
|
|
403
|
+
* `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
|
|
404
|
+
* to the slugs declared in that task's `context.docs`.
|
|
405
|
+
* @param manifestSlugs - All slugs in the run's document manifest.
|
|
406
|
+
*/
|
|
407
|
+
export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlugs) {
|
|
408
|
+
const manifestSet = new Set(manifestSlugs);
|
|
409
|
+
for (const judgment of judgments) {
|
|
410
|
+
const baseDesc = judgment.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
411
|
+
const taskSlugs = taskDocSlugs.get(baseDesc) ?? [];
|
|
412
|
+
const union = new Set(manifestSet);
|
|
413
|
+
for (const slug of taskSlugs)
|
|
414
|
+
union.add(slug);
|
|
415
|
+
judgment.hallucinationCheckedAgainst = [...union];
|
|
416
|
+
for (const citation of judgment.docCitations) {
|
|
417
|
+
if (typeof citation.slug === "string" && citation.slug.length > 0) {
|
|
418
|
+
citation.hallucinated = !union.has(citation.slug);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
160
423
|
/**
|
|
161
424
|
* Maximum characters (JS string length, not bytes) to store for model
|
|
162
425
|
* response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
|
|
@@ -998,7 +1261,56 @@ export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptio
|
|
|
998
1261
|
*/
|
|
999
1262
|
// mergeScores — imported from @sanity/ailf-core above
|
|
1000
1263
|
const CRITICAL_THRESHOLD = 40;
|
|
1001
|
-
|
|
1264
|
+
/**
|
|
1265
|
+
* Build a regrade callback for the borderline runner from the original
|
|
1266
|
+
* Promptfoo results file. Resolves each judgment back to its
|
|
1267
|
+
* `(taskId, modelId, dimension)` component result, then calls the
|
|
1268
|
+
* supplied `regradeOnce` to score the response against the rubric
|
|
1269
|
+
* again. Returns the original judgment score when the lookup or
|
|
1270
|
+
* regrade call fails — the runner treats that as a degenerate replica
|
|
1271
|
+
* (the consensus median absorbs it).
|
|
1272
|
+
*/
|
|
1273
|
+
function buildBorderlineRegrader(input) {
|
|
1274
|
+
// Lazily build the lookup map on first call so we don't pay the
|
|
1275
|
+
// file-read cost when the runner short-circuits on no borderline
|
|
1276
|
+
// judgments. Merges componentResults across every supplied results
|
|
1277
|
+
// file so full-mode (baseline + agentic) judgments resolve to their
|
|
1278
|
+
// original (responseText, rubricText) pair regardless of source file.
|
|
1279
|
+
let lookup = null;
|
|
1280
|
+
return async (judgment) => {
|
|
1281
|
+
if (lookup === null) {
|
|
1282
|
+
lookup = new Map();
|
|
1283
|
+
for (const path of input.resultsPaths) {
|
|
1284
|
+
const results = readAndNormalizeResults(path);
|
|
1285
|
+
for (const result of results) {
|
|
1286
|
+
const taskId = result.description;
|
|
1287
|
+
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
1288
|
+
for (const comp of result.gradingResult.componentResults) {
|
|
1289
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
1290
|
+
continue;
|
|
1291
|
+
const dimension = classifyRubric(comp);
|
|
1292
|
+
if (!dimension)
|
|
1293
|
+
continue;
|
|
1294
|
+
const rubricText = typeof comp.assertion.value === "string"
|
|
1295
|
+
? comp.assertion.value
|
|
1296
|
+
: "";
|
|
1297
|
+
const responseText = result.response?.output ?? "";
|
|
1298
|
+
lookup.set(`${taskId}::${modelId}::${dimension}`, {
|
|
1299
|
+
responseText,
|
|
1300
|
+
rubricText,
|
|
1301
|
+
});
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
}
|
|
1306
|
+
const ctx = lookup.get(`${judgment.taskId}::${judgment.modelId}::${judgment.dimension}`);
|
|
1307
|
+
if (!ctx)
|
|
1308
|
+
return judgment.score;
|
|
1309
|
+
const replica = await input.regradeOnce(ctx.responseText, ctx.rubricText);
|
|
1310
|
+
return replica ?? judgment.score;
|
|
1311
|
+
};
|
|
1312
|
+
}
|
|
1313
|
+
export async function calculateAndWriteScores(options) {
|
|
1002
1314
|
const ROOT = options.rootDir;
|
|
1003
1315
|
const log = options.logger ?? new ConsoleLogger();
|
|
1004
1316
|
const sourceName = options.source;
|
|
@@ -1026,6 +1338,44 @@ export function calculateAndWriteScores(options) {
|
|
|
1026
1338
|
const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
1027
1339
|
// Agentic results path (only used in full mode)
|
|
1028
1340
|
const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
|
|
1341
|
+
// Borderline-consensus pass — shared across all four scoring modes
|
|
1342
|
+
// (literacy, agent-harness, knowledge-probe, mcp-server). Each mode's
|
|
1343
|
+
// persist site invokes this AFTER `extractGraderJudgments` and BEFORE
|
|
1344
|
+
// `writeFileSync(grader-judgments.json)` so the persisted file carries
|
|
1345
|
+
// the consensus median rather than single-replica scores. Returns the
|
|
1346
|
+
// per-judgment consistency map (or null) so the literacy branch can
|
|
1347
|
+
// also persist `borderline-consistency.json`.
|
|
1348
|
+
const runBorderlinePass = async (judgments, resultsPaths) => {
|
|
1349
|
+
if (judgments.length === 0 ||
|
|
1350
|
+
!options.borderlineConsensusRunner ||
|
|
1351
|
+
!options.borderlineRegradeOnce) {
|
|
1352
|
+
return null;
|
|
1353
|
+
}
|
|
1354
|
+
const regrade = buildBorderlineRegrader({
|
|
1355
|
+
resultsPaths,
|
|
1356
|
+
regradeOnce: options.borderlineRegradeOnce,
|
|
1357
|
+
logger: log,
|
|
1358
|
+
});
|
|
1359
|
+
try {
|
|
1360
|
+
const { judgments: regraded, consistencyByJudgment } = await options.borderlineConsensusRunner({
|
|
1361
|
+
judgments,
|
|
1362
|
+
regrade,
|
|
1363
|
+
logger: log,
|
|
1364
|
+
});
|
|
1365
|
+
// Mutate-in-place so subsequent steps (validateGraderJudgmentsCalibration,
|
|
1366
|
+
// persist) see the consensus-merged scores.
|
|
1367
|
+
judgments.length = 0;
|
|
1368
|
+
judgments.push(...regraded);
|
|
1369
|
+
if (consistencyByJudgment.size > 0) {
|
|
1370
|
+
log.info(`Borderline consensus merged ${consistencyByJudgment.size} judgment(s)`);
|
|
1371
|
+
}
|
|
1372
|
+
return consistencyByJudgment;
|
|
1373
|
+
}
|
|
1374
|
+
catch (err) {
|
|
1375
|
+
log.warn(`Borderline consensus pass failed — falling back to single-replica scores: ${err instanceof Error ? err.message : String(err)}`);
|
|
1376
|
+
return null;
|
|
1377
|
+
}
|
|
1378
|
+
};
|
|
1029
1379
|
// Validate baseline results file
|
|
1030
1380
|
const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
|
|
1031
1381
|
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
@@ -1079,7 +1429,14 @@ export function calculateAndWriteScores(options) {
|
|
|
1079
1429
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1080
1430
|
// Extract and persist grader judgments
|
|
1081
1431
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
1432
|
+
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1433
|
+
baselineResultsPath,
|
|
1434
|
+
]);
|
|
1082
1435
|
if (judgments.length > 0) {
|
|
1436
|
+
if (borderlineConsistency && borderlineConsistency.size > 0) {
|
|
1437
|
+
writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
|
|
1438
|
+
log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
|
|
1439
|
+
}
|
|
1083
1440
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1084
1441
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1085
1442
|
}
|
|
@@ -1130,7 +1487,14 @@ export function calculateAndWriteScores(options) {
|
|
|
1130
1487
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1131
1488
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1132
1489
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
1490
|
+
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1491
|
+
baselineResultsPath,
|
|
1492
|
+
]);
|
|
1133
1493
|
if (judgments.length > 0) {
|
|
1494
|
+
if (borderlineConsistency && borderlineConsistency.size > 0) {
|
|
1495
|
+
writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
|
|
1496
|
+
log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
|
|
1497
|
+
}
|
|
1134
1498
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1135
1499
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1136
1500
|
}
|
|
@@ -1232,16 +1596,54 @@ export function calculateAndWriteScores(options) {
|
|
|
1232
1596
|
mkdirSync(outDir, { recursive: true });
|
|
1233
1597
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1234
1598
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1235
|
-
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
1236
|
-
|
|
1599
|
+
// Extract and persist grader judgments (Phase 3a: failure mode extraction).
|
|
1600
|
+
//
|
|
1601
|
+
// Plan 03-03 — wire calibration telemetry. The strict-schema parse
|
|
1602
|
+
// failure counter (`parseFailures`) is incremented during extraction;
|
|
1603
|
+
// the ceiling-cross-check disagreement counter (`failureModeCalibration`)
|
|
1604
|
+
// is incremented during the post-extraction validation pass below.
|
|
1605
|
+
const reliability = { graderModel: "unknown" };
|
|
1606
|
+
const judgments = extractGraderJudgments(baselineResultsPath, {
|
|
1607
|
+
reliability,
|
|
1608
|
+
...(options.runId ? { runId: options.runId } : {}),
|
|
1609
|
+
});
|
|
1237
1610
|
// In full mode, also extract judgments from agentic results
|
|
1238
1611
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1239
|
-
const agenticJudgments = extractGraderJudgments(agenticResultsPath
|
|
1612
|
+
const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
|
|
1613
|
+
reliability,
|
|
1614
|
+
...(options.runId ? { runId: options.runId } : {}),
|
|
1615
|
+
});
|
|
1240
1616
|
judgments.push(...agenticJudgments);
|
|
1241
1617
|
}
|
|
1618
|
+
// Borderline-consensus pass — re-grade the ±5 borderline subset N times
|
|
1619
|
+
// and merge medians back into the canonical judgments BEFORE
|
|
1620
|
+
// `validateGraderJudgmentsCalibration` runs, so the calibration counter
|
|
1621
|
+
// sees the consensus-merged scores rather than single-replica noise.
|
|
1622
|
+
// In full mode, the merged `judgments` array includes entries sourced
|
|
1623
|
+
// from both baseline and agentic result files — pass both paths so the
|
|
1624
|
+
// regrader's lookup map can resolve agentic-sourced judgments back to
|
|
1625
|
+
// their (responseText, rubricText) pair instead of falling through to
|
|
1626
|
+
// the unchanged-score branch.
|
|
1627
|
+
const borderlineConsistency = await runBorderlinePass(judgments, mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
|
|
1628
|
+
? [baselineResultsPath, agenticResultsPath]
|
|
1629
|
+
: [baselineResultsPath]);
|
|
1242
1630
|
if (judgments.length > 0) {
|
|
1631
|
+
// Stamp each judgment with the D0049 ceiling-cross-check confidence
|
|
1632
|
+
// triple and accumulate `failureModeCalibration` disagreement counts.
|
|
1633
|
+
validateGraderJudgmentsCalibration(judgments, scores, reliability);
|
|
1634
|
+
if (borderlineConsistency && borderlineConsistency.size > 0) {
|
|
1635
|
+
writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
|
|
1636
|
+
log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
|
|
1637
|
+
}
|
|
1243
1638
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1244
1639
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1640
|
+
if (reliability.parseFailures !== undefined ||
|
|
1641
|
+
reliability.failureModeCalibration !== undefined) {
|
|
1642
|
+
log.debug("Grader reliability telemetry", {
|
|
1643
|
+
parseFailures: reliability.parseFailures ?? 0,
|
|
1644
|
+
failureModeCalibration: reliability.failureModeCalibration ?? 0,
|
|
1645
|
+
});
|
|
1646
|
+
}
|
|
1245
1647
|
}
|
|
1246
1648
|
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1247
1649
|
const testResults = extractStoredTestResults(baselineResultsPath);
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
*
|
|
19
19
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
20
20
|
*/
|
|
21
|
-
import type
|
|
21
|
+
import { type LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
22
22
|
import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
|
|
23
23
|
import type { PreflightRubricContext } from "./rubric-resolution.js";
|
|
24
24
|
import { type LiteracyEvalSubMode } from "../normalize-mode.js";
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
*
|
|
19
19
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
20
20
|
*/
|
|
21
|
+
import { RubricConfigSchema, formatZodErrors, } from "../../_vendor/ailf-core/index.js";
|
|
21
22
|
import { compileLiteracyTask, } from "./mode-handlers/literacy/index.js";
|
|
22
23
|
import { tryLoadConfigFile } from "./config-loader.js";
|
|
23
24
|
import { buildTaskGraph } from "./task-graph-builder.js";
|
|
@@ -152,23 +153,40 @@ function loadRubricResolutionInput(rootDir) {
|
|
|
152
153
|
const result = tryLoadConfigFile("rubrics", rootDir);
|
|
153
154
|
if (!result)
|
|
154
155
|
return undefined;
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
}
|
|
169
|
-
return { templates };
|
|
156
|
+
// Run the canonical RubricConfigSchema (Plan 03-01) at the trust
|
|
157
|
+
// boundary. Hand-rolled .ailf/rubrics.{yaml,json} configs and test
|
|
158
|
+
// fixtures previously bypassed validation: tryLoadConfigFile only
|
|
159
|
+
// ran the format-specific parser, then per-field unsafe casts read
|
|
160
|
+
// the result. With the strict GraderJudgmentSchema from Plan 03-04,
|
|
161
|
+
// a missing footer silently became "" and every grader emission
|
|
162
|
+
// fell through to the synthesized 'unclassified' fallback (CR-03).
|
|
163
|
+
// Fail loudly with a Zod-formatted message instead so config drift
|
|
164
|
+
// is caught at load time, not at every grader emission.
|
|
165
|
+
const parseResult = RubricConfigSchema.safeParse(result.data);
|
|
166
|
+
if (!parseResult.success) {
|
|
167
|
+
const lines = formatZodErrors(parseResult.error);
|
|
168
|
+
throw new Error(`Invalid rubric config at ${result.filePath}:\n${lines.join("\n")}`);
|
|
170
169
|
}
|
|
171
|
-
|
|
172
|
-
|
|
170
|
+
const parsed = parseResult.data;
|
|
171
|
+
const templates = {};
|
|
172
|
+
for (const [key, t] of Object.entries(parsed.templates)) {
|
|
173
|
+
// Plan 03-02 — thread per-dimension failureModes through to the
|
|
174
|
+
// runtime prompt assembler when the config stamped the field via
|
|
175
|
+
// `failureModesForDimension(dimension)`.
|
|
176
|
+
templates[key] = {
|
|
177
|
+
header: t.header,
|
|
178
|
+
scale: t.scale,
|
|
179
|
+
...(t.dimension !== undefined ? { dimension: t.dimension } : {}),
|
|
180
|
+
...(t.criteria_label !== undefined && t.criteria_label !== null
|
|
181
|
+
? { criteria_label: t.criteria_label }
|
|
182
|
+
: {}),
|
|
183
|
+
...(t.failureModes && t.failureModes.length > 0
|
|
184
|
+
? { failureModes: t.failureModes }
|
|
185
|
+
: {}),
|
|
186
|
+
};
|
|
173
187
|
}
|
|
188
|
+
// Plan 03-01 — footer is min(1) per RubricConfigSchema, so safeParse
|
|
189
|
+
// above guarantees a non-empty string here. No defensive empty-string
|
|
190
|
+
// fall-back needed.
|
|
191
|
+
return { templates, footer: parsed.footer };
|
|
174
192
|
}
|
|
@@ -23,7 +23,22 @@ export interface RubricResolutionInput {
|
|
|
23
23
|
dimension?: string;
|
|
24
24
|
header: string;
|
|
25
25
|
scale: string[];
|
|
26
|
+
/**
|
|
27
|
+
* Plan 03-02 — per-dimension legal failure-mode list emitted into the
|
|
28
|
+
* rubric prompt body before `${rubricConfig.footer}`. Sourced from
|
|
29
|
+
* `failureModesForDimension(dimension)` in
|
|
30
|
+
* `packages/eval/src/grader/index.ts` so the grader is told which modes
|
|
31
|
+
* are legal for this dimension family.
|
|
32
|
+
*/
|
|
33
|
+
failureModes?: readonly string[];
|
|
26
34
|
}>;
|
|
35
|
+
/**
|
|
36
|
+
* Plan 03-01 — formerly hard-coded literal at L98-102; now sourced
|
|
37
|
+
* from RubricConfig.footer. The footer documents the target wire
|
|
38
|
+
* format the grader emits (structured GraderJudgment shape sketch in
|
|
39
|
+
* Phase 3+; legacy `{score, reason}` JSON pre-Phase-3).
|
|
40
|
+
*/
|
|
41
|
+
footer: string;
|
|
27
42
|
}
|
|
28
43
|
/**
|
|
29
44
|
* Resolve a templated LLM-rubric assertion into a fully assembled
|
|
@@ -54,10 +54,18 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
|
|
|
54
54
|
const preflightSection = preflightContext && template.dimension === "code-correctness"
|
|
55
55
|
? buildPreflightSection(preflightContext)
|
|
56
56
|
: "";
|
|
57
|
+
// Plan 03-02 — when the template carries a per-dimension legal-mode
|
|
58
|
+
// list, announce it to the grader before the structured-shape footer
|
|
59
|
+
// (Plan 03-01). Empty list => no announcement (preserves prior wording
|
|
60
|
+
// for templates that haven't yet been wired to a taxonomy family).
|
|
61
|
+
const failureModesLine = template.failureModes && template.failureModes.length > 0
|
|
62
|
+
? `\nThe "failureMode" must be one of: ${template.failureModes.join(", ")}.\n\n`
|
|
63
|
+
: "";
|
|
57
64
|
const rubricValue = preflightSection +
|
|
58
65
|
`${template.header}\n${scaleText}\n\n` +
|
|
59
66
|
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
60
|
-
|
|
67
|
+
failureModesLine +
|
|
68
|
+
`${rubricConfig.footer}`;
|
|
61
69
|
const rubricPrompt = canonicalReference
|
|
62
70
|
? buildRubricPromptWithReference(rubricValue, canonicalReference)
|
|
63
71
|
: undefined;
|