@sanity/ailf 4.2.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/package-surface.ts +37 -0
- package/config/preflight-scoring.ts +26 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
- package/dist/_vendor/ailf-core/config-helpers.js +67 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
- package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
- package/dist/adapters/index.d.ts +1 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
- package/dist/adapters/package-surface/dts-package-surface.js +173 -0
- package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
- package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
- package/dist/adapters/package-surface/index.d.ts +9 -0
- package/dist/adapters/package-surface/index.js +8 -0
- package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
- package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
- package/dist/adapters/task-sources/repo-schemas.js +15 -0
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +12 -0
- package/dist/commands/remote-pipeline.js +10 -2
- package/dist/commands/remote-results.d.ts +12 -1
- package/dist/commands/remote-results.js +25 -5
- package/dist/composition-root.js +9 -0
- package/dist/config/package-surface.ts +37 -0
- package/dist/config/preflight-scoring.ts +26 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
- package/dist/orchestration/pipeline-orchestrator.js +38 -0
- package/dist/orchestration/steps/calculate-scores-step.js +11 -0
- package/dist/orchestration/steps/generate-configs-step.js +16 -1
- package/dist/orchestration/steps/run-eval-step.js +27 -0
- package/dist/pipeline/calculate-scores.d.ts +66 -5
- package/dist/pipeline/calculate-scores.js +141 -27
- package/dist/pipeline/compiler/index.d.ts +1 -1
- package/dist/pipeline/compiler/index.js +1 -1
- package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
- package/dist/pipeline/compiler/literacy-bridge.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
- package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
- package/dist/pipeline/compiler/rubric-resolution.js +78 -2
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
- package/dist/pipeline/compiler/scoring-bridge.js +104 -10
- package/dist/pipeline/eval-fingerprint.d.ts +9 -0
- package/dist/pipeline/eval-fingerprint.js +7 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
- package/dist/pipeline/preflight/compute-preflight.js +118 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
- package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
- package/dist/pipeline/preflight/load-package-surface.js +19 -0
- package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
- package/dist/pipeline/preflight/load-preflight-context.js +25 -0
- package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
- package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
- package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
- package/dist/pipeline/preflight/parse-imports.js +125 -0
- package/dist/report-store.d.ts +8 -0
- package/dist/report-store.js +55 -6
- package/dist/sanity/document-renderers.d.ts +45 -7
- package/dist/sanity/document-renderers.js +99 -13
- package/dist/sanity/queries.d.ts +11 -11
- package/dist/sanity/queries.js +7 -0
- package/dist/sanity/symbol-index.d.ts +98 -0
- package/dist/sanity/symbol-index.js +615 -0
- package/package.json +2 -1
|
@@ -8,11 +8,11 @@ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
9
9
|
// Assertion resolution
|
|
10
10
|
// ---------------------------------------------------------------------------
|
|
11
|
-
export function resolveAssertions(task, options, warnings) {
|
|
11
|
+
export function resolveAssertions(task, options, warnings, canonicalReference) {
|
|
12
12
|
const assertions = [];
|
|
13
13
|
for (const a of task.assertions ?? []) {
|
|
14
14
|
if (a.type === "llm-rubric" && "template" in a) {
|
|
15
|
-
const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
|
|
15
|
+
const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings, canonicalReference, options?.preflightContext);
|
|
16
16
|
if (resolved)
|
|
17
17
|
assertions.push(resolved);
|
|
18
18
|
}
|
|
@@ -31,7 +31,7 @@ export function resolveAssertions(task, options, warnings) {
|
|
|
31
31
|
}
|
|
32
32
|
// Doc-coverage auto-generation
|
|
33
33
|
if (task.docCoverage) {
|
|
34
|
-
const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
|
|
34
|
+
const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider, canonicalReference);
|
|
35
35
|
if (docCoverageAssertion)
|
|
36
36
|
assertions.push(docCoverageAssertion);
|
|
37
37
|
}
|
|
@@ -40,22 +40,49 @@ export function resolveAssertions(task, options, warnings) {
|
|
|
40
40
|
// ---------------------------------------------------------------------------
|
|
41
41
|
// Doc-coverage assertion
|
|
42
42
|
// ---------------------------------------------------------------------------
|
|
43
|
-
function buildDocCoverageAssertion(rubricConfig, graderProvider) {
|
|
43
|
+
function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalReference) {
|
|
44
44
|
if (!rubricConfig?.templates["doc-coverage"])
|
|
45
45
|
return null;
|
|
46
46
|
const template = rubricConfig.templates["doc-coverage"];
|
|
47
47
|
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
48
48
|
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
49
49
|
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
50
|
+
// doc-coverage benefits from the same authoritative reference — the grader
|
|
51
|
+
// needs the doc content to judge whether the candidate actually used what
|
|
52
|
+
// was documented.
|
|
53
|
+
const rubricPrompt = canonicalReference
|
|
54
|
+
? buildDocCoverageRubricPrompt(rubricValue, canonicalReference)
|
|
55
|
+
: undefined;
|
|
50
56
|
return {
|
|
51
57
|
type: "llm-rubric",
|
|
52
58
|
value: rubricValue,
|
|
59
|
+
...(rubricPrompt ? { rubricPrompt } : {}),
|
|
53
60
|
...(graderProvider ? { provider: graderProvider } : {}),
|
|
54
61
|
...(template.dimension
|
|
55
62
|
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
56
63
|
: {}),
|
|
57
64
|
};
|
|
58
65
|
}
|
|
66
|
+
function buildDocCoverageRubricPrompt(rubric, reference) {
|
|
67
|
+
return [
|
|
68
|
+
"You are grading documentation coverage of a candidate response.",
|
|
69
|
+
"",
|
|
70
|
+
"AUTHORITATIVE REFERENCE — this is what the candidate had access to.",
|
|
71
|
+
"Score how well the candidate used what was documented here. Do not",
|
|
72
|
+
"penalize the candidate for missing information that is absent from",
|
|
73
|
+
"the reference.",
|
|
74
|
+
"",
|
|
75
|
+
"--- BEGIN REFERENCE ---",
|
|
76
|
+
reference,
|
|
77
|
+
"--- END REFERENCE ---",
|
|
78
|
+
"",
|
|
79
|
+
"RUBRIC:",
|
|
80
|
+
rubric,
|
|
81
|
+
"",
|
|
82
|
+
"CANDIDATE RESPONSE:",
|
|
83
|
+
"{{output}}",
|
|
84
|
+
].join("\n");
|
|
85
|
+
}
|
|
59
86
|
// ---------------------------------------------------------------------------
|
|
60
87
|
// Baseline assertion filtering
|
|
61
88
|
// ---------------------------------------------------------------------------
|
|
@@ -6,6 +6,8 @@
|
|
|
6
6
|
* - Baseline entry with without-docs prompt and empty docs
|
|
7
7
|
* - Rubric assertions with structured dimension metadata
|
|
8
8
|
*/
|
|
9
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
10
|
+
import { resolve } from "node:path";
|
|
9
11
|
import { LiteracyVariant, } from "../../../normalize-mode.js";
|
|
10
12
|
import { buildBaselineAssertions, resolveAssertions } from "./assertions.js";
|
|
11
13
|
import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
@@ -103,7 +105,15 @@ function buildTestCases(task, evalMode, options, warnings) {
|
|
|
103
105
|
}
|
|
104
106
|
const hasDocs = contextDocs.length > 0;
|
|
105
107
|
const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
|
|
106
|
-
|
|
108
|
+
// When `graderContext` is "with-docs" and the task has canonical docs
|
|
109
|
+
// declared, read the resolved doc file so we can inject it into the
|
|
110
|
+
// assertion rubricPrompt as authoritative ground truth. The file is
|
|
111
|
+
// produced by `FetchDocsStep` (which runs before `GenerateConfigsStep`),
|
|
112
|
+
// so it should exist on disk by the time we're compiling.
|
|
113
|
+
const canonicalReference = options?.graderContext === "with-docs" && hasDocs
|
|
114
|
+
? readGraderReference(task.id, options?.rootDir, warnings)
|
|
115
|
+
: undefined;
|
|
116
|
+
const assertions = resolveAssertions(task, options, warnings, canonicalReference);
|
|
107
117
|
// Gold entry — canonical docs injected. Spread freeform extras first so
|
|
108
118
|
// canonical keys (task / docs / __featureArea) cannot be overridden.
|
|
109
119
|
const goldVars = {
|
|
@@ -142,3 +152,138 @@ function buildTestCases(task, evalMode, options, warnings) {
|
|
|
142
152
|
}
|
|
143
153
|
return tests;
|
|
144
154
|
}
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
// Canonical doc resolution
|
|
157
|
+
// ---------------------------------------------------------------------------
|
|
158
|
+
/**
|
|
159
|
+
* Read the grader-context reference for a task, preferring the W0197
|
|
160
|
+
* symbol-reference index when one was emitted by the doc fetcher and
|
|
161
|
+
* falling back to the full canonical doc otherwise.
|
|
162
|
+
*
|
|
163
|
+
* Resolution is driven by the per-task manifest at
|
|
164
|
+
* `<rootDir>/contexts/canonical-symbols/manifest.json` (W0197). The
|
|
165
|
+
* manifest distinguishes three states:
|
|
166
|
+
* 1. Manifest missing → fetcher hadn't run with W0197. Silent fallback
|
|
167
|
+
* to the full canonical doc; this is the backwards-compat path.
|
|
168
|
+
* 2. Manifest present, entry has `symbolCount > 0` → use the per-task
|
|
169
|
+
* symbol-index file as the grader reference (W0197 path).
|
|
170
|
+
* 3. Manifest present, entry has `symbolCount === 0` → extraction ran
|
|
171
|
+
* and produced nothing. Observable fallback (warning surfaced)
|
|
172
|
+
* to the full canonical doc.
|
|
173
|
+
*
|
|
174
|
+
* Returns undefined and pushes a warning when the canonical doc itself
|
|
175
|
+
* is missing or unreadable, when rootDir is unset, or when a resolved
|
|
176
|
+
* path escapes rootDir (path-traversal guard mirroring
|
|
177
|
+
* `fixture-resolver.ts:resolveFileRef`).
|
|
178
|
+
*/
|
|
179
|
+
function readGraderReference(taskId, rootDir, warnings) {
|
|
180
|
+
if (!rootDir) {
|
|
181
|
+
warnings.push(`graderContext "with-docs" requires rootDir — canonical reference for "${taskId}" not injected`);
|
|
182
|
+
return undefined;
|
|
183
|
+
}
|
|
184
|
+
const manifestEntry = readSymbolIndexManifestEntry(taskId, rootDir, warnings);
|
|
185
|
+
if (manifestEntry !== "missing-manifest") {
|
|
186
|
+
if (manifestEntry === "path-traversal")
|
|
187
|
+
return undefined;
|
|
188
|
+
if (manifestEntry === null) {
|
|
189
|
+
// Manifest exists but task isn't listed — shouldn't happen for tasks
|
|
190
|
+
// the fetcher processed. Treat as silent fallback (matches "missing
|
|
191
|
+
// manifest" semantics for safety) but log a warning so an unexpected
|
|
192
|
+
// mismatch is observable.
|
|
193
|
+
warnings.push(`graderContext: task "${taskId}" not present in symbol-index manifest — falling back to full canonical doc injection`);
|
|
194
|
+
}
|
|
195
|
+
else if (manifestEntry.symbolCount === 0) {
|
|
196
|
+
warnings.push(`graderContext: symbol index empty for "${taskId}" — falling back to full canonical doc injection`);
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
// Symbol-index path: read the per-task .md file.
|
|
200
|
+
const symbolsRelative = manifestEntry.path;
|
|
201
|
+
const symbolsAbsolute = resolveWithinRoot(rootDir, symbolsRelative);
|
|
202
|
+
if (symbolsAbsolute === null) {
|
|
203
|
+
warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
|
|
204
|
+
return undefined;
|
|
205
|
+
}
|
|
206
|
+
if (existsSync(symbolsAbsolute)) {
|
|
207
|
+
const content = readFileSafe(symbolsAbsolute, symbolsRelative, warnings, taskId);
|
|
208
|
+
if (content && content.trim().length > 0)
|
|
209
|
+
return content;
|
|
210
|
+
}
|
|
211
|
+
// Manifest claimed symbols but the file is missing/unreadable —
|
|
212
|
+
// surface as a warning and continue to full-doc fallback.
|
|
213
|
+
warnings.push(`graderContext: symbol index file at ${symbolsRelative} is missing or empty despite manifest entry — falling back to full canonical doc for "${taskId}"`);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
// Fall back to the full canonical doc (W0196 path).
|
|
217
|
+
const canonicalRelative = `contexts/canonical/${taskId}.md`;
|
|
218
|
+
const canonicalAbsolute = resolveWithinRoot(rootDir, canonicalRelative);
|
|
219
|
+
if (canonicalAbsolute === null) {
|
|
220
|
+
warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
|
|
221
|
+
return undefined;
|
|
222
|
+
}
|
|
223
|
+
if (!existsSync(canonicalAbsolute)) {
|
|
224
|
+
warnings.push(`graderContext "with-docs": canonical doc not found at ${canonicalRelative} — run fetch-docs first; reference not injected for "${taskId}"`);
|
|
225
|
+
return undefined;
|
|
226
|
+
}
|
|
227
|
+
return (readFileSafe(canonicalAbsolute, canonicalRelative, warnings, taskId) ??
|
|
228
|
+
undefined);
|
|
229
|
+
}
|
|
230
|
+
function readSymbolIndexManifestEntry(taskId, rootDir, warnings) {
|
|
231
|
+
const manifestRelative = "contexts/canonical-symbols/manifest.json";
|
|
232
|
+
const manifestAbsolute = resolveWithinRoot(rootDir, manifestRelative);
|
|
233
|
+
if (manifestAbsolute === null)
|
|
234
|
+
return "path-traversal";
|
|
235
|
+
if (!existsSync(manifestAbsolute))
|
|
236
|
+
return "missing-manifest";
|
|
237
|
+
const body = readFileSafe(manifestAbsolute, manifestRelative, warnings, taskId);
|
|
238
|
+
if (body === null)
|
|
239
|
+
return "missing-manifest";
|
|
240
|
+
let parsed;
|
|
241
|
+
try {
|
|
242
|
+
parsed = JSON.parse(body);
|
|
243
|
+
}
|
|
244
|
+
catch (err) {
|
|
245
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
246
|
+
warnings.push(`graderContext: symbol-index manifest at ${manifestRelative} is unparseable (${msg}) — falling back to full canonical doc for "${taskId}"`);
|
|
247
|
+
return "missing-manifest";
|
|
248
|
+
}
|
|
249
|
+
if (!parsed || typeof parsed !== "object")
|
|
250
|
+
return null;
|
|
251
|
+
const entries = parsed.entries;
|
|
252
|
+
if (!Array.isArray(entries))
|
|
253
|
+
return null;
|
|
254
|
+
for (const e of entries) {
|
|
255
|
+
if (!e || typeof e !== "object")
|
|
256
|
+
continue;
|
|
257
|
+
const entry = e;
|
|
258
|
+
if (typeof entry.taskId === "string" &&
|
|
259
|
+
typeof entry.path === "string" &&
|
|
260
|
+
typeof entry.symbolCount === "number" &&
|
|
261
|
+
entry.taskId === taskId) {
|
|
262
|
+
return {
|
|
263
|
+
taskId: entry.taskId,
|
|
264
|
+
path: entry.path,
|
|
265
|
+
symbolCount: entry.symbolCount,
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
return null;
|
|
270
|
+
}
|
|
271
|
+
function resolveWithinRoot(rootDir, relativePath) {
|
|
272
|
+
const absolutePath = resolve(rootDir, relativePath);
|
|
273
|
+
const normalizedBase = resolve(rootDir) + "/";
|
|
274
|
+
if (!absolutePath.startsWith(normalizedBase) &&
|
|
275
|
+
absolutePath !== resolve(rootDir)) {
|
|
276
|
+
return null;
|
|
277
|
+
}
|
|
278
|
+
return absolutePath;
|
|
279
|
+
}
|
|
280
|
+
function readFileSafe(absolutePath, relativePath, warnings, taskId) {
|
|
281
|
+
try {
|
|
282
|
+
return readFileSync(absolutePath, "utf-8");
|
|
283
|
+
}
|
|
284
|
+
catch (err) {
|
|
285
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
286
|
+
warnings.push(`graderContext: failed to read ${relativePath}: ${msg} — reference not injected for "${taskId}"`);
|
|
287
|
+
return null;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
* Shared types for the literacy mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
-
export type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
6
|
-
import type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
5
|
+
export type { PreflightRubricContext, RubricResolutionInput, } from "../../rubric-resolution.js";
|
|
6
|
+
import type { PreflightRubricContext, RubricResolutionInput } from "../../rubric-resolution.js";
|
|
7
7
|
/** Options for compiling a literacy task */
|
|
8
8
|
export interface LiteracyCompileOptions {
|
|
9
9
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -20,6 +20,21 @@ export interface LiteracyCompileOptions {
|
|
|
20
20
|
}[];
|
|
21
21
|
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
22
22
|
rubricConfig?: RubricResolutionInput;
|
|
23
|
+
/**
|
|
24
|
+
* Grader context policy. When `"with-docs"` and the task declares
|
|
25
|
+
* `context.docs`, the canonical doc content is read from
|
|
26
|
+
* `<rootDir>/contexts/canonical/<task.id>.md` and injected into the
|
|
27
|
+
* Promptfoo `rubricPrompt`.
|
|
28
|
+
*/
|
|
29
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
30
|
+
/**
|
|
31
|
+
* W0198 Phase 6 — when supplied, the `code-correctness` rubric is
|
|
32
|
+
* prefixed with a "DETERMINISTIC PREFLIGHT" instruction telling the
|
|
33
|
+
* grader to treat the deterministic lane's existence verdicts as
|
|
34
|
+
* ground truth. Sourced from the package-surface manifest at compile
|
|
35
|
+
* time; absence collapses cleanly to the pre-W0198 rubric.
|
|
36
|
+
*/
|
|
37
|
+
preflightContext?: PreflightRubricContext;
|
|
23
38
|
}
|
|
24
39
|
/** Result of compiling a single literacy task */
|
|
25
40
|
export interface LiteracyCompileResult {
|
|
@@ -10,9 +10,12 @@
|
|
|
10
10
|
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
11
|
*
|
|
12
12
|
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
|
|
13
14
|
* @see config/rubrics.ts — template definitions
|
|
14
15
|
*/
|
|
16
|
+
import type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
|
|
15
17
|
import type { PromptfooAssertion } from "./assertion-mapper.js";
|
|
18
|
+
export type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
|
|
16
19
|
/** Minimal rubric config needed for template resolution */
|
|
17
20
|
export interface RubricResolutionInput {
|
|
18
21
|
templates: Record<string, {
|
|
@@ -31,10 +34,23 @@ export interface RubricResolutionInput {
|
|
|
31
34
|
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
32
35
|
* to create the final rubric prompt.
|
|
33
36
|
*
|
|
37
|
+
* When `canonicalReference` is supplied, it is emitted on the assertion's
|
|
38
|
+
* `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
|
|
39
|
+
* wrapped in a ground-truth framing so the grader treats the supplied
|
|
40
|
+
* content as authoritative. The rubric `value` itself is unchanged. Without
|
|
41
|
+
* it, the grader falls back on training priors and may hallucinate against
|
|
42
|
+
* neighboring API surfaces.
|
|
43
|
+
*
|
|
34
44
|
* Returns null (with a warning) if the template can't be resolved.
|
|
35
45
|
*/
|
|
36
46
|
export declare function resolveTemplatedAssertion(assertion: {
|
|
37
47
|
criteria: string[];
|
|
38
48
|
template: string;
|
|
39
49
|
type: string;
|
|
40
|
-
}, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
50
|
+
}, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[], canonicalReference?: string, preflightContext?: PreflightRubricContext): PromptfooAssertion | null;
|
|
51
|
+
/**
|
|
52
|
+
* Build the W0198 Phase 6 preflight preface for a `code-correctness`
|
|
53
|
+
* rubric. Returned with a trailing newline so it composes cleanly with
|
|
54
|
+
* the existing rubric body.
|
|
55
|
+
*/
|
|
56
|
+
export declare function buildPreflightSection(context: PreflightRubricContext): string;
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
11
|
*
|
|
12
12
|
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
|
|
13
14
|
* @see config/rubrics.ts — template definitions
|
|
14
15
|
*/
|
|
15
16
|
// ---------------------------------------------------------------------------
|
|
@@ -24,9 +25,16 @@
|
|
|
24
25
|
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
25
26
|
* to create the final rubric prompt.
|
|
26
27
|
*
|
|
28
|
+
* When `canonicalReference` is supplied, it is emitted on the assertion's
|
|
29
|
+
* `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
|
|
30
|
+
* wrapped in a ground-truth framing so the grader treats the supplied
|
|
31
|
+
* content as authoritative. The rubric `value` itself is unchanged. Without
|
|
32
|
+
* it, the grader falls back on training priors and may hallucinate against
|
|
33
|
+
* neighboring API surfaces.
|
|
34
|
+
*
|
|
27
35
|
* Returns null (with a warning) if the template can't be resolved.
|
|
28
36
|
*/
|
|
29
|
-
export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings) {
|
|
37
|
+
export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings, canonicalReference, preflightContext) {
|
|
30
38
|
if (!rubricConfig) {
|
|
31
39
|
warnings.push(`No rubric config — template "${assertion.template}" cannot be resolved`);
|
|
32
40
|
return null;
|
|
@@ -38,15 +46,83 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
|
|
|
38
46
|
}
|
|
39
47
|
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
40
48
|
const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
|
|
41
|
-
|
|
49
|
+
// W0198 Phase 6 — when the deterministic preflight lane is wired and this
|
|
50
|
+
// rubric scores `code-correctness`, prefix a system instruction so the
|
|
51
|
+
// grader does not re-judge symbol existence. The lane separation is the
|
|
52
|
+
// whole reason W0198 exists; if both lanes weigh existence, the rubric's
|
|
53
|
+
// hallucinations sneak back into a dimension we want deterministic.
|
|
54
|
+
const preflightSection = preflightContext && template.dimension === "code-correctness"
|
|
55
|
+
? buildPreflightSection(preflightContext)
|
|
56
|
+
: "";
|
|
57
|
+
const rubricValue = preflightSection +
|
|
58
|
+
`${template.header}\n${scaleText}\n\n` +
|
|
42
59
|
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
43
60
|
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
61
|
+
const rubricPrompt = canonicalReference
|
|
62
|
+
? buildRubricPromptWithReference(rubricValue, canonicalReference)
|
|
63
|
+
: undefined;
|
|
44
64
|
return {
|
|
45
65
|
type: "llm-rubric",
|
|
46
66
|
value: rubricValue,
|
|
67
|
+
...(rubricPrompt ? { rubricPrompt } : {}),
|
|
47
68
|
...(graderProvider ? { provider: graderProvider } : {}),
|
|
48
69
|
...(template.dimension
|
|
49
70
|
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
50
71
|
: {}),
|
|
51
72
|
};
|
|
52
73
|
}
|
|
74
|
+
/**
|
|
75
|
+
* Build the W0198 Phase 6 preflight preface for a `code-correctness`
|
|
76
|
+
* rubric. Returned with a trailing newline so it composes cleanly with
|
|
77
|
+
* the existing rubric body.
|
|
78
|
+
*/
|
|
79
|
+
export function buildPreflightSection(context) {
|
|
80
|
+
const packageList = context.packages.length > 0
|
|
81
|
+
? context.packages.join(", ")
|
|
82
|
+
: "(no packages in manifest)";
|
|
83
|
+
return [
|
|
84
|
+
"DETERMINISTIC PREFLIGHT — A separate lane checks whether each named",
|
|
85
|
+
"binding the candidate imports (e.g. `import { foo } from 'pkg'`) actually",
|
|
86
|
+
"exports from its source package, against the actual `.d.ts` of the",
|
|
87
|
+
"installed version. Default and namespace imports (`import x from 'pkg'`,",
|
|
88
|
+
"`import * as x from 'pkg'`) are NOT preflight-checked and fall through to",
|
|
89
|
+
"you. Treat any existence verdict the preflight returns as ground truth —",
|
|
90
|
+
"do NOT include symbol-existence concerns in your `code-correctness`",
|
|
91
|
+
"judgment. Confine your code-correctness score to idiomatic usage, code",
|
|
92
|
+
"organization, type safety, and completeness against the task.",
|
|
93
|
+
"",
|
|
94
|
+
`In-scope packages (preflight rules on named bindings imported from`,
|
|
95
|
+
`these): ${packageList}.`,
|
|
96
|
+
"",
|
|
97
|
+
"",
|
|
98
|
+
].join("\n");
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Build the full rubric-prompt string sent to the grader when canonical
|
|
102
|
+
* reference content is available. The framing explicitly tells the grader
|
|
103
|
+
* to treat the reference as authoritative ground truth so it cannot fall
|
|
104
|
+
* back on training priors when checking symbol existence or API shape.
|
|
105
|
+
*
|
|
106
|
+
* Promptfoo interpolates `{{output}}` and `{{rubric}}` at grade time —
|
|
107
|
+
* `{{output}}` is the candidate response; `{{rubric}}` is the assertion's
|
|
108
|
+
* `value` field (the rubric text built above).
|
|
109
|
+
*/
|
|
110
|
+
function buildRubricPromptWithReference(rubric, reference) {
|
|
111
|
+
return [
|
|
112
|
+
"You are grading a candidate response against a rubric.",
|
|
113
|
+
"",
|
|
114
|
+
"AUTHORITATIVE REFERENCE — treat this as ground truth. If a symbol, function,",
|
|
115
|
+
"API, or pattern appears here, it exists; do not contradict it from prior",
|
|
116
|
+
"knowledge. If something is absent here, do not assume it exists.",
|
|
117
|
+
"",
|
|
118
|
+
"--- BEGIN REFERENCE ---",
|
|
119
|
+
reference,
|
|
120
|
+
"--- END REFERENCE ---",
|
|
121
|
+
"",
|
|
122
|
+
"RUBRIC:",
|
|
123
|
+
rubric,
|
|
124
|
+
"",
|
|
125
|
+
"CANDIDATE RESPONSE:",
|
|
126
|
+
"{{output}}",
|
|
127
|
+
].join("\n");
|
|
128
|
+
}
|
|
@@ -16,11 +16,19 @@
|
|
|
16
16
|
* engine works in [0, 1]; this module handles the conversion at
|
|
17
17
|
* boundaries.
|
|
18
18
|
*
|
|
19
|
+
* W0198 Phase 5 — when a `preflightForTest` callback is provided and
|
|
20
|
+
* returns a `SymbolPreflightReport`, the bridge synthesizes one extra
|
|
21
|
+
* `AssertionScore` per test in the `code-correctness` dimension. The
|
|
22
|
+
* deterministic preflight and the LLM rubric merge through D0010's
|
|
23
|
+
* weighted dimension aggregation; the relative share is set by
|
|
24
|
+
* `preflightWeight` in `[0, 1]`.
|
|
25
|
+
*
|
|
19
26
|
* @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
|
|
20
27
|
* @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
|
|
21
28
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
29
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
|
|
22
30
|
*/
|
|
23
|
-
import { type DimensionScore } from "../../_vendor/ailf-core/index.d.ts";
|
|
31
|
+
import { type AssertionScore, type DimensionScore, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
|
|
24
32
|
import type { TestResult } from "../../_vendor/ailf-core/index.d.ts";
|
|
25
33
|
/** Result of scoring a group of tests via the 4-tier engine */
|
|
26
34
|
export interface BridgedScoreResult {
|
|
@@ -33,6 +41,24 @@ export interface BridgedScoreResult {
|
|
|
33
41
|
/** Raw DimensionScore objects from the engine (0–1 scale) */
|
|
34
42
|
rawDimensions: DimensionScore[];
|
|
35
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* W0198 Phase 5 options. When `preflightForTest` is omitted (or returns
|
|
46
|
+
* `undefined`) the bridge stays purely rubric-driven — identical to the
|
|
47
|
+
* pre-W0198 behavior. When a report is returned, one `code-correctness`
|
|
48
|
+
* preflight assertion is synthesized per test and merged with the rubric
|
|
49
|
+
* via weighted-mean.
|
|
50
|
+
*/
|
|
51
|
+
export interface ScoreTestGroupOptions {
|
|
52
|
+
/** Look up the preflight report attached to a particular test, if any. */
|
|
53
|
+
preflightForTest?: (test: TestResult) => SymbolPreflightReport | undefined;
|
|
54
|
+
/**
|
|
55
|
+
* Preflight's share of the `code-correctness` dimension, in `[0, 1]`.
|
|
56
|
+
* The complementary share belongs to the LLM rubric. Defaults to
|
|
57
|
+
* `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` (0.4). Ignored when
|
|
58
|
+
* `preflightForTest` is omitted.
|
|
59
|
+
*/
|
|
60
|
+
preflightWeight?: number;
|
|
61
|
+
}
|
|
36
62
|
/**
|
|
37
63
|
* Score a group of test results using the 4-tier scoring engine.
|
|
38
64
|
*
|
|
@@ -44,6 +70,27 @@ export interface BridgedScoreResult {
|
|
|
44
70
|
* @param profile Weight profile mapping kebab-case dimension names to weights
|
|
45
71
|
* (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
|
|
46
72
|
* @param taskId Optional task identifier for traceability in TaskScore output
|
|
73
|
+
* @param options Optional W0198 Phase 5 preflight integration
|
|
47
74
|
* @returns Dimensions (0–100) and composite (0–100), matching legacy output format
|
|
48
75
|
*/
|
|
49
|
-
export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string): BridgedScoreResult;
|
|
76
|
+
export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string, options?: ScoreTestGroupOptions): BridgedScoreResult;
|
|
77
|
+
/**
|
|
78
|
+
* Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
|
|
79
|
+
* preflight report.
|
|
80
|
+
*
|
|
81
|
+
* The score is `1 - min(1, total / cap)`. With the default
|
|
82
|
+
* `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
|
|
83
|
+
* 2 missing → 0.333, ≥3 missing → 0.0.
|
|
84
|
+
*
|
|
85
|
+
* Edge cases for the deduction config:
|
|
86
|
+
* - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
|
|
87
|
+
* would NaN otherwise.
|
|
88
|
+
* - `cap < 0` (misconfigured): score is 1.0 and the merge silently
|
|
89
|
+
* collapses to rubric-only on this dimension. The Phase 3 Zod schema
|
|
90
|
+
* gates against this upstream so it should never reach here, but the
|
|
91
|
+
* guard preserves the never-deduct invariant if it does.
|
|
92
|
+
*
|
|
93
|
+
* `unresolved` findings never deduct (the preflight's never-deduct rule):
|
|
94
|
+
* they're not part of `total`, so they fall through to the LLM rubric.
|
|
95
|
+
*/
|
|
96
|
+
export declare function preflightToScore(report: SymbolPreflightReport, weight: number): AssertionScore;
|