@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  22. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  23. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
  25. package/dist/_vendor/ailf-core/types/index.js +1 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  27. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  28. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  29. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  30. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  31. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  32. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  33. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  34. package/dist/adapters/api-client/build-request.d.ts +1 -0
  35. package/dist/adapters/api-client/build-request.js +3 -0
  36. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  37. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
  38. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
  39. package/dist/adapters/index.d.ts +1 -0
  40. package/dist/adapters/index.js +1 -0
  41. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  42. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  43. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  44. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  45. package/dist/adapters/package-surface/index.d.ts +9 -0
  46. package/dist/adapters/package-surface/index.js +8 -0
  47. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  48. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  49. package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
  50. package/dist/adapters/task-sources/repo-schemas.js +15 -0
  51. package/dist/commands/pipeline-action.d.ts +2 -0
  52. package/dist/commands/pipeline-action.js +12 -0
  53. package/dist/commands/remote-pipeline.js +10 -2
  54. package/dist/commands/remote-results.d.ts +12 -1
  55. package/dist/commands/remote-results.js +25 -5
  56. package/dist/composition-root.js +9 -0
  57. package/dist/config/package-surface.ts +37 -0
  58. package/dist/config/preflight-scoring.ts +26 -0
  59. package/dist/index.d.ts +2 -2
  60. package/dist/index.js +1 -1
  61. package/dist/orchestration/build-app-context.js +1 -0
  62. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  63. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  64. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  65. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  66. package/dist/orchestration/steps/run-eval-step.js +27 -0
  67. package/dist/pipeline/calculate-scores.d.ts +66 -5
  68. package/dist/pipeline/calculate-scores.js +141 -27
  69. package/dist/pipeline/compiler/index.d.ts +1 -1
  70. package/dist/pipeline/compiler/index.js +1 -1
  71. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  72. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/map-request-to-config.js +1 -0
  85. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  86. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  88. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  89. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  90. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  91. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  92. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  94. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  95. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  96. package/dist/pipeline/preflight/parse-imports.js +125 -0
  97. package/dist/report-store.d.ts +8 -0
  98. package/dist/report-store.js +55 -6
  99. package/dist/sanity/document-renderers.d.ts +45 -7
  100. package/dist/sanity/document-renderers.js +99 -13
  101. package/dist/sanity/queries.d.ts +11 -11
  102. package/dist/sanity/queries.js +7 -0
  103. package/dist/sanity/symbol-index.d.ts +98 -0
  104. package/dist/sanity/symbol-index.js +615 -0
  105. package/package.json +2 -1
@@ -8,11 +8,11 @@ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
8
8
  // ---------------------------------------------------------------------------
9
9
  // Assertion resolution
10
10
  // ---------------------------------------------------------------------------
11
- export function resolveAssertions(task, options, warnings) {
11
+ export function resolveAssertions(task, options, warnings, canonicalReference) {
12
12
  const assertions = [];
13
13
  for (const a of task.assertions ?? []) {
14
14
  if (a.type === "llm-rubric" && "template" in a) {
15
- const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
15
+ const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings, canonicalReference, options?.preflightContext);
16
16
  if (resolved)
17
17
  assertions.push(resolved);
18
18
  }
@@ -31,7 +31,7 @@ export function resolveAssertions(task, options, warnings) {
31
31
  }
32
32
  // Doc-coverage auto-generation
33
33
  if (task.docCoverage) {
34
- const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
34
+ const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider, canonicalReference);
35
35
  if (docCoverageAssertion)
36
36
  assertions.push(docCoverageAssertion);
37
37
  }
@@ -40,22 +40,49 @@ export function resolveAssertions(task, options, warnings) {
40
40
  // ---------------------------------------------------------------------------
41
41
  // Doc-coverage assertion
42
42
  // ---------------------------------------------------------------------------
43
- function buildDocCoverageAssertion(rubricConfig, graderProvider) {
43
+ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalReference) {
44
44
  if (!rubricConfig?.templates["doc-coverage"])
45
45
  return null;
46
46
  const template = rubricConfig.templates["doc-coverage"];
47
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
48
48
  const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
49
  `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
50
+ // doc-coverage benefits from the same authoritative reference — the grader
51
+ // needs the doc content to judge whether the candidate actually used what
52
+ // was documented.
53
+ const rubricPrompt = canonicalReference
54
+ ? buildDocCoverageRubricPrompt(rubricValue, canonicalReference)
55
+ : undefined;
50
56
  return {
51
57
  type: "llm-rubric",
52
58
  value: rubricValue,
59
+ ...(rubricPrompt ? { rubricPrompt } : {}),
53
60
  ...(graderProvider ? { provider: graderProvider } : {}),
54
61
  ...(template.dimension
55
62
  ? { metadata: { dimension: template.dimension, maxScore: 100 } }
56
63
  : {}),
57
64
  };
58
65
  }
66
+ function buildDocCoverageRubricPrompt(rubric, reference) {
67
+ return [
68
+ "You are grading documentation coverage of a candidate response.",
69
+ "",
70
+ "AUTHORITATIVE REFERENCE — this is what the candidate had access to.",
71
+ "Score how well the candidate used what was documented here. Do not",
72
+ "penalize the candidate for missing information that is absent from",
73
+ "the reference.",
74
+ "",
75
+ "--- BEGIN REFERENCE ---",
76
+ reference,
77
+ "--- END REFERENCE ---",
78
+ "",
79
+ "RUBRIC:",
80
+ rubric,
81
+ "",
82
+ "CANDIDATE RESPONSE:",
83
+ "{{output}}",
84
+ ].join("\n");
85
+ }
59
86
  // ---------------------------------------------------------------------------
60
87
  // Baseline assertion filtering
61
88
  // ---------------------------------------------------------------------------
@@ -6,6 +6,8 @@
6
6
  * - Baseline entry with without-docs prompt and empty docs
7
7
  * - Rubric assertions with structured dimension metadata
8
8
  */
9
+ import { existsSync, readFileSync } from "node:fs";
10
+ import { resolve } from "node:path";
9
11
  import { LiteracyVariant, } from "../../../normalize-mode.js";
10
12
  import { buildBaselineAssertions, resolveAssertions } from "./assertions.js";
11
13
  import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
@@ -103,7 +105,15 @@ function buildTestCases(task, evalMode, options, warnings) {
103
105
  }
104
106
  const hasDocs = contextDocs.length > 0;
105
107
  const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
106
- const assertions = resolveAssertions(task, options, warnings);
108
+ // When `graderContext` is "with-docs" and the task has canonical docs
109
+ // declared, read the resolved doc file so we can inject it into the
110
+ // assertion rubricPrompt as authoritative ground truth. The file is
111
+ // produced by `FetchDocsStep` (which runs before `GenerateConfigsStep`),
112
+ // so it should exist on disk by the time we're compiling.
113
+ const canonicalReference = options?.graderContext === "with-docs" && hasDocs
114
+ ? readGraderReference(task.id, options?.rootDir, warnings)
115
+ : undefined;
116
+ const assertions = resolveAssertions(task, options, warnings, canonicalReference);
107
117
  // Gold entry — canonical docs injected. Spread freeform extras first so
108
118
  // canonical keys (task / docs / __featureArea) cannot be overridden.
109
119
  const goldVars = {
@@ -142,3 +152,138 @@ function buildTestCases(task, evalMode, options, warnings) {
142
152
  }
143
153
  return tests;
144
154
  }
155
+ // ---------------------------------------------------------------------------
156
+ // Canonical doc resolution
157
+ // ---------------------------------------------------------------------------
158
+ /**
159
+ * Read the grader-context reference for a task, preferring the W0197
160
+ * symbol-reference index when one was emitted by the doc fetcher and
161
+ * falling back to the full canonical doc otherwise.
162
+ *
163
+ * Resolution is driven by the per-task manifest at
164
+ * `<rootDir>/contexts/canonical-symbols/manifest.json` (W0197). The
165
+ * manifest distinguishes three states:
166
+ * 1. Manifest missing → fetcher hadn't run with W0197. Silent fallback
167
+ * to the full canonical doc; this is the backwards-compat path.
168
+ * 2. Manifest present, entry has `symbolCount > 0` → use the per-task
169
+ * symbol-index file as the grader reference (W0197 path).
170
+ * 3. Manifest present, entry has `symbolCount === 0` → extraction ran
171
+ * and produced nothing. Observable fallback (warning surfaced)
172
+ * to the full canonical doc.
173
+ *
174
+ * Returns undefined and pushes a warning when the canonical doc itself
175
+ * is missing or unreadable, when rootDir is unset, or when a resolved
176
+ * path escapes rootDir (path-traversal guard mirroring
177
+ * `fixture-resolver.ts:resolveFileRef`).
178
+ */
179
+ function readGraderReference(taskId, rootDir, warnings) {
180
+ if (!rootDir) {
181
+ warnings.push(`graderContext "with-docs" requires rootDir — canonical reference for "${taskId}" not injected`);
182
+ return undefined;
183
+ }
184
+ const manifestEntry = readSymbolIndexManifestEntry(taskId, rootDir, warnings);
185
+ if (manifestEntry !== "missing-manifest") {
186
+ if (manifestEntry === "path-traversal")
187
+ return undefined;
188
+ if (manifestEntry === null) {
189
+ // Manifest exists but task isn't listed — shouldn't happen for tasks
190
+ // the fetcher processed. Treat as silent fallback (matches "missing
191
+ // manifest" semantics for safety) but log a warning so an unexpected
192
+ // mismatch is observable.
193
+ warnings.push(`graderContext: task "${taskId}" not present in symbol-index manifest — falling back to full canonical doc injection`);
194
+ }
195
+ else if (manifestEntry.symbolCount === 0) {
196
+ warnings.push(`graderContext: symbol index empty for "${taskId}" — falling back to full canonical doc injection`);
197
+ }
198
+ else {
199
+ // Symbol-index path: read the per-task .md file.
200
+ const symbolsRelative = manifestEntry.path;
201
+ const symbolsAbsolute = resolveWithinRoot(rootDir, symbolsRelative);
202
+ if (symbolsAbsolute === null) {
203
+ warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
204
+ return undefined;
205
+ }
206
+ if (existsSync(symbolsAbsolute)) {
207
+ const content = readFileSafe(symbolsAbsolute, symbolsRelative, warnings, taskId);
208
+ if (content && content.trim().length > 0)
209
+ return content;
210
+ }
211
+ // Manifest claimed symbols but the file is missing/unreadable —
212
+ // surface as a warning and continue to full-doc fallback.
213
+ warnings.push(`graderContext: symbol index file at ${symbolsRelative} is missing or empty despite manifest entry — falling back to full canonical doc for "${taskId}"`);
214
+ }
215
+ }
216
+ // Fall back to the full canonical doc (W0196 path).
217
+ const canonicalRelative = `contexts/canonical/${taskId}.md`;
218
+ const canonicalAbsolute = resolveWithinRoot(rootDir, canonicalRelative);
219
+ if (canonicalAbsolute === null) {
220
+ warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
221
+ return undefined;
222
+ }
223
+ if (!existsSync(canonicalAbsolute)) {
224
+ warnings.push(`graderContext "with-docs": canonical doc not found at ${canonicalRelative} — run fetch-docs first; reference not injected for "${taskId}"`);
225
+ return undefined;
226
+ }
227
+ return (readFileSafe(canonicalAbsolute, canonicalRelative, warnings, taskId) ??
228
+ undefined);
229
+ }
230
+ function readSymbolIndexManifestEntry(taskId, rootDir, warnings) {
231
+ const manifestRelative = "contexts/canonical-symbols/manifest.json";
232
+ const manifestAbsolute = resolveWithinRoot(rootDir, manifestRelative);
233
+ if (manifestAbsolute === null)
234
+ return "path-traversal";
235
+ if (!existsSync(manifestAbsolute))
236
+ return "missing-manifest";
237
+ const body = readFileSafe(manifestAbsolute, manifestRelative, warnings, taskId);
238
+ if (body === null)
239
+ return "missing-manifest";
240
+ let parsed;
241
+ try {
242
+ parsed = JSON.parse(body);
243
+ }
244
+ catch (err) {
245
+ const msg = err instanceof Error ? err.message : String(err);
246
+ warnings.push(`graderContext: symbol-index manifest at ${manifestRelative} is unparseable (${msg}) — falling back to full canonical doc for "${taskId}"`);
247
+ return "missing-manifest";
248
+ }
249
+ if (!parsed || typeof parsed !== "object")
250
+ return null;
251
+ const entries = parsed.entries;
252
+ if (!Array.isArray(entries))
253
+ return null;
254
+ for (const e of entries) {
255
+ if (!e || typeof e !== "object")
256
+ continue;
257
+ const entry = e;
258
+ if (typeof entry.taskId === "string" &&
259
+ typeof entry.path === "string" &&
260
+ typeof entry.symbolCount === "number" &&
261
+ entry.taskId === taskId) {
262
+ return {
263
+ taskId: entry.taskId,
264
+ path: entry.path,
265
+ symbolCount: entry.symbolCount,
266
+ };
267
+ }
268
+ }
269
+ return null;
270
+ }
271
+ function resolveWithinRoot(rootDir, relativePath) {
272
+ const absolutePath = resolve(rootDir, relativePath);
273
+ const normalizedBase = resolve(rootDir) + "/";
274
+ if (!absolutePath.startsWith(normalizedBase) &&
275
+ absolutePath !== resolve(rootDir)) {
276
+ return null;
277
+ }
278
+ return absolutePath;
279
+ }
280
+ function readFileSafe(absolutePath, relativePath, warnings, taskId) {
281
+ try {
282
+ return readFileSync(absolutePath, "utf-8");
283
+ }
284
+ catch (err) {
285
+ const msg = err instanceof Error ? err.message : String(err);
286
+ warnings.push(`graderContext: failed to read ${relativePath}: ${msg} — reference not injected for "${taskId}"`);
287
+ return null;
288
+ }
289
+ }
@@ -25,6 +25,8 @@ export const handler = {
25
25
  rootDir: ctx.rootDir,
26
26
  models: ctx.models,
27
27
  rubricConfig: ctx.rubricConfig,
28
+ graderContext: ctx.graderContext,
29
+ preflightContext: ctx.preflightContext,
28
30
  evalMode: ctx
29
31
  .evalMode,
30
32
  });
@@ -2,8 +2,8 @@
2
2
  * Shared types for the literacy mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
- export type { RubricResolutionInput } from "../../rubric-resolution.js";
6
- import type { RubricResolutionInput } from "../../rubric-resolution.js";
5
+ export type { PreflightRubricContext, RubricResolutionInput, } from "../../rubric-resolution.js";
6
+ import type { PreflightRubricContext, RubricResolutionInput } from "../../rubric-resolution.js";
7
7
  /** Options for compiling a literacy task */
8
8
  export interface LiteracyCompileOptions {
9
9
  /** Grader provider for LLM-graded assertions */
@@ -20,6 +20,21 @@ export interface LiteracyCompileOptions {
20
20
  }[];
21
21
  /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
22
22
  rubricConfig?: RubricResolutionInput;
23
+ /**
24
+ * Grader context policy. When `"with-docs"` and the task declares
25
+ * `context.docs`, the canonical doc content is read from
26
+ * `<rootDir>/contexts/canonical/<task.id>.md` and injected into the
27
+ * Promptfoo `rubricPrompt`.
28
+ */
29
+ graderContext?: "rubric-only" | "with-docs";
30
+ /**
31
+ * W0198 Phase 6 — when supplied, the `code-correctness` rubric is
32
+ * prefixed with a "DETERMINISTIC PREFLIGHT" instruction telling the
33
+ * grader to treat the deterministic lane's existence verdicts as
34
+ * ground truth. Sourced from the package-surface manifest at compile
35
+ * time; absence collapses cleanly to the pre-W0198 rubric.
36
+ */
37
+ preflightContext?: PreflightRubricContext;
23
38
  }
24
39
  /** Result of compiling a single literacy task */
25
40
  export interface LiteracyCompileResult {
@@ -10,9 +10,12 @@
10
10
  * tasks with templated rubrics produced empty rubric text (DOC-2029).
11
11
  *
12
12
  * @see docs/design-docs/mode-agnostic-scoring.md
13
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
13
14
  * @see config/rubrics.ts — template definitions
14
15
  */
16
+ import type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
15
17
  import type { PromptfooAssertion } from "./assertion-mapper.js";
18
+ export type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
16
19
  /** Minimal rubric config needed for template resolution */
17
20
  export interface RubricResolutionInput {
18
21
  templates: Record<string, {
@@ -31,10 +34,23 @@ export interface RubricResolutionInput {
31
34
  * scoring header, scale, and dimension metadata. The criteria are appended
32
35
  * to create the final rubric prompt.
33
36
  *
37
+ * When `canonicalReference` is supplied, it is emitted on the assertion's
38
+ * `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
39
+ * wrapped in a ground-truth framing so the grader treats the supplied
40
+ * content as authoritative. The rubric `value` itself is unchanged. Without
41
+ * it, the grader falls back on training priors and may hallucinate against
42
+ * neighboring API surfaces.
43
+ *
34
44
  * Returns null (with a warning) if the template can't be resolved.
35
45
  */
36
46
  export declare function resolveTemplatedAssertion(assertion: {
37
47
  criteria: string[];
38
48
  template: string;
39
49
  type: string;
40
- }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
50
+ }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[], canonicalReference?: string, preflightContext?: PreflightRubricContext): PromptfooAssertion | null;
51
+ /**
52
+ * Build the W0198 Phase 6 preflight preface for a `code-correctness`
53
+ * rubric. Returned with a trailing newline so it composes cleanly with
54
+ * the existing rubric body.
55
+ */
56
+ export declare function buildPreflightSection(context: PreflightRubricContext): string;
@@ -10,6 +10,7 @@
10
10
  * tasks with templated rubrics produced empty rubric text (DOC-2029).
11
11
  *
12
12
  * @see docs/design-docs/mode-agnostic-scoring.md
13
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
13
14
  * @see config/rubrics.ts — template definitions
14
15
  */
15
16
  // ---------------------------------------------------------------------------
@@ -24,9 +25,16 @@
24
25
  * scoring header, scale, and dimension metadata. The criteria are appended
25
26
  * to create the final rubric prompt.
26
27
  *
28
+ * When `canonicalReference` is supplied, it is emitted on the assertion's
29
+ * `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
30
+ * wrapped in a ground-truth framing so the grader treats the supplied
31
+ * content as authoritative. The rubric `value` itself is unchanged. Without
32
+ * it, the grader falls back on training priors and may hallucinate against
33
+ * neighboring API surfaces.
34
+ *
27
35
  * Returns null (with a warning) if the template can't be resolved.
28
36
  */
29
- export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings) {
37
+ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings, canonicalReference, preflightContext) {
30
38
  if (!rubricConfig) {
31
39
  warnings.push(`No rubric config — template "${assertion.template}" cannot be resolved`);
32
40
  return null;
@@ -38,15 +46,83 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
38
46
  }
39
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
40
48
  const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
41
- const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
+ // W0198 Phase 6 — when the deterministic preflight lane is wired and this
50
+ // rubric scores `code-correctness`, prefix a system instruction so the
51
+ // grader does not re-judge symbol existence. The lane separation is the
52
+ // whole reason W0198 exists; if both lanes weigh existence, the rubric's
53
+ // hallucinations sneak back into a dimension we want deterministic.
54
+ const preflightSection = preflightContext && template.dimension === "code-correctness"
55
+ ? buildPreflightSection(preflightContext)
56
+ : "";
57
+ const rubricValue = preflightSection +
58
+ `${template.header}\n${scaleText}\n\n` +
42
59
  `${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
43
60
  `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
61
+ const rubricPrompt = canonicalReference
62
+ ? buildRubricPromptWithReference(rubricValue, canonicalReference)
63
+ : undefined;
44
64
  return {
45
65
  type: "llm-rubric",
46
66
  value: rubricValue,
67
+ ...(rubricPrompt ? { rubricPrompt } : {}),
47
68
  ...(graderProvider ? { provider: graderProvider } : {}),
48
69
  ...(template.dimension
49
70
  ? { metadata: { dimension: template.dimension, maxScore: 100 } }
50
71
  : {}),
51
72
  };
52
73
  }
74
+ /**
75
+ * Build the W0198 Phase 6 preflight preface for a `code-correctness`
76
+ * rubric. Returned with a trailing newline so it composes cleanly with
77
+ * the existing rubric body.
78
+ */
79
+ export function buildPreflightSection(context) {
80
+ const packageList = context.packages.length > 0
81
+ ? context.packages.join(", ")
82
+ : "(no packages in manifest)";
83
+ return [
84
+ "DETERMINISTIC PREFLIGHT — A separate lane checks whether each named",
85
+ "binding the candidate imports (e.g. `import { foo } from 'pkg'`) actually",
86
+ "exports from its source package, against the actual `.d.ts` of the",
87
+ "installed version. Default and namespace imports (`import x from 'pkg'`,",
88
+ "`import * as x from 'pkg'`) are NOT preflight-checked and fall through to",
89
+ "you. Treat any existence verdict the preflight returns as ground truth —",
90
+ "do NOT include symbol-existence concerns in your `code-correctness`",
91
+ "judgment. Confine your code-correctness score to idiomatic usage, code",
92
+ "organization, type safety, and completeness against the task.",
93
+ "",
94
+ `In-scope packages (preflight rules on named bindings imported from`,
95
+ `these): ${packageList}.`,
96
+ "",
97
+ "",
98
+ ].join("\n");
99
+ }
100
+ /**
101
+ * Build the full rubric-prompt string sent to the grader when canonical
102
+ * reference content is available. The framing explicitly tells the grader
103
+ * to treat the reference as authoritative ground truth so it cannot fall
104
+ * back on training priors when checking symbol existence or API shape.
105
+ *
106
+ * Promptfoo interpolates `{{output}}` and `{{rubric}}` at grade time —
107
+ * `{{output}}` is the candidate response; `{{rubric}}` is the assertion's
108
+ * `value` field (the rubric text built above).
109
+ */
110
+ function buildRubricPromptWithReference(rubric, reference) {
111
+ return [
112
+ "You are grading a candidate response against a rubric.",
113
+ "",
114
+ "AUTHORITATIVE REFERENCE — treat this as ground truth. If a symbol, function,",
115
+ "API, or pattern appears here, it exists; do not contradict it from prior",
116
+ "knowledge. If something is absent here, do not assume it exists.",
117
+ "",
118
+ "--- BEGIN REFERENCE ---",
119
+ reference,
120
+ "--- END REFERENCE ---",
121
+ "",
122
+ "RUBRIC:",
123
+ rubric,
124
+ "",
125
+ "CANDIDATE RESPONSE:",
126
+ "{{output}}",
127
+ ].join("\n");
128
+ }
@@ -16,11 +16,19 @@
16
16
  * engine works in [0, 1]; this module handles the conversion at
17
17
  * boundaries.
18
18
  *
19
+ * W0198 Phase 5 — when a `preflightForTest` callback is provided and
20
+ * returns a `SymbolPreflightReport`, the bridge synthesizes one extra
21
+ * `AssertionScore` per test in the `code-correctness` dimension. The
22
+ * deterministic preflight and the LLM rubric merge through D0010's
23
+ * weighted dimension aggregation; the relative share is set by
24
+ * `preflightWeight` in `[0, 1]`.
25
+ *
19
26
  * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
27
  * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
28
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
29
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
22
30
  */
23
- import { type DimensionScore } from "../../_vendor/ailf-core/index.d.ts";
31
+ import { type AssertionScore, type DimensionScore, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
24
32
  import type { TestResult } from "../../_vendor/ailf-core/index.d.ts";
25
33
  /** Result of scoring a group of tests via the 4-tier engine */
26
34
  export interface BridgedScoreResult {
@@ -33,6 +41,24 @@ export interface BridgedScoreResult {
33
41
  /** Raw DimensionScore objects from the engine (0–1 scale) */
34
42
  rawDimensions: DimensionScore[];
35
43
  }
44
+ /**
45
+ * W0198 Phase 5 options. When `preflightForTest` is omitted (or returns
46
+ * `undefined`) the bridge stays purely rubric-driven — identical to the
47
+ * pre-W0198 behavior. When a report is returned, one `code-correctness`
48
+ * preflight assertion is synthesized per test and merged with the rubric
49
+ * via weighted-mean.
50
+ */
51
+ export interface ScoreTestGroupOptions {
52
+ /** Look up the preflight report attached to a particular test, if any. */
53
+ preflightForTest?: (test: TestResult) => SymbolPreflightReport | undefined;
54
+ /**
55
+ * Preflight's share of the `code-correctness` dimension, in `[0, 1]`.
56
+ * The complementary share belongs to the LLM rubric. Defaults to
57
+ * `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` (0.4). Ignored when
58
+ * `preflightForTest` is omitted.
59
+ */
60
+ preflightWeight?: number;
61
+ }
36
62
  /**
37
63
  * Score a group of test results using the 4-tier scoring engine.
38
64
  *
@@ -44,6 +70,27 @@ export interface BridgedScoreResult {
44
70
  * @param profile Weight profile mapping kebab-case dimension names to weights
45
71
  * (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
46
72
  * @param taskId Optional task identifier for traceability in TaskScore output
73
+ * @param options Optional W0198 Phase 5 preflight integration
47
74
  * @returns Dimensions (0–100) and composite (0–100), matching legacy output format
48
75
  */
49
- export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string): BridgedScoreResult;
76
+ export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string, options?: ScoreTestGroupOptions): BridgedScoreResult;
77
+ /**
78
+ * Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
79
+ * preflight report.
80
+ *
81
+ * The score is `1 - min(1, total / cap)`. With the default
82
+ * `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
83
+ * 2 missing → 0.333, ≥3 missing → 0.0.
84
+ *
85
+ * Edge cases for the deduction config:
86
+ * - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
87
+ * would NaN otherwise.
88
+ * - `cap < 0` (misconfigured): score is 1.0 and the merge silently
89
+ * collapses to rubric-only on this dimension. The Phase 3 Zod schema
90
+ * gates against this upstream so it should never reach here, but the
91
+ * guard preserves the never-deduct invariant if it does.
92
+ *
93
+ * `unresolved` findings never deduct (the preflight's never-deduct rule):
94
+ * they're not part of `total`, so they fall through to the LLM rubric.
95
+ */
96
+ export declare function preflightToScore(report: SymbolPreflightReport, weight: number): AssertionScore;