@sanity/ailf 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  21. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  22. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +20 -3
  23. package/dist/_vendor/ailf-core/types/index.d.ts +13 -1
  24. package/dist/_vendor/ailf-core/types/index.js +1 -0
  25. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  27. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  28. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  29. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  30. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  31. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  32. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  33. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +25 -5
  34. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +276 -95
  35. package/dist/adapters/index.d.ts +1 -0
  36. package/dist/adapters/index.js +1 -0
  37. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  38. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  39. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  40. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  41. package/dist/adapters/package-surface/index.d.ts +9 -0
  42. package/dist/adapters/package-surface/index.js +8 -0
  43. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  44. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  45. package/dist/adapters/task-sources/repo-schemas.d.ts +22 -0
  46. package/dist/adapters/task-sources/repo-schemas.js +93 -1
  47. package/dist/adapters/task-sources/repo-task-source.js +11 -2
  48. package/dist/commands/pipeline-action.d.ts +2 -0
  49. package/dist/commands/pipeline-action.js +12 -0
  50. package/dist/commands/remote-pipeline.js +9 -2
  51. package/dist/commands/remote-results.d.ts +12 -1
  52. package/dist/commands/remote-results.js +25 -5
  53. package/dist/commands/validate-tasks.js +8 -2
  54. package/dist/composition-root.js +9 -0
  55. package/dist/config/package-surface.ts +37 -0
  56. package/dist/config/preflight-scoring.ts +26 -0
  57. package/dist/index.d.ts +2 -2
  58. package/dist/index.js +1 -1
  59. package/dist/orchestration/build-app-context.js +1 -0
  60. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  61. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  62. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  63. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  64. package/dist/orchestration/steps/run-eval-step.js +27 -0
  65. package/dist/pipeline/calculate-scores.d.ts +66 -5
  66. package/dist/pipeline/calculate-scores.js +141 -27
  67. package/dist/pipeline/compiler/index.d.ts +1 -1
  68. package/dist/pipeline/compiler/index.js +1 -1
  69. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  70. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  71. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +0 -12
  72. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +0 -12
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +190 -6
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  85. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  86. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  88. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  89. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  90. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  91. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  92. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  94. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  95. package/dist/pipeline/preflight/parse-imports.js +125 -0
  96. package/dist/report-store.d.ts +8 -0
  97. package/dist/report-store.js +55 -6
  98. package/dist/sanity/document-renderers.d.ts +106 -0
  99. package/dist/sanity/document-renderers.js +307 -0
  100. package/dist/sanity/queries.d.ts +32 -11
  101. package/dist/sanity/queries.js +78 -0
  102. package/dist/sanity/symbol-index.d.ts +98 -0
  103. package/dist/sanity/symbol-index.js +615 -0
  104. package/dist/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  105. package/dist/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  106. package/dist/tasks/literacy/content-lake.task.ts +4 -10
  107. package/dist/tasks/literacy/frameworks.task.ts +2 -8
  108. package/dist/tasks/literacy/functions.task.ts +1 -4
  109. package/dist/tasks/literacy/groq.task.ts +3 -12
  110. package/dist/tasks/literacy/image-handling.task.ts +1 -4
  111. package/dist/tasks/literacy/nextjs-live.task.ts +1 -4
  112. package/dist/tasks/literacy/portable-text.task.ts +2 -8
  113. package/dist/tasks/literacy/studio-setup.task.ts +2 -8
  114. package/dist/tasks/literacy/visual-editing.task.ts +2 -8
  115. package/package.json +2 -1
  116. package/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  117. package/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  118. package/tasks/literacy/content-lake.task.ts +4 -10
  119. package/tasks/literacy/frameworks.task.ts +2 -8
  120. package/tasks/literacy/functions.task.ts +1 -4
  121. package/tasks/literacy/groq.task.ts +3 -12
  122. package/tasks/literacy/image-handling.task.ts +1 -4
  123. package/tasks/literacy/nextjs-live.task.ts +1 -4
  124. package/tasks/literacy/portable-text.task.ts +2 -8
  125. package/tasks/literacy/studio-setup.task.ts +2 -8
  126. package/tasks/literacy/visual-editing.task.ts +2 -8
@@ -6,10 +6,24 @@
6
6
  * - Baseline entry with without-docs prompt and empty docs
7
7
  * - Rubric assertions with structured dimension metadata
8
8
  */
9
+ import { existsSync, readFileSync } from "node:fs";
10
+ import { resolve } from "node:path";
9
11
  import { LiteracyVariant, } from "../../../normalize-mode.js";
10
12
  import { buildBaselineAssertions, resolveAssertions } from "./assertions.js";
11
13
  import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
12
14
  import { validateLiteracyTask } from "./validation.js";
15
+ /**
16
+ * Variable keys reserved by the AILF compilers. Authoring these via
17
+ * `prompt.vars` is rejected by `PromptVars` at compile time and by
18
+ * `TaskPromptSchema` at parse time; this constant exists to defend
19
+ * the literacy compiler at runtime against legacy-shape `*.task.ts`
20
+ * files that bypass both gates.
21
+ */
22
+ const RESERVED_PROMPT_VAR_KEYS = [
23
+ "task",
24
+ "docs",
25
+ "__featureArea",
26
+ ];
13
27
  /**
14
28
  * Compile a literacy task into Promptfoo configuration.
15
29
  */
@@ -58,20 +72,55 @@ function buildPrompts(evalMode) {
58
72
  // ---------------------------------------------------------------------------
59
73
  function buildTestCases(task, evalMode, options, warnings) {
60
74
  const tests = [];
61
- const promptText = task.prompt?.text ?? task.prompt?.template ?? "";
75
+ // W0193: type-erased read of prompt.vars so we can defensively detect
76
+ // reserved keys on legacy-shape `*.task.ts` files (the type narrow makes
77
+ // `task.prompt.vars.task` `never`, but TS task files bypass both the
78
+ // type and the parse-time schema). YAML/inline-task paths have already
79
+ // been migrated by `migratePromptShape` upstream.
80
+ const rawVars = (task.prompt?.vars ?? {});
81
+ const legacyTaskBody = typeof rawVars.task === "string" ? rawVars.task : undefined;
82
+ const promptText = task.prompt?.text ?? legacyTaskBody ?? task.prompt?.template ?? "";
62
83
  const contextDocs = task.context?.docs ?? [];
63
84
  const taskArea = task.area ?? "";
64
85
  const taskTitle = task.title;
65
- const promptVars = task.prompt?.vars ?? {};
86
+ // Strip reserved keys from the vars spread so they cannot override the
87
+ // canonical assignments below. `safePromptVars` carries only freeform
88
+ // template extras.
89
+ const safePromptVars = {};
90
+ const presentReserved = [];
91
+ for (const [key, value] of Object.entries(rawVars)) {
92
+ if (RESERVED_PROMPT_VAR_KEYS.includes(key)) {
93
+ presentReserved.push(key);
94
+ continue;
95
+ }
96
+ safePromptVars[key] = value;
97
+ }
98
+ // Single deduplicated deprecation warning per task — even when several
99
+ // reserved keys are present.
100
+ if (presentReserved.length > 0) {
101
+ warnings.push(`Literacy task "${task.id}": deprecated prompt.vars keys ` +
102
+ `(${presentReserved.join(", ")}) — use prompt.text for the prompt ` +
103
+ `body and context.docs for documentation references. The compiler ` +
104
+ `migrated them in-memory, but the task source should be updated.`);
105
+ }
66
106
  const hasDocs = contextDocs.length > 0;
67
107
  const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
68
- const assertions = resolveAssertions(task, options, warnings);
69
- // Gold entry canonical docs injected
108
+ // When `graderContext` is "with-docs" and the task has canonical docs
109
+ // declared, read the resolved doc file so we can inject it into the
110
+ // assertion rubricPrompt as authoritative ground truth. The file is
111
+ // produced by `FetchDocsStep` (which runs before `GenerateConfigsStep`),
112
+ // so it should exist on disk by the time we're compiling.
113
+ const canonicalReference = options?.graderContext === "with-docs" && hasDocs
114
+ ? readGraderReference(task.id, options?.rootDir, warnings)
115
+ : undefined;
116
+ const assertions = resolveAssertions(task, options, warnings, canonicalReference);
117
+ // Gold entry — canonical docs injected. Spread freeform extras first so
118
+ // canonical keys (task / docs / __featureArea) cannot be overridden.
70
119
  const goldVars = {
120
+ ...safePromptVars,
71
121
  task: promptText,
72
122
  docs: docsVar,
73
123
  __featureArea: taskArea,
74
- ...promptVars,
75
124
  };
76
125
  tests.push({
77
126
  description: `${taskTitle} (gold)`,
@@ -89,10 +138,10 @@ function buildTestCases(task, evalMode, options, warnings) {
89
138
  tests.push({
90
139
  description: `${taskTitle} (baseline)`,
91
140
  vars: {
141
+ ...safePromptVars,
92
142
  task: promptText,
93
143
  docs: "",
94
144
  __featureArea: taskArea,
95
- ...promptVars,
96
145
  },
97
146
  prompts: ["without-docs"],
98
147
  ...(baselineAssertions.length > 0
@@ -103,3 +152,138 @@ function buildTestCases(task, evalMode, options, warnings) {
103
152
  }
104
153
  return tests;
105
154
  }
155
+ // ---------------------------------------------------------------------------
156
+ // Canonical doc resolution
157
+ // ---------------------------------------------------------------------------
158
+ /**
159
+ * Read the grader-context reference for a task, preferring the W0197
160
+ * symbol-reference index when one was emitted by the doc fetcher and
161
+ * falling back to the full canonical doc otherwise.
162
+ *
163
+ * Resolution is driven by the per-task manifest at
164
+ * `<rootDir>/contexts/canonical-symbols/manifest.json` (W0197). The
165
+ * manifest distinguishes three states:
166
+ * 1. Manifest missing → fetcher hadn't run with W0197. Silent fallback
167
+ * to the full canonical doc; this is the backwards-compat path.
168
+ * 2. Manifest present, entry has `symbolCount > 0` → use the per-task
169
+ * symbol-index file as the grader reference (W0197 path).
170
+ * 3. Manifest present, entry has `symbolCount === 0` → extraction ran
171
+ * and produced nothing. Observable fallback (warning surfaced)
172
+ * to the full canonical doc.
173
+ *
174
+ * Returns undefined and pushes a warning when the canonical doc itself
175
+ * is missing or unreadable, when rootDir is unset, or when a resolved
176
+ * path escapes rootDir (path-traversal guard mirroring
177
+ * `fixture-resolver.ts:resolveFileRef`).
178
+ */
179
+ function readGraderReference(taskId, rootDir, warnings) {
180
+ if (!rootDir) {
181
+ warnings.push(`graderContext "with-docs" requires rootDir — canonical reference for "${taskId}" not injected`);
182
+ return undefined;
183
+ }
184
+ const manifestEntry = readSymbolIndexManifestEntry(taskId, rootDir, warnings);
185
+ if (manifestEntry !== "missing-manifest") {
186
+ if (manifestEntry === "path-traversal")
187
+ return undefined;
188
+ if (manifestEntry === null) {
189
+ // Manifest exists but task isn't listed — shouldn't happen for tasks
190
+ // the fetcher processed. Treat as silent fallback (matches "missing
191
+ // manifest" semantics for safety) but log a warning so an unexpected
192
+ // mismatch is observable.
193
+ warnings.push(`graderContext: task "${taskId}" not present in symbol-index manifest — falling back to full canonical doc injection`);
194
+ }
195
+ else if (manifestEntry.symbolCount === 0) {
196
+ warnings.push(`graderContext: symbol index empty for "${taskId}" — falling back to full canonical doc injection`);
197
+ }
198
+ else {
199
+ // Symbol-index path: read the per-task .md file.
200
+ const symbolsRelative = manifestEntry.path;
201
+ const symbolsAbsolute = resolveWithinRoot(rootDir, symbolsRelative);
202
+ if (symbolsAbsolute === null) {
203
+ warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
204
+ return undefined;
205
+ }
206
+ if (existsSync(symbolsAbsolute)) {
207
+ const content = readFileSafe(symbolsAbsolute, symbolsRelative, warnings, taskId);
208
+ if (content && content.trim().length > 0)
209
+ return content;
210
+ }
211
+ // Manifest claimed symbols but the file is missing/unreadable —
212
+ // surface as a warning and continue to full-doc fallback.
213
+ warnings.push(`graderContext: symbol index file at ${symbolsRelative} is missing or empty despite manifest entry — falling back to full canonical doc for "${taskId}"`);
214
+ }
215
+ }
216
+ // Fall back to the full canonical doc (W0196 path).
217
+ const canonicalRelative = `contexts/canonical/${taskId}.md`;
218
+ const canonicalAbsolute = resolveWithinRoot(rootDir, canonicalRelative);
219
+ if (canonicalAbsolute === null) {
220
+ warnings.push(`graderContext: path traversal blocked for task "${taskId}" — canonical reference not injected`);
221
+ return undefined;
222
+ }
223
+ if (!existsSync(canonicalAbsolute)) {
224
+ warnings.push(`graderContext "with-docs": canonical doc not found at ${canonicalRelative} — run fetch-docs first; reference not injected for "${taskId}"`);
225
+ return undefined;
226
+ }
227
+ return (readFileSafe(canonicalAbsolute, canonicalRelative, warnings, taskId) ??
228
+ undefined);
229
+ }
230
+ function readSymbolIndexManifestEntry(taskId, rootDir, warnings) {
231
+ const manifestRelative = "contexts/canonical-symbols/manifest.json";
232
+ const manifestAbsolute = resolveWithinRoot(rootDir, manifestRelative);
233
+ if (manifestAbsolute === null)
234
+ return "path-traversal";
235
+ if (!existsSync(manifestAbsolute))
236
+ return "missing-manifest";
237
+ const body = readFileSafe(manifestAbsolute, manifestRelative, warnings, taskId);
238
+ if (body === null)
239
+ return "missing-manifest";
240
+ let parsed;
241
+ try {
242
+ parsed = JSON.parse(body);
243
+ }
244
+ catch (err) {
245
+ const msg = err instanceof Error ? err.message : String(err);
246
+ warnings.push(`graderContext: symbol-index manifest at ${manifestRelative} is unparseable (${msg}) — falling back to full canonical doc for "${taskId}"`);
247
+ return "missing-manifest";
248
+ }
249
+ if (!parsed || typeof parsed !== "object")
250
+ return null;
251
+ const entries = parsed.entries;
252
+ if (!Array.isArray(entries))
253
+ return null;
254
+ for (const e of entries) {
255
+ if (!e || typeof e !== "object")
256
+ continue;
257
+ const entry = e;
258
+ if (typeof entry.taskId === "string" &&
259
+ typeof entry.path === "string" &&
260
+ typeof entry.symbolCount === "number" &&
261
+ entry.taskId === taskId) {
262
+ return {
263
+ taskId: entry.taskId,
264
+ path: entry.path,
265
+ symbolCount: entry.symbolCount,
266
+ };
267
+ }
268
+ }
269
+ return null;
270
+ }
271
+ function resolveWithinRoot(rootDir, relativePath) {
272
+ const absolutePath = resolve(rootDir, relativePath);
273
+ const normalizedBase = resolve(rootDir) + "/";
274
+ if (!absolutePath.startsWith(normalizedBase) &&
275
+ absolutePath !== resolve(rootDir)) {
276
+ return null;
277
+ }
278
+ return absolutePath;
279
+ }
280
+ function readFileSafe(absolutePath, relativePath, warnings, taskId) {
281
+ try {
282
+ return readFileSync(absolutePath, "utf-8");
283
+ }
284
+ catch (err) {
285
+ const msg = err instanceof Error ? err.message : String(err);
286
+ warnings.push(`graderContext: failed to read ${relativePath}: ${msg} — reference not injected for "${taskId}"`);
287
+ return null;
288
+ }
289
+ }
@@ -25,6 +25,8 @@ export const handler = {
25
25
  rootDir: ctx.rootDir,
26
26
  models: ctx.models,
27
27
  rubricConfig: ctx.rubricConfig,
28
+ graderContext: ctx.graderContext,
29
+ preflightContext: ctx.preflightContext,
28
30
  evalMode: ctx
29
31
  .evalMode,
30
32
  });
@@ -2,8 +2,8 @@
2
2
  * Shared types for the literacy mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
- export type { RubricResolutionInput } from "../../rubric-resolution.js";
6
- import type { RubricResolutionInput } from "../../rubric-resolution.js";
5
+ export type { PreflightRubricContext, RubricResolutionInput, } from "../../rubric-resolution.js";
6
+ import type { PreflightRubricContext, RubricResolutionInput } from "../../rubric-resolution.js";
7
7
  /** Options for compiling a literacy task */
8
8
  export interface LiteracyCompileOptions {
9
9
  /** Grader provider for LLM-graded assertions */
@@ -20,6 +20,21 @@ export interface LiteracyCompileOptions {
20
20
  }[];
21
21
  /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
22
22
  rubricConfig?: RubricResolutionInput;
23
+ /**
24
+ * Grader context policy. When `"with-docs"` and the task declares
25
+ * `context.docs`, the canonical doc content is read from
26
+ * `<rootDir>/contexts/canonical/<task.id>.md` and injected into the
27
+ * Promptfoo `rubricPrompt`.
28
+ */
29
+ graderContext?: "rubric-only" | "with-docs";
30
+ /**
31
+ * W0198 Phase 6 — when supplied, the `code-correctness` rubric is
32
+ * prefixed with a "DETERMINISTIC PREFLIGHT" instruction telling the
33
+ * grader to treat the deterministic lane's existence verdicts as
34
+ * ground truth. Sourced from the package-surface manifest at compile
35
+ * time; absence collapses cleanly to the pre-W0198 rubric.
36
+ */
37
+ preflightContext?: PreflightRubricContext;
23
38
  }
24
39
  /** Result of compiling a single literacy task */
25
40
  export interface LiteracyCompileResult {
@@ -10,9 +10,12 @@
10
10
  * tasks with templated rubrics produced empty rubric text (DOC-2029).
11
11
  *
12
12
  * @see docs/design-docs/mode-agnostic-scoring.md
13
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
13
14
  * @see config/rubrics.ts — template definitions
14
15
  */
16
+ import type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
15
17
  import type { PromptfooAssertion } from "./assertion-mapper.js";
18
+ export type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
16
19
  /** Minimal rubric config needed for template resolution */
17
20
  export interface RubricResolutionInput {
18
21
  templates: Record<string, {
@@ -31,10 +34,23 @@ export interface RubricResolutionInput {
31
34
  * scoring header, scale, and dimension metadata. The criteria are appended
32
35
  * to create the final rubric prompt.
33
36
  *
37
+ * When `canonicalReference` is supplied, it is emitted on the assertion's
38
+ * `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
39
+ * wrapped in a ground-truth framing so the grader treats the supplied
40
+ * content as authoritative. The rubric `value` itself is unchanged. Without
41
+ * it, the grader falls back on training priors and may hallucinate against
42
+ * neighboring API surfaces.
43
+ *
34
44
  * Returns null (with a warning) if the template can't be resolved.
35
45
  */
36
46
  export declare function resolveTemplatedAssertion(assertion: {
37
47
  criteria: string[];
38
48
  template: string;
39
49
  type: string;
40
- }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
50
+ }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[], canonicalReference?: string, preflightContext?: PreflightRubricContext): PromptfooAssertion | null;
51
+ /**
52
+ * Build the W0198 Phase 6 preflight preface for a `code-correctness`
53
+ * rubric. Returned with a trailing newline so it composes cleanly with
54
+ * the existing rubric body.
55
+ */
56
+ export declare function buildPreflightSection(context: PreflightRubricContext): string;
@@ -10,6 +10,7 @@
10
10
  * tasks with templated rubrics produced empty rubric text (DOC-2029).
11
11
  *
12
12
  * @see docs/design-docs/mode-agnostic-scoring.md
13
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
13
14
  * @see config/rubrics.ts — template definitions
14
15
  */
15
16
  // ---------------------------------------------------------------------------
@@ -24,9 +25,16 @@
24
25
  * scoring header, scale, and dimension metadata. The criteria are appended
25
26
  * to create the final rubric prompt.
26
27
  *
28
+ * When `canonicalReference` is supplied, it is emitted on the assertion's
29
+ * `rubricPrompt` field — Promptfoo's per-assertion grader-prompt override —
30
+ * wrapped in a ground-truth framing so the grader treats the supplied
31
+ * content as authoritative. The rubric `value` itself is unchanged. Without
32
+ * it, the grader falls back on training priors and may hallucinate against
33
+ * neighboring API surfaces.
34
+ *
27
35
  * Returns null (with a warning) if the template can't be resolved.
28
36
  */
29
- export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings) {
37
+ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings, canonicalReference, preflightContext) {
30
38
  if (!rubricConfig) {
31
39
  warnings.push(`No rubric config — template "${assertion.template}" cannot be resolved`);
32
40
  return null;
@@ -38,15 +46,83 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
38
46
  }
39
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
40
48
  const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
41
- const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
+ // W0198 Phase 6 — when the deterministic preflight lane is wired and this
50
+ // rubric scores `code-correctness`, prefix a system instruction so the
51
+ // grader does not re-judge symbol existence. The lane separation is the
52
+ // whole reason W0198 exists; if both lanes weigh existence, the rubric's
53
+ // hallucinations sneak back into a dimension we want deterministic.
54
+ const preflightSection = preflightContext && template.dimension === "code-correctness"
55
+ ? buildPreflightSection(preflightContext)
56
+ : "";
57
+ const rubricValue = preflightSection +
58
+ `${template.header}\n${scaleText}\n\n` +
42
59
  `${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
43
60
  `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
61
+ const rubricPrompt = canonicalReference
62
+ ? buildRubricPromptWithReference(rubricValue, canonicalReference)
63
+ : undefined;
44
64
  return {
45
65
  type: "llm-rubric",
46
66
  value: rubricValue,
67
+ ...(rubricPrompt ? { rubricPrompt } : {}),
47
68
  ...(graderProvider ? { provider: graderProvider } : {}),
48
69
  ...(template.dimension
49
70
  ? { metadata: { dimension: template.dimension, maxScore: 100 } }
50
71
  : {}),
51
72
  };
52
73
  }
74
+ /**
75
+ * Build the W0198 Phase 6 preflight preface for a `code-correctness`
76
+ * rubric. Returned with a trailing newline so it composes cleanly with
77
+ * the existing rubric body.
78
+ */
79
+ export function buildPreflightSection(context) {
80
+ const packageList = context.packages.length > 0
81
+ ? context.packages.join(", ")
82
+ : "(no packages in manifest)";
83
+ return [
84
+ "DETERMINISTIC PREFLIGHT — A separate lane checks whether each named",
85
+ "binding the candidate imports (e.g. `import { foo } from 'pkg'`) actually",
86
+ "exports from its source package, against the actual `.d.ts` of the",
87
+ "installed version. Default and namespace imports (`import x from 'pkg'`,",
88
+ "`import * as x from 'pkg'`) are NOT preflight-checked and fall through to",
89
+ "you. Treat any existence verdict the preflight returns as ground truth —",
90
+ "do NOT include symbol-existence concerns in your `code-correctness`",
91
+ "judgment. Confine your code-correctness score to idiomatic usage, code",
92
+ "organization, type safety, and completeness against the task.",
93
+ "",
94
+ `In-scope packages (preflight rules on named bindings imported from`,
95
+ `these): ${packageList}.`,
96
+ "",
97
+ "",
98
+ ].join("\n");
99
+ }
100
+ /**
101
+ * Build the full rubric-prompt string sent to the grader when canonical
102
+ * reference content is available. The framing explicitly tells the grader
103
+ * to treat the reference as authoritative ground truth so it cannot fall
104
+ * back on training priors when checking symbol existence or API shape.
105
+ *
106
+ * Promptfoo interpolates `{{output}}` and `{{rubric}}` at grade time —
107
+ * `{{output}}` is the candidate response; `{{rubric}}` is the assertion's
108
+ * `value` field (the rubric text built above).
109
+ */
110
+ function buildRubricPromptWithReference(rubric, reference) {
111
+ return [
112
+ "You are grading a candidate response against a rubric.",
113
+ "",
114
+ "AUTHORITATIVE REFERENCE — treat this as ground truth. If a symbol, function,",
115
+ "API, or pattern appears here, it exists; do not contradict it from prior",
116
+ "knowledge. If something is absent here, do not assume it exists.",
117
+ "",
118
+ "--- BEGIN REFERENCE ---",
119
+ reference,
120
+ "--- END REFERENCE ---",
121
+ "",
122
+ "RUBRIC:",
123
+ rubric,
124
+ "",
125
+ "CANDIDATE RESPONSE:",
126
+ "{{output}}",
127
+ ].join("\n");
128
+ }
@@ -16,11 +16,19 @@
16
16
  * engine works in [0, 1]; this module handles the conversion at
17
17
  * boundaries.
18
18
  *
19
+ * W0198 Phase 5 — when a `preflightForTest` callback is provided and
20
+ * returns a `SymbolPreflightReport`, the bridge synthesizes one extra
21
+ * `AssertionScore` per test in the `code-correctness` dimension. The
22
+ * deterministic preflight and the LLM rubric merge through D0010's
23
+ * weighted dimension aggregation; the relative share is set by
24
+ * `preflightWeight` in `[0, 1]`.
25
+ *
19
26
  * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
27
  * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
28
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
29
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
22
30
  */
23
- import { type DimensionScore } from "../../_vendor/ailf-core/index.d.ts";
31
+ import { type AssertionScore, type DimensionScore, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
24
32
  import type { TestResult } from "../../_vendor/ailf-core/index.d.ts";
25
33
  /** Result of scoring a group of tests via the 4-tier engine */
26
34
  export interface BridgedScoreResult {
@@ -33,6 +41,24 @@ export interface BridgedScoreResult {
33
41
  /** Raw DimensionScore objects from the engine (0–1 scale) */
34
42
  rawDimensions: DimensionScore[];
35
43
  }
44
+ /**
45
+ * W0198 Phase 5 options. When `preflightForTest` is omitted (or returns
46
+ * `undefined`) the bridge stays purely rubric-driven — identical to the
47
+ * pre-W0198 behavior. When a report is returned, one `code-correctness`
48
+ * preflight assertion is synthesized per test and merged with the rubric
49
+ * via weighted-mean.
50
+ */
51
+ export interface ScoreTestGroupOptions {
52
+ /** Look up the preflight report attached to a particular test, if any. */
53
+ preflightForTest?: (test: TestResult) => SymbolPreflightReport | undefined;
54
+ /**
55
+ * Preflight's share of the `code-correctness` dimension, in `[0, 1]`.
56
+ * The complementary share belongs to the LLM rubric. Defaults to
57
+ * `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` (0.4). Ignored when
58
+ * `preflightForTest` is omitted.
59
+ */
60
+ preflightWeight?: number;
61
+ }
36
62
  /**
37
63
  * Score a group of test results using the 4-tier scoring engine.
38
64
  *
@@ -44,6 +70,27 @@ export interface BridgedScoreResult {
44
70
  * @param profile Weight profile mapping kebab-case dimension names to weights
45
71
  * (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
46
72
  * @param taskId Optional task identifier for traceability in TaskScore output
73
+ * @param options Optional W0198 Phase 5 preflight integration
47
74
  * @returns Dimensions (0–100) and composite (0–100), matching legacy output format
48
75
  */
49
- export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string): BridgedScoreResult;
76
+ export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string, options?: ScoreTestGroupOptions): BridgedScoreResult;
77
+ /**
78
+ * Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
79
+ * preflight report.
80
+ *
81
+ * The score is `1 - min(1, total / cap)`. With the default
82
+ * `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
83
+ * 2 missing → 0.333, ≥3 missing → 0.0.
84
+ *
85
+ * Edge cases for the deduction config:
86
+ * - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
87
+ * would NaN otherwise.
88
+ * - `cap < 0` (misconfigured): score is 1.0 and the merge silently
89
+ * collapses to rubric-only on this dimension. The Phase 3 Zod schema
90
+ * gates against this upstream so it should never reach here, but the
91
+ * guard preserves the never-deduct invariant if it does.
92
+ *
93
+ * `unresolved` findings never deduct (the preflight's never-deduct rule):
94
+ * they're not part of `total`, so they fall through to the LLM rubric.
95
+ */
96
+ export declare function preflightToScore(report: SymbolPreflightReport, weight: number): AssertionScore;