@sanity/ailf 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  21. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  22. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +20 -3
  23. package/dist/_vendor/ailf-core/types/index.d.ts +13 -1
  24. package/dist/_vendor/ailf-core/types/index.js +1 -0
  25. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  27. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  28. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  29. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  30. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  31. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  32. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  33. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +25 -5
  34. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +276 -95
  35. package/dist/adapters/index.d.ts +1 -0
  36. package/dist/adapters/index.js +1 -0
  37. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  38. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  39. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  40. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  41. package/dist/adapters/package-surface/index.d.ts +9 -0
  42. package/dist/adapters/package-surface/index.js +8 -0
  43. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  44. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  45. package/dist/adapters/task-sources/repo-schemas.d.ts +22 -0
  46. package/dist/adapters/task-sources/repo-schemas.js +93 -1
  47. package/dist/adapters/task-sources/repo-task-source.js +11 -2
  48. package/dist/commands/pipeline-action.d.ts +2 -0
  49. package/dist/commands/pipeline-action.js +12 -0
  50. package/dist/commands/remote-pipeline.js +9 -2
  51. package/dist/commands/remote-results.d.ts +12 -1
  52. package/dist/commands/remote-results.js +25 -5
  53. package/dist/commands/validate-tasks.js +8 -2
  54. package/dist/composition-root.js +9 -0
  55. package/dist/config/package-surface.ts +37 -0
  56. package/dist/config/preflight-scoring.ts +26 -0
  57. package/dist/index.d.ts +2 -2
  58. package/dist/index.js +1 -1
  59. package/dist/orchestration/build-app-context.js +1 -0
  60. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  61. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  62. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  63. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  64. package/dist/orchestration/steps/run-eval-step.js +27 -0
  65. package/dist/pipeline/calculate-scores.d.ts +66 -5
  66. package/dist/pipeline/calculate-scores.js +141 -27
  67. package/dist/pipeline/compiler/index.d.ts +1 -1
  68. package/dist/pipeline/compiler/index.js +1 -1
  69. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  70. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  71. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +0 -12
  72. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +0 -12
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +190 -6
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  85. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  86. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  88. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  89. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  90. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  91. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  92. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  94. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  95. package/dist/pipeline/preflight/parse-imports.js +125 -0
  96. package/dist/report-store.d.ts +8 -0
  97. package/dist/report-store.js +55 -6
  98. package/dist/sanity/document-renderers.d.ts +106 -0
  99. package/dist/sanity/document-renderers.js +307 -0
  100. package/dist/sanity/queries.d.ts +32 -11
  101. package/dist/sanity/queries.js +78 -0
  102. package/dist/sanity/symbol-index.d.ts +98 -0
  103. package/dist/sanity/symbol-index.js +615 -0
  104. package/dist/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  105. package/dist/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  106. package/dist/tasks/literacy/content-lake.task.ts +4 -10
  107. package/dist/tasks/literacy/frameworks.task.ts +2 -8
  108. package/dist/tasks/literacy/functions.task.ts +1 -4
  109. package/dist/tasks/literacy/groq.task.ts +3 -12
  110. package/dist/tasks/literacy/image-handling.task.ts +1 -4
  111. package/dist/tasks/literacy/nextjs-live.task.ts +1 -4
  112. package/dist/tasks/literacy/portable-text.task.ts +2 -8
  113. package/dist/tasks/literacy/studio-setup.task.ts +2 -8
  114. package/dist/tasks/literacy/visual-editing.task.ts +2 -8
  115. package/package.json +2 -1
  116. package/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  117. package/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  118. package/tasks/literacy/content-lake.task.ts +4 -10
  119. package/tasks/literacy/frameworks.task.ts +2 -8
  120. package/tasks/literacy/functions.task.ts +1 -4
  121. package/tasks/literacy/groq.task.ts +3 -12
  122. package/tasks/literacy/image-handling.task.ts +1 -4
  123. package/tasks/literacy/nextjs-live.task.ts +1 -4
  124. package/tasks/literacy/portable-text.task.ts +2 -8
  125. package/tasks/literacy/studio-setup.task.ts +2 -8
  126. package/tasks/literacy/visual-editing.task.ts +2 -8
@@ -17,7 +17,7 @@ import { existsSync, readdirSync, readFileSync } from "fs";
17
17
  import { resolve, relative, basename } from "path";
18
18
  import { Command } from "commander";
19
19
  import { load } from "js-yaml";
20
- import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
20
+ import { detectLegacyFieldNames, migratePromptShape, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
21
21
  import { validateCanonicalTasks, formatRepoValidationResult, } from "../adapters/task-sources/repo-validation.js";
22
22
  import { discoverTsTaskFiles, loadTsTaskFile, } from "../adapters/task-sources/task-file-loader.js";
23
23
  export function createValidateTasksCommand() {
@@ -133,8 +133,14 @@ function validateTaskArray(entries, file, accumulator) {
133
133
  console.error();
134
134
  return false;
135
135
  }
136
+ // W0193: pre-migrate legacy prompt.vars.{task,docs,__featureArea} shape
137
+ // and surface deprecation warnings (non-fatal — the file still validates).
138
+ const { migrated, warnings: deprecationWarnings } = migratePromptShape(entries, file);
139
+ for (const warning of deprecationWarnings) {
140
+ console.warn(` ${warning}`);
141
+ }
136
142
  try {
137
- const tasks = parseCanonicalTaskFile(entries, file);
143
+ const tasks = parseCanonicalTaskFile(migrated, file);
138
144
  console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
139
145
  accumulator.push(...tasks);
140
146
  return true;
@@ -29,6 +29,7 @@ import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js"
29
29
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
30
30
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
31
31
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
32
+ import { DtsPackageSurface } from "./adapters/package-surface/index.js";
32
33
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
33
34
  import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
34
35
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
@@ -65,6 +66,13 @@ export function createAppContext(config) {
65
66
  const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
66
67
  // Eval runner — Promptfoo subprocess
67
68
  const evalRunner = new PromptfooEvalAdapter(config.rootDir);
69
+ // Package-surface resolver for the W0198 symbol-resolution preflight.
70
+ // Reads installed `.d.ts` from the eval package's node_modules chain
71
+ // (anchored at `config.rootDir`). Phase 1 wires the adapter; later
72
+ // phases consume it.
73
+ const packageSurfaceResolver = new DtsPackageSurface({
74
+ resolveFromDir: config.rootDir,
75
+ });
68
76
  // Report store — Sanity Content Lake (for publish + auto-compare)
69
77
  const reportStore = createReportStore(config);
70
78
  // Sinks — loaded from config/sinks
@@ -90,6 +98,7 @@ export function createAppContext(config) {
90
98
  docFetcher,
91
99
  evalRunner,
92
100
  logger,
101
+ packageSurfaceResolver,
93
102
  progress,
94
103
  registry,
95
104
  reportStore,
@@ -0,0 +1,37 @@
1
+ /**
2
+ * package-surface.ts — Framework-level package-surface manifest for the
3
+ * W0198 symbol-resolution preflight.
4
+ *
5
+ * The manifest pins each in-scope package to a single semver-major range.
6
+ * The preflight resolver answers "does symbol X export from package Y" by
7
+ * reading the installed package's `.d.ts` against this pin. Tasks
8
+ * reference packages by name; they do not carry per-package version
9
+ * metadata (per-task overrides remain a future extension point).
10
+ *
11
+ * Bumping a major is an editorial event — one PR that updates the pin,
12
+ * regenerates cached surfaces, and re-runs the historical comparison set.
13
+ * Patch and minor releases within a pinned major flow silently because
14
+ * semver disallows the export removals that would change a deduction
15
+ * outcome.
16
+ *
17
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
18
+ */
19
+
20
+ import { definePackageSurface } from "../_vendor/ailf-core/index.js"
21
+
22
+ export default definePackageSurface({
23
+ packages: [
24
+ // Sanity App SDK — drives App SDK literacy tasks (e.g. DOC-2117).
25
+ { pkg: "@sanity/sdk-react", semverPin: "^2.0.0" },
26
+
27
+ // Sanity Studio runtime — drives Studio-side literacy tasks.
28
+ // Pinned to the installed major in this repo (^5.x). The earlier
29
+ // major (v3) is an explicit per-task override candidate when a task
30
+ // exercises legacy Studio behavior.
31
+ { pkg: "sanity", semverPin: "^5.0.0" },
32
+
33
+ // @sanity/client — drives direct-client literacy tasks. Pinned to
34
+ // the installed major in this repo (^7.x).
35
+ { pkg: "@sanity/client", semverPin: "^7.0.0" },
36
+ ],
37
+ })
@@ -0,0 +1,26 @@
1
+ /**
2
+ * preflight-scoring.ts — How heavily the W0198 deterministic preflight
3
+ * contributes to the `code-correctness` dimension.
4
+ *
5
+ * The preflight (`SymbolPreflightReport`) and the LLM rubric both feed
6
+ * into `code-correctness` per D0010's weighted dimension aggregation.
7
+ * `codeCorrectnessWeight` sets the relative share between them — `0.4`
8
+ * means preflight is 40% of the dimension, rubric 60%.
9
+ *
10
+ * Bumping this is an editorial decision: a higher weight ties more of the
11
+ * `code-correctness` score to the deterministic existence-check (less
12
+ * grader noise on the symbol-existence question, but also less elasticity
13
+ * for the rubric to penalize stylistic/correctness issues the preflight
14
+ * cannot see). A lower weight cedes more authority back to the rubric.
15
+ *
16
+ * The default of `0.4` is a starting balance; revisit once the
17
+ * `unresolved` rate stabilizes in CI.
18
+ *
19
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
20
+ */
21
+
22
+ import { definePreflightScoring } from "../_vendor/ailf-core/index.js"
23
+
24
+ export default definePreflightScoring({
25
+ codeCorrectnessWeight: 0.4,
26
+ })
package/dist/index.d.ts CHANGED
@@ -33,8 +33,8 @@
33
33
  * })
34
34
  * ```
35
35
  */
36
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
37
- export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core/index.d.ts";
36
+ export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
37
+ export type { PackageSurfaceConfig, PackageSurfaceEntry, PreflightScoringConfig, PricingEntry, PromptEntry, SourceEntry, } from "./_vendor/ailf-core/index.d.ts";
38
38
  export { env } from "./_vendor/ailf-core/index.d.ts";
39
39
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
40
40
  export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
package/dist/index.js CHANGED
@@ -36,7 +36,7 @@
36
36
  // ---------------------------------------------------------------------------
37
37
  // Configuration helpers (define* identity functions for typed authoring)
38
38
  // ---------------------------------------------------------------------------
39
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
39
+ export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
40
40
  // ---------------------------------------------------------------------------
41
41
  // Environment helper
42
42
  // ---------------------------------------------------------------------------
@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
49
49
  noCache: opts.noCache,
50
50
  noRemoteCache: opts.noRemoteCache,
51
51
  graderReplications: opts.graderReplications,
52
+ graderContext: opts.graderContext,
52
53
  outputDir: opts.outputDir,
53
54
  outputPath: opts.outputPath,
54
55
  urls: opts.urlArgs.length > 0 ? opts.urlArgs : undefined,
@@ -11,7 +11,7 @@
11
11
  * each step completes. This enables the GET /v1/jobs/:jobId polling
12
12
  * endpoint to show real-time progress.
13
13
  */
14
- import { type AppContext, type PipelineResult, type PipelineStep } from "../_vendor/ailf-core/index.d.ts";
14
+ import { type AppContext, type PipelineResult, type PipelineStep, type StepResult } from "../_vendor/ailf-core/index.d.ts";
15
15
  /**
16
16
  * Run a sequence of pipeline steps, short-circuiting on required step failure.
17
17
  *
@@ -22,3 +22,21 @@ import { type AppContext, type PipelineResult, type PipelineStep } from "../_ven
22
22
  * Lake after each step completes.
23
23
  */
24
24
  export declare function orchestratePipeline(ctx: AppContext, steps: PipelineStep[]): Promise<PipelineResult>;
25
+ /**
26
+ * Find the first optional pipeline step that returned `status: "failed"`
27
+ * in step-array order. Returns the diagnostic shape the API job document
28
+ * already accepts (`{ message, step }`), or null when no optional step
29
+ * failed.
30
+ *
31
+ * Required-step failures don't reach this code path — the orchestrator
32
+ * aborts before completion when a required step fails. This helper is
33
+ * the bridge between "step ran and failed" and the wire signal that
34
+ * external `--remote` consumers use to distinguish a clean completion
35
+ * from a degraded one.
36
+ *
37
+ * @see docs/design-docs/optional-step-failure-surfacing.md
38
+ */
39
+ export declare function getFirstOptionalFailure(steps: readonly PipelineStep[], results: Record<string, StepResult>): {
40
+ message: string;
41
+ step: string;
42
+ } | null;
@@ -236,6 +236,12 @@ export async function orchestratePipeline(ctx, steps) {
236
236
  process.env.SANITY_API_TOKEN ??
237
237
  undefined,
238
238
  });
239
+ // DOC-2121 RC-3 — surface the first configured-but-failed optional
240
+ // step on the job document so external --remote consumers can detect
241
+ // partial-completion outcomes. The pipeline still ran end to end
242
+ // (P5 / local-first) and `success: true` is preserved; the `error`
243
+ // field is the wire signal that a configured optional step failed.
244
+ const firstOptionalFailure = getFirstOptionalFailure(steps, results);
239
245
  await store.updateJob(ctx.config.jobId, {
240
246
  status: "completed",
241
247
  completedAt: new Date().toISOString(),
@@ -245,6 +251,7 @@ export async function orchestratePipeline(ctx, steps) {
245
251
  totalSteps: steps.length,
246
252
  },
247
253
  ...(state.reportId ? { reportId: state.reportId } : {}),
254
+ ...(firstOptionalFailure ? { error: firstOptionalFailure } : {}),
248
255
  });
249
256
  }
250
257
  catch {
@@ -275,6 +282,37 @@ export async function orchestratePipeline(ctx, steps) {
275
282
  };
276
283
  }
277
284
  // ---------------------------------------------------------------------------
285
+ // Optional-step failure surfacing (DOC-2121 RC-3)
286
+ // ---------------------------------------------------------------------------
287
+ /**
288
+ * Find the first optional pipeline step that returned `status: "failed"`
289
+ * in step-array order. Returns the diagnostic shape the API job document
290
+ * already accepts (`{ message, step }`), or null when no optional step
291
+ * failed.
292
+ *
293
+ * Required-step failures don't reach this code path — the orchestrator
294
+ * aborts before completion when a required step fails. This helper is
295
+ * the bridge between "step ran and failed" and the wire signal that
296
+ * external `--remote` consumers use to distinguish a clean completion
297
+ * from a degraded one.
298
+ *
299
+ * @see docs/design-docs/optional-step-failure-surfacing.md
300
+ */
301
+ export function getFirstOptionalFailure(steps, results) {
302
+ for (const step of steps) {
303
+ if (step.optional !== true)
304
+ continue;
305
+ const result = results[step.name];
306
+ if (result?.status === "failed") {
307
+ return {
308
+ message: result.error ?? `${step.name} failed`,
309
+ step: step.name,
310
+ };
311
+ }
312
+ }
313
+ return null;
314
+ }
315
+ // ---------------------------------------------------------------------------
278
316
  // Artifact export phase gate (W0053)
279
317
  // ---------------------------------------------------------------------------
280
318
  /**
@@ -14,6 +14,7 @@ import { buildCacheContext } from "../cache-context.js";
14
14
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
15
15
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
16
16
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
17
+ import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
17
18
  import { loadSource } from "../../sources.js";
18
19
  import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
19
20
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
@@ -77,6 +78,13 @@ export class CalculateScoresStep {
77
78
  catch {
78
79
  // Non-fatal — proceed without source metadata
79
80
  }
81
+ // W0198 — load preflight scoring config (silent fall-through to the
82
+ // default weight when absent). Lazy: ignored when no preflight reports
83
+ // were emitted upstream.
84
+ const preflightScoring = await loadPreflightScoring(ctx.config.rootDir).catch((err) => {
85
+ ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
86
+ return undefined;
87
+ });
80
88
  let belowCritical = [];
81
89
  try {
82
90
  const result = calculateAndWriteScores({
@@ -87,11 +95,14 @@ export class CalculateScoresStep {
87
95
  mode: ctx.config.mode === "literacy"
88
96
  ? (ctx.config.variant ?? LiteracyVariant.STANDARD)
89
97
  : ctx.config.mode,
98
+ preflightReports: state.preflightReports,
99
+ preflightWeight: preflightScoring?.codeCorrectnessWeight,
90
100
  resolvedSource,
91
101
  resultsPath: primaryResultsRun !== LiteracyVariant.STANDARD
92
102
  ? join(ctx.config.rootDir, resultsFile)
93
103
  : undefined,
94
104
  rootDir: ctx.config.rootDir,
105
+ runId: ctx.runId,
95
106
  searchMode: ctx.config.searchMode,
96
107
  source: ctx.config.source,
97
108
  });
@@ -18,6 +18,7 @@ import { getStepInputPaths } from "../../pipeline/cache.js";
18
18
  import { buildCacheContext } from "../cache-context.js";
19
19
  import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
20
20
  import { validateModelsYaml } from "../../pipeline/validate.js";
21
+ import { loadPreflightContext } from "../../pipeline/preflight/load-preflight-context.js";
21
22
  import { loadSource } from "../../sources.js";
22
23
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
23
24
  export class GenerateConfigsStep {
@@ -110,12 +111,21 @@ export class GenerateConfigsStep {
110
111
  catch {
111
112
  ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
112
113
  }
113
- // Compile for each variant
114
+ // Compile for each variant. `graderContext` defaults to "rubric-only" here
115
+ // so handlers see a definite value rather than implicit-undefined.
116
+ const graderContext = ctx.config.graderContext ?? "rubric-only";
117
+ // W0198 Phase 6 — when the package-surface manifest is authored, pass
118
+ // the in-scope package list down so the literacy mode handler can
119
+ // prefix the `code-correctness` rubric with the deterministic-lane
120
+ // system instruction. Silent fall-through when absent.
121
+ const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
114
122
  const baselineResults = this.compileAll(handler, tasks, {
115
123
  rootDir: ctx.config.rootDir,
116
124
  graderProvider: models.grader.id,
117
125
  models: baselineModels,
118
126
  rubricConfig,
127
+ graderContext,
128
+ preflightContext,
119
129
  evalMode: LiteracyVariant.STANDARD,
120
130
  });
121
131
  const agenticResults = this.compileAll(handler, tasks, {
@@ -123,6 +133,8 @@ export class GenerateConfigsStep {
123
133
  graderProvider: models.grader.id,
124
134
  models: agenticModels,
125
135
  rubricConfig,
136
+ graderContext,
137
+ preflightContext,
126
138
  evalMode: LiteracyVariant.AGENTIC,
127
139
  });
128
140
  // Log warnings
@@ -174,11 +186,14 @@ export class GenerateConfigsStep {
174
186
  catch {
175
187
  ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
176
188
  }
189
+ const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
177
190
  const merged = this.compileAll(handler, tasks, {
178
191
  rootDir: ctx.config.rootDir,
179
192
  graderProvider: models.grader.id,
180
193
  models: modeModels,
181
194
  rubricConfig,
195
+ graderContext: ctx.config.graderContext ?? "rubric-only",
196
+ preflightContext,
182
197
  });
183
198
  for (const w of merged.warnings) {
184
199
  ctx.logger.warn(` ⚠ ${w}`);
@@ -8,6 +8,8 @@
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
10
  import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
11
+ import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
12
+ import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
11
13
  import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
12
14
  import { getStepInputPaths } from "../../pipeline/cache.js";
13
15
  import { buildCacheContext } from "../cache-context.js";
@@ -90,6 +92,7 @@ export class RunEvalStep {
90
92
  graderModel: loadGraderModel(rootDir).id,
91
93
  mode: this.mode,
92
94
  rootDir,
95
+ graderContext: ctx.config.graderContext,
93
96
  });
94
97
  // Share fingerprint with downstream steps (PublishReportStep)
95
98
  state.evalFingerprint = evalFingerprint;
@@ -224,6 +227,30 @@ export class RunEvalStep {
224
227
  const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
225
228
  if (existsSync(resultsPath)) {
226
229
  await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
230
+ // W0198 Phase 4 — deterministic-lane reports per (task, model).
231
+ // Loaded lazily so test contexts that don't wire the manifest /
232
+ // resolver pay nothing; the helper is itself a no-op when its
233
+ // inputs are missing.
234
+ const packageSurface = await loadPackageSurface(rootDir).catch((err) => {
235
+ console.warn(` ⚠️ W0198 preflight: failed to load package-surface manifest — ${err instanceof Error ? err.message : String(err)}`);
236
+ return undefined;
237
+ });
238
+ const preflight = await emitSymbolPreflight({
239
+ writer: ctx.artifactWriter,
240
+ ctx,
241
+ mode: this.mode,
242
+ resultsPath,
243
+ packageSurface,
244
+ resolver: ctx.packageSurfaceResolver,
245
+ });
246
+ if (preflight.reports.size > 0) {
247
+ if (!state.preflightReports) {
248
+ state.preflightReports = new Map();
249
+ }
250
+ for (const [k, v] of preflight.reports) {
251
+ state.preflightReports.set(k, v);
252
+ }
253
+ }
227
254
  }
228
255
  // Extract Promptfoo share URL from eval results (Step 3b)
229
256
  if (ctx.evalRunner.extractShareUrl) {
@@ -1,6 +1,7 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
3
  import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
+ import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
4
5
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
6
  export interface PromptfooResultsWrapper {
6
7
  results: RawTestResult[];
@@ -60,6 +61,34 @@ export interface RawTestResult {
60
61
  };
61
62
  vars: Record<string, string>;
62
63
  }
64
+ /**
65
+ * Aggregate counts across every preflight report emitted by the run.
66
+ *
67
+ * `unresolvedRate` is `unresolved / totalFindings` in `[0, 1]`, set only
68
+ * when `totalFindings > 0`. The vacuous case (`totalFindings === 0` —
69
+ * reports exist but every candidate cited zero in-scope bindings) is
70
+ * deliberately distinguished from "every binding resolved cleanly"
71
+ * (`unresolvedRate === 0`) so a CI threshold like `unresolvedRate > 0.1`
72
+ * doesn't fire green on a run that had nothing to resolve.
73
+ */
74
+ export interface PreflightSummary {
75
+ /** Number of per-test preflight reports the run emitted. */
76
+ reportCount: number;
77
+ /** Total findings across all reports. */
78
+ totalFindings: number;
79
+ /** Findings classified `exists`. */
80
+ exists: number;
81
+ /** Findings classified `missing` — the deterministic-deduction lane. */
82
+ missing: number;
83
+ /** Findings classified `unresolved` — the resolver-couldn't-answer lane. */
84
+ unresolved: number;
85
+ /**
86
+ * `unresolved / totalFindings` in `[0, 1]`. Absent when `totalFindings`
87
+ * is zero — distinguishes "nothing to resolve" from "all resolutions
88
+ * succeeded" so CI thresholds aren't vacuously green.
89
+ */
90
+ unresolvedRate?: number;
91
+ }
63
92
  /**
64
93
  * Calculate scores grouped by model. Each model gets its own FeatureScore[]
65
94
  * and model-level aggregates.
@@ -70,7 +99,7 @@ export interface RawTestResult {
70
99
  * @returns Record keyed by model ID, or null if only one model was used
71
100
  * (per-model breakdown is redundant when there's only one model).
72
101
  */
73
- export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
102
+ export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): null | PerModelEntry[];
74
103
  /**
75
104
  * Extract grader judgments (reason text + scores) from evaluation results.
76
105
  *
@@ -91,6 +120,19 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
91
120
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
92
121
  */
93
122
  export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
123
+ /**
124
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
125
+ * resolver-health summary. Returns `undefined` when the run had no
126
+ * preflight reports (manifest disabled, resolver missing, or every
127
+ * candidate output cited zero in-scope packages) so the consumer can
128
+ * cleanly omit the field from the score summary instead of writing a
129
+ * vacuous block of zeros.
130
+ *
131
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
132
+ * production calls go through `calculateAndWriteScores`, which threads
133
+ * the result into the `EvalScoreSummary.preflight` field.
134
+ */
135
+ export declare function summarizePreflight(reports: Map<string, SymbolPreflightReport> | undefined): PreflightSummary | undefined;
94
136
  /**
95
137
  * Score knowledge-probe evaluation results.
96
138
  *
@@ -105,7 +147,7 @@ export declare function extractStoredTestResults(resultsPath: string): StoredTes
105
147
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
106
148
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
107
149
  */
108
- export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>): FeatureScore[];
150
+ export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): FeatureScore[];
109
151
  /**
110
152
  * Score agentic evaluation results. In agentic mode, all test entries are
111
153
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -113,7 +155,7 @@ export declare function scoreKnowledgeProbeResults(results: TestResult[], profil
113
155
  *
114
156
  * Returns a record keyed by feature area with the composite actual score.
115
157
  */
116
- export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
158
+ export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, ActualScoreEntry>;
117
159
  /**
118
160
  * Score agentic results broken down by model.
119
161
  *
@@ -121,7 +163,7 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
121
163
  * producing a map of model → feature → ActualScoreEntry.
122
164
  * Used to enrich the per-model breakdown with actual scores in full mode.
123
165
  */
124
- export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
166
+ export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
125
167
  /** Options for the calculate-scores main() function. */
126
168
  export interface CalculateScoresOptions {
127
169
  /** Allowed origins for source isolation reporting */
@@ -130,12 +172,31 @@ export interface CalculateScoresOptions {
130
172
  logger?: Logger;
131
173
  /** Evaluation mode (controls which result files are read) */
132
174
  mode?: string;
175
+ /**
176
+ * W0198 — symbol-preflight reports keyed by `${runId}/${mode}/${task}/${model}`,
177
+ * populated by `RunEvalStep` via `emitSymbolPreflight`. When provided, the
178
+ * scoring engine merges deterministic preflight findings into the
179
+ * `code-correctness` dimension. Absence (or empty map) collapses cleanly
180
+ * to the pre-W0198 rubric-only path.
181
+ */
182
+ preflightReports?: Map<string, SymbolPreflightReport>;
183
+ /**
184
+ * W0198 — preflight's share of `code-correctness` in `[0, 1]`. Defaults
185
+ * to `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` when omitted.
186
+ */
187
+ preflightWeight?: number;
133
188
  /** Pre-resolved source config (skips loadSource() call) */
134
189
  resolvedSource?: ResolvedSourceConfig;
135
190
  /** Path to baseline results file (default: results/latest/eval-results.json) */
136
191
  resultsPath?: string;
137
192
  /** Root directory of the eval package (required) */
138
193
  rootDir: string;
194
+ /**
195
+ * W0198 — runId axis used to look up preflight reports. Required when
196
+ * `preflightReports` is provided; otherwise the lookup callback can't
197
+ * reconstruct the right key.
198
+ */
199
+ runId?: string;
139
200
  /** Search mode for source verification metadata */
140
201
  searchMode?: string;
141
202
  /** Documentation source name */