@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  22. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  23. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
  25. package/dist/_vendor/ailf-core/types/index.js +1 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  27. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  28. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  29. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  30. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  31. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  32. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  33. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  34. package/dist/adapters/api-client/build-request.d.ts +1 -0
  35. package/dist/adapters/api-client/build-request.js +3 -0
  36. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  37. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
  38. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
  39. package/dist/adapters/index.d.ts +1 -0
  40. package/dist/adapters/index.js +1 -0
  41. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  42. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  43. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  44. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  45. package/dist/adapters/package-surface/index.d.ts +9 -0
  46. package/dist/adapters/package-surface/index.js +8 -0
  47. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  48. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  49. package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
  50. package/dist/adapters/task-sources/repo-schemas.js +15 -0
  51. package/dist/commands/pipeline-action.d.ts +2 -0
  52. package/dist/commands/pipeline-action.js +12 -0
  53. package/dist/commands/remote-pipeline.js +10 -2
  54. package/dist/commands/remote-results.d.ts +12 -1
  55. package/dist/commands/remote-results.js +25 -5
  56. package/dist/composition-root.js +9 -0
  57. package/dist/config/package-surface.ts +37 -0
  58. package/dist/config/preflight-scoring.ts +26 -0
  59. package/dist/index.d.ts +2 -2
  60. package/dist/index.js +1 -1
  61. package/dist/orchestration/build-app-context.js +1 -0
  62. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  63. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  64. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  65. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  66. package/dist/orchestration/steps/run-eval-step.js +27 -0
  67. package/dist/pipeline/calculate-scores.d.ts +66 -5
  68. package/dist/pipeline/calculate-scores.js +141 -27
  69. package/dist/pipeline/compiler/index.d.ts +1 -1
  70. package/dist/pipeline/compiler/index.js +1 -1
  71. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  72. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/map-request-to-config.js +1 -0
  85. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  86. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  88. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  89. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  90. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  91. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  92. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  94. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  95. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  96. package/dist/pipeline/preflight/parse-imports.js +125 -0
  97. package/dist/report-store.d.ts +8 -0
  98. package/dist/report-store.js +55 -6
  99. package/dist/sanity/document-renderers.d.ts +45 -7
  100. package/dist/sanity/document-renderers.js +99 -13
  101. package/dist/sanity/queries.d.ts +11 -11
  102. package/dist/sanity/queries.js +7 -0
  103. package/dist/sanity/symbol-index.d.ts +98 -0
  104. package/dist/sanity/symbol-index.js +615 -0
  105. package/package.json +2 -1
@@ -16,11 +16,19 @@
16
16
  * engine works in [0, 1]; this module handles the conversion at
17
17
  * boundaries.
18
18
  *
19
+ * W0198 Phase 5 — when a `preflightForTest` callback is provided and
20
+ * returns a `SymbolPreflightReport`, the bridge synthesizes one extra
21
+ * `AssertionScore` per test in the `code-correctness` dimension. The
22
+ * deterministic preflight and the LLM rubric merge through D0010's
23
+ * weighted dimension aggregation; the relative share is set by
24
+ * `preflightWeight` in `[0, 1]`.
25
+ *
19
26
  * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
27
  * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
28
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
29
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
22
30
  */
23
- import { aggregateDimensions, computeTaskScore, normalizeScore, } from "../../_vendor/ailf-core/index.js";
31
+ import { aggregateDimensions, computeTaskScore, DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, normalizeScore, } from "../../_vendor/ailf-core/index.js";
24
32
  import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.js";
25
33
  // ---------------------------------------------------------------------------
26
34
  // Public API
@@ -36,28 +44,53 @@ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.
36
44
  * @param profile Weight profile mapping kebab-case dimension names to weights
37
45
  * (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
38
46
  * @param taskId Optional task identifier for traceability in TaskScore output
47
+ * @param options Optional W0198 Phase 5 preflight integration
39
48
  * @returns Dimensions (0–100) and composite (0–100), matching legacy output format
40
49
  */
41
- export function scoreTestGroup(tests, profile, taskId) {
50
+ export function scoreTestGroup(tests, profile, taskId, options) {
42
51
  let totalCost = 0;
52
+ const preflightForTest = options?.preflightForTest;
53
+ const preflightWeight = clampWeight(options?.preflightWeight ?? DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT);
54
+ const preflightActive = typeof preflightForTest === "function" && preflightWeight > 0;
43
55
  // Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
44
56
  //
45
- // Two assertion types contribute to scoring:
57
+ // Three assertion sources contribute to scoring:
46
58
  // - llm-rubric: dimension from metadata, score from grader (0–100 → [0,1])
47
59
  // - javascript: mapped to "assertion-pass-rate" dimension (pass=1, fail=0)
60
+ // - preflight (W0198): synthesized per test from SymbolPreflightReport,
61
+ // dimension "code-correctness", weight = preflightWeight.
62
+ //
63
+ // Rubric weight reduction is per-test, not global: a test's
64
+ // `code-correctness` rubric assertion only drops to `1 - preflightWeight`
65
+ // when preflight actually contributes a paired finding for that test.
66
+ // Without this gate, tests with no preflight coverage would have their
67
+ // rubric authority silently downweighted with nothing to compensate, so
68
+ // partial-coverage runs would systematically bias the dimension toward
69
+ // tests that DO have preflight data.
48
70
  //
49
71
  // Other types (cost, trajectory, contains, etc.) are metadata or guards —
50
72
  // they don't produce dimension scores.
51
73
  const assertionScores = [];
52
74
  for (const test of tests) {
53
75
  totalCost += test.cost;
76
+ const report = preflightActive ? preflightForTest(test) : undefined;
77
+ const ccRubricWeight = report ? 1 - preflightWeight : 1;
54
78
  for (const comp of test.gradingResult.componentResults) {
55
- const converted = componentToScore(comp);
79
+ const converted = componentToScore(comp, ccRubricWeight);
56
80
  if (converted)
57
81
  assertionScores.push(converted);
58
82
  }
83
+ if (report) {
84
+ assertionScores.push(preflightToScore(report, preflightWeight));
85
+ }
59
86
  }
60
- // Step 2: Aggregate into DimensionScores (0–1 scale)
87
+ // Step 2: Aggregate into DimensionScores (0–1 scale).
88
+ //
89
+ // Use `weighted-mean` so the W0198 preflight / rubric weights inside
90
+ // `code-correctness` are honored. With everything at weight=1.0 the
91
+ // result is identical to plain `mean`, so behavior outside the
92
+ // code-correctness merge is unchanged. See `aggregateScores` in
93
+ // `packages/core/src/services/scoring-engine.ts` for the equivalence.
61
94
  const dimensionLabels = {
62
95
  "assertion-pass-rate": "Assertion Pass Rate",
63
96
  "code-correctness": "Code Correctness",
@@ -65,7 +98,7 @@ export function scoreTestGroup(tests, profile, taskId) {
65
98
  "task-completion": "Task Completion",
66
99
  };
67
100
  const rawDimensions = aggregateDimensions(assertionScores, {
68
- defaultAggregation: "mean",
101
+ defaultAggregation: "weighted-mean",
69
102
  dimensionLabels,
70
103
  });
71
104
  // Step 3: Compute weighted composite via TaskScore (0–1 scale)
@@ -102,10 +135,10 @@ export function scoreTestGroup(tests, profile, taskId) {
102
135
  * This replaces the previous llm-rubric-only filter that caused agent-harness
103
136
  * javascript assertions to be invisible to the scoring engine (DOC-2029).
104
137
  */
105
- function componentToScore(comp) {
138
+ function componentToScore(comp, rubricCodeCorrectnessWeight) {
106
139
  const type = comp.assertion?.type;
107
140
  if (type === "llm-rubric") {
108
- return llmRubricToScore(comp);
141
+ return llmRubricToScore(comp, rubricCodeCorrectnessWeight);
109
142
  }
110
143
  if (type === "javascript") {
111
144
  return javascriptAssertionToScore(comp);
@@ -118,14 +151,19 @@ function componentToScore(comp) {
118
151
  *
119
152
  * The dimension comes from metadata (set during rubric template resolution).
120
153
  * Returns null if the component doesn't map to any dimension.
154
+ *
155
+ * For the `code-correctness` dimension specifically, the assertion's
156
+ * weight is reduced when W0198's deterministic preflight is also feeding
157
+ * the same dimension; the complementary share belongs to the preflight.
121
158
  */
122
- function llmRubricToScore(comp) {
159
+ function llmRubricToScore(comp, rubricCodeCorrectnessWeight) {
123
160
  const dim = classifyRubric(comp);
124
161
  if (!dim)
125
162
  return null;
126
163
  // Parse the raw score (0–100 from the grader) and normalize to [0, 1]
127
164
  const rawScore = parseRubricScore(comp);
128
165
  const normalized = normalizeScore(rawScore, "llm-rubric");
166
+ const weight = dim === "code-correctness" ? rubricCodeCorrectnessWeight : 1.0;
129
167
  return {
130
168
  assertionType: comp.assertion?.type ?? "llm-rubric",
131
169
  dimension: dim,
@@ -133,7 +171,7 @@ function llmRubricToScore(comp) {
133
171
  pass: comp.pass,
134
172
  reason: comp.reason ?? "",
135
173
  score: normalized,
136
- weight: 1.0,
174
+ weight,
137
175
  };
138
176
  }
139
177
  /**
@@ -160,6 +198,62 @@ function javascriptAssertionToScore(comp) {
160
198
  weight: 1.0,
161
199
  };
162
200
  }
201
+ /**
202
+ * Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
203
+ * preflight report.
204
+ *
205
+ * The score is `1 - min(1, total / cap)`. With the default
206
+ * `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
207
+ * 2 missing → 0.333, ≥3 missing → 0.0.
208
+ *
209
+ * Edge cases for the deduction config:
210
+ * - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
211
+ * would NaN otherwise.
212
+ * - `cap < 0` (misconfigured): score is 1.0 and the merge silently
213
+ * collapses to rubric-only on this dimension. The Phase 3 Zod schema
214
+ * gates against this upstream so it should never reach here, but the
215
+ * guard preserves the never-deduct invariant if it does.
216
+ *
217
+ * `unresolved` findings never deduct (the preflight's never-deduct rule):
218
+ * they're not part of `total`, so they fall through to the LLM rubric.
219
+ */
220
+ export function preflightToScore(report, weight) {
221
+ const { perMissing, cap, total } = report.deduction;
222
+ const score = cap > 0 ? 1 - Math.min(1, total / cap) : 1;
223
+ const counts = countLanes(report);
224
+ return {
225
+ assertionType: "preflight",
226
+ dimension: "code-correctness",
227
+ latencyMs: 0,
228
+ pass: total === 0,
229
+ reason: `preflight: ${counts.exists} exists, ${counts.missing} missing, ${counts.unresolved} unresolved (deduction ${total}/${cap}, ${perMissing} per missing)`,
230
+ score,
231
+ weight,
232
+ };
233
+ }
234
+ function countLanes(report) {
235
+ let exists = 0;
236
+ let missing = 0;
237
+ let unresolved = 0;
238
+ for (const f of report.findings) {
239
+ if (f.result === "exists")
240
+ exists++;
241
+ else if (f.result === "missing")
242
+ missing++;
243
+ else
244
+ unresolved++;
245
+ }
246
+ return { exists, missing, unresolved };
247
+ }
248
+ function clampWeight(w) {
249
+ if (!Number.isFinite(w))
250
+ return 0;
251
+ if (w < 0)
252
+ return 0;
253
+ if (w > 1)
254
+ return 1;
255
+ return w;
256
+ }
163
257
  /** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
164
258
  function kebabToCamel(kebab) {
165
259
  return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
@@ -41,6 +41,15 @@ export interface FingerprintInput {
41
41
  mode: EvalMode;
42
42
  /** Path to the packages/eval root directory */
43
43
  rootDir: string;
44
+ /**
45
+ * Grader context policy. Distinct values produce distinct rubricPrompt
46
+ * content, so the cache must treat them as different evaluations even
47
+ * when tasks + docs + grader model match.
48
+ *
49
+ * Defaults to "rubric-only" inside the hash when undefined, matching
50
+ * the EvalConfig boundary default.
51
+ */
52
+ graderContext?: "rubric-only" | "with-docs";
44
53
  }
45
54
  /**
46
55
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
@@ -38,8 +38,12 @@ import { join, relative, resolve } from "path";
38
38
  * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
39
39
  * files), file paths normalized to rootDir-relative, grader passed
40
40
  * through verbatim instead of the literal string "default".
41
+ *
42
+ * v3 (2026-05-06): grader-context policy ("rubric-only" vs "with-docs")
43
+ * affects rubricPrompt content and therefore eval output, so it must be
44
+ * hashed. Bumping invalidates v2 fingerprints.
41
45
  */
42
- const FINGERPRINT_VERSION = "eval-fingerprint-v2";
46
+ const FINGERPRINT_VERSION = "eval-fingerprint-v3";
43
47
  /**
44
48
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
45
49
  *
@@ -52,10 +56,12 @@ const FINGERPRINT_VERSION = "eval-fingerprint-v2";
52
56
  */
53
57
  export function computeEvalFingerprint(input) {
54
58
  const { graderModel, mode, rootDir, tasks } = input;
59
+ const graderContext = input.graderContext ?? "rubric-only";
55
60
  const hash = createHash("sha256");
56
61
  hash.update(`version:${FINGERPRINT_VERSION}\n`);
57
62
  hash.update(`mode:${mode}\n`);
58
63
  hash.update(`grader:${graderModel}\n`);
64
+ hash.update(`graderContext:${graderContext}\n`);
59
65
  hash.update(`tasks:${hashTaskSet(tasks)}\n`);
60
66
  // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
61
67
  // so a CI runner at /home/runner/... and a laptop at /Users/... produce
@@ -52,6 +52,7 @@ export function mapRequestToConfig(request, rootDir) {
52
52
  noAutoScope: request.noAutoScope ?? false,
53
53
  noCache: request.noCache ?? false,
54
54
  noRemoteCache: request.noRemoteCache ?? false,
55
+ graderContext: request.graderContext,
55
56
  graderReplications: request.graderReplications,
56
57
  urls: request.urls,
57
58
  headers: request.headers,
@@ -0,0 +1,67 @@
1
+ /**
2
+ * compute-preflight — pure function that turns a candidate's code +
3
+ * a `PackageSurfaceResolver` + the framework-level package-surface
4
+ * manifest into a `SymbolPreflightReport`.
5
+ *
6
+ * Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
7
+ * from package Y" entirely out of LLM judgment. The LLM rubric runs
8
+ * after this and is told the preflight's findings as ground truth.
9
+ *
10
+ * The function is pure (no I/O beyond the resolver's): it parses the
11
+ * candidate's imports, asks the resolver about each in-scope package,
12
+ * and emits one finding per imported binding.
13
+ *
14
+ * Per-binding decision tree:
15
+ *
16
+ * 1. Drop the binding if it isn't a `named` import. Default,
17
+ * namespace, and side-effect imports are intentionally not
18
+ * checked — the package surface only includes named exports
19
+ * (per the design's "named bindings only" rule), so default /
20
+ * namespace imports cannot be answered against it without
21
+ * false-deducting legitimate code.
22
+ *
23
+ * 2. Drop the binding if its `source` package is not in the
24
+ * framework-level manifest. Out-of-scope packages don't get
25
+ * findings — they are silently passed through to the LLM rubric.
26
+ *
27
+ * 3. Resolve the package surface. If the resolver throws a typed
28
+ * `PackageSurfaceResolverError`, every binding from that package
29
+ * becomes `unresolved` with the matching reason. **Never deduct.**
30
+ *
31
+ * 4. If the binding is in the surface, emit `exists` (no deduction).
32
+ *
33
+ * 5. Otherwise, emit `missing` (deterministic deduction).
34
+ *
35
+ * Deduction is `total = min(missing_count * perMissing, cap)`. The
36
+ * scoring bridge (Phase 5) computes the per-dimension score from this
37
+ * report; this function stays a pure data factory.
38
+ */
39
+ import { type PackageSurfaceConfig, type PackageSurfaceResolver, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
40
+ /** Default deduction config — `−20 per missing, capped at 60`. */
41
+ export declare const DEFAULT_DEDUCTION: {
42
+ readonly perMissing: 20;
43
+ readonly cap: 60;
44
+ };
45
+ export interface ComputePreflightInput {
46
+ /** Raw candidate output. Typically the contents of a single TS/TSX/JS code block. */
47
+ readonly code: string;
48
+ /** Identity for this candidate, recorded in the report's `candidate` field. */
49
+ readonly candidate: {
50
+ readonly taskId: string;
51
+ readonly testIndex: number;
52
+ };
53
+ /** Framework-level package-surface manifest (Phase 0 / `definePackageSurface`). */
54
+ readonly packageSurface: PackageSurfaceConfig;
55
+ /** Resolver used to fetch each in-scope package's surface (Phase 1). */
56
+ readonly resolver: PackageSurfaceResolver;
57
+ /**
58
+ * Deduction config. Defaults to `DEFAULT_DEDUCTION`. Pass
59
+ * `{ perMissing: 0, cap: 0 }` to compute findings without deduction
60
+ * (e.g. for measurement-only runs).
61
+ */
62
+ readonly deduction?: {
63
+ readonly perMissing?: number;
64
+ readonly cap?: number;
65
+ };
66
+ }
67
+ export declare function computePreflight(input: ComputePreflightInput): Promise<SymbolPreflightReport>;
@@ -0,0 +1,118 @@
1
+ /**
2
+ * compute-preflight — pure function that turns a candidate's code +
3
+ * a `PackageSurfaceResolver` + the framework-level package-surface
4
+ * manifest into a `SymbolPreflightReport`.
5
+ *
6
+ * Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
7
+ * from package Y" entirely out of LLM judgment. The LLM rubric runs
8
+ * after this and is told the preflight's findings as ground truth.
9
+ *
10
+ * The function is pure (no I/O beyond the resolver's): it parses the
11
+ * candidate's imports, asks the resolver about each in-scope package,
12
+ * and emits one finding per imported binding.
13
+ *
14
+ * Per-binding decision tree:
15
+ *
16
+ * 1. Drop the binding if it isn't a `named` import. Default,
17
+ * namespace, and side-effect imports are intentionally not
18
+ * checked — the package surface only includes named exports
19
+ * (per the design's "named bindings only" rule), so default /
20
+ * namespace imports cannot be answered against it without
21
+ * false-deducting legitimate code.
22
+ *
23
+ * 2. Drop the binding if its `source` package is not in the
24
+ * framework-level manifest. Out-of-scope packages don't get
25
+ * findings — they are silently passed through to the LLM rubric.
26
+ *
27
+ * 3. Resolve the package surface. If the resolver throws a typed
28
+ * `PackageSurfaceResolverError`, every binding from that package
29
+ * becomes `unresolved` with the matching reason. **Never deduct.**
30
+ *
31
+ * 4. If the binding is in the surface, emit `exists` (no deduction).
32
+ *
33
+ * 5. Otherwise, emit `missing` (deterministic deduction).
34
+ *
35
+ * Deduction is `total = min(missing_count * perMissing, cap)`. The
36
+ * scoring bridge (Phase 5) computes the per-dimension score from this
37
+ * report; this function stays a pure data factory.
38
+ */
39
+ import { PackageSurfaceResolverError, } from "../../_vendor/ailf-core/index.js";
40
+ import { parseImports } from "./parse-imports.js";
41
+ /** Default deduction config — `−20 per missing, capped at 60`. */
42
+ export const DEFAULT_DEDUCTION = {
43
+ perMissing: 20,
44
+ cap: 60,
45
+ };
46
+ export async function computePreflight(input) {
47
+ const perMissing = input.deduction?.perMissing ?? DEFAULT_DEDUCTION.perMissing;
48
+ const cap = input.deduction?.cap ?? DEFAULT_DEDUCTION.cap;
49
+ const inScope = new Set(input.packageSurface.packages.map((p) => p.pkg));
50
+ const imports = parseImports(input.code);
51
+ // Bucket named imports by package so we resolve each surface at most once.
52
+ const namedBySource = new Map();
53
+ for (const binding of imports) {
54
+ if (binding.kind !== "named")
55
+ continue;
56
+ if (!inScope.has(binding.source))
57
+ continue;
58
+ let bindings = namedBySource.get(binding.source);
59
+ if (!bindings) {
60
+ bindings = [];
61
+ namedBySource.set(binding.source, bindings);
62
+ }
63
+ if (!bindings.includes(binding.imported))
64
+ bindings.push(binding.imported);
65
+ }
66
+ const findings = [];
67
+ for (const [pkg, bindings] of namedBySource) {
68
+ let surface;
69
+ try {
70
+ surface = await input.resolver.resolveExports(pkg);
71
+ }
72
+ catch (err) {
73
+ const reason = unresolvedReasonFor(err);
74
+ for (const binding of bindings) {
75
+ findings.push({ result: "unresolved", pkg, binding, reason });
76
+ }
77
+ continue;
78
+ }
79
+ const surfaceNames = new Map();
80
+ for (const sym of surface.symbols)
81
+ surfaceNames.set(sym.name, sym.source);
82
+ for (const binding of bindings) {
83
+ const source = surfaceNames.get(binding);
84
+ if (source) {
85
+ findings.push({
86
+ result: "exists",
87
+ pkg,
88
+ version: surface.version,
89
+ binding,
90
+ source,
91
+ });
92
+ }
93
+ else {
94
+ findings.push({
95
+ result: "missing",
96
+ pkg,
97
+ version: surface.version,
98
+ binding,
99
+ });
100
+ }
101
+ }
102
+ }
103
+ const missingCount = findings.filter((f) => f.result === "missing").length;
104
+ const total = Math.min(missingCount * perMissing, cap);
105
+ return {
106
+ candidate: { ...input.candidate },
107
+ findings,
108
+ deduction: { perMissing, cap, total },
109
+ };
110
+ }
111
+ function unresolvedReasonFor(err) {
112
+ if (err instanceof PackageSurfaceResolverError) {
113
+ return err.reason;
114
+ }
115
+ // Anything else from the resolver is treated as a parse failure —
116
+ // fail-loud, never-deduct.
117
+ return "parse-failed";
118
+ }
@@ -0,0 +1,51 @@
1
+ /**
2
+ * emit-symbol-preflight — turns a Promptfoo results file into per-test
3
+ * `symbolPreflight` artifact emissions, one per (run, mode, task, model).
4
+ *
5
+ * Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
6
+ * of `RunEvalStep`. For every test row in the results file we:
7
+ *
8
+ * 1. Pull the candidate's response text out of `result.response.output`.
9
+ * 2. Run `computePreflight` against the framework-level package-surface
10
+ * manifest using the wired `PackageSurfaceResolver` from the
11
+ * `AppContext`.
12
+ * 3. Emit the report through the artifact writer at axes
13
+ * `(run, mode, task, model)` — same axes as `rawResults`.
14
+ * 4. Attach the report onto `state.preflightReports` keyed by the
15
+ * same axes so the scoring step (Phase 5) can read it without a
16
+ * second filesystem hop.
17
+ *
18
+ * Non-blocking: a missing resolver, missing manifest, missing response,
19
+ * or per-row exception logs a warning and continues. The deterministic
20
+ * lane is additive; if any of its inputs are missing the LLM rubric
21
+ * still scores the candidate normally.
22
+ */
23
+ import { type ArtifactRef, type ArtifactWriter, type PackageSurfaceConfig, type PackageSurfaceResolver, type RunId, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
24
+ /**
25
+ * Per-row preflight key. Mirrors the axis set the writer uses for
26
+ * `symbolPreflight` so the scoring step can look up reports without
27
+ * re-deriving them from disk.
28
+ */
29
+ export interface SymbolPreflightKey {
30
+ run: RunId;
31
+ mode: string;
32
+ task: string;
33
+ model: string;
34
+ }
35
+ /** Map a per-row preflight key to a stable string for in-memory lookup. */
36
+ export declare function preflightKey(key: SymbolPreflightKey): string;
37
+ export interface EmitSymbolPreflightInput {
38
+ writer: ArtifactWriter;
39
+ ctx: {
40
+ runId: RunId;
41
+ };
42
+ mode: string;
43
+ resultsPath: string;
44
+ packageSurface: PackageSurfaceConfig | undefined;
45
+ resolver: PackageSurfaceResolver | undefined;
46
+ }
47
+ export interface EmitSymbolPreflightOutput {
48
+ reports: Map<string, SymbolPreflightReport>;
49
+ refs: readonly (ArtifactRef | null)[];
50
+ }
51
+ export declare function emitSymbolPreflight(input: EmitSymbolPreflightInput): Promise<EmitSymbolPreflightOutput>;
@@ -0,0 +1,102 @@
1
+ /**
2
+ * emit-symbol-preflight — turns a Promptfoo results file into per-test
3
+ * `symbolPreflight` artifact emissions, one per (run, mode, task, model).
4
+ *
5
+ * Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
6
+ * of `RunEvalStep`. For every test row in the results file we:
7
+ *
8
+ * 1. Pull the candidate's response text out of `result.response.output`.
9
+ * 2. Run `computePreflight` against the framework-level package-surface
10
+ * manifest using the wired `PackageSurfaceResolver` from the
11
+ * `AppContext`.
12
+ * 3. Emit the report through the artifact writer at axes
13
+ * `(run, mode, task, model)` — same axes as `rawResults`.
14
+ * 4. Attach the report onto `state.preflightReports` keyed by the
15
+ * same axes so the scoring step (Phase 5) can read it without a
16
+ * second filesystem hop.
17
+ *
18
+ * Non-blocking: a missing resolver, missing manifest, missing response,
19
+ * or per-row exception logs a warning and continues. The deterministic
20
+ * lane is additive; if any of its inputs are missing the LLM rubric
21
+ * still scores the candidate normally.
22
+ */
23
+ import { readFileSync } from "node:fs";
24
+ import { resolveVariantMode, } from "../../_vendor/ailf-core/index.js";
25
+ import { computePreflight } from "./compute-preflight.js";
26
+ /** Map a per-row preflight key to a stable string for in-memory lookup. */
27
+ export function preflightKey(key) {
28
+ return `${key.run}/${key.mode}/${key.task}/${key.model}`;
29
+ }
30
+ export async function emitSymbolPreflight(input) {
31
+ const reports = new Map();
32
+ const refs = [];
33
+ if (!input.packageSurface || !input.resolver) {
34
+ // The deterministic lane is additive — when its inputs aren't wired
35
+ // (test contexts, opt-out, partial rollouts) the LLM rubric still
36
+ // grades the candidate. Stay silent.
37
+ return { reports, refs };
38
+ }
39
+ if (input.packageSurface.packages.length === 0) {
40
+ return { reports, refs };
41
+ }
42
+ let raw;
43
+ try {
44
+ raw = JSON.parse(readFileSync(input.resultsPath, "utf-8"));
45
+ }
46
+ catch (err) {
47
+ const message = err instanceof Error ? err.message : String(err);
48
+ console.warn(` ⚠️ emitSymbolPreflight: failed to read ${input.resultsPath} — ${message}`);
49
+ return { reports, refs };
50
+ }
51
+ const wrapper = raw.results && "results" in raw.results
52
+ ? raw.results
53
+ : raw;
54
+ const rows = wrapper?.results ?? [];
55
+ if (rows.length === 0)
56
+ return { reports, refs };
57
+ // Track per-(task, model) to dedupe — Promptfoo emits multiple rows
58
+ // for the same candidate when there are multiple assertions, but
59
+ // the preflight only depends on the candidate's text, not the
60
+ // assertion outcome. One report per (task, model) suffices.
61
+ const seen = new Set();
62
+ const emits = [];
63
+ for (let i = 0; i < rows.length; i++) {
64
+ const row = rows[i];
65
+ const rawTaskId = row.testCase?.description ?? "unknown-task";
66
+ const modelId = row.provider?.id ?? row.provider?.label ?? "unknown-model";
67
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, input.mode);
68
+ const baseAssoc = {
69
+ run: input.ctx.runId,
70
+ mode: axisMode,
71
+ task: axisTask,
72
+ model: modelId,
73
+ };
74
+ const key = preflightKey(baseAssoc);
75
+ if (seen.has(key))
76
+ continue;
77
+ seen.add(key);
78
+ const output = row.response?.output;
79
+ if (typeof output !== "string" || output.length === 0)
80
+ continue;
81
+ let report;
82
+ try {
83
+ report = await computePreflight({
84
+ code: output,
85
+ candidate: { taskId: axisTask, testIndex: i },
86
+ packageSurface: input.packageSurface,
87
+ resolver: input.resolver,
88
+ });
89
+ }
90
+ catch (err) {
91
+ const message = err instanceof Error ? err.message : String(err);
92
+ console.warn(` ⚠️ emitSymbolPreflight: computePreflight threw for ${key} — ${message}`);
93
+ continue;
94
+ }
95
+ reports.set(key, report);
96
+ emits.push(input.writer.emit("symbolPreflight", baseAssoc, report));
97
+ }
98
+ const settled = await Promise.all(emits);
99
+ for (const ref of settled)
100
+ refs.push(ref);
101
+ return { reports, refs };
102
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * load-package-surface — read the framework-level package-surface
3
+ * manifest (`config/package-surface.ts`) authored via
4
+ * `definePackageSurface()`.
5
+ *
6
+ * Returns `undefined` when the file is absent so the W0198 preflight
7
+ * step can no-op cleanly during the staged rollout. The eval package
8
+ * itself ships a manifest under `config/package-surface.ts` (Phase 0),
9
+ * so the live pipeline always finds one; the optional return path
10
+ * exists for downstream / external callers that may not have authored
11
+ * one yet.
12
+ */
13
+ import type { PackageSurfaceConfig } from "../../_vendor/ailf-core/index.d.ts";
14
+ export declare function loadPackageSurface(rootDir: string): Promise<PackageSurfaceConfig | undefined>;
@@ -0,0 +1,19 @@
1
+ /**
2
+ * load-package-surface — read the framework-level package-surface
3
+ * manifest (`config/package-surface.ts`) authored via
4
+ * `definePackageSurface()`.
5
+ *
6
+ * Returns `undefined` when the file is absent so the W0198 preflight
7
+ * step can no-op cleanly during the staged rollout. The eval package
8
+ * itself ships a manifest under `config/package-surface.ts` (Phase 0),
9
+ * so the live pipeline always finds one; the optional return path
10
+ * exists for downstream / external callers that may not have authored
11
+ * one yet.
12
+ */
13
+ import { tryLoadConfigFile } from "../compiler/config-loader.js";
14
+ export async function loadPackageSurface(rootDir) {
15
+ const result = tryLoadConfigFile("package-surface", rootDir);
16
+ if (!result)
17
+ return undefined;
18
+ return result.data;
19
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * load-preflight-context — read the framework-level package-surface
3
+ * manifest and project it down to the rubric-side context shape.
4
+ *
5
+ * Returns `undefined` when the manifest is absent or empty so callers
6
+ * collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
7
+ * `loadPackageSurface` and `loadPreflightScoring` — one loader per
8
+ * lazily-read W0198 input, all in `pipeline/preflight/`.
9
+ *
10
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
11
+ */
12
+ import type { Logger, PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
13
+ export declare function loadPreflightContext(rootDir: string, logger?: Pick<Logger, "warn">): Promise<PreflightRubricContext | undefined>;
@@ -0,0 +1,25 @@
1
+ /**
2
+ * load-preflight-context — read the framework-level package-surface
3
+ * manifest and project it down to the rubric-side context shape.
4
+ *
5
+ * Returns `undefined` when the manifest is absent or empty so callers
6
+ * collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
7
+ * `loadPackageSurface` and `loadPreflightScoring` — one loader per
8
+ * lazily-read W0198 input, all in `pipeline/preflight/`.
9
+ *
10
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
11
+ */
12
+ import { loadPackageSurface } from "./load-package-surface.js";
13
+ export async function loadPreflightContext(rootDir, logger) {
14
+ try {
15
+ const manifest = await loadPackageSurface(rootDir);
16
+ if (!manifest || manifest.packages.length === 0)
17
+ return undefined;
18
+ return { packages: manifest.packages.map((p) => p.pkg) };
19
+ }
20
+ catch (err) {
21
+ const message = err instanceof Error ? err.message : String(err);
22
+ logger?.warn(`[warn] W0198 preflight: failed to load package-surface manifest — ${message}`);
23
+ return undefined;
24
+ }
25
+ }