@sanity/ailf 4.2.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/package-surface.ts +37 -0
- package/config/preflight-scoring.ts +26 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
- package/dist/_vendor/ailf-core/config-helpers.js +67 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
- package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
- package/dist/adapters/index.d.ts +1 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
- package/dist/adapters/package-surface/dts-package-surface.js +173 -0
- package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
- package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
- package/dist/adapters/package-surface/index.d.ts +9 -0
- package/dist/adapters/package-surface/index.js +8 -0
- package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
- package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
- package/dist/adapters/task-sources/repo-schemas.js +15 -0
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +12 -0
- package/dist/commands/remote-pipeline.js +10 -2
- package/dist/commands/remote-results.d.ts +12 -1
- package/dist/commands/remote-results.js +25 -5
- package/dist/composition-root.js +9 -0
- package/dist/config/package-surface.ts +37 -0
- package/dist/config/preflight-scoring.ts +26 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
- package/dist/orchestration/pipeline-orchestrator.js +38 -0
- package/dist/orchestration/steps/calculate-scores-step.js +11 -0
- package/dist/orchestration/steps/generate-configs-step.js +16 -1
- package/dist/orchestration/steps/run-eval-step.js +27 -0
- package/dist/pipeline/calculate-scores.d.ts +66 -5
- package/dist/pipeline/calculate-scores.js +141 -27
- package/dist/pipeline/compiler/index.d.ts +1 -1
- package/dist/pipeline/compiler/index.js +1 -1
- package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
- package/dist/pipeline/compiler/literacy-bridge.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
- package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
- package/dist/pipeline/compiler/rubric-resolution.js +78 -2
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
- package/dist/pipeline/compiler/scoring-bridge.js +104 -10
- package/dist/pipeline/eval-fingerprint.d.ts +9 -0
- package/dist/pipeline/eval-fingerprint.js +7 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
- package/dist/pipeline/preflight/compute-preflight.js +118 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
- package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
- package/dist/pipeline/preflight/load-package-surface.js +19 -0
- package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
- package/dist/pipeline/preflight/load-preflight-context.js +25 -0
- package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
- package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
- package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
- package/dist/pipeline/preflight/parse-imports.js +125 -0
- package/dist/report-store.d.ts +8 -0
- package/dist/report-store.js +55 -6
- package/dist/sanity/document-renderers.d.ts +45 -7
- package/dist/sanity/document-renderers.js +99 -13
- package/dist/sanity/queries.d.ts +11 -11
- package/dist/sanity/queries.js +7 -0
- package/dist/sanity/symbol-index.d.ts +98 -0
- package/dist/sanity/symbol-index.js +615 -0
- package/package.json +2 -1
|
@@ -16,11 +16,19 @@
|
|
|
16
16
|
* engine works in [0, 1]; this module handles the conversion at
|
|
17
17
|
* boundaries.
|
|
18
18
|
*
|
|
19
|
+
* W0198 Phase 5 — when a `preflightForTest` callback is provided and
|
|
20
|
+
* returns a `SymbolPreflightReport`, the bridge synthesizes one extra
|
|
21
|
+
* `AssertionScore` per test in the `code-correctness` dimension. The
|
|
22
|
+
* deterministic preflight and the LLM rubric merge through D0010's
|
|
23
|
+
* weighted dimension aggregation; the relative share is set by
|
|
24
|
+
* `preflightWeight` in `[0, 1]`.
|
|
25
|
+
*
|
|
19
26
|
* @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
|
|
20
27
|
* @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
|
|
21
28
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
29
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 design
|
|
22
30
|
*/
|
|
23
|
-
import { aggregateDimensions, computeTaskScore, normalizeScore, } from "../../_vendor/ailf-core/index.js";
|
|
31
|
+
import { aggregateDimensions, computeTaskScore, DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, normalizeScore, } from "../../_vendor/ailf-core/index.js";
|
|
24
32
|
import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.js";
|
|
25
33
|
// ---------------------------------------------------------------------------
|
|
26
34
|
// Public API
|
|
@@ -36,28 +44,53 @@ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.
|
|
|
36
44
|
* @param profile Weight profile mapping kebab-case dimension names to weights
|
|
37
45
|
* (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
|
|
38
46
|
* @param taskId Optional task identifier for traceability in TaskScore output
|
|
47
|
+
* @param options Optional W0198 Phase 5 preflight integration
|
|
39
48
|
* @returns Dimensions (0–100) and composite (0–100), matching legacy output format
|
|
40
49
|
*/
|
|
41
|
-
export function scoreTestGroup(tests, profile, taskId) {
|
|
50
|
+
export function scoreTestGroup(tests, profile, taskId, options) {
|
|
42
51
|
let totalCost = 0;
|
|
52
|
+
const preflightForTest = options?.preflightForTest;
|
|
53
|
+
const preflightWeight = clampWeight(options?.preflightWeight ?? DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT);
|
|
54
|
+
const preflightActive = typeof preflightForTest === "function" && preflightWeight > 0;
|
|
43
55
|
// Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
|
|
44
56
|
//
|
|
45
|
-
//
|
|
57
|
+
// Three assertion sources contribute to scoring:
|
|
46
58
|
// - llm-rubric: dimension from metadata, score from grader (0–100 → [0,1])
|
|
47
59
|
// - javascript: mapped to "assertion-pass-rate" dimension (pass=1, fail=0)
|
|
60
|
+
// - preflight (W0198): synthesized per test from SymbolPreflightReport,
|
|
61
|
+
// dimension "code-correctness", weight = preflightWeight.
|
|
62
|
+
//
|
|
63
|
+
// Rubric weight reduction is per-test, not global: a test's
|
|
64
|
+
// `code-correctness` rubric assertion only drops to `1 - preflightWeight`
|
|
65
|
+
// when preflight actually contributes a paired finding for that test.
|
|
66
|
+
// Without this gate, tests with no preflight coverage would have their
|
|
67
|
+
// rubric authority silently downweighted with nothing to compensate, so
|
|
68
|
+
// partial-coverage runs would systematically bias the dimension toward
|
|
69
|
+
// tests that DO have preflight data.
|
|
48
70
|
//
|
|
49
71
|
// Other types (cost, trajectory, contains, etc.) are metadata or guards —
|
|
50
72
|
// they don't produce dimension scores.
|
|
51
73
|
const assertionScores = [];
|
|
52
74
|
for (const test of tests) {
|
|
53
75
|
totalCost += test.cost;
|
|
76
|
+
const report = preflightActive ? preflightForTest(test) : undefined;
|
|
77
|
+
const ccRubricWeight = report ? 1 - preflightWeight : 1;
|
|
54
78
|
for (const comp of test.gradingResult.componentResults) {
|
|
55
|
-
const converted = componentToScore(comp);
|
|
79
|
+
const converted = componentToScore(comp, ccRubricWeight);
|
|
56
80
|
if (converted)
|
|
57
81
|
assertionScores.push(converted);
|
|
58
82
|
}
|
|
83
|
+
if (report) {
|
|
84
|
+
assertionScores.push(preflightToScore(report, preflightWeight));
|
|
85
|
+
}
|
|
59
86
|
}
|
|
60
|
-
// Step 2: Aggregate into DimensionScores (0–1 scale)
|
|
87
|
+
// Step 2: Aggregate into DimensionScores (0–1 scale).
|
|
88
|
+
//
|
|
89
|
+
// Use `weighted-mean` so the W0198 preflight / rubric weights inside
|
|
90
|
+
// `code-correctness` are honored. With everything at weight=1.0 the
|
|
91
|
+
// result is identical to plain `mean`, so behavior outside the
|
|
92
|
+
// code-correctness merge is unchanged. See `aggregateScores` in
|
|
93
|
+
// `packages/core/src/services/scoring-engine.ts` for the equivalence.
|
|
61
94
|
const dimensionLabels = {
|
|
62
95
|
"assertion-pass-rate": "Assertion Pass Rate",
|
|
63
96
|
"code-correctness": "Code Correctness",
|
|
@@ -65,7 +98,7 @@ export function scoreTestGroup(tests, profile, taskId) {
|
|
|
65
98
|
"task-completion": "Task Completion",
|
|
66
99
|
};
|
|
67
100
|
const rawDimensions = aggregateDimensions(assertionScores, {
|
|
68
|
-
defaultAggregation: "mean",
|
|
101
|
+
defaultAggregation: "weighted-mean",
|
|
69
102
|
dimensionLabels,
|
|
70
103
|
});
|
|
71
104
|
// Step 3: Compute weighted composite via TaskScore (0–1 scale)
|
|
@@ -102,10 +135,10 @@ export function scoreTestGroup(tests, profile, taskId) {
|
|
|
102
135
|
* This replaces the previous llm-rubric-only filter that caused agent-harness
|
|
103
136
|
* javascript assertions to be invisible to the scoring engine (DOC-2029).
|
|
104
137
|
*/
|
|
105
|
-
function componentToScore(comp) {
|
|
138
|
+
function componentToScore(comp, rubricCodeCorrectnessWeight) {
|
|
106
139
|
const type = comp.assertion?.type;
|
|
107
140
|
if (type === "llm-rubric") {
|
|
108
|
-
return llmRubricToScore(comp);
|
|
141
|
+
return llmRubricToScore(comp, rubricCodeCorrectnessWeight);
|
|
109
142
|
}
|
|
110
143
|
if (type === "javascript") {
|
|
111
144
|
return javascriptAssertionToScore(comp);
|
|
@@ -118,14 +151,19 @@ function componentToScore(comp) {
|
|
|
118
151
|
*
|
|
119
152
|
* The dimension comes from metadata (set during rubric template resolution).
|
|
120
153
|
* Returns null if the component doesn't map to any dimension.
|
|
154
|
+
*
|
|
155
|
+
* For the `code-correctness` dimension specifically, the assertion's
|
|
156
|
+
* weight is reduced when W0198's deterministic preflight is also feeding
|
|
157
|
+
* the same dimension; the complementary share belongs to the preflight.
|
|
121
158
|
*/
|
|
122
|
-
function llmRubricToScore(comp) {
|
|
159
|
+
function llmRubricToScore(comp, rubricCodeCorrectnessWeight) {
|
|
123
160
|
const dim = classifyRubric(comp);
|
|
124
161
|
if (!dim)
|
|
125
162
|
return null;
|
|
126
163
|
// Parse the raw score (0–100 from the grader) and normalize to [0, 1]
|
|
127
164
|
const rawScore = parseRubricScore(comp);
|
|
128
165
|
const normalized = normalizeScore(rawScore, "llm-rubric");
|
|
166
|
+
const weight = dim === "code-correctness" ? rubricCodeCorrectnessWeight : 1.0;
|
|
129
167
|
return {
|
|
130
168
|
assertionType: comp.assertion?.type ?? "llm-rubric",
|
|
131
169
|
dimension: dim,
|
|
@@ -133,7 +171,7 @@ function llmRubricToScore(comp) {
|
|
|
133
171
|
pass: comp.pass,
|
|
134
172
|
reason: comp.reason ?? "",
|
|
135
173
|
score: normalized,
|
|
136
|
-
weight
|
|
174
|
+
weight,
|
|
137
175
|
};
|
|
138
176
|
}
|
|
139
177
|
/**
|
|
@@ -160,6 +198,62 @@ function javascriptAssertionToScore(comp) {
|
|
|
160
198
|
weight: 1.0,
|
|
161
199
|
};
|
|
162
200
|
}
|
|
201
|
+
/**
|
|
202
|
+
* Synthesize a `code-correctness` AssertionScore from a W0198 symbol-
|
|
203
|
+
* preflight report.
|
|
204
|
+
*
|
|
205
|
+
* The score is `1 - min(1, total / cap)`. With the default
|
|
206
|
+
* `{ perMissing: 20, cap: 60 }` config: 0 missing → 1.0, 1 missing → 0.667,
|
|
207
|
+
* 2 missing → 0.333, ≥3 missing → 0.0.
|
|
208
|
+
*
|
|
209
|
+
* Edge cases for the deduction config:
|
|
210
|
+
* - `cap === 0` (measurement-only config): score is 1.0 — divide-by-zero
|
|
211
|
+
* would NaN otherwise.
|
|
212
|
+
* - `cap < 0` (misconfigured): score is 1.0 and the merge silently
|
|
213
|
+
* collapses to rubric-only on this dimension. The Phase 3 Zod schema
|
|
214
|
+
* gates against this upstream so it should never reach here, but the
|
|
215
|
+
* guard preserves the never-deduct invariant if it does.
|
|
216
|
+
*
|
|
217
|
+
* `unresolved` findings never deduct (the preflight's never-deduct rule):
|
|
218
|
+
* they're not part of `total`, so they fall through to the LLM rubric.
|
|
219
|
+
*/
|
|
220
|
+
export function preflightToScore(report, weight) {
|
|
221
|
+
const { perMissing, cap, total } = report.deduction;
|
|
222
|
+
const score = cap > 0 ? 1 - Math.min(1, total / cap) : 1;
|
|
223
|
+
const counts = countLanes(report);
|
|
224
|
+
return {
|
|
225
|
+
assertionType: "preflight",
|
|
226
|
+
dimension: "code-correctness",
|
|
227
|
+
latencyMs: 0,
|
|
228
|
+
pass: total === 0,
|
|
229
|
+
reason: `preflight: ${counts.exists} exists, ${counts.missing} missing, ${counts.unresolved} unresolved (deduction ${total}/${cap}, ${perMissing} per missing)`,
|
|
230
|
+
score,
|
|
231
|
+
weight,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
function countLanes(report) {
|
|
235
|
+
let exists = 0;
|
|
236
|
+
let missing = 0;
|
|
237
|
+
let unresolved = 0;
|
|
238
|
+
for (const f of report.findings) {
|
|
239
|
+
if (f.result === "exists")
|
|
240
|
+
exists++;
|
|
241
|
+
else if (f.result === "missing")
|
|
242
|
+
missing++;
|
|
243
|
+
else
|
|
244
|
+
unresolved++;
|
|
245
|
+
}
|
|
246
|
+
return { exists, missing, unresolved };
|
|
247
|
+
}
|
|
248
|
+
function clampWeight(w) {
|
|
249
|
+
if (!Number.isFinite(w))
|
|
250
|
+
return 0;
|
|
251
|
+
if (w < 0)
|
|
252
|
+
return 0;
|
|
253
|
+
if (w > 1)
|
|
254
|
+
return 1;
|
|
255
|
+
return w;
|
|
256
|
+
}
|
|
163
257
|
/** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
|
|
164
258
|
function kebabToCamel(kebab) {
|
|
165
259
|
return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
@@ -41,6 +41,15 @@ export interface FingerprintInput {
|
|
|
41
41
|
mode: EvalMode;
|
|
42
42
|
/** Path to the packages/eval root directory */
|
|
43
43
|
rootDir: string;
|
|
44
|
+
/**
|
|
45
|
+
* Grader context policy. Distinct values produce distinct rubricPrompt
|
|
46
|
+
* content, so the cache must treat them as different evaluations even
|
|
47
|
+
* when tasks + docs + grader model match.
|
|
48
|
+
*
|
|
49
|
+
* Defaults to "rubric-only" inside the hash when undefined, matching
|
|
50
|
+
* the EvalConfig boundary default.
|
|
51
|
+
*/
|
|
52
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
44
53
|
}
|
|
45
54
|
/**
|
|
46
55
|
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
@@ -38,8 +38,12 @@ import { join, relative, resolve } from "path";
|
|
|
38
38
|
* v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
|
|
39
39
|
* files), file paths normalized to rootDir-relative, grader passed
|
|
40
40
|
* through verbatim instead of the literal string "default".
|
|
41
|
+
*
|
|
42
|
+
* v3 (2026-05-06): grader-context policy ("rubric-only" vs "with-docs")
|
|
43
|
+
* affects rubricPrompt content and therefore eval output, so it must be
|
|
44
|
+
* hashed. Bumping invalidates v2 fingerprints.
|
|
41
45
|
*/
|
|
42
|
-
const FINGERPRINT_VERSION = "eval-fingerprint-
|
|
46
|
+
const FINGERPRINT_VERSION = "eval-fingerprint-v3";
|
|
43
47
|
/**
|
|
44
48
|
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
45
49
|
*
|
|
@@ -52,10 +56,12 @@ const FINGERPRINT_VERSION = "eval-fingerprint-v2";
|
|
|
52
56
|
*/
|
|
53
57
|
export function computeEvalFingerprint(input) {
|
|
54
58
|
const { graderModel, mode, rootDir, tasks } = input;
|
|
59
|
+
const graderContext = input.graderContext ?? "rubric-only";
|
|
55
60
|
const hash = createHash("sha256");
|
|
56
61
|
hash.update(`version:${FINGERPRINT_VERSION}\n`);
|
|
57
62
|
hash.update(`mode:${mode}\n`);
|
|
58
63
|
hash.update(`grader:${graderModel}\n`);
|
|
64
|
+
hash.update(`graderContext:${graderContext}\n`);
|
|
59
65
|
hash.update(`tasks:${hashTaskSet(tasks)}\n`);
|
|
60
66
|
// Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
|
|
61
67
|
// so a CI runner at /home/runner/... and a laptop at /Users/... produce
|
|
@@ -52,6 +52,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
52
52
|
noAutoScope: request.noAutoScope ?? false,
|
|
53
53
|
noCache: request.noCache ?? false,
|
|
54
54
|
noRemoteCache: request.noRemoteCache ?? false,
|
|
55
|
+
graderContext: request.graderContext,
|
|
55
56
|
graderReplications: request.graderReplications,
|
|
56
57
|
urls: request.urls,
|
|
57
58
|
headers: request.headers,
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* compute-preflight — pure function that turns a candidate's code +
|
|
3
|
+
* a `PackageSurfaceResolver` + the framework-level package-surface
|
|
4
|
+
* manifest into a `SymbolPreflightReport`.
|
|
5
|
+
*
|
|
6
|
+
* Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
|
|
7
|
+
* from package Y" entirely out of LLM judgment. The LLM rubric runs
|
|
8
|
+
* after this and is told the preflight's findings as ground truth.
|
|
9
|
+
*
|
|
10
|
+
* The function is pure (no I/O beyond the resolver's): it parses the
|
|
11
|
+
* candidate's imports, asks the resolver about each in-scope package,
|
|
12
|
+
* and emits one finding per imported binding.
|
|
13
|
+
*
|
|
14
|
+
* Per-binding decision tree:
|
|
15
|
+
*
|
|
16
|
+
* 1. Drop the binding if it isn't a `named` import. Default,
|
|
17
|
+
* namespace, and side-effect imports are intentionally not
|
|
18
|
+
* checked — the package surface only includes named exports
|
|
19
|
+
* (per the design's "named bindings only" rule), so default /
|
|
20
|
+
* namespace imports cannot be answered against it without
|
|
21
|
+
* false-deducting legitimate code.
|
|
22
|
+
*
|
|
23
|
+
* 2. Drop the binding if its `source` package is not in the
|
|
24
|
+
* framework-level manifest. Out-of-scope packages don't get
|
|
25
|
+
* findings — they are silently passed through to the LLM rubric.
|
|
26
|
+
*
|
|
27
|
+
* 3. Resolve the package surface. If the resolver throws a typed
|
|
28
|
+
* `PackageSurfaceResolverError`, every binding from that package
|
|
29
|
+
* becomes `unresolved` with the matching reason. **Never deduct.**
|
|
30
|
+
*
|
|
31
|
+
* 4. If the binding is in the surface, emit `exists` (no deduction).
|
|
32
|
+
*
|
|
33
|
+
* 5. Otherwise, emit `missing` (deterministic deduction).
|
|
34
|
+
*
|
|
35
|
+
* Deduction is `total = min(missing_count * perMissing, cap)`. The
|
|
36
|
+
* scoring bridge (Phase 5) computes the per-dimension score from this
|
|
37
|
+
* report; this function stays a pure data factory.
|
|
38
|
+
*/
|
|
39
|
+
import { type PackageSurfaceConfig, type PackageSurfaceResolver, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
|
|
40
|
+
/** Default deduction config — `−20 per missing, capped at 60`. */
|
|
41
|
+
export declare const DEFAULT_DEDUCTION: {
|
|
42
|
+
readonly perMissing: 20;
|
|
43
|
+
readonly cap: 60;
|
|
44
|
+
};
|
|
45
|
+
export interface ComputePreflightInput {
|
|
46
|
+
/** Raw candidate output. Typically the contents of a single TS/TSX/JS code block. */
|
|
47
|
+
readonly code: string;
|
|
48
|
+
/** Identity for this candidate, recorded in the report's `candidate` field. */
|
|
49
|
+
readonly candidate: {
|
|
50
|
+
readonly taskId: string;
|
|
51
|
+
readonly testIndex: number;
|
|
52
|
+
};
|
|
53
|
+
/** Framework-level package-surface manifest (Phase 0 / `definePackageSurface`). */
|
|
54
|
+
readonly packageSurface: PackageSurfaceConfig;
|
|
55
|
+
/** Resolver used to fetch each in-scope package's surface (Phase 1). */
|
|
56
|
+
readonly resolver: PackageSurfaceResolver;
|
|
57
|
+
/**
|
|
58
|
+
* Deduction config. Defaults to `DEFAULT_DEDUCTION`. Pass
|
|
59
|
+
* `{ perMissing: 0, cap: 0 }` to compute findings without deduction
|
|
60
|
+
* (e.g. for measurement-only runs).
|
|
61
|
+
*/
|
|
62
|
+
readonly deduction?: {
|
|
63
|
+
readonly perMissing?: number;
|
|
64
|
+
readonly cap?: number;
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
export declare function computePreflight(input: ComputePreflightInput): Promise<SymbolPreflightReport>;
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* compute-preflight — pure function that turns a candidate's code +
|
|
3
|
+
* a `PackageSurfaceResolver` + the framework-level package-surface
|
|
4
|
+
* manifest into a `SymbolPreflightReport`.
|
|
5
|
+
*
|
|
6
|
+
* Stage 1 of the W0198 two-stage grader: lifts "does symbol X export
|
|
7
|
+
* from package Y" entirely out of LLM judgment. The LLM rubric runs
|
|
8
|
+
* after this and is told the preflight's findings as ground truth.
|
|
9
|
+
*
|
|
10
|
+
* The function is pure (no I/O beyond the resolver's): it parses the
|
|
11
|
+
* candidate's imports, asks the resolver about each in-scope package,
|
|
12
|
+
* and emits one finding per imported binding.
|
|
13
|
+
*
|
|
14
|
+
* Per-binding decision tree:
|
|
15
|
+
*
|
|
16
|
+
* 1. Drop the binding if it isn't a `named` import. Default,
|
|
17
|
+
* namespace, and side-effect imports are intentionally not
|
|
18
|
+
* checked — the package surface only includes named exports
|
|
19
|
+
* (per the design's "named bindings only" rule), so default /
|
|
20
|
+
* namespace imports cannot be answered against it without
|
|
21
|
+
* false-deducting legitimate code.
|
|
22
|
+
*
|
|
23
|
+
* 2. Drop the binding if its `source` package is not in the
|
|
24
|
+
* framework-level manifest. Out-of-scope packages don't get
|
|
25
|
+
* findings — they are silently passed through to the LLM rubric.
|
|
26
|
+
*
|
|
27
|
+
* 3. Resolve the package surface. If the resolver throws a typed
|
|
28
|
+
* `PackageSurfaceResolverError`, every binding from that package
|
|
29
|
+
* becomes `unresolved` with the matching reason. **Never deduct.**
|
|
30
|
+
*
|
|
31
|
+
* 4. If the binding is in the surface, emit `exists` (no deduction).
|
|
32
|
+
*
|
|
33
|
+
* 5. Otherwise, emit `missing` (deterministic deduction).
|
|
34
|
+
*
|
|
35
|
+
* Deduction is `total = min(missing_count * perMissing, cap)`. The
|
|
36
|
+
* scoring bridge (Phase 5) computes the per-dimension score from this
|
|
37
|
+
* report; this function stays a pure data factory.
|
|
38
|
+
*/
|
|
39
|
+
import { PackageSurfaceResolverError, } from "../../_vendor/ailf-core/index.js";
|
|
40
|
+
import { parseImports } from "./parse-imports.js";
|
|
41
|
+
/** Default deduction config — `−20 per missing, capped at 60`. */
|
|
42
|
+
export const DEFAULT_DEDUCTION = {
|
|
43
|
+
perMissing: 20,
|
|
44
|
+
cap: 60,
|
|
45
|
+
};
|
|
46
|
+
export async function computePreflight(input) {
|
|
47
|
+
const perMissing = input.deduction?.perMissing ?? DEFAULT_DEDUCTION.perMissing;
|
|
48
|
+
const cap = input.deduction?.cap ?? DEFAULT_DEDUCTION.cap;
|
|
49
|
+
const inScope = new Set(input.packageSurface.packages.map((p) => p.pkg));
|
|
50
|
+
const imports = parseImports(input.code);
|
|
51
|
+
// Bucket named imports by package so we resolve each surface at most once.
|
|
52
|
+
const namedBySource = new Map();
|
|
53
|
+
for (const binding of imports) {
|
|
54
|
+
if (binding.kind !== "named")
|
|
55
|
+
continue;
|
|
56
|
+
if (!inScope.has(binding.source))
|
|
57
|
+
continue;
|
|
58
|
+
let bindings = namedBySource.get(binding.source);
|
|
59
|
+
if (!bindings) {
|
|
60
|
+
bindings = [];
|
|
61
|
+
namedBySource.set(binding.source, bindings);
|
|
62
|
+
}
|
|
63
|
+
if (!bindings.includes(binding.imported))
|
|
64
|
+
bindings.push(binding.imported);
|
|
65
|
+
}
|
|
66
|
+
const findings = [];
|
|
67
|
+
for (const [pkg, bindings] of namedBySource) {
|
|
68
|
+
let surface;
|
|
69
|
+
try {
|
|
70
|
+
surface = await input.resolver.resolveExports(pkg);
|
|
71
|
+
}
|
|
72
|
+
catch (err) {
|
|
73
|
+
const reason = unresolvedReasonFor(err);
|
|
74
|
+
for (const binding of bindings) {
|
|
75
|
+
findings.push({ result: "unresolved", pkg, binding, reason });
|
|
76
|
+
}
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
const surfaceNames = new Map();
|
|
80
|
+
for (const sym of surface.symbols)
|
|
81
|
+
surfaceNames.set(sym.name, sym.source);
|
|
82
|
+
for (const binding of bindings) {
|
|
83
|
+
const source = surfaceNames.get(binding);
|
|
84
|
+
if (source) {
|
|
85
|
+
findings.push({
|
|
86
|
+
result: "exists",
|
|
87
|
+
pkg,
|
|
88
|
+
version: surface.version,
|
|
89
|
+
binding,
|
|
90
|
+
source,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
findings.push({
|
|
95
|
+
result: "missing",
|
|
96
|
+
pkg,
|
|
97
|
+
version: surface.version,
|
|
98
|
+
binding,
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
const missingCount = findings.filter((f) => f.result === "missing").length;
|
|
104
|
+
const total = Math.min(missingCount * perMissing, cap);
|
|
105
|
+
return {
|
|
106
|
+
candidate: { ...input.candidate },
|
|
107
|
+
findings,
|
|
108
|
+
deduction: { perMissing, cap, total },
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
function unresolvedReasonFor(err) {
|
|
112
|
+
if (err instanceof PackageSurfaceResolverError) {
|
|
113
|
+
return err.reason;
|
|
114
|
+
}
|
|
115
|
+
// Anything else from the resolver is treated as a parse failure —
|
|
116
|
+
// fail-loud, never-deduct.
|
|
117
|
+
return "parse-failed";
|
|
118
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emit-symbol-preflight — turns a Promptfoo results file into per-test
|
|
3
|
+
* `symbolPreflight` artifact emissions, one per (run, mode, task, model).
|
|
4
|
+
*
|
|
5
|
+
* Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
|
|
6
|
+
* of `RunEvalStep`. For every test row in the results file we:
|
|
7
|
+
*
|
|
8
|
+
* 1. Pull the candidate's response text out of `result.response.output`.
|
|
9
|
+
* 2. Run `computePreflight` against the framework-level package-surface
|
|
10
|
+
* manifest using the wired `PackageSurfaceResolver` from the
|
|
11
|
+
* `AppContext`.
|
|
12
|
+
* 3. Emit the report through the artifact writer at axes
|
|
13
|
+
* `(run, mode, task, model)` — same axes as `rawResults`.
|
|
14
|
+
* 4. Attach the report onto `state.preflightReports` keyed by the
|
|
15
|
+
* same axes so the scoring step (Phase 5) can read it without a
|
|
16
|
+
* second filesystem hop.
|
|
17
|
+
*
|
|
18
|
+
* Non-blocking: a missing resolver, missing manifest, missing response,
|
|
19
|
+
* or per-row exception logs a warning and continues. The deterministic
|
|
20
|
+
* lane is additive; if any of its inputs are missing the LLM rubric
|
|
21
|
+
* still scores the candidate normally.
|
|
22
|
+
*/
|
|
23
|
+
import { type ArtifactRef, type ArtifactWriter, type PackageSurfaceConfig, type PackageSurfaceResolver, type RunId, type SymbolPreflightReport } from "../../_vendor/ailf-core/index.d.ts";
|
|
24
|
+
/**
|
|
25
|
+
* Per-row preflight key. Mirrors the axis set the writer uses for
|
|
26
|
+
* `symbolPreflight` so the scoring step can look up reports without
|
|
27
|
+
* re-deriving them from disk.
|
|
28
|
+
*/
|
|
29
|
+
export interface SymbolPreflightKey {
|
|
30
|
+
run: RunId;
|
|
31
|
+
mode: string;
|
|
32
|
+
task: string;
|
|
33
|
+
model: string;
|
|
34
|
+
}
|
|
35
|
+
/** Map a per-row preflight key to a stable string for in-memory lookup. */
|
|
36
|
+
export declare function preflightKey(key: SymbolPreflightKey): string;
|
|
37
|
+
export interface EmitSymbolPreflightInput {
|
|
38
|
+
writer: ArtifactWriter;
|
|
39
|
+
ctx: {
|
|
40
|
+
runId: RunId;
|
|
41
|
+
};
|
|
42
|
+
mode: string;
|
|
43
|
+
resultsPath: string;
|
|
44
|
+
packageSurface: PackageSurfaceConfig | undefined;
|
|
45
|
+
resolver: PackageSurfaceResolver | undefined;
|
|
46
|
+
}
|
|
47
|
+
export interface EmitSymbolPreflightOutput {
|
|
48
|
+
reports: Map<string, SymbolPreflightReport>;
|
|
49
|
+
refs: readonly (ArtifactRef | null)[];
|
|
50
|
+
}
|
|
51
|
+
export declare function emitSymbolPreflight(input: EmitSymbolPreflightInput): Promise<EmitSymbolPreflightOutput>;
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emit-symbol-preflight — turns a Promptfoo results file into per-test
|
|
3
|
+
* `symbolPreflight` artifact emissions, one per (run, mode, task, model).
|
|
4
|
+
*
|
|
5
|
+
* Sits next to `emitPerEntryEvalResults` (W0050) in the post-eval phase
|
|
6
|
+
* of `RunEvalStep`. For every test row in the results file we:
|
|
7
|
+
*
|
|
8
|
+
* 1. Pull the candidate's response text out of `result.response.output`.
|
|
9
|
+
* 2. Run `computePreflight` against the framework-level package-surface
|
|
10
|
+
* manifest using the wired `PackageSurfaceResolver` from the
|
|
11
|
+
* `AppContext`.
|
|
12
|
+
* 3. Emit the report through the artifact writer at axes
|
|
13
|
+
* `(run, mode, task, model)` — same axes as `rawResults`.
|
|
14
|
+
* 4. Attach the report onto `state.preflightReports` keyed by the
|
|
15
|
+
* same axes so the scoring step (Phase 5) can read it without a
|
|
16
|
+
* second filesystem hop.
|
|
17
|
+
*
|
|
18
|
+
* Non-blocking: a missing resolver, missing manifest, missing response,
|
|
19
|
+
* or per-row exception logs a warning and continues. The deterministic
|
|
20
|
+
* lane is additive; if any of its inputs are missing the LLM rubric
|
|
21
|
+
* still scores the candidate normally.
|
|
22
|
+
*/
|
|
23
|
+
import { readFileSync } from "node:fs";
|
|
24
|
+
import { resolveVariantMode, } from "../../_vendor/ailf-core/index.js";
|
|
25
|
+
import { computePreflight } from "./compute-preflight.js";
|
|
26
|
+
/** Map a per-row preflight key to a stable string for in-memory lookup. */
|
|
27
|
+
export function preflightKey(key) {
|
|
28
|
+
return `${key.run}/${key.mode}/${key.task}/${key.model}`;
|
|
29
|
+
}
|
|
30
|
+
export async function emitSymbolPreflight(input) {
|
|
31
|
+
const reports = new Map();
|
|
32
|
+
const refs = [];
|
|
33
|
+
if (!input.packageSurface || !input.resolver) {
|
|
34
|
+
// The deterministic lane is additive — when its inputs aren't wired
|
|
35
|
+
// (test contexts, opt-out, partial rollouts) the LLM rubric still
|
|
36
|
+
// grades the candidate. Stay silent.
|
|
37
|
+
return { reports, refs };
|
|
38
|
+
}
|
|
39
|
+
if (input.packageSurface.packages.length === 0) {
|
|
40
|
+
return { reports, refs };
|
|
41
|
+
}
|
|
42
|
+
let raw;
|
|
43
|
+
try {
|
|
44
|
+
raw = JSON.parse(readFileSync(input.resultsPath, "utf-8"));
|
|
45
|
+
}
|
|
46
|
+
catch (err) {
|
|
47
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
48
|
+
console.warn(` ⚠️ emitSymbolPreflight: failed to read ${input.resultsPath} — ${message}`);
|
|
49
|
+
return { reports, refs };
|
|
50
|
+
}
|
|
51
|
+
const wrapper = raw.results && "results" in raw.results
|
|
52
|
+
? raw.results
|
|
53
|
+
: raw;
|
|
54
|
+
const rows = wrapper?.results ?? [];
|
|
55
|
+
if (rows.length === 0)
|
|
56
|
+
return { reports, refs };
|
|
57
|
+
// Track per-(task, model) to dedupe — Promptfoo emits multiple rows
|
|
58
|
+
// for the same candidate when there are multiple assertions, but
|
|
59
|
+
// the preflight only depends on the candidate's text, not the
|
|
60
|
+
// assertion outcome. One report per (task, model) suffices.
|
|
61
|
+
const seen = new Set();
|
|
62
|
+
const emits = [];
|
|
63
|
+
for (let i = 0; i < rows.length; i++) {
|
|
64
|
+
const row = rows[i];
|
|
65
|
+
const rawTaskId = row.testCase?.description ?? "unknown-task";
|
|
66
|
+
const modelId = row.provider?.id ?? row.provider?.label ?? "unknown-model";
|
|
67
|
+
const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, input.mode);
|
|
68
|
+
const baseAssoc = {
|
|
69
|
+
run: input.ctx.runId,
|
|
70
|
+
mode: axisMode,
|
|
71
|
+
task: axisTask,
|
|
72
|
+
model: modelId,
|
|
73
|
+
};
|
|
74
|
+
const key = preflightKey(baseAssoc);
|
|
75
|
+
if (seen.has(key))
|
|
76
|
+
continue;
|
|
77
|
+
seen.add(key);
|
|
78
|
+
const output = row.response?.output;
|
|
79
|
+
if (typeof output !== "string" || output.length === 0)
|
|
80
|
+
continue;
|
|
81
|
+
let report;
|
|
82
|
+
try {
|
|
83
|
+
report = await computePreflight({
|
|
84
|
+
code: output,
|
|
85
|
+
candidate: { taskId: axisTask, testIndex: i },
|
|
86
|
+
packageSurface: input.packageSurface,
|
|
87
|
+
resolver: input.resolver,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
catch (err) {
|
|
91
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
92
|
+
console.warn(` ⚠️ emitSymbolPreflight: computePreflight threw for ${key} — ${message}`);
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
reports.set(key, report);
|
|
96
|
+
emits.push(input.writer.emit("symbolPreflight", baseAssoc, report));
|
|
97
|
+
}
|
|
98
|
+
const settled = await Promise.all(emits);
|
|
99
|
+
for (const ref of settled)
|
|
100
|
+
refs.push(ref);
|
|
101
|
+
return { reports, refs };
|
|
102
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* load-package-surface — read the framework-level package-surface
|
|
3
|
+
* manifest (`config/package-surface.ts`) authored via
|
|
4
|
+
* `definePackageSurface()`.
|
|
5
|
+
*
|
|
6
|
+
* Returns `undefined` when the file is absent so the W0198 preflight
|
|
7
|
+
* step can no-op cleanly during the staged rollout. The eval package
|
|
8
|
+
* itself ships a manifest under `config/package-surface.ts` (Phase 0),
|
|
9
|
+
* so the live pipeline always finds one; the optional return path
|
|
10
|
+
* exists for downstream / external callers that may not have authored
|
|
11
|
+
* one yet.
|
|
12
|
+
*/
|
|
13
|
+
import type { PackageSurfaceConfig } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
export declare function loadPackageSurface(rootDir: string): Promise<PackageSurfaceConfig | undefined>;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* load-package-surface — read the framework-level package-surface
|
|
3
|
+
* manifest (`config/package-surface.ts`) authored via
|
|
4
|
+
* `definePackageSurface()`.
|
|
5
|
+
*
|
|
6
|
+
* Returns `undefined` when the file is absent so the W0198 preflight
|
|
7
|
+
* step can no-op cleanly during the staged rollout. The eval package
|
|
8
|
+
* itself ships a manifest under `config/package-surface.ts` (Phase 0),
|
|
9
|
+
* so the live pipeline always finds one; the optional return path
|
|
10
|
+
* exists for downstream / external callers that may not have authored
|
|
11
|
+
* one yet.
|
|
12
|
+
*/
|
|
13
|
+
import { tryLoadConfigFile } from "../compiler/config-loader.js";
|
|
14
|
+
export async function loadPackageSurface(rootDir) {
|
|
15
|
+
const result = tryLoadConfigFile("package-surface", rootDir);
|
|
16
|
+
if (!result)
|
|
17
|
+
return undefined;
|
|
18
|
+
return result.data;
|
|
19
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* load-preflight-context — read the framework-level package-surface
|
|
3
|
+
* manifest and project it down to the rubric-side context shape.
|
|
4
|
+
*
|
|
5
|
+
* Returns `undefined` when the manifest is absent or empty so callers
|
|
6
|
+
* collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
|
|
7
|
+
* `loadPackageSurface` and `loadPreflightScoring` — one loader per
|
|
8
|
+
* lazily-read W0198 input, all in `pipeline/preflight/`.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
|
|
11
|
+
*/
|
|
12
|
+
import type { Logger, PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
|
|
13
|
+
export declare function loadPreflightContext(rootDir: string, logger?: Pick<Logger, "warn">): Promise<PreflightRubricContext | undefined>;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* load-preflight-context — read the framework-level package-surface
|
|
3
|
+
* manifest and project it down to the rubric-side context shape.
|
|
4
|
+
*
|
|
5
|
+
* Returns `undefined` when the manifest is absent or empty so callers
|
|
6
|
+
* collapse cleanly to the pre-W0198 rubric. Mirrors the convention of
|
|
7
|
+
* `loadPackageSurface` and `loadPreflightScoring` — one loader per
|
|
8
|
+
* lazily-read W0198 input, all in `pipeline/preflight/`.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md — Phase 6
|
|
11
|
+
*/
|
|
12
|
+
import { loadPackageSurface } from "./load-package-surface.js";
|
|
13
|
+
export async function loadPreflightContext(rootDir, logger) {
|
|
14
|
+
try {
|
|
15
|
+
const manifest = await loadPackageSurface(rootDir);
|
|
16
|
+
if (!manifest || manifest.packages.length === 0)
|
|
17
|
+
return undefined;
|
|
18
|
+
return { packages: manifest.packages.map((p) => p.pkg) };
|
|
19
|
+
}
|
|
20
|
+
catch (err) {
|
|
21
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
22
|
+
logger?.warn(`[warn] W0198 preflight: failed to load package-surface manifest — ${message}`);
|
|
23
|
+
return undefined;
|
|
24
|
+
}
|
|
25
|
+
}
|