@sanity/ailf 4.2.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/package-surface.ts +37 -0
- package/config/preflight-scoring.ts +26 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
- package/dist/_vendor/ailf-core/config-helpers.js +67 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
- package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
- package/dist/adapters/index.d.ts +1 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
- package/dist/adapters/package-surface/dts-package-surface.js +173 -0
- package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
- package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
- package/dist/adapters/package-surface/index.d.ts +9 -0
- package/dist/adapters/package-surface/index.js +8 -0
- package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
- package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
- package/dist/adapters/task-sources/repo-schemas.js +15 -0
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +12 -0
- package/dist/commands/remote-pipeline.js +10 -2
- package/dist/commands/remote-results.d.ts +12 -1
- package/dist/commands/remote-results.js +25 -5
- package/dist/composition-root.js +9 -0
- package/dist/config/package-surface.ts +37 -0
- package/dist/config/preflight-scoring.ts +26 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
- package/dist/orchestration/pipeline-orchestrator.js +38 -0
- package/dist/orchestration/steps/calculate-scores-step.js +11 -0
- package/dist/orchestration/steps/generate-configs-step.js +16 -1
- package/dist/orchestration/steps/run-eval-step.js +27 -0
- package/dist/pipeline/calculate-scores.d.ts +66 -5
- package/dist/pipeline/calculate-scores.js +141 -27
- package/dist/pipeline/compiler/index.d.ts +1 -1
- package/dist/pipeline/compiler/index.js +1 -1
- package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
- package/dist/pipeline/compiler/literacy-bridge.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
- package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
- package/dist/pipeline/compiler/rubric-resolution.js +78 -2
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
- package/dist/pipeline/compiler/scoring-bridge.js +104 -10
- package/dist/pipeline/eval-fingerprint.d.ts +9 -0
- package/dist/pipeline/eval-fingerprint.js +7 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
- package/dist/pipeline/preflight/compute-preflight.js +118 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
- package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
- package/dist/pipeline/preflight/load-package-surface.js +19 -0
- package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
- package/dist/pipeline/preflight/load-preflight-context.js +25 -0
- package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
- package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
- package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
- package/dist/pipeline/preflight/parse-imports.js +125 -0
- package/dist/report-store.d.ts +8 -0
- package/dist/report-store.js +55 -6
- package/dist/sanity/document-renderers.d.ts +45 -7
- package/dist/sanity/document-renderers.js +99 -13
- package/dist/sanity/queries.d.ts +11 -11
- package/dist/sanity/queries.js +7 -0
- package/dist/sanity/symbol-index.d.ts +98 -0
- package/dist/sanity/symbol-index.js +615 -0
- package/package.json +2 -1
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
10
|
import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
11
|
+
import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
|
|
12
|
+
import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
|
|
11
13
|
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
12
14
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
13
15
|
import { buildCacheContext } from "../cache-context.js";
|
|
@@ -90,6 +92,7 @@ export class RunEvalStep {
|
|
|
90
92
|
graderModel: loadGraderModel(rootDir).id,
|
|
91
93
|
mode: this.mode,
|
|
92
94
|
rootDir,
|
|
95
|
+
graderContext: ctx.config.graderContext,
|
|
93
96
|
});
|
|
94
97
|
// Share fingerprint with downstream steps (PublishReportStep)
|
|
95
98
|
state.evalFingerprint = evalFingerprint;
|
|
@@ -224,6 +227,30 @@ export class RunEvalStep {
|
|
|
224
227
|
const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
|
|
225
228
|
if (existsSync(resultsPath)) {
|
|
226
229
|
await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
|
|
230
|
+
// W0198 Phase 4 — deterministic-lane reports per (task, model).
|
|
231
|
+
// Loaded lazily so test contexts that don't wire the manifest /
|
|
232
|
+
// resolver pay nothing; the helper is itself a no-op when its
|
|
233
|
+
// inputs are missing.
|
|
234
|
+
const packageSurface = await loadPackageSurface(rootDir).catch((err) => {
|
|
235
|
+
console.warn(` ⚠️ W0198 preflight: failed to load package-surface manifest — ${err instanceof Error ? err.message : String(err)}`);
|
|
236
|
+
return undefined;
|
|
237
|
+
});
|
|
238
|
+
const preflight = await emitSymbolPreflight({
|
|
239
|
+
writer: ctx.artifactWriter,
|
|
240
|
+
ctx,
|
|
241
|
+
mode: this.mode,
|
|
242
|
+
resultsPath,
|
|
243
|
+
packageSurface,
|
|
244
|
+
resolver: ctx.packageSurfaceResolver,
|
|
245
|
+
});
|
|
246
|
+
if (preflight.reports.size > 0) {
|
|
247
|
+
if (!state.preflightReports) {
|
|
248
|
+
state.preflightReports = new Map();
|
|
249
|
+
}
|
|
250
|
+
for (const [k, v] of preflight.reports) {
|
|
251
|
+
state.preflightReports.set(k, v);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
227
254
|
}
|
|
228
255
|
// Extract Promptfoo share URL from eval results (Step 3b)
|
|
229
256
|
if (ctx.evalRunner.extractShareUrl) {
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
3
|
import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
|
+
import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
|
|
4
5
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
5
6
|
export interface PromptfooResultsWrapper {
|
|
6
7
|
results: RawTestResult[];
|
|
@@ -60,6 +61,34 @@ export interface RawTestResult {
|
|
|
60
61
|
};
|
|
61
62
|
vars: Record<string, string>;
|
|
62
63
|
}
|
|
64
|
+
/**
|
|
65
|
+
* Aggregate counts across every preflight report emitted by the run.
|
|
66
|
+
*
|
|
67
|
+
* `unresolvedRate` is `unresolved / totalFindings` in `[0, 1]`, set only
|
|
68
|
+
* when `totalFindings > 0`. The vacuous case (`totalFindings === 0` —
|
|
69
|
+
* reports exist but every candidate cited zero in-scope bindings) is
|
|
70
|
+
* deliberately distinguished from "every binding resolved cleanly"
|
|
71
|
+
* (`unresolvedRate === 0`) so a CI threshold like `unresolvedRate > 0.1`
|
|
72
|
+
* doesn't fire green on a run that had nothing to resolve.
|
|
73
|
+
*/
|
|
74
|
+
export interface PreflightSummary {
|
|
75
|
+
/** Number of per-test preflight reports the run emitted. */
|
|
76
|
+
reportCount: number;
|
|
77
|
+
/** Total findings across all reports. */
|
|
78
|
+
totalFindings: number;
|
|
79
|
+
/** Findings classified `exists`. */
|
|
80
|
+
exists: number;
|
|
81
|
+
/** Findings classified `missing` — the deterministic-deduction lane. */
|
|
82
|
+
missing: number;
|
|
83
|
+
/** Findings classified `unresolved` — the resolver-couldn't-answer lane. */
|
|
84
|
+
unresolved: number;
|
|
85
|
+
/**
|
|
86
|
+
* `unresolved / totalFindings` in `[0, 1]`. Absent when `totalFindings`
|
|
87
|
+
* is zero — distinguishes "nothing to resolve" from "all resolutions
|
|
88
|
+
* succeeded" so CI thresholds aren't vacuously green.
|
|
89
|
+
*/
|
|
90
|
+
unresolvedRate?: number;
|
|
91
|
+
}
|
|
63
92
|
/**
|
|
64
93
|
* Calculate scores grouped by model. Each model gets its own FeatureScore[]
|
|
65
94
|
* and model-level aggregates.
|
|
@@ -70,7 +99,7 @@ export interface RawTestResult {
|
|
|
70
99
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
71
100
|
* (per-model breakdown is redundant when there's only one model).
|
|
72
101
|
*/
|
|
73
|
-
export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number
|
|
102
|
+
export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): null | PerModelEntry[];
|
|
74
103
|
/**
|
|
75
104
|
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
76
105
|
*
|
|
@@ -91,6 +120,19 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
|
|
|
91
120
|
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
92
121
|
*/
|
|
93
122
|
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
123
|
+
/**
|
|
124
|
+
* W0198 — aggregate every per-test `SymbolPreflightReport` into a single
|
|
125
|
+
* resolver-health summary. Returns `undefined` when the run had no
|
|
126
|
+
* preflight reports (manifest disabled, resolver missing, or every
|
|
127
|
+
* candidate output cited zero in-scope packages) so the consumer can
|
|
128
|
+
* cleanly omit the field from the score summary instead of writing a
|
|
129
|
+
* vacuous block of zeros.
|
|
130
|
+
*
|
|
131
|
+
* Exported for the dedicated unit test in `preflight-summary.test.ts`;
|
|
132
|
+
* production calls go through `calculateAndWriteScores`, which threads
|
|
133
|
+
* the result into the `EvalScoreSummary.preflight` field.
|
|
134
|
+
*/
|
|
135
|
+
export declare function summarizePreflight(reports: Map<string, SymbolPreflightReport> | undefined): PreflightSummary | undefined;
|
|
94
136
|
/**
|
|
95
137
|
* Score knowledge-probe evaluation results.
|
|
96
138
|
*
|
|
@@ -105,7 +147,7 @@ export declare function extractStoredTestResults(resultsPath: string): StoredTes
|
|
|
105
147
|
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
106
148
|
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
107
149
|
*/
|
|
108
|
-
export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number
|
|
150
|
+
export declare function scoreKnowledgeProbeResults(results: TestResult[], profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): FeatureScore[];
|
|
109
151
|
/**
|
|
110
152
|
* Score agentic evaluation results. In agentic mode, all test entries are
|
|
111
153
|
* gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
|
|
@@ -113,7 +155,7 @@ export declare function scoreKnowledgeProbeResults(results: TestResult[], profil
|
|
|
113
155
|
*
|
|
114
156
|
* Returns a record keyed by feature area with the composite actual score.
|
|
115
157
|
*/
|
|
116
|
-
export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number
|
|
158
|
+
export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, ActualScoreEntry>;
|
|
117
159
|
/**
|
|
118
160
|
* Score agentic results broken down by model.
|
|
119
161
|
*
|
|
@@ -121,7 +163,7 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
|
|
|
121
163
|
* producing a map of model → feature → ActualScoreEntry.
|
|
122
164
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
123
165
|
*/
|
|
124
|
-
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number
|
|
166
|
+
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
|
|
125
167
|
/** Options for the calculate-scores main() function. */
|
|
126
168
|
export interface CalculateScoresOptions {
|
|
127
169
|
/** Allowed origins for source isolation reporting */
|
|
@@ -130,12 +172,31 @@ export interface CalculateScoresOptions {
|
|
|
130
172
|
logger?: Logger;
|
|
131
173
|
/** Evaluation mode (controls which result files are read) */
|
|
132
174
|
mode?: string;
|
|
175
|
+
/**
|
|
176
|
+
* W0198 — symbol-preflight reports keyed by `${runId}/${mode}/${task}/${model}`,
|
|
177
|
+
* populated by `RunEvalStep` via `emitSymbolPreflight`. When provided, the
|
|
178
|
+
* scoring engine merges deterministic preflight findings into the
|
|
179
|
+
* `code-correctness` dimension. Absence (or empty map) collapses cleanly
|
|
180
|
+
* to the pre-W0198 rubric-only path.
|
|
181
|
+
*/
|
|
182
|
+
preflightReports?: Map<string, SymbolPreflightReport>;
|
|
183
|
+
/**
|
|
184
|
+
* W0198 — preflight's share of `code-correctness` in `[0, 1]`. Defaults
|
|
185
|
+
* to `DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT` when omitted.
|
|
186
|
+
*/
|
|
187
|
+
preflightWeight?: number;
|
|
133
188
|
/** Pre-resolved source config (skips loadSource() call) */
|
|
134
189
|
resolvedSource?: ResolvedSourceConfig;
|
|
135
190
|
/** Path to baseline results file (default: results/latest/eval-results.json) */
|
|
136
191
|
resultsPath?: string;
|
|
137
192
|
/** Root directory of the eval package (required) */
|
|
138
193
|
rootDir: string;
|
|
194
|
+
/**
|
|
195
|
+
* W0198 — runId axis used to look up preflight reports. Required when
|
|
196
|
+
* `preflightReports` is provided; otherwise the lookup callback can't
|
|
197
|
+
* reconstruct the right key.
|
|
198
|
+
*/
|
|
199
|
+
runId?: string;
|
|
139
200
|
/** Search mode for source verification metadata */
|
|
140
201
|
searchMode?: string;
|
|
141
202
|
/** Documentation source name */
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
*/
|
|
30
30
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
31
31
|
import { join } from "path";
|
|
32
|
-
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
32
|
+
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
33
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
34
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
35
|
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
@@ -38,7 +38,7 @@ import { loadRubricTemplates } from "./rubric-loader.js";
|
|
|
38
38
|
import { resolveProfile } from "./profile-resolution.js";
|
|
39
39
|
import { loadSource } from "../sources.js";
|
|
40
40
|
import { LiteracyVariant } from "./normalize-mode.js";
|
|
41
|
-
import { scoreTestGroup } from "./compiler/scoring-bridge.js";
|
|
41
|
+
import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
|
|
42
42
|
// Re-export from core for backward compatibility.
|
|
43
43
|
// Existing imports from this file continue to work unchanged.
|
|
44
44
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -52,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
|
|
|
52
52
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
53
53
|
* (per-model breakdown is redundant when there's only one model).
|
|
54
54
|
*/
|
|
55
|
-
export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
|
|
55
|
+
export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile, preflightOptions) {
|
|
56
56
|
const results = readAndNormalizeResults(resultsPath);
|
|
57
57
|
// Group results by provider
|
|
58
58
|
const byModel = {};
|
|
@@ -72,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
|
|
|
72
72
|
}
|
|
73
73
|
const perModel = [];
|
|
74
74
|
for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
|
|
75
|
-
const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
|
|
75
|
+
const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId, preflightOptions);
|
|
76
76
|
const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
|
|
77
77
|
const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
|
|
78
78
|
const avgScore = scores.length > 0
|
|
@@ -408,9 +408,111 @@ function buildSourceVerification(root, source, verificationCtx) {
|
|
|
408
408
|
* Calculate overall scores (all models combined).
|
|
409
409
|
* This is the original scoring path — backward compatible.
|
|
410
410
|
*/
|
|
411
|
-
function calculateScores(resultsPath, goldProfile, baselineProfile) {
|
|
411
|
+
function calculateScores(resultsPath, goldProfile, baselineProfile, preflightOptions) {
|
|
412
412
|
const results = readAndNormalizeResults(resultsPath);
|
|
413
|
-
return scoreResults(results, goldProfile, baselineProfile);
|
|
413
|
+
return scoreResults(results, goldProfile, baselineProfile, undefined, preflightOptions);
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* W0198 — build a `ScoreTestGroupOptions` that the scoring bridge can
|
|
417
|
+
* use to look up a `SymbolPreflightReport` for any given `TestResult`.
|
|
418
|
+
*
|
|
419
|
+
* Mirrors the keying scheme `emitSymbolPreflight` uses:
|
|
420
|
+
* `${runId}/${mode}/${task}/${model}` where `(mode, task)` come from
|
|
421
|
+
* `resolveVariantMode(test.description, defaultMode)`.
|
|
422
|
+
*
|
|
423
|
+
* Returns `undefined` (effectively a no-op) when reports are absent,
|
|
424
|
+
* empty, or the runId hasn't been provided — those collapse cleanly
|
|
425
|
+
* to the pre-W0198 path. The runId branch logs a warning when reports
|
|
426
|
+
* exist but the caller forgot to wire `runId` so the silent
|
|
427
|
+
* preflight-disabled state doesn't go unobserved.
|
|
428
|
+
*/
|
|
429
|
+
function makePreflightOptions(reports, runId, defaultMode, weight, logger) {
|
|
430
|
+
if (!reports || reports.size === 0)
|
|
431
|
+
return undefined;
|
|
432
|
+
if (!runId) {
|
|
433
|
+
logger?.warn(`[warn] W0198 preflight: ${reports.size} preflight report(s) provided but no runId — skipping merge into code-correctness`);
|
|
434
|
+
return undefined;
|
|
435
|
+
}
|
|
436
|
+
return {
|
|
437
|
+
preflightWeight: weight,
|
|
438
|
+
preflightForTest: (test) => {
|
|
439
|
+
const modelId = test.providerId ?? test.providerLabel ?? "unknown-model";
|
|
440
|
+
const { mode: axisMode, task } = resolveVariantMode(test.description, defaultMode);
|
|
441
|
+
const key = `${runId}/${axisMode}/${task}/${modelId}`;
|
|
442
|
+
return reports.get(key);
|
|
443
|
+
},
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* W0198 — aggregate every per-test `SymbolPreflightReport` into a single
|
|
448
|
+
* resolver-health summary. Returns `undefined` when the run had no
|
|
449
|
+
* preflight reports (manifest disabled, resolver missing, or every
|
|
450
|
+
* candidate output cited zero in-scope packages) so the consumer can
|
|
451
|
+
* cleanly omit the field from the score summary instead of writing a
|
|
452
|
+
* vacuous block of zeros.
|
|
453
|
+
*
|
|
454
|
+
* Exported for the dedicated unit test in `preflight-summary.test.ts`;
|
|
455
|
+
* production calls go through `calculateAndWriteScores`, which threads
|
|
456
|
+
* the result into the `EvalScoreSummary.preflight` field.
|
|
457
|
+
*/
|
|
458
|
+
export function summarizePreflight(reports) {
|
|
459
|
+
if (!reports || reports.size === 0)
|
|
460
|
+
return undefined;
|
|
461
|
+
let totalFindings = 0;
|
|
462
|
+
let exists = 0;
|
|
463
|
+
let missing = 0;
|
|
464
|
+
let unresolved = 0;
|
|
465
|
+
for (const report of reports.values()) {
|
|
466
|
+
for (const finding of report.findings) {
|
|
467
|
+
totalFindings++;
|
|
468
|
+
if (finding.result === "exists") {
|
|
469
|
+
exists++;
|
|
470
|
+
}
|
|
471
|
+
else if (finding.result === "missing") {
|
|
472
|
+
missing++;
|
|
473
|
+
}
|
|
474
|
+
else if (finding.result === "unresolved") {
|
|
475
|
+
unresolved++;
|
|
476
|
+
}
|
|
477
|
+
else {
|
|
478
|
+
// Exhaustiveness guard: a future fourth `result` variant lands
|
|
479
|
+
// here and surfaces as a build error rather than silently
|
|
480
|
+
// counting into `unresolved`.
|
|
481
|
+
const _exhaustive = finding;
|
|
482
|
+
void _exhaustive;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
return {
|
|
487
|
+
reportCount: reports.size,
|
|
488
|
+
totalFindings,
|
|
489
|
+
exists,
|
|
490
|
+
missing,
|
|
491
|
+
unresolved,
|
|
492
|
+
...(totalFindings > 0 && { unresolvedRate: unresolved / totalFindings }),
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
/**
|
|
496
|
+
* Print the preflight summary to the run log. Format mirrors the other
|
|
497
|
+
* single-line health signals (URL fetch, agent isolation) so CI grep can
|
|
498
|
+
* extract `unresolvedRate` directly from the log when score-summary.json
|
|
499
|
+
* isn't already in scope.
|
|
500
|
+
*/
|
|
501
|
+
function printPreflightSummary(summary, log) {
|
|
502
|
+
if (!summary)
|
|
503
|
+
return;
|
|
504
|
+
// `unresolvedRate` is absent when the run produced reports but no
|
|
505
|
+
// findings — distinguish vacuous-green from all-resolved so CI doesn't
|
|
506
|
+
// misread the threshold.
|
|
507
|
+
const rateLabel = summary.unresolvedRate === undefined
|
|
508
|
+
? "n/a (no findings)"
|
|
509
|
+
: `${(summary.unresolvedRate * 100).toFixed(1)}%`;
|
|
510
|
+
log.info("-".repeat(80));
|
|
511
|
+
log.info("SYMBOL PREFLIGHT (W0198)");
|
|
512
|
+
log.info("-".repeat(80));
|
|
513
|
+
log.info(` ${summary.reportCount} report(s), ${summary.totalFindings} finding(s): ${summary.exists} exists / ${summary.missing} missing / ${summary.unresolved} unresolved`);
|
|
514
|
+
log.info(` unresolvedRate: ${rateLabel} (resolver-health signal — not a candidate score factor)`);
|
|
515
|
+
log.info("");
|
|
414
516
|
}
|
|
415
517
|
/**
|
|
416
518
|
* Extracts agent behavior summary from a test result's metadata.
|
|
@@ -644,7 +746,7 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
644
746
|
* @param baselineProfile Weight profile for baseline (without-docs) entries
|
|
645
747
|
* @param modelId Optional model identifier to tag each FeatureScore
|
|
646
748
|
*/
|
|
647
|
-
function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
749
|
+
function scoreResults(results, goldProfile, baselineProfile, modelId, preflightOptions) {
|
|
648
750
|
// Group by feature + docs/no-docs
|
|
649
751
|
const byFeature = {};
|
|
650
752
|
for (const result of results) {
|
|
@@ -663,12 +765,12 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
663
765
|
const scores = [];
|
|
664
766
|
for (const [feature, data] of Object.entries(byFeature)) {
|
|
665
767
|
// --- With docs (gold / ceiling) — scored via 4-tier engine ---
|
|
666
|
-
const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
|
|
768
|
+
const gold = scoreTestGroup(data.withDocs, goldProfile, feature, preflightOptions);
|
|
667
769
|
// --- Without docs (baseline / floor) ---
|
|
668
770
|
// Uses the baseline profile (e.g. "output-only") which may exclude
|
|
669
771
|
// dimensions like doc-coverage that are undefined without docs.
|
|
670
772
|
// See docs/design-docs/named-scoring-profiles.md.
|
|
671
|
-
const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
|
|
773
|
+
const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature, preflightOptions);
|
|
672
774
|
const featureCost = gold.totalCost + baseline.totalCost;
|
|
673
775
|
const ceilingScore = gold.composite;
|
|
674
776
|
const floorScore = baseline.composite;
|
|
@@ -709,7 +811,7 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
709
811
|
* Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
|
|
710
812
|
* are set to 0 for backward compatibility with downstream consumers.
|
|
711
813
|
*/
|
|
712
|
-
function scoreAgentHarnessResults(results, profile) {
|
|
814
|
+
function scoreAgentHarnessResults(results, profile, preflightOptions) {
|
|
713
815
|
// Group by task ID (extracted from description: "task-id — Title")
|
|
714
816
|
const byTask = {};
|
|
715
817
|
for (const result of results) {
|
|
@@ -721,7 +823,7 @@ function scoreAgentHarnessResults(results, profile) {
|
|
|
721
823
|
}
|
|
722
824
|
const scores = [];
|
|
723
825
|
for (const [taskId, taskResults] of Object.entries(byTask)) {
|
|
724
|
-
const scored = scoreTestGroup(taskResults, profile, taskId);
|
|
826
|
+
const scored = scoreTestGroup(taskResults, profile, taskId, preflightOptions);
|
|
725
827
|
const totalCost = scored.totalCost;
|
|
726
828
|
// Detect feature area for backward compat (used by report grouping)
|
|
727
829
|
const feature = taskResults[0]?.vars.__featureArea ??
|
|
@@ -774,7 +876,7 @@ function extractTaskId(description) {
|
|
|
774
876
|
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
775
877
|
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
776
878
|
*/
|
|
777
|
-
export function scoreKnowledgeProbeResults(results, profile) {
|
|
879
|
+
export function scoreKnowledgeProbeResults(results, profile, preflightOptions) {
|
|
778
880
|
const byFeature = {};
|
|
779
881
|
for (const result of results) {
|
|
780
882
|
const feature = result.vars.__featureArea || detectFeatureArea(result.description);
|
|
@@ -785,7 +887,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
|
|
|
785
887
|
}
|
|
786
888
|
const scores = [];
|
|
787
889
|
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
788
|
-
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
890
|
+
const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
|
|
789
891
|
scores.push({
|
|
790
892
|
assertionPassRate: scored.dimensions.assertionPassRate,
|
|
791
893
|
ceilingScore: 0,
|
|
@@ -817,7 +919,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
|
|
|
817
919
|
* Returns a record keyed by feature area with the composite actual score.
|
|
818
920
|
*/
|
|
819
921
|
// ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
|
|
820
|
-
export function scoreAgenticResults(resultsPath, profile) {
|
|
922
|
+
export function scoreAgenticResults(resultsPath, profile, preflightOptions) {
|
|
821
923
|
const results = readAndNormalizeResults(resultsPath);
|
|
822
924
|
// Group by feature area
|
|
823
925
|
const byFeature = {};
|
|
@@ -830,7 +932,7 @@ export function scoreAgenticResults(resultsPath, profile) {
|
|
|
830
932
|
}
|
|
831
933
|
const entries = {};
|
|
832
934
|
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
833
|
-
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
935
|
+
const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
|
|
834
936
|
entries[feature] = {
|
|
835
937
|
actualScore: scored.composite,
|
|
836
938
|
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
@@ -849,7 +951,7 @@ export function scoreAgenticResults(resultsPath, profile) {
|
|
|
849
951
|
* producing a map of model → feature → ActualScoreEntry.
|
|
850
952
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
851
953
|
*/
|
|
852
|
-
export function scoreAgenticResultsPerModel(resultsPath, profile) {
|
|
954
|
+
export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptions) {
|
|
853
955
|
const results = readAndNormalizeResults(resultsPath);
|
|
854
956
|
// Group by model, then feature
|
|
855
957
|
const byModel = {};
|
|
@@ -866,7 +968,7 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
|
|
|
866
968
|
for (const [modelId, features] of Object.entries(byModel)) {
|
|
867
969
|
perModel[modelId] = {};
|
|
868
970
|
for (const [feature, featureResults] of Object.entries(features)) {
|
|
869
|
-
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
971
|
+
const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
|
|
870
972
|
perModel[modelId][feature] = {
|
|
871
973
|
actualScore: scored.composite,
|
|
872
974
|
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
@@ -912,6 +1014,15 @@ export function calculateAndWriteScores(options) {
|
|
|
912
1014
|
}
|
|
913
1015
|
// Determine mode — controls which result files are read
|
|
914
1016
|
const mode = options.mode ?? LiteracyVariant.STANDARD;
|
|
1017
|
+
// W0198 — assemble preflight options once. The helper returns
|
|
1018
|
+
// `undefined` when reports / runId are missing, so all downstream
|
|
1019
|
+
// callers handle the no-preflight case uniformly.
|
|
1020
|
+
const preflightOptions = makePreflightOptions(options.preflightReports, options.runId, mode, options.preflightWeight, log);
|
|
1021
|
+
// W0198 — resolver-health summary. Independent of `preflightOptions`
|
|
1022
|
+
// (which gates the score merge): when reports exist but the runId is
|
|
1023
|
+
// missing, scoring stays on the rubric-only path while telemetry still
|
|
1024
|
+
// surfaces, so the resolver's drift remains visible.
|
|
1025
|
+
const preflightSummary = summarizePreflight(options.preflightReports);
|
|
915
1026
|
const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
916
1027
|
// Agentic results path (only used in full mode)
|
|
917
1028
|
const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
|
|
@@ -940,7 +1051,7 @@ export function calculateAndWriteScores(options) {
|
|
|
940
1051
|
const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
|
|
941
1052
|
log.debug("Agent-harness scoring profile", agentProfile);
|
|
942
1053
|
const results = readAndNormalizeResults(baselineResultsPath);
|
|
943
|
-
const scores = scoreAgentHarnessResults(results, agentProfile);
|
|
1054
|
+
const scores = scoreAgentHarnessResults(results, agentProfile, preflightOptions);
|
|
944
1055
|
log.debug("Agent-harness scores calculated", {
|
|
945
1056
|
taskCount: scores.length,
|
|
946
1057
|
tasks: scores.map((s) => ({
|
|
@@ -960,7 +1071,7 @@ export function calculateAndWriteScores(options) {
|
|
|
960
1071
|
const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
|
|
961
1072
|
graderCost, null, // no per-model breakdown
|
|
962
1073
|
null, // no source isolation
|
|
963
|
-
sourceVerification, "agent-harness", log);
|
|
1074
|
+
sourceVerification, "agent-harness", log, preflightSummary);
|
|
964
1075
|
// Persist
|
|
965
1076
|
const outDir = join(ROOT, "results", "latest");
|
|
966
1077
|
mkdirSync(outDir, { recursive: true });
|
|
@@ -992,7 +1103,7 @@ export function calculateAndWriteScores(options) {
|
|
|
992
1103
|
const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
|
|
993
1104
|
log.debug("Knowledge-probe scoring profile", probeProfile);
|
|
994
1105
|
const results = readAndNormalizeResults(baselineResultsPath);
|
|
995
|
-
const scores = scoreKnowledgeProbeResults(results, probeProfile);
|
|
1106
|
+
const scores = scoreKnowledgeProbeResults(results, probeProfile, preflightOptions);
|
|
996
1107
|
log.debug("Knowledge-probe scores calculated", {
|
|
997
1108
|
featureCount: scores.length,
|
|
998
1109
|
features: scores.map((s) => ({
|
|
@@ -1012,7 +1123,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1012
1123
|
const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
|
|
1013
1124
|
graderCost, null, // no per-model breakdown for now
|
|
1014
1125
|
null, // no source isolation — KP doesn't fetch sources
|
|
1015
|
-
sourceVerification, "knowledge-probe", log);
|
|
1126
|
+
sourceVerification, "knowledge-probe", log, preflightSummary);
|
|
1016
1127
|
// Persist
|
|
1017
1128
|
const outDir = join(ROOT, "results", "latest");
|
|
1018
1129
|
mkdirSync(outDir, { recursive: true });
|
|
@@ -1041,7 +1152,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1041
1152
|
gold: goldProfile,
|
|
1042
1153
|
baseline: baselineProfileWeights,
|
|
1043
1154
|
});
|
|
1044
|
-
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
1155
|
+
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
|
|
1045
1156
|
log.debug("Baseline scores calculated", {
|
|
1046
1157
|
featureCount: baselineScores.length,
|
|
1047
1158
|
features: baselineScores.map((s) => ({
|
|
@@ -1051,7 +1162,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1051
1162
|
docLift: s.docLift,
|
|
1052
1163
|
})),
|
|
1053
1164
|
});
|
|
1054
|
-
const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
1165
|
+
const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
|
|
1055
1166
|
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
1056
1167
|
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
1057
1168
|
allowedOrigins: options.allowedOrigins,
|
|
@@ -1067,7 +1178,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1067
1178
|
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
1068
1179
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
1069
1180
|
const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
|
|
1070
|
-
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
|
|
1181
|
+
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
|
|
1071
1182
|
log.debug("Agentic scores calculated", {
|
|
1072
1183
|
featureCount: Object.keys(agenticScores).length,
|
|
1073
1184
|
features: Object.entries(agenticScores).map(([f, s]) => ({
|
|
@@ -1080,7 +1191,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1080
1191
|
evaluationMode = LiteracyVariant.FULL;
|
|
1081
1192
|
// Merge agentic actual scores into the per-model breakdown
|
|
1082
1193
|
if (perModel) {
|
|
1083
|
-
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
|
|
1194
|
+
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile, preflightOptions);
|
|
1084
1195
|
for (const entry of perModel) {
|
|
1085
1196
|
const modelAgentic = agenticPerModel[entry.modelId];
|
|
1086
1197
|
if (modelAgentic) {
|
|
@@ -1115,7 +1226,7 @@ export function calculateAndWriteScores(options) {
|
|
|
1115
1226
|
? LiteracyVariant.OBSERVED
|
|
1116
1227
|
: LiteracyVariant.STANDARD;
|
|
1117
1228
|
}
|
|
1118
|
-
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
|
|
1229
|
+
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary);
|
|
1119
1230
|
// Persist
|
|
1120
1231
|
const outDir = join(ROOT, "results", "latest");
|
|
1121
1232
|
mkdirSync(outDir, { recursive: true });
|
|
@@ -1269,7 +1380,7 @@ function printPerModelReport(perModel, log) {
|
|
|
1269
1380
|
// ---------------------------------------------------------------------------
|
|
1270
1381
|
// Main
|
|
1271
1382
|
// ---------------------------------------------------------------------------
|
|
1272
|
-
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
|
|
1383
|
+
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary) {
|
|
1273
1384
|
const _log = log ?? new ConsoleLogger();
|
|
1274
1385
|
_log.info("\n" + "=".repeat(80));
|
|
1275
1386
|
_log.info(" SANITY AI LITERACY SCORE REPORT");
|
|
@@ -1428,6 +1539,8 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1428
1539
|
if (perModel) {
|
|
1429
1540
|
printPerModelReport(perModel, _log);
|
|
1430
1541
|
}
|
|
1542
|
+
// W0198 — symbol preflight resolver-health summary
|
|
1543
|
+
printPreflightSummary(preflightSummary, _log);
|
|
1431
1544
|
// URL References
|
|
1432
1545
|
printUrlReport(urlRefs, _log);
|
|
1433
1546
|
// Agent Behavior (only present when run with instrumented provider)
|
|
@@ -1557,6 +1670,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1557
1670
|
}
|
|
1558
1671
|
: undefined,
|
|
1559
1672
|
...(perModel && { perModel }),
|
|
1673
|
+
...(preflightSummary && { preflight: preflightSummary }),
|
|
1560
1674
|
...(sourceIsolation && { sourceIsolation }),
|
|
1561
1675
|
...(sourceVerification && { sourceVerification }),
|
|
1562
1676
|
timestamp: new Date().toISOString(),
|
|
@@ -21,6 +21,6 @@ export { checkBudget, classifyToolCall, classifyToolCalls, collectTrace, compute
|
|
|
21
21
|
export { registerSanityLiteracyPreset, sanityLiteracyPreset, } from "./presets/index.js";
|
|
22
22
|
export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from "./ignore-fields.js";
|
|
23
23
|
export { simpleHash } from "./hash.js";
|
|
24
|
-
export { scoreTestGroup, type BridgedScoreResult } from "./scoring-bridge.js";
|
|
24
|
+
export { preflightToScore, scoreTestGroup, type BridgedScoreResult, type ScoreTestGroupOptions, } from "./scoring-bridge.js";
|
|
25
25
|
export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
|
|
26
26
|
export type { ConfigLoadResult } from "./config-loader.js";
|
|
@@ -37,6 +37,6 @@ export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from
|
|
|
37
37
|
// Hash utility
|
|
38
38
|
export { simpleHash } from "./hash.js";
|
|
39
39
|
// Scoring bridge — 4-tier engine integration
|
|
40
|
-
export { scoreTestGroup } from "./scoring-bridge.js";
|
|
40
|
+
export { preflightToScore, scoreTestGroup, } from "./scoring-bridge.js";
|
|
41
41
|
// Unified config loader
|
|
42
42
|
export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
*/
|
|
21
21
|
import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
22
22
|
import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
|
|
23
|
+
import type { PreflightRubricContext } from "./rubric-resolution.js";
|
|
23
24
|
import { type LiteracyEvalSubMode } from "../normalize-mode.js";
|
|
24
25
|
/** Options for compiling all literacy tasks via the new compiler */
|
|
25
26
|
export interface LiteracyBridgeOptions {
|
|
@@ -35,6 +36,14 @@ export interface LiteracyBridgeOptions {
|
|
|
35
36
|
label: string;
|
|
36
37
|
config?: Record<string, unknown>;
|
|
37
38
|
}[];
|
|
39
|
+
/** Grader context policy passed through to `compileLiteracyTask`. */
|
|
40
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
41
|
+
/**
|
|
42
|
+
* W0198 Phase 6 — preflight context passed through to every task's
|
|
43
|
+
* `code-correctness` rubric so the grader treats the deterministic
|
|
44
|
+
* lane's existence verdicts as ground truth.
|
|
45
|
+
*/
|
|
46
|
+
preflightContext?: PreflightRubricContext;
|
|
38
47
|
}
|
|
39
48
|
/** Result of compiling all literacy tasks */
|
|
40
49
|
export interface LiteracyBridgeResult {
|
|
@@ -73,6 +73,8 @@ export function compileLiteracyTasks(tasks, options) {
|
|
|
73
73
|
evalMode: options.evalMode,
|
|
74
74
|
models: options.models,
|
|
75
75
|
rubricConfig,
|
|
76
|
+
graderContext: options.graderContext,
|
|
77
|
+
preflightContext: options.preflightContext,
|
|
76
78
|
};
|
|
77
79
|
for (const node of orderedNodes) {
|
|
78
80
|
const task = taskMap.get(node.taskId);
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
8
|
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
9
9
|
import type { LiteracyCompileOptions } from "./types.js";
|
|
10
|
-
export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
|
|
10
|
+
export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
|
|
11
11
|
/**
|
|
12
12
|
* Build baseline assertions matching the legacy expand-tasks behavior.
|
|
13
13
|
*
|