@sanity/ailf 7.0.1 → 7.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +12 -13
- package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/report.js +2 -0
- package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/team.js +63 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
- package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
- package/dist/_vendor/ailf-core/types/team.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
- package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
- package/dist/_vendor/ailf-shared/event-types.js +23 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
- package/dist/_vendor/ailf-shared/index.d.ts +5 -3
- package/dist/_vendor/ailf-shared/index.js +5 -2
- package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
- package/dist/_vendor/ailf-shared/member-roles.js +16 -0
- package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
- package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
- package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
- package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
- package/dist/adapters/task-sources/repo-task-source.js +2 -1
- package/dist/commands/pipeline-action.d.ts +4 -3
- package/dist/commands/pipeline-action.js +7 -5
- package/dist/commands/run.js +2 -2
- package/dist/config/rubrics.ts +12 -13
- package/dist/job-store.d.ts +18 -0
- package/dist/job-store.js +34 -0
- package/dist/orchestration/build-app-context.js +8 -1
- package/dist/orchestration/pipeline-orchestrator.js +46 -1
- package/dist/orchestration/steps/compare-step.d.ts +7 -0
- package/dist/orchestration/steps/compare-step.js +59 -23
- package/dist/orchestration/steps/fetch-docs-step.js +3 -0
- package/dist/orchestration/steps/finalize-run-step.js +2 -0
- package/dist/orchestration/steps/gap-analysis-step.js +9 -8
- package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
- package/dist/orchestration/steps/generate-configs-step.js +47 -13
- package/dist/orchestration/steps/grader-consistency-step.js +11 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
- package/dist/orchestration/steps/publish-report-step.js +36 -8
- package/dist/pipeline/cache-hit-restore.d.ts +14 -1
- package/dist/pipeline/cache-hit-restore.js +17 -0
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +123 -29
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
- package/dist/pipeline/compiler/provider-assembler.js +16 -3
- package/dist/pipeline/failure-modes.d.ts +20 -10
- package/dist/pipeline/failure-modes.js +84 -15
- package/dist/pipeline/map-request-to-config.js +2 -0
- package/dist/pipeline/normalize-mode.d.ts +1 -1
- package/dist/pipeline/normalize-mode.js +2 -0
- package/dist/pipeline/run-context.d.ts +16 -1
- package/dist/pipeline/run-context.js +12 -1
- package/dist/pipeline/validate.d.ts +8 -4
- package/dist/pipeline/validate.js +8 -18
- package/dist/report-store.d.ts +14 -1
- package/dist/report-store.js +32 -0
- package/dist/sanity/client.js +2 -2
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
package/dist/job-store.d.ts
CHANGED
|
@@ -101,4 +101,22 @@ export declare class JobStore {
|
|
|
101
101
|
* Update a job's status and optional associated data.
|
|
102
102
|
*/
|
|
103
103
|
updateJob(jobId: string, update: Partial<Pick<JobDocument, "completedAt" | "error" | "execution" | "progress" | "reportId" | "startedAt" | "status">>): Promise<boolean>;
|
|
104
|
+
/**
|
|
105
|
+
* Patch the parent ailf.evalRequest doc when this job reaches a terminal
|
|
106
|
+
* state. The webhook handler writes `jobId` onto the evalRequest at
|
|
107
|
+
* dispatch time, so we look the parent up by that field.
|
|
108
|
+
*
|
|
109
|
+
* Best-effort: returns `false` on lookup miss or write failure, never
|
|
110
|
+
* throws. Closes the S1-B gap from the 2026-05-24 new-eval audit — until
|
|
111
|
+
* this runs, evalRequest docs stay `status: "dispatched"` indefinitely
|
|
112
|
+
* and the dashboard can't surface completion or errors to users.
|
|
113
|
+
*
|
|
114
|
+
* @returns true on successful patch, false on lookup miss or write error
|
|
115
|
+
*/
|
|
116
|
+
patchEvalRequestForJob(jobId: string, patch: {
|
|
117
|
+
status: "completed" | "failed";
|
|
118
|
+
completedAt: string;
|
|
119
|
+
reportId?: string;
|
|
120
|
+
error?: string;
|
|
121
|
+
}): Promise<boolean>;
|
|
104
122
|
}
|
package/dist/job-store.js
CHANGED
|
@@ -149,6 +149,40 @@ export class JobStore {
|
|
|
149
149
|
return false;
|
|
150
150
|
}
|
|
151
151
|
}
|
|
152
|
+
/**
|
|
153
|
+
* Patch the parent ailf.evalRequest doc when this job reaches a terminal
|
|
154
|
+
* state. The webhook handler writes `jobId` onto the evalRequest at
|
|
155
|
+
* dispatch time, so we look the parent up by that field.
|
|
156
|
+
*
|
|
157
|
+
* Best-effort: returns `false` on lookup miss or write failure, never
|
|
158
|
+
* throws. Closes the S1-B gap from the 2026-05-24 new-eval audit — until
|
|
159
|
+
* this runs, evalRequest docs stay `status: "dispatched"` indefinitely
|
|
160
|
+
* and the dashboard can't surface completion or errors to users.
|
|
161
|
+
*
|
|
162
|
+
* @returns true on successful patch, false on lookup miss or write error
|
|
163
|
+
*/
|
|
164
|
+
async patchEvalRequestForJob(jobId, patch) {
|
|
165
|
+
try {
|
|
166
|
+
const evalRequest = await this.client.fetch(`*[_type == "ailf.evalRequest" && jobId == $jobId][0]{_id}`, { jobId });
|
|
167
|
+
if (!evalRequest?._id) {
|
|
168
|
+
return false;
|
|
169
|
+
}
|
|
170
|
+
await this.client
|
|
171
|
+
.patch(evalRequest._id)
|
|
172
|
+
.set({
|
|
173
|
+
status: patch.status,
|
|
174
|
+
completedAt: patch.completedAt,
|
|
175
|
+
...(patch.reportId ? { reportId: patch.reportId } : {}),
|
|
176
|
+
...(patch.error ? { error: patch.error } : {}),
|
|
177
|
+
})
|
|
178
|
+
.commit();
|
|
179
|
+
return true;
|
|
180
|
+
}
|
|
181
|
+
catch (error) {
|
|
182
|
+
console.warn(` ⚠️ Failed to patch ailf.evalRequest for jobId ${jobId}: ${error instanceof Error ? error.message : String(error)}`);
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
152
186
|
}
|
|
153
187
|
// ---------------------------------------------------------------------------
|
|
154
188
|
// Helpers
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* Once all commands construct ResolvedConfig directly (or use --config),
|
|
9
9
|
* this bridge can be deleted.
|
|
10
10
|
*/
|
|
11
|
+
import { isLiteracyVariant } from "../_vendor/ailf-shared/index.js";
|
|
11
12
|
import { createAppContext } from "../composition-root.js";
|
|
12
13
|
import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
|
|
13
14
|
/**
|
|
@@ -18,10 +19,16 @@ import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
|
|
|
18
19
|
* are derived (e.g., areas from areaOption).
|
|
19
20
|
*/
|
|
20
21
|
export function mapToResolvedConfig(opts, rootDir) {
|
|
22
|
+
// `opts.variant` is a free-form string from CLI / config flags; narrow it
|
|
23
|
+
// to the closed `LiteracyVariant` set so downstream consumers (the report
|
|
24
|
+
// provenance derivation, in particular) never see a bogus string.
|
|
25
|
+
// Unknown values silently drop to undefined — the legacy behavior — but a
|
|
26
|
+
// narrowing surface is in place for the day we want to error here.
|
|
27
|
+
const variant = isLiteracyVariant(opts.variant) ? opts.variant : undefined;
|
|
21
28
|
return {
|
|
22
29
|
rootDir,
|
|
23
30
|
mode: opts.mode,
|
|
24
|
-
variant
|
|
31
|
+
variant,
|
|
25
32
|
noAutoScope: opts.noAutoScope ?? false,
|
|
26
33
|
debug: opts.debug,
|
|
27
34
|
areas: opts.areaOption
|
|
@@ -69,6 +69,35 @@ async function reportJobProgress(ctx, stepName, completedSteps, totalSteps, stat
|
|
|
69
69
|
ctx.logger.warn(`Failed to report job progress for step "${stepName}" — continuing`);
|
|
70
70
|
}
|
|
71
71
|
}
|
|
72
|
+
/**
|
|
73
|
+
* Patch the parent ailf.evalRequest doc when the underlying ailf.job
|
|
74
|
+
* reaches a terminal state (completed | failed).
|
|
75
|
+
*
|
|
76
|
+
* Thin wrapper over `JobStore.patchEvalRequestForJob` that handles client
|
|
77
|
+
* construction (env-driven token) and the logger callback. Best-effort:
|
|
78
|
+
* never throws, logs warnings on lookup miss or write failure.
|
|
79
|
+
*
|
|
80
|
+
* Closes the S1-B gap from the 2026-05-24 new-eval audit — until this
|
|
81
|
+
* runs, evalRequest docs stay `status: "dispatched"` indefinitely and
|
|
82
|
+
* the dashboard can't surface completion or errors to users.
|
|
83
|
+
*/
|
|
84
|
+
async function patchEvalRequestForJob(ctx, jobId, patch) {
|
|
85
|
+
try {
|
|
86
|
+
const { JobStore } = await import("../job-store.js");
|
|
87
|
+
const store = new JobStore({
|
|
88
|
+
token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
|
|
89
|
+
process.env.SANITY_API_TOKEN ??
|
|
90
|
+
undefined,
|
|
91
|
+
});
|
|
92
|
+
const patched = await store.patchEvalRequestForJob(jobId, patch);
|
|
93
|
+
if (!patched) {
|
|
94
|
+
ctx.logger.debug(`No ailf.evalRequest patched for jobId ${jobId} — lookup miss or write failure`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
ctx.logger.warn(`Failed to patch ailf.evalRequest for jobId ${jobId}: ${err instanceof Error ? err.message : String(err)}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
72
101
|
// ---------------------------------------------------------------------------
|
|
73
102
|
// Artifact capture
|
|
74
103
|
// ---------------------------------------------------------------------------
|
|
@@ -188,6 +217,11 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
188
217
|
message: failedError,
|
|
189
218
|
step: step.name,
|
|
190
219
|
}, jobUpdates);
|
|
220
|
+
await patchEvalRequestForJob(ctx, ctx.config.jobId, {
|
|
221
|
+
status: "failed",
|
|
222
|
+
completedAt: new Date().toISOString(),
|
|
223
|
+
error: `${step.name}: ${failedError}`,
|
|
224
|
+
});
|
|
191
225
|
}
|
|
192
226
|
// Capture pipeline context before exiting. `job-updates` was an
|
|
193
227
|
// observability-only capture not tied to a registered artifact type;
|
|
@@ -242,9 +276,10 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
242
276
|
// (P5 / local-first) and `success: true` is preserved; the `error`
|
|
243
277
|
// field is the wire signal that a configured optional step failed.
|
|
244
278
|
const firstOptionalFailure = getFirstOptionalFailure(steps, results);
|
|
279
|
+
const completedAt = new Date().toISOString();
|
|
245
280
|
await store.updateJob(ctx.config.jobId, {
|
|
246
281
|
status: "completed",
|
|
247
|
-
completedAt
|
|
282
|
+
completedAt,
|
|
248
283
|
progress: {
|
|
249
284
|
currentStep: "complete",
|
|
250
285
|
completedSteps: steps.length,
|
|
@@ -253,6 +288,16 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
253
288
|
...(state.reportId ? { reportId: state.reportId } : {}),
|
|
254
289
|
...(firstOptionalFailure ? { error: firstOptionalFailure } : {}),
|
|
255
290
|
});
|
|
291
|
+
await patchEvalRequestForJob(ctx, ctx.config.jobId, {
|
|
292
|
+
status: "completed",
|
|
293
|
+
completedAt,
|
|
294
|
+
...(state.reportId ? { reportId: state.reportId } : {}),
|
|
295
|
+
...(firstOptionalFailure
|
|
296
|
+
? {
|
|
297
|
+
error: `${firstOptionalFailure.step}: ${firstOptionalFailure.message}`,
|
|
298
|
+
}
|
|
299
|
+
: {}),
|
|
300
|
+
});
|
|
256
301
|
}
|
|
257
302
|
catch {
|
|
258
303
|
ctx.logger.warn("Failed to report job completion — continuing");
|
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
* This step is already pure (no execSync, no env vars) — the logic is
|
|
5
5
|
* inlined directly from the former pipeline/steps/compare-step.ts.
|
|
6
6
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Baseline resolution order (highest priority first):
|
|
9
|
+
* 1. `compareBaselineReportId` — fetch the named report doc
|
|
10
|
+
* and use its `summary` (a ReportSummary, which is a
|
|
11
|
+
* superset of ComparableSummary) as the baseline.
|
|
12
|
+
* 2. `compareBaseline` — local filesystem path (CLI ergonomics).
|
|
13
|
+
* 3. Latest baseline in `results/baselines/`.
|
|
7
14
|
*/
|
|
8
15
|
import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
9
16
|
export declare class CompareStep implements PipelineStep {
|
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
* This step is already pure (no execSync, no env vars) — the logic is
|
|
5
5
|
* inlined directly from the former pipeline/steps/compare-step.ts.
|
|
6
6
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Baseline resolution order (highest priority first):
|
|
9
|
+
* 1. `compareBaselineReportId` — fetch the named report doc
|
|
10
|
+
* and use its `summary` (a ReportSummary, which is a
|
|
11
|
+
* superset of ComparableSummary) as the baseline.
|
|
12
|
+
* 2. `compareBaseline` — local filesystem path (CLI ergonomics).
|
|
13
|
+
* 3. Latest baseline in `results/baselines/`.
|
|
7
14
|
*/
|
|
8
15
|
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
9
16
|
import { join, resolve } from "path";
|
|
@@ -29,39 +36,68 @@ export class CompareStep {
|
|
|
29
36
|
}
|
|
30
37
|
// Load experiment (current run)
|
|
31
38
|
const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
32
|
-
// Resolve baseline
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
const baselinesDir = resolve(rootDir, "results", "baselines");
|
|
39
|
-
if (!existsSync(baselinesDir)) {
|
|
39
|
+
// Resolve baseline. Pinned report id wins over local FS, which wins
|
|
40
|
+
// over auto-discovery of the most recent file in `results/baselines/`.
|
|
41
|
+
let baseline;
|
|
42
|
+
const pinnedReportId = ctx.config.compareBaselineReportId;
|
|
43
|
+
if (pinnedReportId) {
|
|
44
|
+
if (!ctx.reportStore) {
|
|
40
45
|
return {
|
|
41
|
-
reason: "
|
|
46
|
+
reason: "compareBaselineReportId set but no reportStore is configured. " +
|
|
47
|
+
"Check Sanity credentials in .ailf/config.yaml.",
|
|
42
48
|
status: "skipped",
|
|
43
49
|
};
|
|
44
50
|
}
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
const result = await ctx.reportStore.loadBaselineFromReport(pinnedReportId);
|
|
52
|
+
if (result.kind === "error") {
|
|
53
|
+
return {
|
|
54
|
+
durationMs: Date.now() - start,
|
|
55
|
+
error: `Failed to load baseline report ${pinnedReportId}: ${result.message}`,
|
|
56
|
+
status: "failed",
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
if (result.kind === "not_found") {
|
|
50
60
|
return {
|
|
51
|
-
reason:
|
|
61
|
+
reason: `Baseline report ${pinnedReportId} not found.`,
|
|
52
62
|
status: "skipped",
|
|
53
63
|
};
|
|
54
64
|
}
|
|
55
|
-
|
|
65
|
+
baseline = result.baseline;
|
|
56
66
|
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
67
|
+
else {
|
|
68
|
+
let resolvedBaselinePath;
|
|
69
|
+
if (ctx.config.compareBaseline) {
|
|
70
|
+
resolvedBaselinePath = resolve(ctx.config.compareBaseline);
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
const baselinesDir = resolve(rootDir, "results", "baselines");
|
|
74
|
+
if (!existsSync(baselinesDir)) {
|
|
75
|
+
return {
|
|
76
|
+
reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
|
|
77
|
+
status: "skipped",
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
const files = readdirSync(baselinesDir)
|
|
81
|
+
.filter((f) => f.endsWith(".json"))
|
|
82
|
+
.sort()
|
|
83
|
+
.reverse();
|
|
84
|
+
if (files.length === 0) {
|
|
85
|
+
return {
|
|
86
|
+
reason: "No baseline files found. Run 'pnpm baseline:save' first.",
|
|
87
|
+
status: "skipped",
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
resolvedBaselinePath = join(baselinesDir, files[0]);
|
|
91
|
+
}
|
|
92
|
+
if (!existsSync(resolvedBaselinePath)) {
|
|
93
|
+
return {
|
|
94
|
+
durationMs: Date.now() - start,
|
|
95
|
+
error: `Baseline file not found: ${resolvedBaselinePath}`,
|
|
96
|
+
status: "failed",
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
|
|
63
100
|
}
|
|
64
|
-
const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
|
|
65
101
|
// Run comparison
|
|
66
102
|
const options = ctx.config.compareThreshold
|
|
67
103
|
? { noiseThreshold: ctx.config.compareThreshold }
|
|
@@ -37,6 +37,9 @@ export class FetchDocsStep {
|
|
|
37
37
|
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
38
38
|
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
39
39
|
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
40
|
+
...(ctx.config.changedDocs?.length
|
|
41
|
+
? { changedDocs: ctx.config.changedDocs }
|
|
42
|
+
: {}),
|
|
40
43
|
};
|
|
41
44
|
const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
42
45
|
// Bridge: narrow to literacy tasks for canonical doc access
|
|
@@ -84,6 +84,8 @@ export class FinalizeRunStep {
|
|
|
84
84
|
rootDir: ctx.config.rootDir,
|
|
85
85
|
source: resolvedSource,
|
|
86
86
|
taskIds: ctx.config.tasks,
|
|
87
|
+
variant: ctx.config.variant,
|
|
88
|
+
requestedModelIds: ctx.config.models,
|
|
87
89
|
});
|
|
88
90
|
// W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
|
|
89
91
|
// `AccumulatingArtifactWriter`, which keeps a map of every ref any
|
|
@@ -82,14 +82,15 @@ export class GapAnalysisStep {
|
|
|
82
82
|
const resolveRefs = (slugs) => slugs
|
|
83
83
|
.map((slug) => {
|
|
84
84
|
const m = refBySlug.get(slug);
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
}
|
|
92
|
-
:
|
|
85
|
+
if (!m)
|
|
86
|
+
return { documentId: "", slug, title: slug };
|
|
87
|
+
return {
|
|
88
|
+
documentId: m._id,
|
|
89
|
+
revision: m._rev,
|
|
90
|
+
slug: m.slug,
|
|
91
|
+
...(m.path ? { path: m.path } : {}),
|
|
92
|
+
title: m.title,
|
|
93
|
+
};
|
|
93
94
|
})
|
|
94
95
|
.filter((r) => r.documentId !== "");
|
|
95
96
|
// ── Build description→docs mapping from TaskSource ─────────
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* When the variant is "full", the handler is called twice (baseline + agentic)
|
|
9
9
|
* and three YAML files are written. Other modes produce one YAML file.
|
|
10
10
|
*/
|
|
11
|
-
import { type AppContext, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
11
|
+
import { type AppContext, type ModelsConfig, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
12
12
|
export declare class GenerateConfigsStep implements PipelineStep {
|
|
13
13
|
readonly name = "generate-configs";
|
|
14
14
|
/** Task IDs from the last loadTasks call (pre-filter), for error messages. */
|
|
@@ -42,3 +42,34 @@ export declare class GenerateConfigsStep implements PipelineStep {
|
|
|
42
42
|
cacheInputs(ctx: AppContext): string[];
|
|
43
43
|
cacheContext(ctx: AppContext): string[];
|
|
44
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* Merge multiple compile results into one.
|
|
47
|
+
*
|
|
48
|
+
* Note: `providers` and `prompts` are taken from the first result only.
|
|
49
|
+
* This is correct for single-mode compilation where all tasks share the
|
|
50
|
+
* same provider set. Cross-mode merging with per-task provider overrides
|
|
51
|
+
* would need deduplication here.
|
|
52
|
+
*/
|
|
53
|
+
/**
|
|
54
|
+
* Apply `PipelineRequest.models` to the loaded model cohort (W0281).
|
|
55
|
+
*
|
|
56
|
+
* Returns one of three outcomes:
|
|
57
|
+
* - `unfiltered` — caller didn't pin any models; pass through.
|
|
58
|
+
* - `filtered` — at least one requested ID matched the cohort; unknown
|
|
59
|
+
* IDs are reported via a structured warning so callers
|
|
60
|
+
* can detect typos.
|
|
61
|
+
* - `no-match` — every requested ID is unknown. Caller wired this
|
|
62
|
+
* step into a failure path so the rejection reason
|
|
63
|
+
* surfaces on the job's `error` field, not silently.
|
|
64
|
+
*/
|
|
65
|
+
export type FilterModelsResult = {
|
|
66
|
+
kind: "unfiltered";
|
|
67
|
+
models: ModelsConfig;
|
|
68
|
+
} | {
|
|
69
|
+
kind: "filtered";
|
|
70
|
+
models: ModelsConfig;
|
|
71
|
+
} | {
|
|
72
|
+
kind: "no-match";
|
|
73
|
+
reason: string;
|
|
74
|
+
};
|
|
75
|
+
export declare function filterModelsByRequest(loaded: ModelsConfig, requested: string[] | undefined, logger: import("@sanity/ailf-core").Logger): FilterModelsResult;
|
|
@@ -67,12 +67,32 @@ export class GenerateConfigsStep {
|
|
|
67
67
|
};
|
|
68
68
|
}
|
|
69
69
|
// Load models
|
|
70
|
-
const { loadModelsAndProviders } = await import("../../pipeline/compiler/provider-assembler.js");
|
|
70
|
+
const { loadModelsAndProviders, loadModelsYaml } = await import("../../pipeline/compiler/provider-assembler.js");
|
|
71
71
|
const overrides = configToSourceOverrides(ctx.config);
|
|
72
72
|
const resolvedSource = ctx.config.source
|
|
73
73
|
? loadSource(ctx.config.source, overrides)
|
|
74
74
|
: undefined;
|
|
75
|
-
|
|
75
|
+
// W0281: when the caller pinned a subset of models via
|
|
76
|
+
// `PipelineRequest.models`, filter the cohort BEFORE provider
|
|
77
|
+
// assembly. Filtering only the returned `models` field would silently
|
|
78
|
+
// defeat the filter — promptfoo decides which LLMs to call from the
|
|
79
|
+
// providers array, which is assembled from the unfiltered set unless
|
|
80
|
+
// we hand the assembler a pre-filtered ModelsConfig. Unknown IDs are
|
|
81
|
+
// surfaced via a structured warning AND a failed step result (whose
|
|
82
|
+
// message lands on `ailf.job.error`) so callers can detect typos
|
|
83
|
+
// instead of silently running the full default cohort.
|
|
84
|
+
const rawModels = loadModelsYaml(ctx.config.rootDir);
|
|
85
|
+
const filtered = filterModelsByRequest(rawModels, ctx.config.models, ctx.logger);
|
|
86
|
+
if (filtered.kind === "no-match") {
|
|
87
|
+
return {
|
|
88
|
+
durationMs: Date.now() - start,
|
|
89
|
+
error: filtered.reason,
|
|
90
|
+
status: "failed",
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
const loaded = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins, filtered.models);
|
|
94
|
+
const models = loaded.models;
|
|
95
|
+
const providers = loaded.providers;
|
|
76
96
|
// Literacy mode: variant expansion (baseline + agentic → 3 YAML files)
|
|
77
97
|
if (mode === "literacy") {
|
|
78
98
|
return this.compileLiteracyVariants(ctx, handler, tasks, models, providers, start);
|
|
@@ -239,6 +259,9 @@ export class GenerateConfigsStep {
|
|
|
239
259
|
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
240
260
|
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
241
261
|
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
262
|
+
...(ctx.config.changedDocs?.length
|
|
263
|
+
? { changedDocs: ctx.config.changedDocs }
|
|
264
|
+
: {}),
|
|
242
265
|
};
|
|
243
266
|
const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
244
267
|
// Mode filter — the adapter may return a mixed-mode set (e.g. a user's
|
|
@@ -345,17 +368,28 @@ export class GenerateConfigsStep {
|
|
|
345
368
|
return buildCacheContext(ctx.config);
|
|
346
369
|
}
|
|
347
370
|
}
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
371
|
+
export function filterModelsByRequest(loaded, requested, logger) {
|
|
372
|
+
if (!requested || requested.length === 0) {
|
|
373
|
+
return { kind: "unfiltered", models: loaded };
|
|
374
|
+
}
|
|
375
|
+
const availableIds = new Set(loaded.models.map((m) => m.id));
|
|
376
|
+
const requestedSet = new Set(requested);
|
|
377
|
+
const kept = loaded.models.filter((m) => requestedSet.has(m.id));
|
|
378
|
+
const unknown = requested.filter((id) => !availableIds.has(id));
|
|
379
|
+
if (kept.length === 0) {
|
|
380
|
+
const reason = `[generate-configs] PipelineRequest.models rejected — none of ` +
|
|
381
|
+
`[${requested.join(", ")}] match config/models.ts. ` +
|
|
382
|
+
`Available IDs: ${[...availableIds].join(", ") || "(none configured)"}.`;
|
|
383
|
+
logger.warn(reason);
|
|
384
|
+
return { kind: "no-match", reason };
|
|
385
|
+
}
|
|
386
|
+
if (unknown.length > 0) {
|
|
387
|
+
logger.warn(`[generate-configs] PipelineRequest.models partial match — ignoring ` +
|
|
388
|
+
`unknown ID(s) [${unknown.join(", ")}]; ` +
|
|
389
|
+
`running ${kept.length}/${requested.length} requested.`);
|
|
390
|
+
}
|
|
391
|
+
return { kind: "filtered", models: { ...loaded, models: kept } };
|
|
392
|
+
}
|
|
359
393
|
function mergeCompileResults(results) {
|
|
360
394
|
const tests = results.flatMap((r) => r.tests);
|
|
361
395
|
const warnings = results.flatMap((r) => r.warnings);
|
|
@@ -18,7 +18,18 @@ export class GraderConsistencyStep {
|
|
|
18
18
|
}
|
|
19
19
|
async execute(ctx) {
|
|
20
20
|
const start = Date.now();
|
|
21
|
+
// Default-on-omit is 5 (matches consistency-analysis-friendly defaults).
|
|
22
|
+
// The dashboard sends 1 by default for cost reasons (see W0283 / new-eval
|
|
23
|
+
// audit S1-E). When the resolved value is <2, the analysis can't compute
|
|
24
|
+
// variance — skip instead of failing so the job doesn't carry a
|
|
25
|
+
// misleading `error.step: "grader-consistency"`.
|
|
21
26
|
const replications = ctx.config.graderReplications ?? 5;
|
|
27
|
+
if (replications < 2) {
|
|
28
|
+
return {
|
|
29
|
+
reason: `graderReplications=${replications} (<2) — consistency analysis requires at least 2 replications`,
|
|
30
|
+
status: "skipped",
|
|
31
|
+
};
|
|
32
|
+
}
|
|
22
33
|
const primaryResultsRun = ctx.config.mode === "literacy"
|
|
23
34
|
? ctx.config.variant === LiteracyVariant.FULL
|
|
24
35
|
? LiteracyVariant.STANDARD
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
* - P5: Local-first (pipeline never fails because of a store write)
|
|
11
11
|
* - P6: Sinks are fire-and-forget (failures logged, not thrown)
|
|
12
12
|
*/
|
|
13
|
-
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
13
|
+
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
import { type ProvenanceInput } from "../../pipeline/provenance.js";
|
|
14
15
|
export declare class PublishReportStep implements PipelineStep {
|
|
15
16
|
private readonly pipelineStart;
|
|
16
17
|
private readonly options;
|
|
@@ -24,3 +25,13 @@ export declare class PublishReportStep implements PipelineStep {
|
|
|
24
25
|
check(): ValidationIssue[];
|
|
25
26
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
26
27
|
}
|
|
28
|
+
/**
|
|
29
|
+
* Assemble provenance input from the score summary and pipeline context.
|
|
30
|
+
*
|
|
31
|
+
* Exported for unit testing — direct consumers should still call
|
|
32
|
+
* `buildProvenance` (which calls this transitively via the publish step).
|
|
33
|
+
*/
|
|
34
|
+
export declare function buildProvenanceInput(summary: ScoreSummary, ctx: AppContext, options: {
|
|
35
|
+
evalFingerprint?: string;
|
|
36
|
+
promptfooUrls?: PromptfooUrlEntry[];
|
|
37
|
+
}, autoScope?: ReportAutoScope): ProvenanceInput;
|
|
@@ -194,26 +194,52 @@ export class PublishReportStep {
|
|
|
194
194
|
// ---------------------------------------------------------------------------
|
|
195
195
|
/**
|
|
196
196
|
* Assemble provenance input from the score summary and pipeline context.
|
|
197
|
+
*
|
|
198
|
+
* Exported for unit testing — direct consumers should still call
|
|
199
|
+
* `buildProvenance` (which calls this transitively via the publish step).
|
|
197
200
|
*/
|
|
198
|
-
function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
201
|
+
export function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
199
202
|
const areas = summary.scores.map((s) => s.feature);
|
|
200
203
|
const mode = ctx.config.mode;
|
|
201
204
|
// Read document IDs from config
|
|
202
205
|
const sanityDocumentIds = ctx.config.sanityDocumentArgs;
|
|
203
206
|
// Read task filter from config
|
|
204
207
|
const taskIds = ctx.config.tasks;
|
|
205
|
-
// Build source from summary metadata or config
|
|
208
|
+
// Build source from summary metadata or config. Resolution order:
|
|
209
|
+
// 1. summary.source — written by calculate-scores after a successful
|
|
210
|
+
// `loadSource` round-trip.
|
|
211
|
+
// 2. ctx.config.source — the caller-requested source name. Preserves
|
|
212
|
+
// the user's intent when `loadSource` failed silently upstream
|
|
213
|
+
// (calculate-scores-step:104-108 swallows the throw, leaving
|
|
214
|
+
// summary.source undefined). Without this fallback, the report
|
|
215
|
+
// reads "production" regardless of what the dashboard sent.
|
|
216
|
+
// 3. "production" — last-resort built-in default.
|
|
217
|
+
//
|
|
218
|
+
// Per-field fallbacks (dataset/projectId/perspective) only fire when
|
|
219
|
+
// `summary.source` itself is absent — i.e. the loadSource throw was
|
|
220
|
+
// swallowed. When summary.source is present, trust what the fetch
|
|
221
|
+
// actually used; papering over a missing `perspective` from
|
|
222
|
+
// `ctx.config.perspectiveOverride` makes provenance claim a release
|
|
223
|
+
// was used when it wasn't (W0295).
|
|
224
|
+
const sourceResolved = summary.source?.name !== undefined;
|
|
225
|
+
if (!sourceResolved && ctx.config.source) {
|
|
226
|
+
ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
|
|
227
|
+
}
|
|
206
228
|
const source = {
|
|
207
229
|
baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
|
|
208
|
-
dataset:
|
|
230
|
+
dataset: sourceResolved
|
|
231
|
+
? (summary.source.dataset ?? "next")
|
|
232
|
+
: (ctx.config.datasetOverride ?? "next"),
|
|
209
233
|
documentIds: [],
|
|
210
234
|
llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
|
|
211
|
-
name: summary.source?.name ?? "production",
|
|
212
|
-
perspective:
|
|
213
|
-
|
|
214
|
-
undefined,
|
|
235
|
+
name: summary.source?.name ?? ctx.config.source ?? "production",
|
|
236
|
+
perspective: sourceResolved
|
|
237
|
+
? summary.source.perspective
|
|
238
|
+
: (ctx.config.perspectiveOverride ?? undefined),
|
|
215
239
|
priorityDomain: "sanity.io",
|
|
216
|
-
projectId:
|
|
240
|
+
projectId: sourceResolved
|
|
241
|
+
? summary.source.projectId
|
|
242
|
+
: (ctx.config.projectIdOverride ?? "3do82whm"),
|
|
217
243
|
studioOrigin: "https://admin.sanity.io",
|
|
218
244
|
urls: [],
|
|
219
245
|
};
|
|
@@ -235,6 +261,8 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
235
261
|
source,
|
|
236
262
|
sourceReportId: ctx.config.sourceReportId,
|
|
237
263
|
taskIds,
|
|
264
|
+
variant: ctx.config.variant,
|
|
265
|
+
requestedModelIds: ctx.config.models,
|
|
238
266
|
};
|
|
239
267
|
}
|
|
240
268
|
/**
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
-
import type
|
|
11
|
+
import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
12
12
|
/**
|
|
13
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
14
14
|
* that doesn't already carry one.
|
|
@@ -29,6 +29,19 @@ import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
|
29
29
|
* `sourceRunId` equals the runId encoded in its `path` (= where the bytes
|
|
30
30
|
* physically live).
|
|
31
31
|
*
|
|
32
|
+
* **Post-hoc artifacts are dropped.** Refs whose descriptor has
|
|
33
|
+
* `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
|
|
34
|
+
* report's slot points at the *previous* run's path, but the post-hoc
|
|
35
|
+
* producer fires again on the new run and emits a fresh ref anchored at
|
|
36
|
+
* the new runId. Injecting the cached cross-run ref into the accumulator
|
|
37
|
+
* makes `FinalizeRunStep` embed the stale path into the on-GCS
|
|
38
|
+
* `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
|
|
39
|
+
* Sanity report doc, leaving the GCS manifest stale. Dropping the ref
|
|
40
|
+
* here keeps the GCS manifest consistent with the cache-miss shape (no
|
|
41
|
+
* post-hoc slot until the post-hoc emit lands), and the reader-side
|
|
42
|
+
* fallback resolves diagnosis via the Sanity doc, which the post-hoc
|
|
43
|
+
* patch keeps correct.
|
|
44
|
+
*
|
|
32
45
|
* Pure function; safe to call without side effects.
|
|
33
46
|
*/
|
|
34
47
|
export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
+
import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
|
|
11
12
|
/**
|
|
12
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
13
14
|
* that doesn't already carry one.
|
|
@@ -28,6 +29,19 @@
|
|
|
28
29
|
* `sourceRunId` equals the runId encoded in its `path` (= where the bytes
|
|
29
30
|
* physically live).
|
|
30
31
|
*
|
|
32
|
+
* **Post-hoc artifacts are dropped.** Refs whose descriptor has
|
|
33
|
+
* `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
|
|
34
|
+
* report's slot points at the *previous* run's path, but the post-hoc
|
|
35
|
+
* producer fires again on the new run and emits a fresh ref anchored at
|
|
36
|
+
* the new runId. Injecting the cached cross-run ref into the accumulator
|
|
37
|
+
* makes `FinalizeRunStep` embed the stale path into the on-GCS
|
|
38
|
+
* `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
|
|
39
|
+
* Sanity report doc, leaving the GCS manifest stale. Dropping the ref
|
|
40
|
+
* here keeps the GCS manifest consistent with the cache-miss shape (no
|
|
41
|
+
* post-hoc slot until the post-hoc emit lands), and the reader-side
|
|
42
|
+
* fallback resolves diagnosis via the Sanity doc, which the post-hoc
|
|
43
|
+
* patch keeps correct.
|
|
44
|
+
*
|
|
31
45
|
* Pure function; safe to call without side effects.
|
|
32
46
|
*/
|
|
33
47
|
export function remapToCacheHitRefs(source, opts) {
|
|
@@ -35,6 +49,9 @@ export function remapToCacheHitRefs(source, opts) {
|
|
|
35
49
|
for (const [type, ref] of Object.entries(source)) {
|
|
36
50
|
if (!ref)
|
|
37
51
|
continue;
|
|
52
|
+
const descriptor = ARTIFACT_REGISTRY[type];
|
|
53
|
+
if (descriptor?.writePolicy === "post-hoc")
|
|
54
|
+
continue;
|
|
38
55
|
const typed = ref;
|
|
39
56
|
out[type] = {
|
|
40
57
|
...typed,
|