@sanity/ailf 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
- package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
- package/dist/commands/check-staleness.js +12 -4
- package/dist/orchestration/steps/run-eval-step.js +39 -29
- package/dist/pipeline/cache-hit-restore.d.ts +24 -0
- package/dist/pipeline/cache-hit-restore.js +32 -0
- package/dist/pipeline/eval-fingerprint.d.ts +33 -35
- package/dist/pipeline/eval-fingerprint.js +124 -106
- package/dist/report-store.js +3 -0
- package/package.json +1 -1
|
@@ -138,12 +138,21 @@ function toTitleCase(id) {
|
|
|
138
138
|
// ---------------------------------------------------------------------------
|
|
139
139
|
const RECOMMENDATION_TOP_N = 3;
|
|
140
140
|
function slimRecommendations(full) {
|
|
141
|
+
// Cache-hit pass-through: when the pipeline restores a previously
|
|
142
|
+
// published report on a remote cache hit, `score-summary.json` carries
|
|
143
|
+
// recommendations in their already-slim shape (no `.gaps` field).
|
|
144
|
+
// Re-slimming would crash on `for (gap of undefined)`; the slim shape
|
|
145
|
+
// has no full-fidelity data to recover, so we return it verbatim.
|
|
146
|
+
if (!Array.isArray(full.gaps)) {
|
|
147
|
+
return full;
|
|
148
|
+
}
|
|
149
|
+
const fullReport = full;
|
|
141
150
|
const counts = {};
|
|
142
|
-
for (const gap of
|
|
151
|
+
for (const gap of fullReport.gaps) {
|
|
143
152
|
counts[gap.area] = (counts[gap.area] ?? 0) + 1;
|
|
144
153
|
}
|
|
145
154
|
// Sort by priority descending, break ties by estimatedLift.
|
|
146
|
-
const sorted = [...
|
|
155
|
+
const sorted = [...fullReport.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
|
|
147
156
|
(b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
|
|
148
157
|
const top3 = sorted
|
|
149
158
|
.slice(0, RECOMMENDATION_TOP_N)
|
|
@@ -156,8 +165,8 @@ function slimRecommendations(full) {
|
|
|
156
165
|
return {
|
|
157
166
|
counts,
|
|
158
167
|
top3,
|
|
159
|
-
totalGaps:
|
|
160
|
-
totalPotentialLift:
|
|
168
|
+
totalGaps: fullReport.gaps.length,
|
|
169
|
+
totalPotentialLift: fullReport.totalPotentialLift,
|
|
161
170
|
};
|
|
162
171
|
}
|
|
163
172
|
/**
|
|
@@ -1364,6 +1364,15 @@ export interface ArtifactRefEntry {
|
|
|
1364
1364
|
* - `truncated` on the bulk row indicates the single-object body was capped.
|
|
1365
1365
|
* - `preview` on the bulk row carries a descriptor-typed summary for list
|
|
1366
1366
|
* views; wiring lands in W0051.
|
|
1367
|
+
*
|
|
1368
|
+
* D0040/W0135 extension:
|
|
1369
|
+
* - `sourceRunId` declares that this ref's bytes physically live under a
|
|
1370
|
+
* different run's storage prefix than the manifest containing it.
|
|
1371
|
+
* `path` is already self-contained and authoritative for resolution;
|
|
1372
|
+
* `sourceRunId` is purely a lineage marker for retention, GC,
|
|
1373
|
+
* observability, and BigQuery joins. Set by the cache-hit branch in
|
|
1374
|
+
* `RunEvalStep` when a new run reuses a prior report's artifacts;
|
|
1375
|
+
* unset on cold-path producers.
|
|
1367
1376
|
*/
|
|
1368
1377
|
export interface ArtifactRef {
|
|
1369
1378
|
store: "gcs" | "local";
|
|
@@ -1381,6 +1390,7 @@ export interface ArtifactRef {
|
|
|
1381
1390
|
entries?: ArtifactRefEntry[];
|
|
1382
1391
|
truncated?: boolean;
|
|
1383
1392
|
preview?: unknown;
|
|
1393
|
+
sourceRunId?: RunId;
|
|
1384
1394
|
}
|
|
1385
1395
|
/**
|
|
1386
1396
|
* Catalog of artifact refs produced by a single pipeline run.
|
|
@@ -39,6 +39,19 @@ export declare class AccumulatingArtifactWriter implements ArtifactWriter {
|
|
|
39
39
|
getAccumulatedArtifactRefs(): ArtifactManifest;
|
|
40
40
|
/** Test-only. Clears accumulated refs without touching the inner writer. */
|
|
41
41
|
_resetAccumulated(): void;
|
|
42
|
+
/**
|
|
43
|
+
* Merge externally-supplied refs into the accumulator without touching
|
|
44
|
+
* the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
|
|
45
|
+
* W0135) to restore a cached report's `Report.artifactManifest` so the
|
|
46
|
+
* new run's `RunManifest` advertises the cached artifacts via cross-run
|
|
47
|
+
* paths instead of skipping them entirely.
|
|
48
|
+
*
|
|
49
|
+
* The injected refs already carry `path`, `bucket`, `entries`, etc. as
|
|
50
|
+
* the source run wrote them — we don't synthesize new paths, we copy.
|
|
51
|
+
* Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
|
|
52
|
+
* downstream tooling can follow the lineage.
|
|
53
|
+
*/
|
|
54
|
+
injectAccumulated(refs: ArtifactManifest): void;
|
|
42
55
|
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
56
|
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
44
57
|
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
@@ -46,6 +46,25 @@ export class AccumulatingArtifactWriter {
|
|
|
46
46
|
delete this.accumulated[k];
|
|
47
47
|
}
|
|
48
48
|
}
|
|
49
|
+
/**
|
|
50
|
+
* Merge externally-supplied refs into the accumulator without touching
|
|
51
|
+
* the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
|
|
52
|
+
* W0135) to restore a cached report's `Report.artifactManifest` so the
|
|
53
|
+
* new run's `RunManifest` advertises the cached artifacts via cross-run
|
|
54
|
+
* paths instead of skipping them entirely.
|
|
55
|
+
*
|
|
56
|
+
* The injected refs already carry `path`, `bucket`, `entries`, etc. as
|
|
57
|
+
* the source run wrote them — we don't synthesize new paths, we copy.
|
|
58
|
+
* Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
|
|
59
|
+
* downstream tooling can follow the lineage.
|
|
60
|
+
*/
|
|
61
|
+
injectAccumulated(refs) {
|
|
62
|
+
for (const [type, ref] of Object.entries(refs)) {
|
|
63
|
+
if (!ref)
|
|
64
|
+
continue;
|
|
65
|
+
this.mergeRef(type, ref);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
49
68
|
// ---- ArtifactWriter surface --------------------------------------------
|
|
50
69
|
async emit(type, association, payload) {
|
|
51
70
|
const ref = await this.inner.emit(type, association, payload);
|
|
@@ -21,11 +21,19 @@ export function createCheckStalenessCommand() {
|
|
|
21
21
|
// weekly-digest.ts and composition-root.ts — AILF_REPORT_* wins over
|
|
22
22
|
// the evaluated-source SANITY_* defaults so the staleness probe tracks
|
|
23
23
|
// the actual report dataset even when it diverges from the eval source.
|
|
24
|
+
//
|
|
25
|
+
// The `??` fallbacks matter: passing `{ projectId: undefined }` would
|
|
26
|
+
// clobber `getSanityClient`'s built-in default via spread. The
|
|
27
|
+
// staleness workflow only sets SANITY_PROJECT_ID, so without the
|
|
28
|
+
// fallback the probe crashes with "Configuration must contain
|
|
29
|
+
// `projectId`" instead of doing its job (issue #272).
|
|
30
|
+
const projectId = process.env.AILF_REPORT_PROJECT_ID ?? process.env.SANITY_PROJECT_ID;
|
|
31
|
+
const dataset = process.env.AILF_REPORT_DATASET ?? process.env.SANITY_DATASET;
|
|
32
|
+
const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
|
|
24
33
|
const client = getSanityClient({
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
token:
|
|
28
|
-
process.env.SANITY_API_TOKEN,
|
|
34
|
+
...(projectId ? { projectId } : {}),
|
|
35
|
+
...(dataset ? { dataset } : {}),
|
|
36
|
+
...(token ? { token } : {}),
|
|
29
37
|
});
|
|
30
38
|
const maxAgeDays = opts.maxAge;
|
|
31
39
|
// Bound the GROQ sort with a `completedAt > $floor` filter. Beyond
|
|
@@ -8,10 +8,13 @@
|
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
10
|
import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
11
|
+
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
11
12
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
12
13
|
import { buildCacheContext } from "../cache-context.js";
|
|
14
|
+
import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
|
|
13
15
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
14
16
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
17
|
+
import { loadGraderModel } from "../../pipeline/grader-api.js";
|
|
15
18
|
import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
|
|
16
19
|
export class RunEvalStep {
|
|
17
20
|
mode;
|
|
@@ -39,31 +42,31 @@ export class RunEvalStep {
|
|
|
39
42
|
status: "failed",
|
|
40
43
|
};
|
|
41
44
|
}
|
|
45
|
+
// Load the task set once and reuse it for both the literacy precondition
|
|
46
|
+
// check and the fingerprint. Mirrors the area/task filter applied by
|
|
47
|
+
// fetch-docs so we only see tasks that were actually fetched.
|
|
48
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
49
|
+
? {
|
|
50
|
+
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
51
|
+
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
52
|
+
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
53
|
+
}
|
|
54
|
+
: undefined;
|
|
55
|
+
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
56
|
+
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
57
|
+
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
58
|
+
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
59
|
+
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
60
|
+
}
|
|
42
61
|
// Precondition: canonical context files exist for filtered tasks.
|
|
43
62
|
// Only applies to literacy mode — other modes don't use canonical doc contexts.
|
|
44
63
|
if (this.mode === "literacy") {
|
|
45
|
-
// Must apply the same area/task filter as fetch-docs so we only
|
|
46
|
-
// check contexts that were actually fetched.
|
|
47
|
-
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
48
|
-
? {
|
|
49
|
-
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
50
|
-
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
51
|
-
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
52
|
-
}
|
|
53
|
-
: undefined;
|
|
54
|
-
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
55
|
-
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
56
|
-
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
57
|
-
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
58
|
-
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
59
|
-
}
|
|
60
64
|
// Only check context files for tasks that have canonical docs.
|
|
61
65
|
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
62
66
|
// have no docs to fetch), so no context file is written for them.
|
|
63
67
|
// The generated Promptfoo config still includes their "without-docs"
|
|
64
68
|
// variant (testing model knowledge alone), which doesn't need a
|
|
65
69
|
// context file.
|
|
66
|
-
// Bridge: narrow to literacy tasks with docs
|
|
67
70
|
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
68
71
|
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
69
72
|
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
@@ -83,14 +86,8 @@ export class RunEvalStep {
|
|
|
83
86
|
if (!debug?.enabled) {
|
|
84
87
|
try {
|
|
85
88
|
evalFingerprint = computeEvalFingerprint({
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
areas: ctx.config.areas,
|
|
89
|
-
taskIds: ctx.config.tasks,
|
|
90
|
-
tags: ctx.config.tags,
|
|
91
|
-
}
|
|
92
|
-
: undefined,
|
|
93
|
-
graderModel: "default",
|
|
89
|
+
tasks,
|
|
90
|
+
graderModel: loadGraderModel(rootDir).id,
|
|
94
91
|
mode: this.mode,
|
|
95
92
|
rootDir,
|
|
96
93
|
});
|
|
@@ -119,11 +116,22 @@ export class RunEvalStep {
|
|
|
119
116
|
state.promptfooUrls ??= [];
|
|
120
117
|
state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
|
|
121
118
|
}
|
|
122
|
-
//
|
|
123
|
-
//
|
|
124
|
-
//
|
|
125
|
-
//
|
|
126
|
-
//
|
|
119
|
+
// D0040 / W0135 — restore the cached report's artifact manifest into
|
|
120
|
+
// the accumulator so the new run's RunManifest advertises the cached
|
|
121
|
+
// artifacts via cross-run lineage (`sourceRunId`) instead of skipping
|
|
122
|
+
// them entirely. Without this, Studio drill-downs on the new report
|
|
123
|
+
// 404 because per-entry GCS objects were never written under the new
|
|
124
|
+
// runId. Bytes are not duplicated; the original prefix is untouched.
|
|
125
|
+
if (remoteCacheResult.artifactManifest &&
|
|
126
|
+
remoteCacheResult.sourceRunId &&
|
|
127
|
+
ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
|
|
128
|
+
const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
|
|
129
|
+
ctx.artifactWriter.injectAccumulated(restored);
|
|
130
|
+
const count = Object.keys(restored).length;
|
|
131
|
+
if (count > 0) {
|
|
132
|
+
console.log(` ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
127
135
|
return {
|
|
128
136
|
durationMs: Date.now() - start,
|
|
129
137
|
status: "success",
|
|
@@ -241,9 +249,11 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
|
|
|
241
249
|
console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
|
|
242
250
|
console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
|
|
243
251
|
return {
|
|
252
|
+
artifactManifest: cachedReport.artifactManifest,
|
|
244
253
|
completedAt: cachedReport.completedAt,
|
|
245
254
|
promptfooUrls: cachedReport.provenance?.promptfooUrls,
|
|
246
255
|
reportId: cachedReport.id,
|
|
256
|
+
sourceRunId: cachedReport.provenance?.runId,
|
|
247
257
|
};
|
|
248
258
|
}
|
|
249
259
|
catch (err) {
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cache-hit-restore.ts — helpers for the eval cache-hit branch in
|
|
3
|
+
* `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
|
|
4
|
+
* refs so the new run's manifest advertises the cached artifacts via
|
|
5
|
+
* cross-run lineage instead of pointing at GCS objects that were never
|
|
6
|
+
* written under the new runId.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
|
+
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
|
+
*/
|
|
11
|
+
import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
12
|
+
/**
|
|
13
|
+
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
|
|
14
|
+
*
|
|
15
|
+
* The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
|
|
16
|
+
* unchanged — they already point at the source run's storage. Only
|
|
17
|
+
* `sourceRunId` is added so retention/GC and observability tooling can
|
|
18
|
+
* follow the cross-run dependency.
|
|
19
|
+
*
|
|
20
|
+
* Pure function; safe to call without side effects.
|
|
21
|
+
*/
|
|
22
|
+
export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
|
|
23
|
+
sourceRunId: RunId;
|
|
24
|
+
}): ArtifactManifest;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cache-hit-restore.ts — helpers for the eval cache-hit branch in
|
|
3
|
+
* `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
|
|
4
|
+
* refs so the new run's manifest advertises the cached artifacts via
|
|
5
|
+
* cross-run lineage instead of pointing at GCS objects that were never
|
|
6
|
+
* written under the new runId.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
|
+
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
|
|
13
|
+
*
|
|
14
|
+
* The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
|
|
15
|
+
* unchanged — they already point at the source run's storage. Only
|
|
16
|
+
* `sourceRunId` is added so retention/GC and observability tooling can
|
|
17
|
+
* follow the cross-run dependency.
|
|
18
|
+
*
|
|
19
|
+
* Pure function; safe to call without side effects.
|
|
20
|
+
*/
|
|
21
|
+
export function remapToCacheHitRefs(source, opts) {
|
|
22
|
+
const out = {};
|
|
23
|
+
for (const [type, ref] of Object.entries(source)) {
|
|
24
|
+
if (!ref)
|
|
25
|
+
continue;
|
|
26
|
+
out[type] = {
|
|
27
|
+
...ref,
|
|
28
|
+
sourceRunId: opts.sourceRunId,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
return out;
|
|
32
|
+
}
|
|
@@ -6,30 +6,35 @@
|
|
|
6
6
|
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
7
|
* identical fingerprint and skip the expensive eval step.
|
|
8
8
|
*
|
|
9
|
-
* The fingerprint captures
|
|
9
|
+
* The fingerprint captures:
|
|
10
10
|
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
-
* - Model configuration (which models, their settings)
|
|
12
11
|
* - Grader model identity (different graders score differently)
|
|
13
|
-
* -
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
* -
|
|
18
|
-
* -
|
|
12
|
+
* - The task set that was actually loaded for this run, in its canonical
|
|
13
|
+
* shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
|
|
14
|
+
* Studio-authored task edits in the Content Lake are picked up — pre-v2
|
|
15
|
+
* the fingerprint walked `tasks/` on disk and missed them entirely).
|
|
16
|
+
* - Repo-tracked config (models, prompts, rubrics) and reference solutions.
|
|
17
|
+
* - Fetched canonical doc content (contexts/canonical/*.md).
|
|
19
18
|
*
|
|
20
19
|
* The fingerprint intentionally EXCLUDES:
|
|
21
|
-
* - Source name/URL (content matters, not origin)
|
|
22
|
-
* - Git metadata (informational, not eval-affecting)
|
|
23
|
-
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
-
* - Report tags (human labels)
|
|
20
|
+
* - Source name/URL (content matters, not origin).
|
|
21
|
+
* - Git metadata (informational, not eval-affecting).
|
|
22
|
+
* - Trigger type (manual vs CI → same inputs → same results).
|
|
23
|
+
* - Report tags (human labels).
|
|
25
24
|
*
|
|
26
25
|
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
26
|
*/
|
|
28
|
-
import type {
|
|
27
|
+
import type { GeneralizedTaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
28
|
+
import type { EvalMode } from "../_vendor/ailf-shared/index.d.ts";
|
|
29
29
|
/** Inputs needed to compute an evaluation fingerprint. */
|
|
30
30
|
export interface FingerprintInput {
|
|
31
|
-
/**
|
|
32
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Task definitions returned by `ctx.taskSource.loadTasks(filter)` after
|
|
33
|
+
* any release-auto-scope narrowing has been applied. The fingerprint
|
|
34
|
+
* captures whatever set the pipeline is actually about to evaluate, so
|
|
35
|
+
* filter changes are reflected implicitly.
|
|
36
|
+
*/
|
|
37
|
+
tasks: readonly GeneralizedTaskDefinition[];
|
|
33
38
|
/** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
|
|
34
39
|
graderModel: string;
|
|
35
40
|
/** Evaluation mode */
|
|
@@ -37,30 +42,23 @@ export interface FingerprintInput {
|
|
|
37
42
|
/** Path to the packages/eval root directory */
|
|
38
43
|
rootDir: string;
|
|
39
44
|
}
|
|
40
|
-
/**
|
|
41
|
-
* Collect all file paths that contribute to the evaluation fingerprint.
|
|
42
|
-
*
|
|
43
|
-
* This is similar to `getStepInputPaths()` in `cache.ts` but is more
|
|
44
|
-
* comprehensive and explicitly designed for cross-environment cache keys:
|
|
45
|
-
*
|
|
46
|
-
* - Includes `config/prompts` and `config/rubrics` directly
|
|
47
|
-
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
-
* - Includes `config/models` (model configuration)
|
|
49
|
-
* - Includes task definitions and reference solutions
|
|
50
|
-
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
-
* - Respects filter flags to only include relevant files
|
|
52
|
-
*/
|
|
53
|
-
export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
|
|
54
45
|
/**
|
|
55
46
|
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
56
47
|
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
* and adds non-file context (mode, grader model, filter flags) as
|
|
62
|
-
* additional context strings.
|
|
48
|
+
* Identical inputs always produce the same fingerprint, regardless of the
|
|
49
|
+
* environment (local, CI, etc.). Cross-environment portability relies on
|
|
50
|
+
* (a) tasks coming from the same Content Lake source and (b) file paths
|
|
51
|
+
* being hashed as rootDir-relative.
|
|
63
52
|
*
|
|
64
53
|
* @returns SHA-256 hex string (64 characters)
|
|
65
54
|
*/
|
|
66
55
|
export declare function computeEvalFingerprint(input: FingerprintInput): string;
|
|
56
|
+
/**
|
|
57
|
+
* Collect repo-tracked + fetched file paths that contribute to the
|
|
58
|
+
* fingerprint. Tasks are NOT collected here — they come from
|
|
59
|
+
* `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
|
|
60
|
+
* input on `computeEvalFingerprint`.
|
|
61
|
+
*
|
|
62
|
+
* Exported for the debug-fingerprint diagnostic script.
|
|
63
|
+
*/
|
|
64
|
+
export declare function collectFingerprintFilePaths(rootDir: string): string[];
|
|
@@ -6,28 +6,27 @@
|
|
|
6
6
|
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
7
|
* identical fingerprint and skip the expensive eval step.
|
|
8
8
|
*
|
|
9
|
-
* The fingerprint captures
|
|
9
|
+
* The fingerprint captures:
|
|
10
10
|
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
-
* - Model configuration (which models, their settings)
|
|
12
11
|
* - Grader model identity (different graders score differently)
|
|
13
|
-
* -
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
* -
|
|
18
|
-
* -
|
|
12
|
+
* - The task set that was actually loaded for this run, in its canonical
|
|
13
|
+
* shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
|
|
14
|
+
* Studio-authored task edits in the Content Lake are picked up — pre-v2
|
|
15
|
+
* the fingerprint walked `tasks/` on disk and missed them entirely).
|
|
16
|
+
* - Repo-tracked config (models, prompts, rubrics) and reference solutions.
|
|
17
|
+
* - Fetched canonical doc content (contexts/canonical/*.md).
|
|
19
18
|
*
|
|
20
19
|
* The fingerprint intentionally EXCLUDES:
|
|
21
|
-
* - Source name/URL (content matters, not origin)
|
|
22
|
-
* - Git metadata (informational, not eval-affecting)
|
|
23
|
-
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
-
* - Report tags (human labels)
|
|
20
|
+
* - Source name/URL (content matters, not origin).
|
|
21
|
+
* - Git metadata (informational, not eval-affecting).
|
|
22
|
+
* - Trigger type (manual vs CI → same inputs → same results).
|
|
23
|
+
* - Report tags (human labels).
|
|
25
24
|
*
|
|
26
25
|
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
26
|
*/
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
27
|
+
import { createHash } from "crypto";
|
|
28
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "fs";
|
|
29
|
+
import { join, relative, resolve } from "path";
|
|
31
30
|
// ---------------------------------------------------------------------------
|
|
32
31
|
// Constants
|
|
33
32
|
// ---------------------------------------------------------------------------
|
|
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
|
|
|
35
34
|
* Version prefix for the fingerprint hash. Bumping this invalidates all
|
|
36
35
|
* existing fingerprints in the Content Lake without needing to clear the
|
|
37
36
|
* store. Change this when adding new inputs to the hash.
|
|
37
|
+
*
|
|
38
|
+
* v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
|
|
39
|
+
* files), file paths normalized to rootDir-relative, grader passed
|
|
40
|
+
* through verbatim instead of the literal string "default".
|
|
38
41
|
*/
|
|
39
|
-
const FINGERPRINT_VERSION = "eval-fingerprint-
|
|
42
|
+
const FINGERPRINT_VERSION = "eval-fingerprint-v2";
|
|
40
43
|
/**
|
|
41
|
-
*
|
|
44
|
+
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
42
45
|
*
|
|
43
|
-
*
|
|
44
|
-
*
|
|
46
|
+
* Identical inputs always produce the same fingerprint, regardless of the
|
|
47
|
+
* environment (local, CI, etc.). Cross-environment portability relies on
|
|
48
|
+
* (a) tasks coming from the same Content Lake source and (b) file paths
|
|
49
|
+
* being hashed as rootDir-relative.
|
|
45
50
|
*
|
|
46
|
-
* -
|
|
47
|
-
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
-
* - Includes `config/models` (model configuration)
|
|
49
|
-
* - Includes task definitions and reference solutions
|
|
50
|
-
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
-
* - Respects filter flags to only include relevant files
|
|
51
|
+
* @returns SHA-256 hex string (64 characters)
|
|
52
52
|
*/
|
|
53
|
-
export function
|
|
53
|
+
export function computeEvalFingerprint(input) {
|
|
54
|
+
const { graderModel, mode, rootDir, tasks } = input;
|
|
55
|
+
const hash = createHash("sha256");
|
|
56
|
+
hash.update(`version:${FINGERPRINT_VERSION}\n`);
|
|
57
|
+
hash.update(`mode:${mode}\n`);
|
|
58
|
+
hash.update(`grader:${graderModel}\n`);
|
|
59
|
+
hash.update(`tasks:${hashTaskSet(tasks)}\n`);
|
|
60
|
+
// Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
|
|
61
|
+
// so a CI runner at /home/runner/... and a laptop at /Users/... produce
|
|
62
|
+
// the same hash for byte-identical content.
|
|
63
|
+
const filePaths = collectFingerprintFilePaths(rootDir);
|
|
64
|
+
for (const p of [...filePaths].sort(byteCompare)) {
|
|
65
|
+
hash.update(`path:${relative(rootDir, p)}\n`);
|
|
66
|
+
if (existsSync(p)) {
|
|
67
|
+
hash.update(readFileSync(p));
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
hash.update("__missing__\n");
|
|
71
|
+
}
|
|
72
|
+
hash.update("\n---\n");
|
|
73
|
+
}
|
|
74
|
+
return hash.digest("hex");
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Collect repo-tracked + fetched file paths that contribute to the
|
|
78
|
+
* fingerprint. Tasks are NOT collected here — they come from
|
|
79
|
+
* `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
|
|
80
|
+
* input on `computeEvalFingerprint`.
|
|
81
|
+
*
|
|
82
|
+
* Exported for the debug-fingerprint diagnostic script.
|
|
83
|
+
*/
|
|
84
|
+
export function collectFingerprintFilePaths(rootDir) {
|
|
54
85
|
const r = (rel) => resolve(rootDir, rel);
|
|
55
86
|
const paths = [];
|
|
56
|
-
//
|
|
57
|
-
// Config files — always included
|
|
58
|
-
// -----------------------------------------------------------------------
|
|
59
|
-
// Check all supported extensions in priority order
|
|
87
|
+
// Config files (any of the supported extensions)
|
|
60
88
|
const configNames = ["models", "prompts", "rubrics"];
|
|
61
89
|
const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
}
|
|
68
|
-
// -----------------------------------------------------------------------
|
|
69
|
-
// Task files — filtered if --area is set
|
|
70
|
-
// -----------------------------------------------------------------------
|
|
71
|
-
const tasksDir = r("tasks");
|
|
72
|
-
if (existsSync(tasksDir)) {
|
|
73
|
-
const taskFiles = readdirSync(tasksDir)
|
|
74
|
-
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
75
|
-
.filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
|
|
76
|
-
for (const f of taskFiles) {
|
|
77
|
-
// If area filter is set, only include matching task files
|
|
78
|
-
if (filter?.areas && filter.areas.length > 0) {
|
|
79
|
-
const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
|
|
80
|
-
if (!filter.areas.includes(stem))
|
|
81
|
-
continue;
|
|
82
|
-
}
|
|
83
|
-
paths.push(join(tasksDir, f));
|
|
90
|
+
for (const name of configNames) {
|
|
91
|
+
for (const ext of configExts) {
|
|
92
|
+
const p = r(`config/${name}${ext}`);
|
|
93
|
+
if (existsSync(p))
|
|
94
|
+
paths.push(p);
|
|
84
95
|
}
|
|
85
96
|
}
|
|
86
|
-
//
|
|
87
|
-
// Reference solutions — all included (they're referenced by tasks)
|
|
88
|
-
// -----------------------------------------------------------------------
|
|
97
|
+
// Reference solutions — recursive (mixed languages, nested by area)
|
|
89
98
|
const refDir = r("canonical/reference-solutions");
|
|
90
|
-
if (existsSync(refDir))
|
|
99
|
+
if (existsSync(refDir))
|
|
91
100
|
collectFilesRecursive(refDir, paths);
|
|
92
|
-
|
|
93
|
-
//
|
|
94
|
-
//
|
|
95
|
-
// This is the KEY differentiator from the local cache (which doesn't
|
|
96
|
-
// include Sanity document content in the fetch-docs cache key).
|
|
97
|
-
// -----------------------------------------------------------------------
|
|
101
|
+
// Canonical context files — the fetched documentation content. These
|
|
102
|
+
// change whenever the Content Lake source shifts, so they capture
|
|
103
|
+
// doc-level edits that the task set itself wouldn't reflect.
|
|
98
104
|
const canonicalDir = r("contexts/canonical");
|
|
99
105
|
if (existsSync(canonicalDir)) {
|
|
100
106
|
const contextFiles = readdirSync(canonicalDir)
|
|
101
107
|
.filter((f) => f.endsWith(".md"))
|
|
102
|
-
.sort();
|
|
103
|
-
for (const f of contextFiles)
|
|
104
|
-
// If area or task filter is set, we include all context files anyway
|
|
105
|
-
// because context filenames map to task IDs, and task-to-area mapping
|
|
106
|
-
// requires reading the YAML. It's safer to include all — a superset
|
|
107
|
-
// doesn't cause false cache hits, only potential false misses when
|
|
108
|
-
// a non-matching context changes. This is acceptable: the filter
|
|
109
|
-
// flags in the context strings differentiate the fingerprints.
|
|
108
|
+
.sort(byteCompare);
|
|
109
|
+
for (const f of contextFiles)
|
|
110
110
|
paths.push(join(canonicalDir, f));
|
|
111
|
-
}
|
|
112
111
|
}
|
|
113
112
|
return paths;
|
|
114
113
|
}
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
// Canonical serialization — byte-stable across runtimes
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
115
117
|
/**
|
|
116
|
-
*
|
|
117
|
-
*
|
|
118
|
-
* The fingerprint is content-addressed: identical inputs always produce
|
|
119
|
-
* the same fingerprint, regardless of the environment (local, CI, etc.).
|
|
120
|
-
*
|
|
121
|
-
* Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
|
|
122
|
-
* and adds non-file context (mode, grader model, filter flags) as
|
|
123
|
-
* additional context strings.
|
|
118
|
+
* Compare two strings by their UTF-8 byte representation.
|
|
124
119
|
*
|
|
125
|
-
*
|
|
120
|
+
* Why this and not `localeCompare` or default `<`: `localeCompare` calls
|
|
121
|
+
* into ICU, whose tables can vary between Node builds (small-icu vs
|
|
122
|
+
* full-icu) and platforms. The default JS string comparison uses UTF-16
|
|
123
|
+
* code units, which diverges from UTF-8 byte order for surrogate pairs.
|
|
124
|
+
* `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
|
|
125
|
+
* the right primitive when the result feeds a hash.
|
|
126
126
|
*/
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
if (
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
127
|
+
function byteCompare(a, b) {
|
|
128
|
+
return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Recursively normalize `value` for stable serialization: object keys
|
|
132
|
+
* sorted by UTF-8 byte order; arrays preserve order (the caller decides
|
|
133
|
+
* whether to pre-sort).
|
|
134
|
+
*/
|
|
135
|
+
function canonicalize(value) {
|
|
136
|
+
if (Array.isArray(value))
|
|
137
|
+
return value.map(canonicalize);
|
|
138
|
+
if (value !== null && typeof value === "object") {
|
|
139
|
+
const out = {};
|
|
140
|
+
const obj = value;
|
|
141
|
+
for (const k of Object.keys(obj).sort(byteCompare)) {
|
|
142
|
+
out[k] = canonicalize(obj[k]);
|
|
143
|
+
}
|
|
144
|
+
return out;
|
|
143
145
|
}
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
return value;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Hash a task set in a way that's invariant under source ordering and
|
|
150
|
+
* optional-field-spread reorder.
|
|
151
|
+
*
|
|
152
|
+
* Each task is canonicalized once, then the array is sorted by
|
|
153
|
+
* `(id, canonical-json)`. The secondary sort key matters: the Content
|
|
154
|
+
* Lake currently has duplicate `ailf.task` documents that share the
|
|
155
|
+
* same `id.current` but differ in body (DOC-2096). With only the id as
|
|
156
|
+
* the sort key, two such duplicates compare equal and their relative
|
|
157
|
+
* order falls back to GROQ's input order — which is undefined for
|
|
158
|
+
* equal `(area, id)` rows, so the hash could shift between runs over
|
|
159
|
+
* the same dataset. Tiebreaking on the serialized content makes the
|
|
160
|
+
* hash deterministic even in the presence of dup-id rows.
|
|
161
|
+
*/
|
|
162
|
+
function hashTaskSet(tasks) {
|
|
163
|
+
const serialized = tasks.map((t) => ({
|
|
164
|
+
id: t.id,
|
|
165
|
+
json: JSON.stringify(canonicalize(t)),
|
|
166
|
+
}));
|
|
167
|
+
serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
|
|
168
|
+
const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
|
|
169
|
+
return createHash("sha256").update(arrayJson).digest("hex");
|
|
152
170
|
}
|
|
153
171
|
// ---------------------------------------------------------------------------
|
|
154
|
-
//
|
|
172
|
+
// File walk
|
|
155
173
|
// ---------------------------------------------------------------------------
|
|
156
174
|
/**
|
|
157
175
|
* Recursively collect all file paths under a directory.
|
|
158
176
|
* Skips hidden files and directories (starting with '.').
|
|
159
177
|
*/
|
|
160
178
|
function collectFilesRecursive(dir, paths) {
|
|
161
|
-
const entries = readdirSync(dir);
|
|
179
|
+
const entries = readdirSync(dir).sort(byteCompare);
|
|
162
180
|
for (const entry of entries) {
|
|
163
181
|
if (entry.startsWith("."))
|
|
164
182
|
continue;
|
package/dist/report-store.js
CHANGED
|
@@ -286,7 +286,10 @@ export function generateReportId() {
|
|
|
286
286
|
* metadata (_id, _type, _rev, etc.) that we strip.
|
|
287
287
|
*/
|
|
288
288
|
function toReport(doc) {
|
|
289
|
+
const summary = doc.summary;
|
|
290
|
+
const artifactManifest = summary?.artifactManifest;
|
|
289
291
|
return {
|
|
292
|
+
artifactManifest,
|
|
290
293
|
comparison: doc.comparison,
|
|
291
294
|
completedAt: doc.completedAt,
|
|
292
295
|
durationMs: doc.durationMs,
|