@sanity/ailf 4.0.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -138,12 +138,21 @@ function toTitleCase(id) {
138
138
  // ---------------------------------------------------------------------------
139
139
  const RECOMMENDATION_TOP_N = 3;
140
140
  function slimRecommendations(full) {
141
+ // Cache-hit pass-through: when the pipeline restores a previously
142
+ // published report on a remote cache hit, `score-summary.json` carries
143
+ // recommendations in their already-slim shape (no `.gaps` field).
144
+ // Re-slimming would crash on `for (gap of undefined)`; the slim shape
145
+ // has no full-fidelity data to recover, so we return it verbatim.
146
+ if (!Array.isArray(full.gaps)) {
147
+ return full;
148
+ }
149
+ const fullReport = full;
141
150
  const counts = {};
142
- for (const gap of full.gaps) {
151
+ for (const gap of fullReport.gaps) {
143
152
  counts[gap.area] = (counts[gap.area] ?? 0) + 1;
144
153
  }
145
154
  // Sort by priority descending, break ties by estimatedLift.
146
- const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
155
+ const sorted = [...fullReport.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
147
156
  (b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
148
157
  const top3 = sorted
149
158
  .slice(0, RECOMMENDATION_TOP_N)
@@ -156,8 +165,8 @@ function slimRecommendations(full) {
156
165
  return {
157
166
  counts,
158
167
  top3,
159
- totalGaps: full.gaps.length,
160
- totalPotentialLift: full.totalPotentialLift,
168
+ totalGaps: fullReport.gaps.length,
169
+ totalPotentialLift: fullReport.totalPotentialLift,
161
170
  };
162
171
  }
163
172
  /**
@@ -1364,6 +1364,15 @@ export interface ArtifactRefEntry {
1364
1364
  * - `truncated` on the bulk row indicates the single-object body was capped.
1365
1365
  * - `preview` on the bulk row carries a descriptor-typed summary for list
1366
1366
  * views; wiring lands in W0051.
1367
+ *
1368
+ * D0040/W0135 extension:
1369
+ * - `sourceRunId` declares that this ref's bytes physically live under a
1370
+ * different run's storage prefix than the manifest containing it.
1371
+ * `path` is already self-contained and authoritative for resolution;
1372
+ * `sourceRunId` is purely a lineage marker for retention, GC,
1373
+ * observability, and BigQuery joins. Set by the cache-hit branch in
1374
+ * `RunEvalStep` when a new run reuses a prior report's artifacts;
1375
+ * unset on cold-path producers.
1367
1376
  */
1368
1377
  export interface ArtifactRef {
1369
1378
  store: "gcs" | "local";
@@ -1381,6 +1390,7 @@ export interface ArtifactRef {
1381
1390
  entries?: ArtifactRefEntry[];
1382
1391
  truncated?: boolean;
1383
1392
  preview?: unknown;
1393
+ sourceRunId?: RunId;
1384
1394
  }
1385
1395
  /**
1386
1396
  * Catalog of artifact refs produced by a single pipeline run.
@@ -39,6 +39,19 @@ export declare class AccumulatingArtifactWriter implements ArtifactWriter {
39
39
  getAccumulatedArtifactRefs(): ArtifactManifest;
40
40
  /** Test-only. Clears accumulated refs without touching the inner writer. */
41
41
  _resetAccumulated(): void;
42
+ /**
43
+ * Merge externally-supplied refs into the accumulator without touching
44
+ * the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
45
+ * W0135) to restore a cached report's `Report.artifactManifest` so the
46
+ * new run's `RunManifest` advertises the cached artifacts via cross-run
47
+ * paths instead of skipping them entirely.
48
+ *
49
+ * The injected refs already carry `path`, `bucket`, `entries`, etc. as
50
+ * the source run wrote them — we don't synthesize new paths, we copy.
51
+ * Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
52
+ * downstream tooling can follow the lineage.
53
+ */
54
+ injectAccumulated(refs: ArtifactManifest): void;
42
55
  emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
43
56
  appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
44
57
  writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
@@ -46,6 +46,25 @@ export class AccumulatingArtifactWriter {
46
46
  delete this.accumulated[k];
47
47
  }
48
48
  }
49
+ /**
50
+ * Merge externally-supplied refs into the accumulator without touching
51
+ * the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
52
+ * W0135) to restore a cached report's `Report.artifactManifest` so the
53
+ * new run's `RunManifest` advertises the cached artifacts via cross-run
54
+ * paths instead of skipping them entirely.
55
+ *
56
+ * The injected refs already carry `path`, `bucket`, `entries`, etc. as
57
+ * the source run wrote them — we don't synthesize new paths, we copy.
58
+ * Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
59
+ * downstream tooling can follow the lineage.
60
+ */
61
+ injectAccumulated(refs) {
62
+ for (const [type, ref] of Object.entries(refs)) {
63
+ if (!ref)
64
+ continue;
65
+ this.mergeRef(type, ref);
66
+ }
67
+ }
49
68
  // ---- ArtifactWriter surface --------------------------------------------
50
69
  async emit(type, association, payload) {
51
70
  const ref = await this.inner.emit(type, association, payload);
@@ -21,11 +21,19 @@ export function createCheckStalenessCommand() {
21
21
  // weekly-digest.ts and composition-root.ts — AILF_REPORT_* wins over
22
22
  // the evaluated-source SANITY_* defaults so the staleness probe tracks
23
23
  // the actual report dataset even when it diverges from the eval source.
24
+ //
25
+ // The `??` fallbacks matter: passing `{ projectId: undefined }` would
26
+ // clobber `getSanityClient`'s built-in default via spread. The
27
+ // staleness workflow only sets SANITY_PROJECT_ID, so without the
28
+ // fallback the probe crashes with "Configuration must contain
29
+ // `projectId`" instead of doing its job (issue #272).
30
+ const projectId = process.env.AILF_REPORT_PROJECT_ID ?? process.env.SANITY_PROJECT_ID;
31
+ const dataset = process.env.AILF_REPORT_DATASET ?? process.env.SANITY_DATASET;
32
+ const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
24
33
  const client = getSanityClient({
25
- dataset: process.env.AILF_REPORT_DATASET,
26
- projectId: process.env.AILF_REPORT_PROJECT_ID,
27
- token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
28
- process.env.SANITY_API_TOKEN,
34
+ ...(projectId ? { projectId } : {}),
35
+ ...(dataset ? { dataset } : {}),
36
+ ...(token ? { token } : {}),
29
37
  });
30
38
  const maxAgeDays = opts.maxAge;
31
39
  // Bound the GROQ sort with a `completedAt > $floor` filter. Beyond
@@ -8,10 +8,13 @@
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
10
  import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
11
+ import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
11
12
  import { getStepInputPaths } from "../../pipeline/cache.js";
12
13
  import { buildCacheContext } from "../cache-context.js";
14
+ import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
13
15
  import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
14
16
  import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
17
+ import { loadGraderModel } from "../../pipeline/grader-api.js";
15
18
  import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
16
19
  export class RunEvalStep {
17
20
  mode;
@@ -39,31 +42,31 @@ export class RunEvalStep {
39
42
  status: "failed",
40
43
  };
41
44
  }
45
+ // Load the task set once and reuse it for both the literacy precondition
46
+ // check and the fingerprint. Mirrors the area/task filter applied by
47
+ // fetch-docs so we only see tasks that were actually fetched.
48
+ const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
49
+ ? {
50
+ ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
51
+ ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
52
+ ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
53
+ }
54
+ : undefined;
55
+ let tasks = await ctx.taskSource.loadTasks(filter);
56
+ // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
57
+ if (state.releaseAutoScope && !ctx.config.noAutoScope) {
58
+ const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
59
+ tasks = tasks.filter((t) => scopedIds.has(t.id));
60
+ }
42
61
  // Precondition: canonical context files exist for filtered tasks.
43
62
  // Only applies to literacy mode — other modes don't use canonical doc contexts.
44
63
  if (this.mode === "literacy") {
45
- // Must apply the same area/task filter as fetch-docs so we only
46
- // check contexts that were actually fetched.
47
- const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
48
- ? {
49
- ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
50
- ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
51
- ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
52
- }
53
- : undefined;
54
- let tasks = await ctx.taskSource.loadTasks(filter);
55
- // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
56
- if (state.releaseAutoScope && !ctx.config.noAutoScope) {
57
- const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
58
- tasks = tasks.filter((t) => scopedIds.has(t.id));
59
- }
60
64
  // Only check context files for tasks that have canonical docs.
61
65
  // Tasks without canonical docs are skipped by FetchDocsStep (they
62
66
  // have no docs to fetch), so no context file is written for them.
63
67
  // The generated Promptfoo config still includes their "without-docs"
64
68
  // variant (testing model knowledge alone), which doesn't need a
65
69
  // context file.
66
- // Bridge: narrow to literacy tasks with docs
67
70
  const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
68
71
  const taskIds = tasksWithDocs.map((t) => t.id);
69
72
  const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
@@ -83,14 +86,8 @@ export class RunEvalStep {
83
86
  if (!debug?.enabled) {
84
87
  try {
85
88
  evalFingerprint = computeEvalFingerprint({
86
- filter: ctx.config.areas || ctx.config.tasks || ctx.config.tags
87
- ? {
88
- areas: ctx.config.areas,
89
- taskIds: ctx.config.tasks,
90
- tags: ctx.config.tags,
91
- }
92
- : undefined,
93
- graderModel: "default",
89
+ tasks,
90
+ graderModel: loadGraderModel(rootDir).id,
94
91
  mode: this.mode,
95
92
  rootDir,
96
93
  });
@@ -119,11 +116,22 @@ export class RunEvalStep {
119
116
  state.promptfooUrls ??= [];
120
117
  state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
121
118
  }
122
- // W0050score-summary-cached was an unregistered capture;
123
- // scoreSummary is already emitted by calculate-scores-step on the
124
- // non-cached path, which also runs when we have a remote cache hit
125
- // (populating state.remoteCacheHits CalculateScoresStep still
126
- // invokes for the score-summary emit). Dropped here.
119
+ // D0040 / W0135 restore the cached report's artifact manifest into
120
+ // the accumulator so the new run's RunManifest advertises the cached
121
+ // artifacts via cross-run lineage (`sourceRunId`) instead of skipping
122
+ // them entirely. Without this, Studio drill-downs on the new report
123
+ // 404 because per-entry GCS objects were never written under the new
124
+ // runId. Bytes are not duplicated; the original prefix is untouched.
125
+ if (remoteCacheResult.artifactManifest &&
126
+ remoteCacheResult.sourceRunId &&
127
+ ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
128
+ const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
129
+ ctx.artifactWriter.injectAccumulated(restored);
130
+ const count = Object.keys(restored).length;
131
+ if (count > 0) {
132
+ console.log(` ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
133
+ }
134
+ }
127
135
  return {
128
136
  durationMs: Date.now() - start,
129
137
  status: "success",
@@ -241,9 +249,11 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
241
249
  console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
242
250
  console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
243
251
  return {
252
+ artifactManifest: cachedReport.artifactManifest,
244
253
  completedAt: cachedReport.completedAt,
245
254
  promptfooUrls: cachedReport.provenance?.promptfooUrls,
246
255
  reportId: cachedReport.id,
256
+ sourceRunId: cachedReport.provenance?.runId,
247
257
  };
248
258
  }
249
259
  catch (err) {
@@ -0,0 +1,24 @@
1
+ /**
2
+ * cache-hit-restore.ts — helpers for the eval cache-hit branch in
3
+ * `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
4
+ * refs so the new run's manifest advertises the cached artifacts via
5
+ * cross-run lineage instead of pointing at GCS objects that were never
6
+ * written under the new runId.
7
+ *
8
+ * @see docs/decisions/D0040-artifact-ref-source-run-id.md
9
+ * @see docs/design-docs/cache-hit-artifact-restoration.md
10
+ */
11
+ import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
12
+ /**
13
+ * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
14
+ *
15
+ * The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
16
+ * unchanged — they already point at the source run's storage. Only
17
+ * `sourceRunId` is added so retention/GC and observability tooling can
18
+ * follow the cross-run dependency.
19
+ *
20
+ * Pure function; safe to call without side effects.
21
+ */
22
+ export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
23
+ sourceRunId: RunId;
24
+ }): ArtifactManifest;
@@ -0,0 +1,32 @@
1
+ /**
2
+ * cache-hit-restore.ts — helpers for the eval cache-hit branch in
3
+ * `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
4
+ * refs so the new run's manifest advertises the cached artifacts via
5
+ * cross-run lineage instead of pointing at GCS objects that were never
6
+ * written under the new runId.
7
+ *
8
+ * @see docs/decisions/D0040-artifact-ref-source-run-id.md
9
+ * @see docs/design-docs/cache-hit-artifact-restoration.md
10
+ */
11
+ /**
12
+ * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
13
+ *
14
+ * The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
15
+ * unchanged — they already point at the source run's storage. Only
16
+ * `sourceRunId` is added so retention/GC and observability tooling can
17
+ * follow the cross-run dependency.
18
+ *
19
+ * Pure function; safe to call without side effects.
20
+ */
21
+ export function remapToCacheHitRefs(source, opts) {
22
+ const out = {};
23
+ for (const [type, ref] of Object.entries(source)) {
24
+ if (!ref)
25
+ continue;
26
+ out[type] = {
27
+ ...ref,
28
+ sourceRunId: opts.sourceRunId,
29
+ };
30
+ }
31
+ return out;
32
+ }
@@ -6,30 +6,35 @@
6
6
  * pipeline can query the Sanity Content Lake for a previous report with an
7
7
  * identical fingerprint and skip the expensive eval step.
8
8
  *
9
- * The fingerprint captures everything that would change evaluation results:
9
+ * The fingerprint captures:
10
10
  * - Evaluation mode (baseline, observed, agentic)
11
- * - Model configuration (which models, their settings)
12
11
  * - Grader model identity (different graders score differently)
13
- * - Prompt templates (different instructions different outputs)
14
- * - Rubric templates (different criteria → different scores)
15
- * - Task definitions (what's being evaluated)
16
- * - Reference solutions (used by grader assertions)
17
- * - Documentation content (the docs being evaluated the primary variable)
18
- * - Filter flags (which subset of tasks is included)
12
+ * - The task set that was actually loaded for this run, in its canonical
13
+ * shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
14
+ * Studio-authored task edits in the Content Lake are picked up — pre-v2
15
+ * the fingerprint walked `tasks/` on disk and missed them entirely).
16
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
17
+ * - Fetched canonical doc content (contexts/canonical/*.md).
19
18
  *
20
19
  * The fingerprint intentionally EXCLUDES:
21
- * - Source name/URL (content matters, not origin)
22
- * - Git metadata (informational, not eval-affecting)
23
- * - Trigger type (manual vs CI → same inputs → same results)
24
- * - Report tags (human labels)
20
+ * - Source name/URL (content matters, not origin).
21
+ * - Git metadata (informational, not eval-affecting).
22
+ * - Trigger type (manual vs CI → same inputs → same results).
23
+ * - Report tags (human labels).
25
24
  *
26
25
  * @see docs/design-docs/content-lake-eval-caching.md
27
26
  */
28
- import type { EvalMode, FilterOptions } from "./types.js";
27
+ import type { GeneralizedTaskDefinition } from "../_vendor/ailf-core/index.d.ts";
28
+ import type { EvalMode } from "../_vendor/ailf-shared/index.d.ts";
29
29
  /** Inputs needed to compute an evaluation fingerprint. */
30
30
  export interface FingerprintInput {
31
- /** Filter options (areas, taskIds) — determines which tasks are included */
32
- filter?: FilterOptions;
31
+ /**
32
+ * Task definitions returned by `ctx.taskSource.loadTasks(filter)` after
33
+ * any release-auto-scope narrowing has been applied. The fingerprint
34
+ * captures whatever set the pipeline is actually about to evaluate, so
35
+ * filter changes are reflected implicitly.
36
+ */
37
+ tasks: readonly GeneralizedTaskDefinition[];
33
38
  /** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
34
39
  graderModel: string;
35
40
  /** Evaluation mode */
@@ -37,30 +42,23 @@ export interface FingerprintInput {
37
42
  /** Path to the packages/eval root directory */
38
43
  rootDir: string;
39
44
  }
40
- /**
41
- * Collect all file paths that contribute to the evaluation fingerprint.
42
- *
43
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
44
- * comprehensive and explicitly designed for cross-environment cache keys:
45
- *
46
- * - Includes `config/prompts` and `config/rubrics` directly
47
- * (the local cache only includes them indirectly via generated configs)
48
- * - Includes `config/models` (model configuration)
49
- * - Includes task definitions and reference solutions
50
- * - Includes the actual documentation content (contexts/canonical/*.md)
51
- * - Respects filter flags to only include relevant files
52
- */
53
- export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
54
45
  /**
55
46
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
56
47
  *
57
- * The fingerprint is content-addressed: identical inputs always produce
58
- * the same fingerprint, regardless of the environment (local, CI, etc.).
59
- *
60
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
61
- * and adds non-file context (mode, grader model, filter flags) as
62
- * additional context strings.
48
+ * Identical inputs always produce the same fingerprint, regardless of the
49
+ * environment (local, CI, etc.). Cross-environment portability relies on
50
+ * (a) tasks coming from the same Content Lake source and (b) file paths
51
+ * being hashed as rootDir-relative.
63
52
  *
64
53
  * @returns SHA-256 hex string (64 characters)
65
54
  */
66
55
  export declare function computeEvalFingerprint(input: FingerprintInput): string;
56
+ /**
57
+ * Collect repo-tracked + fetched file paths that contribute to the
58
+ * fingerprint. Tasks are NOT collected here — they come from
59
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
60
+ * input on `computeEvalFingerprint`.
61
+ *
62
+ * Exported for the debug-fingerprint diagnostic script.
63
+ */
64
+ export declare function collectFingerprintFilePaths(rootDir: string): string[];
@@ -6,28 +6,27 @@
6
6
  * pipeline can query the Sanity Content Lake for a previous report with an
7
7
  * identical fingerprint and skip the expensive eval step.
8
8
  *
9
- * The fingerprint captures everything that would change evaluation results:
9
+ * The fingerprint captures:
10
10
  * - Evaluation mode (baseline, observed, agentic)
11
- * - Model configuration (which models, their settings)
12
11
  * - Grader model identity (different graders score differently)
13
- * - Prompt templates (different instructions different outputs)
14
- * - Rubric templates (different criteria → different scores)
15
- * - Task definitions (what's being evaluated)
16
- * - Reference solutions (used by grader assertions)
17
- * - Documentation content (the docs being evaluated the primary variable)
18
- * - Filter flags (which subset of tasks is included)
12
+ * - The task set that was actually loaded for this run, in its canonical
13
+ * shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
14
+ * Studio-authored task edits in the Content Lake are picked up — pre-v2
15
+ * the fingerprint walked `tasks/` on disk and missed them entirely).
16
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
17
+ * - Fetched canonical doc content (contexts/canonical/*.md).
19
18
  *
20
19
  * The fingerprint intentionally EXCLUDES:
21
- * - Source name/URL (content matters, not origin)
22
- * - Git metadata (informational, not eval-affecting)
23
- * - Trigger type (manual vs CI → same inputs → same results)
24
- * - Report tags (human labels)
20
+ * - Source name/URL (content matters, not origin).
21
+ * - Git metadata (informational, not eval-affecting).
22
+ * - Trigger type (manual vs CI → same inputs → same results).
23
+ * - Report tags (human labels).
25
24
  *
26
25
  * @see docs/design-docs/content-lake-eval-caching.md
27
26
  */
28
- import { existsSync, readdirSync, statSync } from "fs";
29
- import { join, resolve } from "path";
30
- import { hashFiles } from "./cache.js";
27
+ import { createHash } from "crypto";
28
+ import { existsSync, readdirSync, readFileSync, statSync } from "fs";
29
+ import { join, relative, resolve } from "path";
31
30
  // ---------------------------------------------------------------------------
32
31
  // Constants
33
32
  // ---------------------------------------------------------------------------
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
35
34
  * Version prefix for the fingerprint hash. Bumping this invalidates all
36
35
  * existing fingerprints in the Content Lake without needing to clear the
37
36
  * store. Change this when adding new inputs to the hash.
37
+ *
38
+ * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
39
+ * files), file paths normalized to rootDir-relative, grader passed
40
+ * through verbatim instead of the literal string "default".
38
41
  */
39
- const FINGERPRINT_VERSION = "eval-fingerprint-v1";
42
+ const FINGERPRINT_VERSION = "eval-fingerprint-v2";
40
43
  /**
41
- * Collect all file paths that contribute to the evaluation fingerprint.
44
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
42
45
  *
43
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
44
- * comprehensive and explicitly designed for cross-environment cache keys:
46
+ * Identical inputs always produce the same fingerprint, regardless of the
47
+ * environment (local, CI, etc.). Cross-environment portability relies on
48
+ * (a) tasks coming from the same Content Lake source and (b) file paths
49
+ * being hashed as rootDir-relative.
45
50
  *
46
- * - Includes `config/prompts` and `config/rubrics` directly
47
- * (the local cache only includes them indirectly via generated configs)
48
- * - Includes `config/models` (model configuration)
49
- * - Includes task definitions and reference solutions
50
- * - Includes the actual documentation content (contexts/canonical/*.md)
51
- * - Respects filter flags to only include relevant files
51
+ * @returns SHA-256 hex string (64 characters)
52
52
  */
53
- export function collectFingerprintInputPaths(rootDir, filter) {
53
+ export function computeEvalFingerprint(input) {
54
+ const { graderModel, mode, rootDir, tasks } = input;
55
+ const hash = createHash("sha256");
56
+ hash.update(`version:${FINGERPRINT_VERSION}\n`);
57
+ hash.update(`mode:${mode}\n`);
58
+ hash.update(`grader:${graderModel}\n`);
59
+ hash.update(`tasks:${hashTaskSet(tasks)}\n`);
60
+ // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
61
+ // so a CI runner at /home/runner/... and a laptop at /Users/... produce
62
+ // the same hash for byte-identical content.
63
+ const filePaths = collectFingerprintFilePaths(rootDir);
64
+ for (const p of [...filePaths].sort(byteCompare)) {
65
+ hash.update(`path:${relative(rootDir, p)}\n`);
66
+ if (existsSync(p)) {
67
+ hash.update(readFileSync(p));
68
+ }
69
+ else {
70
+ hash.update("__missing__\n");
71
+ }
72
+ hash.update("\n---\n");
73
+ }
74
+ return hash.digest("hex");
75
+ }
76
+ /**
77
+ * Collect repo-tracked + fetched file paths that contribute to the
78
+ * fingerprint. Tasks are NOT collected here — they come from
79
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
80
+ * input on `computeEvalFingerprint`.
81
+ *
82
+ * Exported for the debug-fingerprint diagnostic script.
83
+ */
84
+ export function collectFingerprintFilePaths(rootDir) {
54
85
  const r = (rel) => resolve(rootDir, rel);
55
86
  const paths = [];
56
- // -----------------------------------------------------------------------
57
- // Config files — always included
58
- // -----------------------------------------------------------------------
59
- // Check all supported extensions in priority order
87
+ // Config files (any of the supported extensions)
60
88
  const configNames = ["models", "prompts", "rubrics"];
61
89
  const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
62
- const configFiles = configNames.flatMap((name) => configExts.map((ext) => `config/${name}${ext}`));
63
- for (const f of configFiles) {
64
- const p = r(f);
65
- if (existsSync(p))
66
- paths.push(p);
67
- }
68
- // -----------------------------------------------------------------------
69
- // Task files — filtered if --area is set
70
- // -----------------------------------------------------------------------
71
- const tasksDir = r("tasks");
72
- if (existsSync(tasksDir)) {
73
- const taskFiles = readdirSync(tasksDir)
74
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
75
- .filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
76
- for (const f of taskFiles) {
77
- // If area filter is set, only include matching task files
78
- if (filter?.areas && filter.areas.length > 0) {
79
- const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
80
- if (!filter.areas.includes(stem))
81
- continue;
82
- }
83
- paths.push(join(tasksDir, f));
90
+ for (const name of configNames) {
91
+ for (const ext of configExts) {
92
+ const p = r(`config/${name}${ext}`);
93
+ if (existsSync(p))
94
+ paths.push(p);
84
95
  }
85
96
  }
86
- // -----------------------------------------------------------------------
87
- // Reference solutions — all included (they're referenced by tasks)
88
- // -----------------------------------------------------------------------
97
+ // Reference solutions — recursive (mixed languages, nested by area)
89
98
  const refDir = r("canonical/reference-solutions");
90
- if (existsSync(refDir)) {
99
+ if (existsSync(refDir))
91
100
  collectFilesRecursive(refDir, paths);
92
- }
93
- // -----------------------------------------------------------------------
94
- // Canonical context files the documentation content being evaluated
95
- // This is the KEY differentiator from the local cache (which doesn't
96
- // include Sanity document content in the fetch-docs cache key).
97
- // -----------------------------------------------------------------------
101
+ // Canonical context files — the fetched documentation content. These
102
+ // change whenever the Content Lake source shifts, so they capture
103
+ // doc-level edits that the task set itself wouldn't reflect.
98
104
  const canonicalDir = r("contexts/canonical");
99
105
  if (existsSync(canonicalDir)) {
100
106
  const contextFiles = readdirSync(canonicalDir)
101
107
  .filter((f) => f.endsWith(".md"))
102
- .sort();
103
- for (const f of contextFiles) {
104
- // If area or task filter is set, we include all context files anyway
105
- // because context filenames map to task IDs, and task-to-area mapping
106
- // requires reading the YAML. It's safer to include all — a superset
107
- // doesn't cause false cache hits, only potential false misses when
108
- // a non-matching context changes. This is acceptable: the filter
109
- // flags in the context strings differentiate the fingerprints.
108
+ .sort(byteCompare);
109
+ for (const f of contextFiles)
110
110
  paths.push(join(canonicalDir, f));
111
- }
112
111
  }
113
112
  return paths;
114
113
  }
114
+ // ---------------------------------------------------------------------------
115
+ // Canonical serialization — byte-stable across runtimes
116
+ // ---------------------------------------------------------------------------
115
117
  /**
116
- * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
117
- *
118
- * The fingerprint is content-addressed: identical inputs always produce
119
- * the same fingerprint, regardless of the environment (local, CI, etc.).
120
- *
121
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
122
- * and adds non-file context (mode, grader model, filter flags) as
123
- * additional context strings.
118
+ * Compare two strings by their UTF-8 byte representation.
124
119
  *
125
- * @returns SHA-256 hex string (64 characters)
120
+ * Why this and not `localeCompare` or default `<`: `localeCompare` calls
121
+ * into ICU, whose tables can vary between Node builds (small-icu vs
122
+ * full-icu) and platforms. The default JS string comparison uses UTF-16
123
+ * code units, which diverges from UTF-8 byte order for surrogate pairs.
124
+ * `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
125
+ * the right primitive when the result feeds a hash.
126
126
  */
127
- export function computeEvalFingerprint(input) {
128
- const { filter, graderModel, mode, rootDir } = input;
129
- // -----------------------------------------------------------------------
130
- // 1. Collect context strings (non-file inputs)
131
- // -----------------------------------------------------------------------
132
- const context = [
133
- FINGERPRINT_VERSION,
134
- `mode:${mode}`,
135
- `grader:${graderModel}`,
136
- ];
137
- // Include filter flags so that scoped runs produce different fingerprints
138
- if (filter?.areas && filter.areas.length > 0) {
139
- context.push(`areas:${[...filter.areas].sort().join(",")}`);
140
- }
141
- if (filter?.taskIds && filter.taskIds.length > 0) {
142
- context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
127
+ function byteCompare(a, b) {
128
+ return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
129
+ }
130
+ /**
131
+ * Recursively normalize `value` for stable serialization: object keys
132
+ * sorted by UTF-8 byte order; arrays preserve order (the caller decides
133
+ * whether to pre-sort).
134
+ */
135
+ function canonicalize(value) {
136
+ if (Array.isArray(value))
137
+ return value.map(canonicalize);
138
+ if (value !== null && typeof value === "object") {
139
+ const out = {};
140
+ const obj = value;
141
+ for (const k of Object.keys(obj).sort(byteCompare)) {
142
+ out[k] = canonicalize(obj[k]);
143
+ }
144
+ return out;
143
145
  }
144
- // -----------------------------------------------------------------------
145
- // 2. Collect input file paths (all files that affect eval output)
146
- // -----------------------------------------------------------------------
147
- const paths = collectFingerprintInputPaths(rootDir, filter);
148
- // -----------------------------------------------------------------------
149
- // 3. Hash everything together
150
- // -----------------------------------------------------------------------
151
- return hashFiles(paths, context);
146
+ return value;
147
+ }
148
+ /**
149
+ * Hash a task set in a way that's invariant under source ordering and
150
+ * optional-field-spread reorder.
151
+ *
152
+ * Each task is canonicalized once, then the array is sorted by
153
+ * `(id, canonical-json)`. The secondary sort key matters: the Content
154
+ * Lake currently has duplicate `ailf.task` documents that share the
155
+ * same `id.current` but differ in body (DOC-2096). With only the id as
156
+ * the sort key, two such duplicates compare equal and their relative
157
+ * order falls back to GROQ's input order — which is undefined for
158
+ * equal `(area, id)` rows, so the hash could shift between runs over
159
+ * the same dataset. Tiebreaking on the serialized content makes the
160
+ * hash deterministic even in the presence of dup-id rows.
161
+ */
162
+ function hashTaskSet(tasks) {
163
+ const serialized = tasks.map((t) => ({
164
+ id: t.id,
165
+ json: JSON.stringify(canonicalize(t)),
166
+ }));
167
+ serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
168
+ const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
169
+ return createHash("sha256").update(arrayJson).digest("hex");
152
170
  }
153
171
  // ---------------------------------------------------------------------------
154
- // Helpers
172
+ // File walk
155
173
  // ---------------------------------------------------------------------------
156
174
  /**
157
175
  * Recursively collect all file paths under a directory.
158
176
  * Skips hidden files and directories (starting with '.').
159
177
  */
160
178
  function collectFilesRecursive(dir, paths) {
161
- const entries = readdirSync(dir);
179
+ const entries = readdirSync(dir).sort(byteCompare);
162
180
  for (const entry of entries) {
163
181
  if (entry.startsWith("."))
164
182
  continue;
@@ -286,7 +286,10 @@ export function generateReportId() {
286
286
  * metadata (_id, _type, _rev, etc.) that we strip.
287
287
  */
288
288
  function toReport(doc) {
289
+ const summary = doc.summary;
290
+ const artifactManifest = summary?.artifactManifest;
289
291
  return {
292
+ artifactManifest,
290
293
  comparison: doc.comparison,
291
294
  completedAt: doc.completedAt,
292
295
  durationMs: doc.durationMs,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "4.0.0",
3
+ "version": "4.0.1",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"