@sanity/ailf 3.9.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/config/models.ts +32 -4
  2. package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
  3. package/dist/_vendor/ailf-core/config-helpers.js +54 -1
  4. package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
  5. package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
  6. package/dist/_vendor/ailf-shared/index.d.ts +16 -10
  7. package/dist/_vendor/ailf-shared/index.js +13 -10
  8. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  9. package/dist/agent-observer/agentic-provider.js +28 -23
  10. package/dist/agent-observer/classifier.js +7 -2
  11. package/dist/agent-observer/proxy.d.ts +88 -3
  12. package/dist/agent-observer/proxy.js +174 -16
  13. package/dist/agent-observer/types.d.ts +23 -5
  14. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
  15. package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
  16. package/dist/cli-program.js +1 -1
  17. package/dist/commands/baseline.d.ts +3 -1
  18. package/dist/commands/baseline.js +29 -9
  19. package/dist/commands/cache.d.ts +5 -1
  20. package/dist/commands/cache.js +31 -15
  21. package/dist/commands/check-staleness.js +12 -4
  22. package/dist/commands/compare.js +11 -4
  23. package/dist/commands/explain-handler.js +2 -2
  24. package/dist/config/models.ts +32 -4
  25. package/dist/orchestration/steps/run-eval-step.js +39 -29
  26. package/dist/pipeline/baseline.d.ts +14 -3
  27. package/dist/pipeline/baseline.js +7 -13
  28. package/dist/pipeline/cache-hit-restore.d.ts +24 -0
  29. package/dist/pipeline/cache-hit-restore.js +32 -0
  30. package/dist/pipeline/calculate-scores.js +40 -1
  31. package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
  32. package/dist/pipeline/compiler/provider-assembler.js +37 -2
  33. package/dist/pipeline/eval-fingerprint.d.ts +33 -35
  34. package/dist/pipeline/eval-fingerprint.js +124 -106
  35. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  36. package/dist/report-store.js +3 -0
  37. package/package.json +2 -2
@@ -6,28 +6,27 @@
6
6
  * pipeline can query the Sanity Content Lake for a previous report with an
7
7
  * identical fingerprint and skip the expensive eval step.
8
8
  *
9
- * The fingerprint captures everything that would change evaluation results:
9
+ * The fingerprint captures:
10
10
  * - Evaluation mode (baseline, observed, agentic)
11
- * - Model configuration (which models, their settings)
12
11
  * - Grader model identity (different graders score differently)
13
- * - Prompt templates (different instructions different outputs)
14
- * - Rubric templates (different criteria → different scores)
15
- * - Task definitions (what's being evaluated)
16
- * - Reference solutions (used by grader assertions)
17
- * - Documentation content (the docs being evaluated the primary variable)
18
- * - Filter flags (which subset of tasks is included)
12
+ * - The task set that was actually loaded for this run, in its canonical
13
+ * shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
14
+ * Studio-authored task edits in the Content Lake are picked up — pre-v2
15
+ * the fingerprint walked `tasks/` on disk and missed them entirely).
16
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
17
+ * - Fetched canonical doc content (contexts/canonical/*.md).
19
18
  *
20
19
  * The fingerprint intentionally EXCLUDES:
21
- * - Source name/URL (content matters, not origin)
22
- * - Git metadata (informational, not eval-affecting)
23
- * - Trigger type (manual vs CI → same inputs → same results)
24
- * - Report tags (human labels)
20
+ * - Source name/URL (content matters, not origin).
21
+ * - Git metadata (informational, not eval-affecting).
22
+ * - Trigger type (manual vs CI → same inputs → same results).
23
+ * - Report tags (human labels).
25
24
  *
26
25
  * @see docs/design-docs/content-lake-eval-caching.md
27
26
  */
28
- import { existsSync, readdirSync, statSync } from "fs";
29
- import { join, resolve } from "path";
30
- import { hashFiles } from "./cache.js";
27
+ import { createHash } from "crypto";
28
+ import { existsSync, readdirSync, readFileSync, statSync } from "fs";
29
+ import { join, relative, resolve } from "path";
31
30
  // ---------------------------------------------------------------------------
32
31
  // Constants
33
32
  // ---------------------------------------------------------------------------
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
35
34
  * Version prefix for the fingerprint hash. Bumping this invalidates all
36
35
  * existing fingerprints in the Content Lake without needing to clear the
37
36
  * store. Change this when adding new inputs to the hash.
37
+ *
38
+ * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
39
+ * files), file paths normalized to rootDir-relative, grader passed
40
+ * through verbatim instead of the literal string "default".
38
41
  */
39
- const FINGERPRINT_VERSION = "eval-fingerprint-v1";
42
+ const FINGERPRINT_VERSION = "eval-fingerprint-v2";
40
43
  /**
41
- * Collect all file paths that contribute to the evaluation fingerprint.
44
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
42
45
  *
43
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
44
- * comprehensive and explicitly designed for cross-environment cache keys:
46
+ * Identical inputs always produce the same fingerprint, regardless of the
47
+ * environment (local, CI, etc.). Cross-environment portability relies on
48
+ * (a) tasks coming from the same Content Lake source and (b) file paths
49
+ * being hashed as rootDir-relative.
45
50
  *
46
- * - Includes `config/prompts` and `config/rubrics` directly
47
- * (the local cache only includes them indirectly via generated configs)
48
- * - Includes `config/models` (model configuration)
49
- * - Includes task definitions and reference solutions
50
- * - Includes the actual documentation content (contexts/canonical/*.md)
51
- * - Respects filter flags to only include relevant files
51
+ * @returns SHA-256 hex string (64 characters)
52
52
  */
53
- export function collectFingerprintInputPaths(rootDir, filter) {
53
+ export function computeEvalFingerprint(input) {
54
+ const { graderModel, mode, rootDir, tasks } = input;
55
+ const hash = createHash("sha256");
56
+ hash.update(`version:${FINGERPRINT_VERSION}\n`);
57
+ hash.update(`mode:${mode}\n`);
58
+ hash.update(`grader:${graderModel}\n`);
59
+ hash.update(`tasks:${hashTaskSet(tasks)}\n`);
60
+ // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
61
+ // so a CI runner at /home/runner/... and a laptop at /Users/... produce
62
+ // the same hash for byte-identical content.
63
+ const filePaths = collectFingerprintFilePaths(rootDir);
64
+ for (const p of [...filePaths].sort(byteCompare)) {
65
+ hash.update(`path:${relative(rootDir, p)}\n`);
66
+ if (existsSync(p)) {
67
+ hash.update(readFileSync(p));
68
+ }
69
+ else {
70
+ hash.update("__missing__\n");
71
+ }
72
+ hash.update("\n---\n");
73
+ }
74
+ return hash.digest("hex");
75
+ }
76
+ /**
77
+ * Collect repo-tracked + fetched file paths that contribute to the
78
+ * fingerprint. Tasks are NOT collected here — they come from
79
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
80
+ * input on `computeEvalFingerprint`.
81
+ *
82
+ * Exported for the debug-fingerprint diagnostic script.
83
+ */
84
+ export function collectFingerprintFilePaths(rootDir) {
54
85
  const r = (rel) => resolve(rootDir, rel);
55
86
  const paths = [];
56
- // -----------------------------------------------------------------------
57
- // Config files — always included
58
- // -----------------------------------------------------------------------
59
- // Check all supported extensions in priority order
87
+ // Config files (any of the supported extensions)
60
88
  const configNames = ["models", "prompts", "rubrics"];
61
89
  const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
62
- const configFiles = configNames.flatMap((name) => configExts.map((ext) => `config/${name}${ext}`));
63
- for (const f of configFiles) {
64
- const p = r(f);
65
- if (existsSync(p))
66
- paths.push(p);
67
- }
68
- // -----------------------------------------------------------------------
69
- // Task files — filtered if --area is set
70
- // -----------------------------------------------------------------------
71
- const tasksDir = r("tasks");
72
- if (existsSync(tasksDir)) {
73
- const taskFiles = readdirSync(tasksDir)
74
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
75
- .filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
76
- for (const f of taskFiles) {
77
- // If area filter is set, only include matching task files
78
- if (filter?.areas && filter.areas.length > 0) {
79
- const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
80
- if (!filter.areas.includes(stem))
81
- continue;
82
- }
83
- paths.push(join(tasksDir, f));
90
+ for (const name of configNames) {
91
+ for (const ext of configExts) {
92
+ const p = r(`config/${name}${ext}`);
93
+ if (existsSync(p))
94
+ paths.push(p);
84
95
  }
85
96
  }
86
- // -----------------------------------------------------------------------
87
- // Reference solutions — all included (they're referenced by tasks)
88
- // -----------------------------------------------------------------------
97
+ // Reference solutions — recursive (mixed languages, nested by area)
89
98
  const refDir = r("canonical/reference-solutions");
90
- if (existsSync(refDir)) {
99
+ if (existsSync(refDir))
91
100
  collectFilesRecursive(refDir, paths);
92
- }
93
- // -----------------------------------------------------------------------
94
- // Canonical context files the documentation content being evaluated
95
- // This is the KEY differentiator from the local cache (which doesn't
96
- // include Sanity document content in the fetch-docs cache key).
97
- // -----------------------------------------------------------------------
101
+ // Canonical context files — the fetched documentation content. These
102
+ // change whenever the Content Lake source shifts, so they capture
103
+ // doc-level edits that the task set itself wouldn't reflect.
98
104
  const canonicalDir = r("contexts/canonical");
99
105
  if (existsSync(canonicalDir)) {
100
106
  const contextFiles = readdirSync(canonicalDir)
101
107
  .filter((f) => f.endsWith(".md"))
102
- .sort();
103
- for (const f of contextFiles) {
104
- // If area or task filter is set, we include all context files anyway
105
- // because context filenames map to task IDs, and task-to-area mapping
106
- // requires reading the YAML. It's safer to include all — a superset
107
- // doesn't cause false cache hits, only potential false misses when
108
- // a non-matching context changes. This is acceptable: the filter
109
- // flags in the context strings differentiate the fingerprints.
108
+ .sort(byteCompare);
109
+ for (const f of contextFiles)
110
110
  paths.push(join(canonicalDir, f));
111
- }
112
111
  }
113
112
  return paths;
114
113
  }
114
+ // ---------------------------------------------------------------------------
115
+ // Canonical serialization — byte-stable across runtimes
116
+ // ---------------------------------------------------------------------------
115
117
  /**
116
- * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
117
- *
118
- * The fingerprint is content-addressed: identical inputs always produce
119
- * the same fingerprint, regardless of the environment (local, CI, etc.).
120
- *
121
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
122
- * and adds non-file context (mode, grader model, filter flags) as
123
- * additional context strings.
118
+ * Compare two strings by their UTF-8 byte representation.
124
119
  *
125
- * @returns SHA-256 hex string (64 characters)
120
+ * Why this and not `localeCompare` or default `<`: `localeCompare` calls
121
+ * into ICU, whose tables can vary between Node builds (small-icu vs
122
+ * full-icu) and platforms. The default JS string comparison uses UTF-16
123
+ * code units, which diverges from UTF-8 byte order for surrogate pairs.
124
+ * `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
125
+ * the right primitive when the result feeds a hash.
126
126
  */
127
- export function computeEvalFingerprint(input) {
128
- const { filter, graderModel, mode, rootDir } = input;
129
- // -----------------------------------------------------------------------
130
- // 1. Collect context strings (non-file inputs)
131
- // -----------------------------------------------------------------------
132
- const context = [
133
- FINGERPRINT_VERSION,
134
- `mode:${mode}`,
135
- `grader:${graderModel}`,
136
- ];
137
- // Include filter flags so that scoped runs produce different fingerprints
138
- if (filter?.areas && filter.areas.length > 0) {
139
- context.push(`areas:${[...filter.areas].sort().join(",")}`);
140
- }
141
- if (filter?.taskIds && filter.taskIds.length > 0) {
142
- context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
127
+ function byteCompare(a, b) {
128
+ return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
129
+ }
130
+ /**
131
+ * Recursively normalize `value` for stable serialization: object keys
132
+ * sorted by UTF-8 byte order; arrays preserve order (the caller decides
133
+ * whether to pre-sort).
134
+ */
135
+ function canonicalize(value) {
136
+ if (Array.isArray(value))
137
+ return value.map(canonicalize);
138
+ if (value !== null && typeof value === "object") {
139
+ const out = {};
140
+ const obj = value;
141
+ for (const k of Object.keys(obj).sort(byteCompare)) {
142
+ out[k] = canonicalize(obj[k]);
143
+ }
144
+ return out;
143
145
  }
144
- // -----------------------------------------------------------------------
145
- // 2. Collect input file paths (all files that affect eval output)
146
- // -----------------------------------------------------------------------
147
- const paths = collectFingerprintInputPaths(rootDir, filter);
148
- // -----------------------------------------------------------------------
149
- // 3. Hash everything together
150
- // -----------------------------------------------------------------------
151
- return hashFiles(paths, context);
146
+ return value;
147
+ }
148
+ /**
149
+ * Hash a task set in a way that's invariant under source ordering and
150
+ * optional-field-spread reorder.
151
+ *
152
+ * Each task is canonicalized once, then the array is sorted by
153
+ * `(id, canonical-json)`. The secondary sort key matters: the Content
154
+ * Lake currently has duplicate `ailf.task` documents that share the
155
+ * same `id.current` but differ in body (DOC-2096). With only the id as
156
+ * the sort key, two such duplicates compare equal and their relative
157
+ * order falls back to GROQ's input order — which is undefined for
158
+ * equal `(area, id)` rows, so the hash could shift between runs over
159
+ * the same dataset. Tiebreaking on the serialized content makes the
160
+ * hash deterministic even in the presence of dup-id rows.
161
+ */
162
+ function hashTaskSet(tasks) {
163
+ const serialized = tasks.map((t) => ({
164
+ id: t.id,
165
+ json: JSON.stringify(canonicalize(t)),
166
+ }));
167
+ serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
168
+ const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
169
+ return createHash("sha256").update(arrayJson).digest("hex");
152
170
  }
153
171
  // ---------------------------------------------------------------------------
154
- // Helpers
172
+ // File walk
155
173
  // ---------------------------------------------------------------------------
156
174
  /**
157
175
  * Recursively collect all file paths under a directory.
158
176
  * Skips hidden files and directories (starting with '.').
159
177
  */
160
178
  function collectFilesRecursive(dir, paths) {
161
- const entries = readdirSync(dir);
179
+ const entries = readdirSync(dir).sort(byteCompare);
162
180
  for (const entry of entries) {
163
181
  if (entry.startsWith("."))
164
182
  continue;
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
107
107
  slugToDocId: Map<string, string>;
108
108
  }): {
109
109
  baseline?: {
110
- rubric?: "abbreviated" | "full" | "none" | undefined;
110
+ rubric?: "full" | "abbreviated" | "none" | undefined;
111
111
  enabled?: boolean | undefined;
112
112
  } | undefined;
113
113
  _id: string;
@@ -286,7 +286,10 @@ export function generateReportId() {
286
286
  * metadata (_id, _type, _rev, etc.) that we strip.
287
287
  */
288
288
  function toReport(doc) {
289
+ const summary = doc.summary;
290
+ const artifactManifest = summary?.artifactManifest;
289
291
  return {
292
+ artifactManifest,
290
293
  comparison: doc.comparison,
291
294
  completedAt: doc.completedAt,
292
295
  durationMs: doc.durationMs,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.9.0",
3
+ "version": "4.0.1",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -77,7 +77,7 @@
77
77
  "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
78
78
  "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
79
79
  "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
80
- "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
80
+ "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
81
81
  "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
82
82
  "pr-comment": "tsx src/cli.ts pr-comment",
83
83
  "coverage-audit": "tsx src/cli.ts report coverage",