@sanity/ailf 3.9.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.ts +32 -4
- package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
- package/dist/_vendor/ailf-core/config-helpers.js +54 -1
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
- package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -10
- package/dist/_vendor/ailf-shared/index.js +13 -10
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/check-staleness.js +12 -4
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/models.ts +32 -4
- package/dist/orchestration/steps/run-eval-step.js +39 -29
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/cache-hit-restore.d.ts +24 -0
- package/dist/pipeline/cache-hit-restore.js +32 -0
- package/dist/pipeline/calculate-scores.js +40 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/eval-fingerprint.d.ts +33 -35
- package/dist/pipeline/eval-fingerprint.js +124 -106
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/report-store.js +3 -0
- package/package.json +2 -2
|
@@ -6,28 +6,27 @@
|
|
|
6
6
|
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
7
|
* identical fingerprint and skip the expensive eval step.
|
|
8
8
|
*
|
|
9
|
-
* The fingerprint captures
|
|
9
|
+
* The fingerprint captures:
|
|
10
10
|
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
-
* - Model configuration (which models, their settings)
|
|
12
11
|
* - Grader model identity (different graders score differently)
|
|
13
|
-
* -
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
* -
|
|
18
|
-
* -
|
|
12
|
+
* - The task set that was actually loaded for this run, in its canonical
|
|
13
|
+
* shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
|
|
14
|
+
* Studio-authored task edits in the Content Lake are picked up — pre-v2
|
|
15
|
+
* the fingerprint walked `tasks/` on disk and missed them entirely).
|
|
16
|
+
* - Repo-tracked config (models, prompts, rubrics) and reference solutions.
|
|
17
|
+
* - Fetched canonical doc content (contexts/canonical/*.md).
|
|
19
18
|
*
|
|
20
19
|
* The fingerprint intentionally EXCLUDES:
|
|
21
|
-
* - Source name/URL (content matters, not origin)
|
|
22
|
-
* - Git metadata (informational, not eval-affecting)
|
|
23
|
-
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
-
* - Report tags (human labels)
|
|
20
|
+
* - Source name/URL (content matters, not origin).
|
|
21
|
+
* - Git metadata (informational, not eval-affecting).
|
|
22
|
+
* - Trigger type (manual vs CI → same inputs → same results).
|
|
23
|
+
* - Report tags (human labels).
|
|
25
24
|
*
|
|
26
25
|
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
26
|
*/
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
27
|
+
import { createHash } from "crypto";
|
|
28
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "fs";
|
|
29
|
+
import { join, relative, resolve } from "path";
|
|
31
30
|
// ---------------------------------------------------------------------------
|
|
32
31
|
// Constants
|
|
33
32
|
// ---------------------------------------------------------------------------
|
|
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
|
|
|
35
34
|
* Version prefix for the fingerprint hash. Bumping this invalidates all
|
|
36
35
|
* existing fingerprints in the Content Lake without needing to clear the
|
|
37
36
|
* store. Change this when adding new inputs to the hash.
|
|
37
|
+
*
|
|
38
|
+
* v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
|
|
39
|
+
* files), file paths normalized to rootDir-relative, grader passed
|
|
40
|
+
* through verbatim instead of the literal string "default".
|
|
38
41
|
*/
|
|
39
|
-
const FINGERPRINT_VERSION = "eval-fingerprint-
|
|
42
|
+
const FINGERPRINT_VERSION = "eval-fingerprint-v2";
|
|
40
43
|
/**
|
|
41
|
-
*
|
|
44
|
+
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
42
45
|
*
|
|
43
|
-
*
|
|
44
|
-
*
|
|
46
|
+
* Identical inputs always produce the same fingerprint, regardless of the
|
|
47
|
+
* environment (local, CI, etc.). Cross-environment portability relies on
|
|
48
|
+
* (a) tasks coming from the same Content Lake source and (b) file paths
|
|
49
|
+
* being hashed as rootDir-relative.
|
|
45
50
|
*
|
|
46
|
-
* -
|
|
47
|
-
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
-
* - Includes `config/models` (model configuration)
|
|
49
|
-
* - Includes task definitions and reference solutions
|
|
50
|
-
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
-
* - Respects filter flags to only include relevant files
|
|
51
|
+
* @returns SHA-256 hex string (64 characters)
|
|
52
52
|
*/
|
|
53
|
-
export function
|
|
53
|
+
export function computeEvalFingerprint(input) {
|
|
54
|
+
const { graderModel, mode, rootDir, tasks } = input;
|
|
55
|
+
const hash = createHash("sha256");
|
|
56
|
+
hash.update(`version:${FINGERPRINT_VERSION}\n`);
|
|
57
|
+
hash.update(`mode:${mode}\n`);
|
|
58
|
+
hash.update(`grader:${graderModel}\n`);
|
|
59
|
+
hash.update(`tasks:${hashTaskSet(tasks)}\n`);
|
|
60
|
+
// Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
|
|
61
|
+
// so a CI runner at /home/runner/... and a laptop at /Users/... produce
|
|
62
|
+
// the same hash for byte-identical content.
|
|
63
|
+
const filePaths = collectFingerprintFilePaths(rootDir);
|
|
64
|
+
for (const p of [...filePaths].sort(byteCompare)) {
|
|
65
|
+
hash.update(`path:${relative(rootDir, p)}\n`);
|
|
66
|
+
if (existsSync(p)) {
|
|
67
|
+
hash.update(readFileSync(p));
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
hash.update("__missing__\n");
|
|
71
|
+
}
|
|
72
|
+
hash.update("\n---\n");
|
|
73
|
+
}
|
|
74
|
+
return hash.digest("hex");
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Collect repo-tracked + fetched file paths that contribute to the
|
|
78
|
+
* fingerprint. Tasks are NOT collected here — they come from
|
|
79
|
+
* `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
|
|
80
|
+
* input on `computeEvalFingerprint`.
|
|
81
|
+
*
|
|
82
|
+
* Exported for the debug-fingerprint diagnostic script.
|
|
83
|
+
*/
|
|
84
|
+
export function collectFingerprintFilePaths(rootDir) {
|
|
54
85
|
const r = (rel) => resolve(rootDir, rel);
|
|
55
86
|
const paths = [];
|
|
56
|
-
//
|
|
57
|
-
// Config files — always included
|
|
58
|
-
// -----------------------------------------------------------------------
|
|
59
|
-
// Check all supported extensions in priority order
|
|
87
|
+
// Config files (any of the supported extensions)
|
|
60
88
|
const configNames = ["models", "prompts", "rubrics"];
|
|
61
89
|
const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
}
|
|
68
|
-
// -----------------------------------------------------------------------
|
|
69
|
-
// Task files — filtered if --area is set
|
|
70
|
-
// -----------------------------------------------------------------------
|
|
71
|
-
const tasksDir = r("tasks");
|
|
72
|
-
if (existsSync(tasksDir)) {
|
|
73
|
-
const taskFiles = readdirSync(tasksDir)
|
|
74
|
-
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
75
|
-
.filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
|
|
76
|
-
for (const f of taskFiles) {
|
|
77
|
-
// If area filter is set, only include matching task files
|
|
78
|
-
if (filter?.areas && filter.areas.length > 0) {
|
|
79
|
-
const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
|
|
80
|
-
if (!filter.areas.includes(stem))
|
|
81
|
-
continue;
|
|
82
|
-
}
|
|
83
|
-
paths.push(join(tasksDir, f));
|
|
90
|
+
for (const name of configNames) {
|
|
91
|
+
for (const ext of configExts) {
|
|
92
|
+
const p = r(`config/${name}${ext}`);
|
|
93
|
+
if (existsSync(p))
|
|
94
|
+
paths.push(p);
|
|
84
95
|
}
|
|
85
96
|
}
|
|
86
|
-
//
|
|
87
|
-
// Reference solutions — all included (they're referenced by tasks)
|
|
88
|
-
// -----------------------------------------------------------------------
|
|
97
|
+
// Reference solutions — recursive (mixed languages, nested by area)
|
|
89
98
|
const refDir = r("canonical/reference-solutions");
|
|
90
|
-
if (existsSync(refDir))
|
|
99
|
+
if (existsSync(refDir))
|
|
91
100
|
collectFilesRecursive(refDir, paths);
|
|
92
|
-
|
|
93
|
-
//
|
|
94
|
-
//
|
|
95
|
-
// This is the KEY differentiator from the local cache (which doesn't
|
|
96
|
-
// include Sanity document content in the fetch-docs cache key).
|
|
97
|
-
// -----------------------------------------------------------------------
|
|
101
|
+
// Canonical context files — the fetched documentation content. These
|
|
102
|
+
// change whenever the Content Lake source shifts, so they capture
|
|
103
|
+
// doc-level edits that the task set itself wouldn't reflect.
|
|
98
104
|
const canonicalDir = r("contexts/canonical");
|
|
99
105
|
if (existsSync(canonicalDir)) {
|
|
100
106
|
const contextFiles = readdirSync(canonicalDir)
|
|
101
107
|
.filter((f) => f.endsWith(".md"))
|
|
102
|
-
.sort();
|
|
103
|
-
for (const f of contextFiles)
|
|
104
|
-
// If area or task filter is set, we include all context files anyway
|
|
105
|
-
// because context filenames map to task IDs, and task-to-area mapping
|
|
106
|
-
// requires reading the YAML. It's safer to include all — a superset
|
|
107
|
-
// doesn't cause false cache hits, only potential false misses when
|
|
108
|
-
// a non-matching context changes. This is acceptable: the filter
|
|
109
|
-
// flags in the context strings differentiate the fingerprints.
|
|
108
|
+
.sort(byteCompare);
|
|
109
|
+
for (const f of contextFiles)
|
|
110
110
|
paths.push(join(canonicalDir, f));
|
|
111
|
-
}
|
|
112
111
|
}
|
|
113
112
|
return paths;
|
|
114
113
|
}
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
// Canonical serialization — byte-stable across runtimes
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
115
117
|
/**
|
|
116
|
-
*
|
|
117
|
-
*
|
|
118
|
-
* The fingerprint is content-addressed: identical inputs always produce
|
|
119
|
-
* the same fingerprint, regardless of the environment (local, CI, etc.).
|
|
120
|
-
*
|
|
121
|
-
* Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
|
|
122
|
-
* and adds non-file context (mode, grader model, filter flags) as
|
|
123
|
-
* additional context strings.
|
|
118
|
+
* Compare two strings by their UTF-8 byte representation.
|
|
124
119
|
*
|
|
125
|
-
*
|
|
120
|
+
* Why this and not `localeCompare` or default `<`: `localeCompare` calls
|
|
121
|
+
* into ICU, whose tables can vary between Node builds (small-icu vs
|
|
122
|
+
* full-icu) and platforms. The default JS string comparison uses UTF-16
|
|
123
|
+
* code units, which diverges from UTF-8 byte order for surrogate pairs.
|
|
124
|
+
* `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
|
|
125
|
+
* the right primitive when the result feeds a hash.
|
|
126
126
|
*/
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
if (
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
127
|
+
function byteCompare(a, b) {
|
|
128
|
+
return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Recursively normalize `value` for stable serialization: object keys
|
|
132
|
+
* sorted by UTF-8 byte order; arrays preserve order (the caller decides
|
|
133
|
+
* whether to pre-sort).
|
|
134
|
+
*/
|
|
135
|
+
function canonicalize(value) {
|
|
136
|
+
if (Array.isArray(value))
|
|
137
|
+
return value.map(canonicalize);
|
|
138
|
+
if (value !== null && typeof value === "object") {
|
|
139
|
+
const out = {};
|
|
140
|
+
const obj = value;
|
|
141
|
+
for (const k of Object.keys(obj).sort(byteCompare)) {
|
|
142
|
+
out[k] = canonicalize(obj[k]);
|
|
143
|
+
}
|
|
144
|
+
return out;
|
|
143
145
|
}
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
return value;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Hash a task set in a way that's invariant under source ordering and
|
|
150
|
+
* optional-field-spread reorder.
|
|
151
|
+
*
|
|
152
|
+
* Each task is canonicalized once, then the array is sorted by
|
|
153
|
+
* `(id, canonical-json)`. The secondary sort key matters: the Content
|
|
154
|
+
* Lake currently has duplicate `ailf.task` documents that share the
|
|
155
|
+
* same `id.current` but differ in body (DOC-2096). With only the id as
|
|
156
|
+
* the sort key, two such duplicates compare equal and their relative
|
|
157
|
+
* order falls back to GROQ's input order — which is undefined for
|
|
158
|
+
* equal `(area, id)` rows, so the hash could shift between runs over
|
|
159
|
+
* the same dataset. Tiebreaking on the serialized content makes the
|
|
160
|
+
* hash deterministic even in the presence of dup-id rows.
|
|
161
|
+
*/
|
|
162
|
+
function hashTaskSet(tasks) {
|
|
163
|
+
const serialized = tasks.map((t) => ({
|
|
164
|
+
id: t.id,
|
|
165
|
+
json: JSON.stringify(canonicalize(t)),
|
|
166
|
+
}));
|
|
167
|
+
serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
|
|
168
|
+
const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
|
|
169
|
+
return createHash("sha256").update(arrayJson).digest("hex");
|
|
152
170
|
}
|
|
153
171
|
// ---------------------------------------------------------------------------
|
|
154
|
-
//
|
|
172
|
+
// File walk
|
|
155
173
|
// ---------------------------------------------------------------------------
|
|
156
174
|
/**
|
|
157
175
|
* Recursively collect all file paths under a directory.
|
|
158
176
|
* Skips hidden files and directories (starting with '.').
|
|
159
177
|
*/
|
|
160
178
|
function collectFilesRecursive(dir, paths) {
|
|
161
|
-
const entries = readdirSync(dir);
|
|
179
|
+
const entries = readdirSync(dir).sort(byteCompare);
|
|
162
180
|
for (const entry of entries) {
|
|
163
181
|
if (entry.startsWith("."))
|
|
164
182
|
continue;
|
|
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
|
|
|
107
107
|
slugToDocId: Map<string, string>;
|
|
108
108
|
}): {
|
|
109
109
|
baseline?: {
|
|
110
|
-
rubric?: "
|
|
110
|
+
rubric?: "full" | "abbreviated" | "none" | undefined;
|
|
111
111
|
enabled?: boolean | undefined;
|
|
112
112
|
} | undefined;
|
|
113
113
|
_id: string;
|
package/dist/report-store.js
CHANGED
|
@@ -286,7 +286,10 @@ export function generateReportId() {
|
|
|
286
286
|
* metadata (_id, _type, _rev, etc.) that we strip.
|
|
287
287
|
*/
|
|
288
288
|
function toReport(doc) {
|
|
289
|
+
const summary = doc.summary;
|
|
290
|
+
const artifactManifest = summary?.artifactManifest;
|
|
289
291
|
return {
|
|
292
|
+
artifactManifest,
|
|
290
293
|
comparison: doc.comparison,
|
|
291
294
|
completedAt: doc.completedAt,
|
|
292
295
|
durationMs: doc.durationMs,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.1",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -77,7 +77,7 @@
|
|
|
77
77
|
"test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
78
78
|
"test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
|
|
79
79
|
"test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
|
|
80
|
-
"test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts
|
|
80
|
+
"test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
|
|
81
81
|
"test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
82
82
|
"pr-comment": "tsx src/cli.ts pr-comment",
|
|
83
83
|
"coverage-audit": "tsx src/cli.ts report coverage",
|