@sanity/ailf 7.1.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +4 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
- package/dist/_vendor/ailf-core/schemas/report.js +14 -0
- package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/user.js +23 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
- package/dist/_vendor/ailf-core/types/index.js +13 -0
- package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
- package/dist/_vendor/ailf-core/types/user.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
- package/dist/_vendor/ailf-shared/index.d.ts +1 -1
- package/dist/_vendor/ailf-shared/index.js +1 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +19 -6
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
- package/dist/adapters/task-sources/content-lake-task-source.js +12 -7
- package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
- package/dist/orchestration/steps/compute-attribution-step.js +17 -2
- package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
- package/dist/orchestration/steps/gap-analysis-step.js +29 -10
- package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
- package/dist/orchestration/steps/publish-report-step.js +63 -6
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +125 -22
- package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
- package/dist/pipeline/enrichment-preconditions.js +84 -0
- package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
- package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/extract-grader-judgments-resilient.ts
|
|
3
|
+
*
|
|
4
|
+
* Resilient grader-judgment extraction for the `calculate-scores` persist
|
|
5
|
+
* junction.
|
|
6
|
+
*
|
|
7
|
+
* Background: `calculateAndWriteScores` extracts grader judgments from the
|
|
8
|
+
* eval results file(s), then writes `grader-judgments.json` only when the
|
|
9
|
+
* array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
|
|
10
|
+
* observed where `extractGraderJudgments` returned 0 judgments for a results
|
|
11
|
+
* file that demonstrably contained classifiable llm-rubric components — the
|
|
12
|
+
* same file, read tens of milliseconds later by `extractStoredTestResults`,
|
|
13
|
+
* yielded the full set (entries with populated dimensions). The empty array
|
|
14
|
+
* silently skipped the write, so gap-analysis and compute-attribution skipped
|
|
15
|
+
* and the report shipped with a score but no tests.
|
|
16
|
+
*
|
|
17
|
+
* The committed code reads the file via a pure `readFileSync` with identical
|
|
18
|
+
* classification on both sides, so the divergence is not reproducible from the
|
|
19
|
+
* source + captured artifacts — it is a transient read anomaly at the live
|
|
20
|
+
* junction. This wrapper does not pretend to know the mechanism; it makes the
|
|
21
|
+
* junction observable and recovers from the transient:
|
|
22
|
+
*
|
|
23
|
+
* 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
|
|
24
|
+
* result count, and judgment count on every run (never silent on 0), so a
|
|
25
|
+
* future empty-judgments persist is diagnosable from the run log alone.
|
|
26
|
+
* 2. **Self-heals** — when extraction yields 0 judgments but a results file
|
|
27
|
+
* exists, it re-extracts with bounded retries. A later read that yields
|
|
28
|
+
* judgments proves the initial 0 was transient; the recovered judgments
|
|
29
|
+
* are returned. If every attempt yields 0, severity is decided by an
|
|
30
|
+
* independent classifiable-component count: a genuinely judgment-free run
|
|
31
|
+
* (all api-errors / no llm-rubric) logs a warning, while 0 judgments
|
|
32
|
+
* against present classifiable components logs an error (the downstream
|
|
33
|
+
* gap-analysis fail-loud guard is the backstop).
|
|
34
|
+
*
|
|
35
|
+
* The extractor and fs helpers are injected so the wrapper is unit-testable
|
|
36
|
+
* without importing the ~3000-line scoring module (which would be circular)
|
|
37
|
+
* or touching the real filesystem.
|
|
38
|
+
*/
|
|
39
|
+
import type { GraderJudgment, GraderReliability, Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
40
|
+
/** Telemetry sink threaded into each extraction (shared reliability counters). */
|
|
41
|
+
export interface ExtractionTelemetry {
|
|
42
|
+
reliability: GraderReliability;
|
|
43
|
+
runId?: string;
|
|
44
|
+
}
|
|
45
|
+
/** Cheap on-disk stat used for diagnostics and the retry gate. */
|
|
46
|
+
export interface FileStat {
|
|
47
|
+
exists: boolean;
|
|
48
|
+
mtimeMs: number;
|
|
49
|
+
size: number;
|
|
50
|
+
}
|
|
51
|
+
/** Injectable seams — defaults wire the real fs; tests substitute fakes. */
|
|
52
|
+
export interface ResilientExtractionDeps {
|
|
53
|
+
/**
|
|
54
|
+
* The real `extractGraderJudgments`. Injected (rather than imported) to
|
|
55
|
+
* avoid a circular dependency with `calculate-scores.ts`.
|
|
56
|
+
*/
|
|
57
|
+
extract: (path: string, telemetry?: ExtractionTelemetry) => GraderJudgment[];
|
|
58
|
+
/** Parsed result-entry count for a path — diagnostics only. */
|
|
59
|
+
countResults?: (path: string) => number;
|
|
60
|
+
/**
|
|
61
|
+
* Count of classifiable llm-rubric components for a path. Used only to set
|
|
62
|
+
* the severity of a persistent-empty extraction: a file with classifiable
|
|
63
|
+
* components but 0 extracted judgments is an error; a file with none (all
|
|
64
|
+
* api-errors / no llm-rubric) is a benign empty.
|
|
65
|
+
*/
|
|
66
|
+
countClassifiable?: (path: string) => number;
|
|
67
|
+
/** On-disk stat (existence + size + mtime). */
|
|
68
|
+
statFile?: (path: string) => FileStat;
|
|
69
|
+
/** Backoff between self-heal attempts. */
|
|
70
|
+
sleep?: (ms: number) => Promise<void>;
|
|
71
|
+
}
|
|
72
|
+
export interface ResilientExtractionOptions {
|
|
73
|
+
/** Total extraction attempts when the first yields 0 (default 3, min 1). */
|
|
74
|
+
maxAttempts?: number;
|
|
75
|
+
/** Delay before each retry, in ms (default 200). */
|
|
76
|
+
delayMs?: number;
|
|
77
|
+
deps: ResilientExtractionDeps;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Extract grader judgments across one or more results files, instrumented and
|
|
81
|
+
* self-healing. See the module header for the rationale.
|
|
82
|
+
*
|
|
83
|
+
* @param resultsPaths One or more results files (e.g. baseline + agentic in
|
|
84
|
+
* literacy full mode). Missing paths are skipped.
|
|
85
|
+
* @param telemetry Shared reliability sink threaded into every extraction.
|
|
86
|
+
* @param log Pipeline logger — the junction is logged here on every run.
|
|
87
|
+
*/
|
|
88
|
+
export declare function extractGraderJudgmentsResilient(resultsPaths: readonly string[], telemetry: ExtractionTelemetry | undefined, log: Logger, options: ResilientExtractionOptions): Promise<GraderJudgment[]>;
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/extract-grader-judgments-resilient.ts
|
|
3
|
+
*
|
|
4
|
+
* Resilient grader-judgment extraction for the `calculate-scores` persist
|
|
5
|
+
* junction.
|
|
6
|
+
*
|
|
7
|
+
* Background: `calculateAndWriteScores` extracts grader judgments from the
|
|
8
|
+
* eval results file(s), then writes `grader-judgments.json` only when the
|
|
9
|
+
* array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
|
|
10
|
+
* observed where `extractGraderJudgments` returned 0 judgments for a results
|
|
11
|
+
* file that demonstrably contained classifiable llm-rubric components — the
|
|
12
|
+
* same file, read tens of milliseconds later by `extractStoredTestResults`,
|
|
13
|
+
* yielded the full set (entries with populated dimensions). The empty array
|
|
14
|
+
* silently skipped the write, so gap-analysis and compute-attribution skipped
|
|
15
|
+
* and the report shipped with a score but no tests.
|
|
16
|
+
*
|
|
17
|
+
* The committed code reads the file via a pure `readFileSync` with identical
|
|
18
|
+
* classification on both sides, so the divergence is not reproducible from the
|
|
19
|
+
* source + captured artifacts — it is a transient read anomaly at the live
|
|
20
|
+
* junction. This wrapper does not pretend to know the mechanism; it makes the
|
|
21
|
+
* junction observable and recovers from the transient:
|
|
22
|
+
*
|
|
23
|
+
* 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
|
|
24
|
+
* result count, and judgment count on every run (never silent on 0), so a
|
|
25
|
+
* future empty-judgments persist is diagnosable from the run log alone.
|
|
26
|
+
* 2. **Self-heals** — when extraction yields 0 judgments but a results file
|
|
27
|
+
* exists, it re-extracts with bounded retries. A later read that yields
|
|
28
|
+
* judgments proves the initial 0 was transient; the recovered judgments
|
|
29
|
+
* are returned. If every attempt yields 0, severity is decided by an
|
|
30
|
+
* independent classifiable-component count: a genuinely judgment-free run
|
|
31
|
+
* (all api-errors / no llm-rubric) logs a warning, while 0 judgments
|
|
32
|
+
* against present classifiable components logs an error (the downstream
|
|
33
|
+
* gap-analysis fail-loud guard is the backstop).
|
|
34
|
+
*
|
|
35
|
+
* The extractor and fs helpers are injected so the wrapper is unit-testable
|
|
36
|
+
* without importing the ~3000-line scoring module (which would be circular)
|
|
37
|
+
* or touching the real filesystem.
|
|
38
|
+
*/
|
|
39
|
+
import { existsSync, statSync } from "node:fs";
|
|
40
|
+
const defaultStat = (path) => {
|
|
41
|
+
if (!existsSync(path))
|
|
42
|
+
return { exists: false, mtimeMs: 0, size: 0 };
|
|
43
|
+
const s = statSync(path);
|
|
44
|
+
return { exists: true, mtimeMs: s.mtimeMs, size: s.size };
|
|
45
|
+
};
|
|
46
|
+
const defaultSleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
47
|
+
/**
|
|
48
|
+
* Extract grader judgments across one or more results files, instrumented and
|
|
49
|
+
* self-healing. See the module header for the rationale.
|
|
50
|
+
*
|
|
51
|
+
* @param resultsPaths One or more results files (e.g. baseline + agentic in
|
|
52
|
+
* literacy full mode). Missing paths are skipped.
|
|
53
|
+
* @param telemetry Shared reliability sink threaded into every extraction.
|
|
54
|
+
* @param log Pipeline logger — the junction is logged here on every run.
|
|
55
|
+
*/
|
|
56
|
+
export async function extractGraderJudgmentsResilient(resultsPaths, telemetry, log, options) {
|
|
57
|
+
const { extract } = options.deps;
|
|
58
|
+
const statFile = options.deps.statFile ?? defaultStat;
|
|
59
|
+
const sleep = options.deps.sleep ?? defaultSleep;
|
|
60
|
+
const { countResults, countClassifiable } = options.deps;
|
|
61
|
+
const maxAttempts = Math.max(1, options.maxAttempts ?? 3);
|
|
62
|
+
const delayMs = options.delayMs ?? 200;
|
|
63
|
+
const present = resultsPaths.filter((p) => statFile(p).exists);
|
|
64
|
+
const extractAll = () => {
|
|
65
|
+
const all = [];
|
|
66
|
+
for (const p of present) {
|
|
67
|
+
all.push(...extract(p, telemetry));
|
|
68
|
+
}
|
|
69
|
+
return all;
|
|
70
|
+
};
|
|
71
|
+
const diag = (path) => {
|
|
72
|
+
const st = statFile(path);
|
|
73
|
+
return {
|
|
74
|
+
mtimeMs: st.mtimeMs,
|
|
75
|
+
path,
|
|
76
|
+
sizeBytes: st.size,
|
|
77
|
+
...(countResults ? { resultCount: countResults(path) } : {}),
|
|
78
|
+
};
|
|
79
|
+
};
|
|
80
|
+
// Attempt 1 — always instrument the junction so 0 is never silent.
|
|
81
|
+
let judgments = extractAll();
|
|
82
|
+
for (const p of present) {
|
|
83
|
+
log.info("Grader judgments — persist junction read", diag(p));
|
|
84
|
+
}
|
|
85
|
+
log.info(`Grader judgments extracted: ${judgments.length} total`, {
|
|
86
|
+
judgmentCount: judgments.length,
|
|
87
|
+
paths: present,
|
|
88
|
+
});
|
|
89
|
+
if (judgments.length > 0)
|
|
90
|
+
return judgments;
|
|
91
|
+
// 0 judgments and no results file present → genuinely nothing to grade.
|
|
92
|
+
// A missing file cannot become non-empty within the retry window.
|
|
93
|
+
if (present.length === 0) {
|
|
94
|
+
log.info("No grader judgments — no results file present (nothing to grade)");
|
|
95
|
+
return judgments;
|
|
96
|
+
}
|
|
97
|
+
// Results file(s) exist but extraction yielded 0 judgments — a suspected
|
|
98
|
+
// transient read anomaly. Loud diagnostic, then bounded self-heal retries;
|
|
99
|
+
// the same file read tens of ms later has been observed to yield the full set.
|
|
100
|
+
log.warn("Grader extraction returned 0 judgments despite present results file(s) — suspected transient read anomaly; attempting self-heal", { paths: present.map(diag) });
|
|
101
|
+
for (let attempt = 2; attempt <= maxAttempts; attempt++) {
|
|
102
|
+
await sleep(delayMs);
|
|
103
|
+
judgments = extractAll();
|
|
104
|
+
log.warn(`Grader self-heal attempt ${attempt}/${maxAttempts}: ${judgments.length} judgment(s)`);
|
|
105
|
+
if (judgments.length > 0) {
|
|
106
|
+
log.warn(`Grader self-heal recovered ${judgments.length} grader judgment(s) on attempt ${attempt} — the initial empty extraction was a transient read anomaly`);
|
|
107
|
+
return judgments;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Still empty after every attempt. Severity depends on whether the files
|
|
111
|
+
// actually contain classifiable components.
|
|
112
|
+
const classifiable = countClassifiable
|
|
113
|
+
? present.reduce((n, p) => n + countClassifiable(p), 0)
|
|
114
|
+
: undefined;
|
|
115
|
+
if (classifiable === 0) {
|
|
116
|
+
log.warn(`No grader judgments after ${maxAttempts} attempt(s) — results contain no classifiable llm-rubric components (e.g. all api-errors); nothing to persist`);
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
log.error(`Grader judgments empty after ${maxAttempts} attempt(s) but ${classifiable ?? "an unknown number of"} classifiable component(s) present in the results file(s) — persisting none; downstream gap-analysis/attribution will fail loud`, { paths: present.map(diag) });
|
|
120
|
+
}
|
|
121
|
+
return judgments;
|
|
122
|
+
}
|
package/dist/report-store.d.ts
CHANGED
|
@@ -216,6 +216,7 @@ export interface SanityReportDoc {
|
|
|
216
216
|
_type: string;
|
|
217
217
|
comparison: null | Omit<ComparisonReport, "baseline" | "experiment">;
|
|
218
218
|
completedAt: string;
|
|
219
|
+
degraded?: Report["degraded"];
|
|
219
220
|
durationMs: number;
|
|
220
221
|
provenance: Report["provenance"];
|
|
221
222
|
reportId: ReportId;
|
package/dist/report-store.js
CHANGED
|
@@ -477,6 +477,7 @@ export function toSanityReportDoc(report) {
|
|
|
477
477
|
_type: REPORT_TYPE,
|
|
478
478
|
comparison,
|
|
479
479
|
completedAt: report.completedAt,
|
|
480
|
+
...(report.degraded ? { degraded: report.degraded } : {}),
|
|
480
481
|
durationMs: report.durationMs,
|
|
481
482
|
provenance: report.provenance,
|
|
482
483
|
reportId: report.id,
|
|
@@ -526,6 +527,7 @@ export function toReport(doc) {
|
|
|
526
527
|
artifactManifest,
|
|
527
528
|
comparison: doc.comparison,
|
|
528
529
|
completedAt: doc.completedAt,
|
|
530
|
+
degraded: doc.degraded,
|
|
529
531
|
durationMs: doc.durationMs,
|
|
530
532
|
id: doc.reportId,
|
|
531
533
|
provenance: doc.provenance,
|
package/dist/sanity/queries.d.ts
CHANGED
|
@@ -69,7 +69,7 @@ export declare const ALL_ARTICLES_QUERY = "\n *[_type == \"article\"\n && !(
|
|
|
69
69
|
*
|
|
70
70
|
* Usage: client.fetch(ARTICLES_METADATA_BY_SLUGS_QUERY, { slugs: ["slug-a", "slug-b"] })
|
|
71
71
|
*/
|
|
72
|
-
export declare const ARTICLES_METADATA_BY_SLUGS_QUERY = "\n *[_type == \"article\"\n && slug.current in $slugs\n && !(_id in path(\"drafts.**\"))\n ] {\n \"slug\": slug.current,\n _id,\n _rev,\n title\n }\n";
|
|
72
|
+
export declare const ARTICLES_METADATA_BY_SLUGS_QUERY = "\n *[_type == \"article\"\n && slug.current in $slugs\n && !(_id in path(\"drafts.**\"))\n ] {\n \"slug\": slug.current,\n \"sectionSlug\": primarySection->slug.current,\n _id,\n _rev,\n title\n }\n";
|
|
73
73
|
/**
|
|
74
74
|
* Fetch a single article by its slug — identical to ARTICLE_BY_SLUG_QUERY
|
|
75
75
|
* but designed to be called with a perspective-enabled client.
|
package/dist/sanity/queries.js
CHANGED
package/dist/sources.js
CHANGED
|
@@ -37,6 +37,44 @@ const DEFAULT_SOURCE = {
|
|
|
37
37
|
studioOrigin: "https://admin.sanity.io",
|
|
38
38
|
urls: [],
|
|
39
39
|
};
|
|
40
|
+
/**
|
|
41
|
+
* Apply `SourceOverrides` + env-var fallbacks to `DEFAULT_SOURCE`.
|
|
42
|
+
*
|
|
43
|
+
* The DEFAULT_SOURCE early-return branches are taken when `config/sources`
|
|
44
|
+
* is missing or empty — the production state, since the named source
|
|
45
|
+
* definitions actually live in the `sanity-literacy` preset's `sourceDefs`
|
|
46
|
+
* (which `loadSource` doesn't consult). Returning `DEFAULT_SOURCE`
|
|
47
|
+
* verbatim drops every override the caller passed in, including
|
|
48
|
+
* `perspective` — observed live as production-source release evals
|
|
49
|
+
* fetching the published doc revision (W0295).
|
|
50
|
+
*
|
|
51
|
+
* The merge order mirrors the priority-1 (env-baseUrl) branch. The two
|
|
52
|
+
* paths diverge in three ways, all intentional: this branch (a) pins
|
|
53
|
+
* `baseUrl` / `llmsTxt` / `name` / `priorityDomain` to `DEFAULT_SOURCE`,
|
|
54
|
+
* (b) returns `documentIds: []` (the prior `DEFAULT_SOURCE` shape) where
|
|
55
|
+
* priority-1 would return `undefined` — both fall through the same
|
|
56
|
+
* `length > 0` consumer check, so behaviorally equivalent.
|
|
57
|
+
*/
|
|
58
|
+
function applyOverridesToDefault(overrides) {
|
|
59
|
+
const allowedOrigins = overrides?.allowedOrigins ?? parseAllowedOriginsEnv();
|
|
60
|
+
const headers = overrides?.headers ?? parseHeadersEnv();
|
|
61
|
+
return {
|
|
62
|
+
...DEFAULT_SOURCE,
|
|
63
|
+
...(allowedOrigins ? { allowedOrigins } : {}),
|
|
64
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string env var should fall back
|
|
65
|
+
dataset: overrides?.dataset ?? (process.env.SANITY_DATASET || "next"),
|
|
66
|
+
documentIds: overrides?.documentIds ?? parseDocumentIdsEnv() ?? [],
|
|
67
|
+
...(headers ? { headers } : {}),
|
|
68
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string env var should fall back
|
|
69
|
+
perspective: overrides?.perspective ?? (process.env.SANITY_PERSPECTIVE || undefined),
|
|
70
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string env var should fall back
|
|
71
|
+
projectId: overrides?.projectId ?? (process.env.SANITY_PROJECT_ID || "3do82whm"),
|
|
72
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string env var should fall back
|
|
73
|
+
studioOrigin: overrides?.studioOrigin ??
|
|
74
|
+
(process.env.SANITY_STUDIO_ORIGIN || "https://admin.sanity.io"),
|
|
75
|
+
urls: overrides?.directUrls ?? parseDirectUrlsEnv(),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
40
78
|
// ---------------------------------------------------------------------------
|
|
41
79
|
// Validation
|
|
42
80
|
// ---------------------------------------------------------------------------
|
|
@@ -117,12 +155,12 @@ export function loadSource(name, overrides, logger) {
|
|
|
117
155
|
defaultBaseUrl: DEFAULT_SOURCE.baseUrl,
|
|
118
156
|
});
|
|
119
157
|
console.log(" No config/sources found, using built-in default (sanity.io production)");
|
|
120
|
-
return
|
|
158
|
+
return applyOverridesToDefault(overrides);
|
|
121
159
|
}
|
|
122
160
|
if (!rawFile?.sources || Object.keys(rawFile.sources).length === 0) {
|
|
123
161
|
log.debug("config/sources is empty, falling back to built-in default");
|
|
124
162
|
console.log(" config/sources is empty, using built-in default");
|
|
125
|
-
return
|
|
163
|
+
return applyOverridesToDefault(overrides);
|
|
126
164
|
}
|
|
127
165
|
// Resolve which source to use
|
|
128
166
|
const sourceName =
|