@sanity/ailf 7.2.2 → 7.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +38 -0
- package/config/bigquery/README.md +39 -7
- package/config/bigquery/views/reports.sql +6 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +22 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/report.d.ts +30 -0
- package/dist/_vendor/ailf-core/schemas/report.js +21 -2
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +14 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +4 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.d.ts +116 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.js +128 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +19 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/report-validity.d.ts +60 -0
- package/dist/_vendor/ailf-core/types/report-validity.js +42 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +4 -3
- package/dist/_vendor/ailf-shared/glossary.d.ts +32 -0
- package/dist/_vendor/ailf-shared/glossary.js +35 -0
- package/dist/_vendor/ailf-shared/index.d.ts +2 -1
- package/dist/_vendor/ailf-shared/index.js +2 -1
- package/dist/_vendor/ailf-shared/run-classification.d.ts +53 -0
- package/dist/_vendor/ailf-shared/run-classification.js +111 -0
- package/dist/_vendor/ailf-shared/trustworthiness.d.ts +97 -0
- package/dist/_vendor/ailf-shared/trustworthiness.js +86 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +8 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +10 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +12 -2
- package/dist/artifact-capture/gcs-artifact-writer.js +18 -0
- package/dist/commands/publish.js +9 -2
- package/dist/orchestration/steps/publish-report-step.js +11 -3
- package/dist/orchestration/steps/run-eval-step.js +56 -3
- package/dist/pipeline/cache-hit-restore.d.ts +37 -1
- package/dist/pipeline/cache-hit-restore.js +108 -1
- package/dist/pipeline/report-validity.d.ts +32 -0
- package/dist/pipeline/report-validity.js +43 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/package.json +1 -1
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trustworthiness.ts — The single trust gate for reports (D0059).
|
|
3
|
+
*
|
|
4
|
+
* `includeInDefaultTrends` is the one definition of "show this report by
|
|
5
|
+
* default." Every surface (dashboard analytics, Studio presets, the BigQuery
|
|
6
|
+
* `reports.sql` view) references this predicate so the gate cannot drift
|
|
7
|
+
* between consumers.
|
|
8
|
+
*
|
|
9
|
+
* Two orthogonal axes decide inclusion:
|
|
10
|
+
*
|
|
11
|
+
* - **Validity (data health, D0059)** — the *primary* gate. A report is
|
|
12
|
+
* included only when its `validity.status` is `ok` OR validity is absent
|
|
13
|
+
* (pre-stamp reads are trusted until backfilled — the rollout is additive
|
|
14
|
+
* and nullable). Any non-`ok` status (`degraded` / `incomplete` /
|
|
15
|
+
* `suspect`) excludes the report regardless of intent.
|
|
16
|
+
* - **Intent (run classification, D0037)** — a *secondary* exclusion. The
|
|
17
|
+
* explicit `test` and `experimental` classifications are dropped;
|
|
18
|
+
* `adhoc` / `official` / `external` (and a missing classification) are kept.
|
|
19
|
+
* `adhoc` is intentionally included — it holds real production one-offs;
|
|
20
|
+
* the validity gate, not the intent gate, removes the bad ones inside it.
|
|
21
|
+
*
|
|
22
|
+
* We model a slim subset of the core `Report` shape (the two read axes) rather
|
|
23
|
+
* than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
|
|
24
|
+
* package is the dependency-graph leaf and imports nothing from core. A full
|
|
25
|
+
* core `Report` is structurally assignable to {@link TrustGateReport}.
|
|
26
|
+
*
|
|
27
|
+
* The predicate is total — it never throws — and is kept trivially
|
|
28
|
+
* translatable to the two query-language forms it is materialized as on the
|
|
29
|
+
* other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
|
|
30
|
+
* Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
|
|
31
|
+
* SQL boolean in the BigQuery `reports.sql` view
|
|
32
|
+
* ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
|
|
33
|
+
* function, so the one gate cannot drift between consumers; a cross-check test
|
|
34
|
+
* asserts all three forms agree across the full truth table.
|
|
35
|
+
*
|
|
36
|
+
* Note the SQL form is NULL-safe on *both* axes: a bare
|
|
37
|
+
* `classification NOT IN ('test','experimental')` would evaluate to `NULL`
|
|
38
|
+
* (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
|
|
39
|
+
* excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
|
|
40
|
+
* `classification IS NULL OR …`.
|
|
41
|
+
*
|
|
42
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
43
|
+
* @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
|
|
44
|
+
*/
|
|
45
|
+
/**
|
|
46
|
+
* Whether a report should appear in default trend views.
|
|
47
|
+
*
|
|
48
|
+
* Validity is the primary gate; intent is a secondary exclusion. See the
|
|
49
|
+
* module header for the full rationale and the equivalent SQL.
|
|
50
|
+
*
|
|
51
|
+
* @returns `true` when the report is trustworthy enough to show by default.
|
|
52
|
+
*/
|
|
53
|
+
export function includeInDefaultTrends(report) {
|
|
54
|
+
const status = report.validity?.status;
|
|
55
|
+
// Primary gate: trustworthy when explicitly `ok` or not yet assessed.
|
|
56
|
+
const validityOk = status == null || status === "ok";
|
|
57
|
+
const classification = report.provenance?.classification;
|
|
58
|
+
// Secondary exclusion: drop explicit test/experimental intent only.
|
|
59
|
+
const intentIncluded = classification !== "test" && classification !== "experimental";
|
|
60
|
+
return validityOk && intentIncluded;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
|
|
64
|
+
* `ailf.report` document. Drop it into a Studio structure filter with the
|
|
65
|
+
* document-type guard, e.g.
|
|
66
|
+
* `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
|
|
67
|
+
*
|
|
68
|
+
* GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
|
|
69
|
+
* unclassified report passes the intent clause without an explicit
|
|
70
|
+
* `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
|
|
71
|
+
* `defined(validity.status)` makes the absent-validity case trusted.
|
|
72
|
+
*/
|
|
73
|
+
export const INCLUDE_IN_DEFAULT_TRENDS_GROQ = '(!defined(validity.status) || validity.status == "ok") && !(provenance.classification in ["test", "experimental"])';
|
|
74
|
+
/**
|
|
75
|
+
* SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
|
|
76
|
+
* flattened `ailf.reports` BigQuery row (columns `validity_status`,
|
|
77
|
+
* `classification`). Materialized verbatim as the `include_in_default_trends`
|
|
78
|
+
* column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
|
|
79
|
+
* asserts the view embeds this exact string.
|
|
80
|
+
*
|
|
81
|
+
* Both axes are NULL-safe so the column matches the TS predicate row-for-row:
|
|
82
|
+
* `classification NOT IN (...)` alone is `NULL` for an unclassified row under
|
|
83
|
+
* SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
|
|
84
|
+
* silently dropping pre-taxonomy reports the TS predicate keeps.
|
|
85
|
+
*/
|
|
86
|
+
export const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
|
|
@@ -1564,8 +1564,8 @@ export declare const RepoConfigSchema: z.ZodObject<{
|
|
|
1564
1564
|
summary: z.ZodOptional<z.ZodObject<{
|
|
1565
1565
|
onRun: z.ZodOptional<z.ZodEnum<{
|
|
1566
1566
|
never: "never";
|
|
1567
|
-
always: "always";
|
|
1568
1567
|
auto: "auto";
|
|
1568
|
+
always: "always";
|
|
1569
1569
|
}>>;
|
|
1570
1570
|
}, z.core.$strip>>;
|
|
1571
1571
|
taskSource: z.ZodOptional<z.ZodObject<{
|
|
@@ -38,6 +38,14 @@ export declare class FanoutArtifactWriter implements ArtifactWriter {
|
|
|
38
38
|
private readonly writers;
|
|
39
39
|
private readonly progress?;
|
|
40
40
|
constructor(writers: readonly ArtifactWriter[], options?: FanoutArtifactWriterOptions);
|
|
41
|
+
/**
|
|
42
|
+
* The delegate writers in declaration order. Exposed read-only so callers
|
|
43
|
+
* walking the writer chain (e.g. to feature-detect an
|
|
44
|
+
* `ArtifactObjectChecker`) can descend into the fanout without it having to
|
|
45
|
+
* re-implement every optional capability. Mirrors the decorators' readonly
|
|
46
|
+
* `inner` accessor.
|
|
47
|
+
*/
|
|
48
|
+
get delegates(): readonly ArtifactWriter[];
|
|
41
49
|
private reportProgress;
|
|
42
50
|
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
51
|
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
@@ -33,6 +33,16 @@ export class FanoutArtifactWriter {
|
|
|
33
33
|
this.writers = writers;
|
|
34
34
|
this.progress = options.progress;
|
|
35
35
|
}
|
|
36
|
+
/**
|
|
37
|
+
* The delegate writers in declaration order. Exposed read-only so callers
|
|
38
|
+
* walking the writer chain (e.g. to feature-detect an
|
|
39
|
+
* `ArtifactObjectChecker`) can descend into the fanout without it having to
|
|
40
|
+
* re-implement every optional capability. Mirrors the decorators' readonly
|
|
41
|
+
* `inner` accessor.
|
|
42
|
+
*/
|
|
43
|
+
get delegates() {
|
|
44
|
+
return this.writers;
|
|
45
|
+
}
|
|
36
46
|
reportProgress(ref) {
|
|
37
47
|
if (!this.progress)
|
|
38
48
|
return;
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
29
29
|
*/
|
|
30
30
|
import { Storage } from "@google-cloud/storage";
|
|
31
|
-
import { type ArtifactEntry, type ArtifactRef, type ArtifactType, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssociationValues, type RunId, type RunManifest, type WriteSource } from "../_vendor/ailf-core/index.d.ts";
|
|
31
|
+
import { type ArtifactEntry, type ArtifactObjectChecker, type ArtifactRef, type ArtifactType, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssociationValues, type RunId, type RunManifest, type WriteSource } from "../_vendor/ailf-core/index.d.ts";
|
|
32
32
|
import { type UploadMetricsSink } from "./upload-metrics.js";
|
|
33
33
|
export interface GcsArtifactWriterOptions {
|
|
34
34
|
/** GCS bucket name (e.g., "ailf-artifacts") */
|
|
@@ -61,7 +61,7 @@ export interface GcsArtifactWriterOptions {
|
|
|
61
61
|
*/
|
|
62
62
|
writerSource?: WriteSource;
|
|
63
63
|
}
|
|
64
|
-
export declare class GcsArtifactWriter implements ArtifactWriter {
|
|
64
|
+
export declare class GcsArtifactWriter implements ArtifactWriter, ArtifactObjectChecker {
|
|
65
65
|
private client;
|
|
66
66
|
private readonly options;
|
|
67
67
|
private readonly ndjsonStreams;
|
|
@@ -83,6 +83,16 @@ export declare class GcsArtifactWriter implements ArtifactWriter {
|
|
|
83
83
|
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
84
84
|
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
85
85
|
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
86
|
+
/**
|
|
87
|
+
* Existence probe used by the cache-hit restore prune (D0057). Unlike the
|
|
88
|
+
* write methods (P5 non-blocking — swallow errors, return null), this
|
|
89
|
+
* resolves `false` ONLY for a definitively-absent object and **throws** on
|
|
90
|
+
* any other failure (auth / network / quota) so the caller can fail open
|
|
91
|
+
* and keep the ref rather than dropping a real artifact on a transient
|
|
92
|
+
* blip. `file.exists()` rejects only on real errors; a missing object
|
|
93
|
+
* resolves `[false]`.
|
|
94
|
+
*/
|
|
95
|
+
objectExists(path: string): Promise<boolean>;
|
|
86
96
|
/** @deprecated Use `emit()` instead. Routes through the same GCS I/O. */
|
|
87
97
|
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
88
98
|
/** @deprecated Use `emit()` per entry instead. */
|
|
@@ -223,6 +223,24 @@ export class GcsArtifactWriter {
|
|
|
223
223
|
this.reportProgress(ref);
|
|
224
224
|
return ref;
|
|
225
225
|
}
|
|
226
|
+
// ---- ArtifactObjectChecker (D0057) --------------------------------------
|
|
227
|
+
/**
|
|
228
|
+
* Existence probe used by the cache-hit restore prune (D0057). Unlike the
|
|
229
|
+
* write methods (P5 non-blocking — swallow errors, return null), this
|
|
230
|
+
* resolves `false` ONLY for a definitively-absent object and **throws** on
|
|
231
|
+
* any other failure (auth / network / quota) so the caller can fail open
|
|
232
|
+
* and keep the ref rather than dropping a real artifact on a transient
|
|
233
|
+
* blip. `file.exists()` rejects only on real errors; a missing object
|
|
234
|
+
* resolves `[false]`.
|
|
235
|
+
*/
|
|
236
|
+
async objectExists(path) {
|
|
237
|
+
const storage = this.getClient();
|
|
238
|
+
const [exists] = await storage
|
|
239
|
+
.bucket(this.options.bucket)
|
|
240
|
+
.file(path)
|
|
241
|
+
.exists();
|
|
242
|
+
return exists;
|
|
243
|
+
}
|
|
226
244
|
// ---- Deprecated legacy surface (W0052) ----------------------------------
|
|
227
245
|
/** @deprecated Use `emit()` instead. Routes through the same GCS I/O. */
|
|
228
246
|
async writeBulk(type, runId, data) {
|
package/dist/commands/publish.js
CHANGED
|
@@ -27,6 +27,7 @@ import { addOutputDirOption } from "./shared/options.js";
|
|
|
27
27
|
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
28
28
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
29
29
|
import { generateReportTitle } from "../pipeline/report-title.js";
|
|
30
|
+
import { stampReportValidity } from "../pipeline/report-validity.js";
|
|
30
31
|
import { buildSlimReportSummary } from "../_vendor/ailf-core/index.js";
|
|
31
32
|
import { generateReportId, } from "../report-store.js";
|
|
32
33
|
import { withRetry } from "../sinks/retry.js";
|
|
@@ -214,8 +215,14 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
|
214
215
|
// -----------------------------------------------------------------------
|
|
215
216
|
// 5. Write to Sanity (system of record)
|
|
216
217
|
// -----------------------------------------------------------------------
|
|
218
|
+
// Stamp the data-health validity axis + normalize classification (D0059)
|
|
219
|
+
// — the same server-computed forward guarantee the pipeline write path
|
|
220
|
+
// applies, so reports published via this command carry validity too.
|
|
221
|
+
const stampedReport = stampReportValidity(report, now);
|
|
217
222
|
console.log(" Writing to Sanity Content Lake...");
|
|
218
|
-
const sanityResult = store
|
|
223
|
+
const sanityResult = store
|
|
224
|
+
? await store.write(stampedReport)
|
|
225
|
+
: null;
|
|
219
226
|
if (sanityResult) {
|
|
220
227
|
console.log(` ✅ Report written: ${sanityResult}`);
|
|
221
228
|
}
|
|
@@ -237,7 +244,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
|
237
244
|
console.log();
|
|
238
245
|
console.log(` Delivering to ${sinks.length} sink(s)...`);
|
|
239
246
|
const settled = await Promise.allSettled(sinks.map(async (sink) => {
|
|
240
|
-
const result = await withRetry(() => sink.publish(
|
|
247
|
+
const result = await withRetry(() => sink.publish(stampedReport));
|
|
241
248
|
return { name: sink.name, result };
|
|
242
249
|
}));
|
|
243
250
|
for (const outcome of settled) {
|
|
@@ -16,6 +16,7 @@ import { assoc, buildSlimReportSummary, } from "../../_vendor/ailf-core/index.js
|
|
|
16
16
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
17
17
|
import { buildProvenance, } from "../../pipeline/provenance.js";
|
|
18
18
|
import { generateReportTitle } from "../../pipeline/report-title.js";
|
|
19
|
+
import { stampReportValidity } from "../../pipeline/report-validity.js";
|
|
19
20
|
import { generateReportId } from "../../report-store.js";
|
|
20
21
|
import { withRetry } from "../../sinks/retry.js";
|
|
21
22
|
export class PublishReportStep {
|
|
@@ -145,21 +146,28 @@ export class PublishReportStep {
|
|
|
145
146
|
testResults: slimSummary.testResults.map(slimTestResult),
|
|
146
147
|
};
|
|
147
148
|
}
|
|
149
|
+
// Stamp the data-health `validity` axis (D0059) and normalize
|
|
150
|
+
// `provenance.classification` on the report now that it is fully assembled
|
|
151
|
+
// (degradation + slim summary settled). The verdict is server-computed
|
|
152
|
+
// from the report's own data — never the caller envelope (D0037) — and
|
|
153
|
+
// assessed at the report's completion time. From here on, the stamped
|
|
154
|
+
// report is what reaches the snapshot artifact, the store, and the sinks.
|
|
155
|
+
const stampedReport = stampReportValidity(report, now);
|
|
148
156
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
149
157
|
state.reportId = reportId;
|
|
150
158
|
// W0050 — migrated from ctx.collector.capture to the unified writer.
|
|
151
159
|
// reportSnapshot: full Report JSON for replay (run-scoped, bulk).
|
|
152
|
-
await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx),
|
|
160
|
+
await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), stampedReport);
|
|
153
161
|
// autoComparison: delta vs baseline (run-scoped, bulk, optional).
|
|
154
162
|
if (comparison) {
|
|
155
163
|
await ctx.artifactWriter.emit("autoComparison", assoc(ctx), comparison);
|
|
156
164
|
}
|
|
157
165
|
// Write to store (system of record — best-effort, P5)
|
|
158
166
|
const sanityResult = ctx.reportStore
|
|
159
|
-
? await ctx.reportStore.write(
|
|
167
|
+
? await ctx.reportStore.write(stampedReport)
|
|
160
168
|
: null;
|
|
161
169
|
// Run sinks (fire-and-forget, P6)
|
|
162
|
-
const publishResult = await runSinks(
|
|
170
|
+
const publishResult = await runSinks(stampedReport, ctx);
|
|
163
171
|
// sinkResults: per-sink outcome (run-scoped, per-entry keyed by sink name).
|
|
164
172
|
for (const r of publishResult.sinkResults) {
|
|
165
173
|
await ctx.artifactWriter.emit("sinkResults", assoc(ctx, { name: r.name }), {
|
|
@@ -11,9 +11,11 @@ import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
|
11
11
|
import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
|
|
12
12
|
import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
|
|
13
13
|
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
14
|
+
import { FanoutArtifactWriter } from "../../artifact-capture/fanout-artifact-writer.js";
|
|
15
|
+
import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
|
|
14
16
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
15
17
|
import { buildCacheContext } from "../cache-context.js";
|
|
16
|
-
import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
|
|
18
|
+
import { pruneToResolvableRefs, remapToCacheHitRefs, } from "../../pipeline/cache-hit-restore.js";
|
|
17
19
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
18
20
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
19
21
|
import { loadGraderModel } from "../../pipeline/grader-api.js";
|
|
@@ -147,11 +149,29 @@ export class RunEvalStep {
|
|
|
147
149
|
remoteCacheResult.sourceRunId &&
|
|
148
150
|
ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
|
|
149
151
|
const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
|
|
150
|
-
|
|
151
|
-
|
|
152
|
+
// W0350 / D0057 — a degraded source run can advertise per-entry
|
|
153
|
+
// artifacts (e.g. rawResults) whose objects were never written under
|
|
154
|
+
// its prefix. Drop those over-claims here, at the restore boundary,
|
|
155
|
+
// so the new run's manifest advertises only artifacts that resolve —
|
|
156
|
+
// rather than pushing per-object HEAD checks onto the read side's hot
|
|
157
|
+
// signing path (AC 3). When no object checker is reachable in the
|
|
158
|
+
// writer chain (local-only / NoOp / gateway backends), skip the prune
|
|
159
|
+
// and restore verbatim, preserving prior behavior.
|
|
160
|
+
const checker = findObjectChecker(ctx.artifactWriter);
|
|
161
|
+
const { manifest: resolvable, droppedEntries, droppedRefs, } = checker
|
|
162
|
+
? await pruneToResolvableRefs(restored, checker)
|
|
163
|
+
: { manifest: restored, droppedEntries: 0, droppedRefs: 0 };
|
|
164
|
+
ctx.artifactWriter.injectAccumulated(resolvable);
|
|
165
|
+
const count = Object.keys(resolvable).length;
|
|
152
166
|
if (count > 0) {
|
|
153
167
|
console.log(` ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
|
|
154
168
|
}
|
|
169
|
+
if (droppedEntries > 0 || droppedRefs > 0) {
|
|
170
|
+
const refsNote = droppedRefs > 0
|
|
171
|
+
? ` and ${droppedRefs} ref${droppedRefs === 1 ? "" : "s"}`
|
|
172
|
+
: "";
|
|
173
|
+
console.log(` ⚠️ Dropped ${droppedEntries} unresolvable artifact entr${droppedEntries === 1 ? "y" : "ies"}${refsNote} over-claimed by cache parent ${remoteCacheResult.sourceRunId}`);
|
|
174
|
+
}
|
|
155
175
|
}
|
|
156
176
|
return {
|
|
157
177
|
durationMs: Date.now() - start,
|
|
@@ -275,6 +295,39 @@ export class RunEvalStep {
|
|
|
275
295
|
}
|
|
276
296
|
}
|
|
277
297
|
// ---------------------------------------------------------------------------
|
|
298
|
+
// Object-checker discovery (D0057 / W0350)
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
const FIND_CHECKER_MAX_STEPS = 16;
|
|
301
|
+
/**
|
|
302
|
+
* Walk a writer's decorator/fanout chain to feature-detect an
|
|
303
|
+
* `ArtifactObjectChecker`. The composition root wraps the backend in
|
|
304
|
+
* `AccumulatingArtifactWriter` (and optionally `InstrumentedArtifactWriter`)
|
|
305
|
+
* and layers GCS over local via `FanoutArtifactWriter`. Only
|
|
306
|
+
* `GcsArtifactWriter` implements `objectExists`, so on local-only / NoOp /
|
|
307
|
+
* gateway chains this returns null and the cache-hit restore skips pruning.
|
|
308
|
+
* `MAX_STEPS` is a cycle guard against a future decorator self-reference.
|
|
309
|
+
*/
|
|
310
|
+
function findObjectChecker(writer) {
|
|
311
|
+
const stack = [writer];
|
|
312
|
+
for (let steps = 0; stack.length > 0 && steps < FIND_CHECKER_MAX_STEPS; steps++) {
|
|
313
|
+
const cursor = stack.pop();
|
|
314
|
+
if (!cursor)
|
|
315
|
+
continue;
|
|
316
|
+
if (hasObjectExists(cursor))
|
|
317
|
+
return cursor;
|
|
318
|
+
if (cursor instanceof AccumulatingArtifactWriter)
|
|
319
|
+
stack.push(cursor.inner);
|
|
320
|
+
else if (cursor instanceof InstrumentedArtifactWriter)
|
|
321
|
+
stack.push(cursor.inner);
|
|
322
|
+
else if (cursor instanceof FanoutArtifactWriter)
|
|
323
|
+
stack.push(...cursor.delegates);
|
|
324
|
+
}
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
function hasObjectExists(w) {
|
|
328
|
+
return (typeof w.objectExists === "function");
|
|
329
|
+
}
|
|
330
|
+
// ---------------------------------------------------------------------------
|
|
278
331
|
// Remote cache helpers
|
|
279
332
|
// ---------------------------------------------------------------------------
|
|
280
333
|
async function checkRemoteCache(fingerprint, reportStore, rootDir) {
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
-
import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
11
|
+
import { type ArtifactManifest, type ArtifactObjectChecker, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
12
12
|
/**
|
|
13
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
14
14
|
* that doesn't already carry one.
|
|
@@ -47,3 +47,39 @@ import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.
|
|
|
47
47
|
export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
|
|
48
48
|
sourceRunId: RunId;
|
|
49
49
|
}): ArtifactManifest;
|
|
50
|
+
/** Outcome of `pruneToResolvableRefs`. */
|
|
51
|
+
export interface PruneResult {
|
|
52
|
+
/** Manifest with over-claimed entries/refs removed. */
|
|
53
|
+
readonly manifest: ArtifactManifest;
|
|
54
|
+
/** Per-entry entries dropped because their object did not exist. */
|
|
55
|
+
readonly droppedEntries: number;
|
|
56
|
+
/** Refs dropped entirely (bulk missing, or per-entry left with no entries). */
|
|
57
|
+
readonly droppedRefs: number;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Drop artifact refs (and per-entry entries) a cached report over-claimed —
|
|
61
|
+
* entries whose backing object was never written under the source run's
|
|
62
|
+
* storage prefix (D0040 / D0057, W0350).
|
|
63
|
+
*
|
|
64
|
+
* A degraded source run can publish a manifest that lists `rawResults`
|
|
65
|
+
* entries with no GCS object behind them; `remapToCacheHitRefs` copies those
|
|
66
|
+
* phantom entries forward into the new run's manifest, and the read side then
|
|
67
|
+
* signs URLs that 404 ("the specified key does not exist"). Pruning here, at
|
|
68
|
+
* the cache-hit restore boundary, removes the over-claim at the source so the
|
|
69
|
+
* written manifest's `entryCount` / `entries[]` reflect only artifacts that
|
|
70
|
+
* actually resolve — instead of pushing per-object HEAD checks onto the hot
|
|
71
|
+
* signing path (W0350 AC 3).
|
|
72
|
+
*
|
|
73
|
+
* Resolution mirrors the gateway: a per-entry object lives at
|
|
74
|
+
* `descriptor.objectPath(sourceRunId, entry.key)`, where `sourceRunId` is the
|
|
75
|
+
* runId encoded in `ref.path` (preferred — structurally tied to where bytes
|
|
76
|
+
* physically live) falling back to `ref.sourceRunId` (the lineage hint).
|
|
77
|
+
*
|
|
78
|
+
* **Fail open.** `checker.objectExists` throws when existence can't be
|
|
79
|
+
* determined (auth / network / quota). A throw KEEPS the ref/entry — we never
|
|
80
|
+
* drop a real artifact on a transient blip; the read side already tolerates a
|
|
81
|
+
* rare residual 404 (W0349). Only a definitive `false` drops an entry.
|
|
82
|
+
*
|
|
83
|
+
* Pure w.r.t. its inputs: returns a fresh manifest, never mutates `source`.
|
|
84
|
+
*/
|
|
85
|
+
export declare function pruneToResolvableRefs(source: ArtifactManifest, checker: ArtifactObjectChecker): Promise<PruneResult>;
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
-
import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
|
|
11
|
+
import { ARTIFACT_REGISTRY, runId as parseRunId, } from "../_vendor/ailf-core/index.js";
|
|
12
12
|
/**
|
|
13
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
14
14
|
* that doesn't already carry one.
|
|
@@ -60,3 +60,110 @@ export function remapToCacheHitRefs(source, opts) {
|
|
|
60
60
|
}
|
|
61
61
|
return out;
|
|
62
62
|
}
|
|
63
|
+
/**
|
|
64
|
+
* Drop artifact refs (and per-entry entries) a cached report over-claimed —
|
|
65
|
+
* entries whose backing object was never written under the source run's
|
|
66
|
+
* storage prefix (D0040 / D0057, W0350).
|
|
67
|
+
*
|
|
68
|
+
* A degraded source run can publish a manifest that lists `rawResults`
|
|
69
|
+
* entries with no GCS object behind them; `remapToCacheHitRefs` copies those
|
|
70
|
+
* phantom entries forward into the new run's manifest, and the read side then
|
|
71
|
+
* signs URLs that 404 ("the specified key does not exist"). Pruning here, at
|
|
72
|
+
* the cache-hit restore boundary, removes the over-claim at the source so the
|
|
73
|
+
* written manifest's `entryCount` / `entries[]` reflect only artifacts that
|
|
74
|
+
* actually resolve — instead of pushing per-object HEAD checks onto the hot
|
|
75
|
+
* signing path (W0350 AC 3).
|
|
76
|
+
*
|
|
77
|
+
* Resolution mirrors the gateway: a per-entry object lives at
|
|
78
|
+
* `descriptor.objectPath(sourceRunId, entry.key)`, where `sourceRunId` is the
|
|
79
|
+
* runId encoded in `ref.path` (preferred — structurally tied to where bytes
|
|
80
|
+
* physically live) falling back to `ref.sourceRunId` (the lineage hint).
|
|
81
|
+
*
|
|
82
|
+
* **Fail open.** `checker.objectExists` throws when existence can't be
|
|
83
|
+
* determined (auth / network / quota). A throw KEEPS the ref/entry — we never
|
|
84
|
+
* drop a real artifact on a transient blip; the read side already tolerates a
|
|
85
|
+
* rare residual 404 (W0349). Only a definitive `false` drops an entry.
|
|
86
|
+
*
|
|
87
|
+
* Pure w.r.t. its inputs: returns a fresh manifest, never mutates `source`.
|
|
88
|
+
*/
|
|
89
|
+
export async function pruneToResolvableRefs(source, checker) {
|
|
90
|
+
const out = {};
|
|
91
|
+
let droppedEntries = 0;
|
|
92
|
+
let droppedRefs = 0;
|
|
93
|
+
for (const [type, ref] of Object.entries(source)) {
|
|
94
|
+
if (!ref)
|
|
95
|
+
continue;
|
|
96
|
+
const artifactType = type;
|
|
97
|
+
const descriptor = ARTIFACT_REGISTRY[artifactType];
|
|
98
|
+
// Bulk: a single object at ref.path.
|
|
99
|
+
if (ref.layout === "bulk") {
|
|
100
|
+
if (await existsOrKeep(checker, ref.path))
|
|
101
|
+
out[artifactType] = ref;
|
|
102
|
+
else
|
|
103
|
+
droppedRefs++;
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
// Per-entry: each entry is its own object under the source run prefix.
|
|
107
|
+
const sourceRunId = resolveSourceRunId(ref);
|
|
108
|
+
const entries = ref.entries ?? [];
|
|
109
|
+
if (!descriptor || sourceRunId === undefined || entries.length === 0) {
|
|
110
|
+
// Can't resolve per-entry object paths — fail open, keep verbatim.
|
|
111
|
+
out[artifactType] = ref;
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
const keptFlags = await Promise.all(entries.map(async (entry) => {
|
|
115
|
+
let objectPath;
|
|
116
|
+
try {
|
|
117
|
+
objectPath = descriptor.objectPath(sourceRunId, entry.key);
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
return true; // malformed key — fail open rather than drop
|
|
121
|
+
}
|
|
122
|
+
return existsOrKeep(checker, objectPath);
|
|
123
|
+
}));
|
|
124
|
+
const kept = entries.filter((_, i) => keptFlags[i]);
|
|
125
|
+
if (kept.length === entries.length) {
|
|
126
|
+
out[artifactType] = ref;
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
droppedEntries += entries.length - kept.length;
|
|
130
|
+
if (kept.length === 0) {
|
|
131
|
+
droppedRefs++; // nothing resolvable — drop the whole over-claimed ref
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
out[artifactType] = {
|
|
135
|
+
...ref,
|
|
136
|
+
entries: kept,
|
|
137
|
+
entryCount: kept.length,
|
|
138
|
+
bytes: kept.reduce((sum, e) => sum + (e.bytes ?? 0), 0),
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
return { manifest: out, droppedEntries, droppedRefs };
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Resolve where a ref's bytes physically live. Prefers the runId encoded in
|
|
145
|
+
* `ref.path` (validated through the canonical parser so a malformed manifest
|
|
146
|
+
* path can't propagate into a synthesized object name) over the
|
|
147
|
+
* `ref.sourceRunId` lineage hint — matching the gateway's resolution order.
|
|
148
|
+
*/
|
|
149
|
+
function resolveSourceRunId(ref) {
|
|
150
|
+
const fromPath = /^runs\/([^/]+)/.exec(ref.path)?.[1];
|
|
151
|
+
if (fromPath) {
|
|
152
|
+
const parsed = parseRunId(fromPath);
|
|
153
|
+
if (parsed.ok)
|
|
154
|
+
return parsed.value;
|
|
155
|
+
}
|
|
156
|
+
return ref.sourceRunId;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* `checker.objectExists` wrapper that returns `true` (keep) on a thrown,
|
|
160
|
+
* indeterminate result — only a definitive `false` drops the artifact.
|
|
161
|
+
*/
|
|
162
|
+
async function existsOrKeep(checker, path) {
|
|
163
|
+
try {
|
|
164
|
+
return await checker.objectExists(path);
|
|
165
|
+
}
|
|
166
|
+
catch {
|
|
167
|
+
return true;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stampReportValidity — apply the report-trustworthiness detector at write time.
|
|
3
|
+
*
|
|
4
|
+
* The eval write path's forward guarantee (D0059): every newly written report
|
|
5
|
+
* carries a top-level `validity` data-health stamp so the trustworthiness gap
|
|
6
|
+
* cannot recur on new reports. Lives in `pipeline/` (not the orchestration
|
|
7
|
+
* step) so both report-write paths — `PublishReportStep` and the standalone
|
|
8
|
+
* `publish` command — import it without a command→orchestration-step coupling.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
11
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
12
|
+
*/
|
|
13
|
+
import { type Report } from "../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
/**
|
|
15
|
+
* Stamp the data-health `validity` axis (D0059) onto a report and normalize
|
|
16
|
+
* its `provenance.classification` to the canonical spelling.
|
|
17
|
+
*
|
|
18
|
+
* Runs the pure detector (`assessReportValidity`) over the assembled report.
|
|
19
|
+
* `Report` structurally satisfies the detector's `ReportValidityInput`
|
|
20
|
+
* (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
|
|
21
|
+
* adapter is needed. The verdict is **server-computed from the report's own
|
|
22
|
+
* data** (D0037): `assessedAt` is injected by the caller (the report's
|
|
23
|
+
* completion time) and nothing is read from the caller envelope.
|
|
24
|
+
*
|
|
25
|
+
* `classification` is patched only when the detector returns one — it returns
|
|
26
|
+
* `undefined` when the existing value is already canonical and no Tier-1 rule
|
|
27
|
+
* fired, so the patch is idempotent and never clobbers a correct (or
|
|
28
|
+
* human-corrected) value. Tier-2 review flags are not persisted here; the
|
|
29
|
+
* one-shot backfill consumes them. Returns a new report; the input is not
|
|
30
|
+
* mutated.
|
|
31
|
+
*/
|
|
32
|
+
export declare function stampReportValidity(report: Report, assessedAt: string): Report;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stampReportValidity — apply the report-trustworthiness detector at write time.
|
|
3
|
+
*
|
|
4
|
+
* The eval write path's forward guarantee (D0059): every newly written report
|
|
5
|
+
* carries a top-level `validity` data-health stamp so the trustworthiness gap
|
|
6
|
+
* cannot recur on new reports. Lives in `pipeline/` (not the orchestration
|
|
7
|
+
* step) so both report-write paths — `PublishReportStep` and the standalone
|
|
8
|
+
* `publish` command — import it without a command→orchestration-step coupling.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
11
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
12
|
+
*/
|
|
13
|
+
import { assessReportValidity } from "../_vendor/ailf-core/index.js";
|
|
14
|
+
/**
|
|
15
|
+
* Stamp the data-health `validity` axis (D0059) onto a report and normalize
|
|
16
|
+
* its `provenance.classification` to the canonical spelling.
|
|
17
|
+
*
|
|
18
|
+
* Runs the pure detector (`assessReportValidity`) over the assembled report.
|
|
19
|
+
* `Report` structurally satisfies the detector's `ReportValidityInput`
|
|
20
|
+
* (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
|
|
21
|
+
* adapter is needed. The verdict is **server-computed from the report's own
|
|
22
|
+
* data** (D0037): `assessedAt` is injected by the caller (the report's
|
|
23
|
+
* completion time) and nothing is read from the caller envelope.
|
|
24
|
+
*
|
|
25
|
+
* `classification` is patched only when the detector returns one — it returns
|
|
26
|
+
* `undefined` when the existing value is already canonical and no Tier-1 rule
|
|
27
|
+
* fired, so the patch is idempotent and never clobbers a correct (or
|
|
28
|
+
* human-corrected) value. Tier-2 review flags are not persisted here; the
|
|
29
|
+
* one-shot backfill consumes them. Returns a new report; the input is not
|
|
30
|
+
* mutated.
|
|
31
|
+
*/
|
|
32
|
+
export function stampReportValidity(report, assessedAt) {
|
|
33
|
+
const { classification, validity } = assessReportValidity(report, {
|
|
34
|
+
assessedAt,
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
...report,
|
|
38
|
+
provenance: classification
|
|
39
|
+
? { ...report.provenance, classification }
|
|
40
|
+
: report.provenance,
|
|
41
|
+
validity,
|
|
42
|
+
};
|
|
43
|
+
}
|
package/dist/report-store.d.ts
CHANGED
package/dist/report-store.js
CHANGED
|
@@ -491,6 +491,7 @@ export function toSanityReportDoc(report) {
|
|
|
491
491
|
},
|
|
492
492
|
tag: report.tag ?? null,
|
|
493
493
|
title: report.title ?? null,
|
|
494
|
+
...(report.validity ? { validity: report.validity } : {}),
|
|
494
495
|
};
|
|
495
496
|
}
|
|
496
497
|
/**
|
|
@@ -534,6 +535,7 @@ export function toReport(doc) {
|
|
|
534
535
|
summary: doc.summary,
|
|
535
536
|
tag: doc.tag,
|
|
536
537
|
title: doc.title,
|
|
538
|
+
validity: doc.validity,
|
|
537
539
|
};
|
|
538
540
|
}
|
|
539
541
|
/**
|