@sanity/ailf 7.2.3 → 7.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +38 -0
- package/config/bigquery/README.md +39 -7
- package/config/bigquery/views/reports.sql +6 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +30 -0
- package/dist/_vendor/ailf-core/schemas/report.js +21 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +4 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.d.ts +116 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.js +128 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +19 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/report-validity.d.ts +60 -0
- package/dist/_vendor/ailf-core/types/report-validity.js +42 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +3 -2
- package/dist/_vendor/ailf-shared/index.d.ts +2 -1
- package/dist/_vendor/ailf-shared/index.js +2 -1
- package/dist/_vendor/ailf-shared/run-classification.d.ts +53 -0
- package/dist/_vendor/ailf-shared/run-classification.js +111 -0
- package/dist/_vendor/ailf-shared/trustworthiness.d.ts +97 -0
- package/dist/_vendor/ailf-shared/trustworthiness.js +86 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/commands/publish.js +9 -2
- package/dist/orchestration/steps/publish-report-step.js +11 -3
- package/dist/pipeline/calculate-scores.js +8 -2
- package/dist/pipeline/report-validity.d.ts +32 -0
- package/dist/pipeline/report-validity.js +43 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/package.json +1 -1
|
@@ -104,6 +104,11 @@ definitions:
|
|
|
104
104
|
"host_platform": provenance.host.platform,
|
|
105
105
|
"host_arch": provenance.host.arch,
|
|
106
106
|
"host_ci": provenance.host.ci,
|
|
107
|
+
"validity_status": validity.status,
|
|
108
|
+
"validity_method": validity.method,
|
|
109
|
+
"validity_ruleset_version": validity.rulesetVersion,
|
|
110
|
+
"validity_assessed_at": validity.assessedAt,
|
|
111
|
+
"validity_reasons": validity.reasons,
|
|
107
112
|
_createdAt
|
|
108
113
|
}
|
|
109
114
|
record_selector:
|
|
@@ -724,6 +729,39 @@ schemas:
|
|
|
724
729
|
- string
|
|
725
730
|
- "null"
|
|
726
731
|
description: CI provider when running under one (e.g., github-actions).
|
|
732
|
+
# ----------------------------------------------------------------
|
|
733
|
+
# D0059 — report validity (data-health axis, orthogonal to intent)
|
|
734
|
+
# ----------------------------------------------------------------
|
|
735
|
+
validity_status:
|
|
736
|
+
type:
|
|
737
|
+
- string
|
|
738
|
+
- "null"
|
|
739
|
+
description:
|
|
740
|
+
"Data-health verdict (D0059): ok | degraded | incomplete | suspect.
|
|
741
|
+
NULL for reports predating the validity stamp (treated as trusted)."
|
|
742
|
+
validity_method:
|
|
743
|
+
type:
|
|
744
|
+
- string
|
|
745
|
+
- "null"
|
|
746
|
+
description: '"auto" | "manual" — how the validity verdict was reached.'
|
|
747
|
+
validity_ruleset_version:
|
|
748
|
+
type:
|
|
749
|
+
- string
|
|
750
|
+
- "null"
|
|
751
|
+
description: Detector ruleset version, so re-assessments are comparable.
|
|
752
|
+
validity_assessed_at:
|
|
753
|
+
type:
|
|
754
|
+
- string
|
|
755
|
+
- "null"
|
|
756
|
+
description:
|
|
757
|
+
ISO 8601 UTC timestamp when the validity verdict was produced.
|
|
758
|
+
validity_reasons:
|
|
759
|
+
type:
|
|
760
|
+
- array
|
|
761
|
+
- "null"
|
|
762
|
+
items:
|
|
763
|
+
type: string
|
|
764
|
+
description: Which detector rules fired — the audit trail behind status.
|
|
727
765
|
_createdAt:
|
|
728
766
|
type:
|
|
729
767
|
- string
|
|
@@ -22,13 +22,13 @@ BigQuery views (this directory)
|
|
|
22
22
|
|
|
23
23
|
## Files
|
|
24
24
|
|
|
25
|
-
| File | Purpose
|
|
26
|
-
| -------------------------------- |
|
|
27
|
-
| `views/area_scores.sql` | Flattens nested `model_scores` array into one row per area per model per report
|
|
28
|
-
| `views/reports.sql` | Clean passthrough view with correct types and column ordering
|
|
29
|
-
| `views/official_runs.sql` | Canonical trend series (D0037): `classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'`
|
|
30
|
-
| `views/official_area_scores.sql` | `area_scores` joined to `official_runs` — inherits the official-run predicate for area-level dashboards
|
|
31
|
-
| `views/team_runs_template.sql` | Recipe/template for instantiating per-team filtered views
|
|
25
|
+
| File | Purpose |
|
|
26
|
+
| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
27
|
+
| `views/area_scores.sql` | Flattens nested `model_scores` array into one row per area per model per report |
|
|
28
|
+
| `views/reports.sql` | Clean passthrough view with correct types and column ordering; also materializes `validity_*` columns and the `include_in_default_trends` trust gate (D0059) |
|
|
29
|
+
| `views/official_runs.sql` | Canonical trend series (D0037): `classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'` |
|
|
30
|
+
| `views/official_area_scores.sql` | `area_scores` joined to `official_runs` — inherits the official-run predicate for area-level dashboards |
|
|
31
|
+
| `views/team_runs_template.sql` | Recipe/template for instantiating per-team filtered views |
|
|
32
32
|
|
|
33
33
|
## Setup
|
|
34
34
|
|
|
@@ -88,6 +88,38 @@ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false
|
|
|
88
88
|
> run `bq query` from this repo regularly, consider setting the default with
|
|
89
89
|
> `gcloud config set project data-platform-302218`.
|
|
90
90
|
|
|
91
|
+
## Querying default (trustworthy) reports
|
|
92
|
+
|
|
93
|
+
`ailf.reports` exposes every synced report, including development runs and
|
|
94
|
+
structurally incomplete ones. BI dashboards and ad-hoc analysis should gate on
|
|
95
|
+
the **`include_in_default_trends`** boolean, which materializes the single
|
|
96
|
+
trustworthiness predicate (D0059) shared with the dashboard and Studio — so all
|
|
97
|
+
three surfaces show the same default population:
|
|
98
|
+
|
|
99
|
+
```sql
|
|
100
|
+
SELECT *
|
|
101
|
+
FROM `data-platform-302218.ailf.reports`
|
|
102
|
+
WHERE include_in_default_trends = TRUE;
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
`include_in_default_trends` is the SQL form of the `includeInDefaultTrends`
|
|
106
|
+
predicate (`INCLUDE_IN_DEFAULT_TRENDS_SQL` in `@sanity/ailf-shared`):
|
|
107
|
+
|
|
108
|
+
```text
|
|
109
|
+
(validity_status IS NULL OR validity_status = 'ok')
|
|
110
|
+
AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
To audit the _excluded_ (untrusted) reports — the inverse population, useful for
|
|
114
|
+
the Tier-2 review pass — flip the predicate to `= FALSE`. The `validity_status`
|
|
115
|
+
/ `validity_reasons` columns explain why each was excluded.
|
|
116
|
+
|
|
117
|
+
> The `official_runs` view (above) is a **stricter** canonical-series gate
|
|
118
|
+
> (`classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'`);
|
|
119
|
+
> `include_in_default_trends` is the **broader** "trustworthy enough to show by
|
|
120
|
+
> default" gate. Use `official_runs` for the tracked trend line,
|
|
121
|
+
> `include_in_default_trends` for general default views.
|
|
122
|
+
|
|
91
123
|
## Naming conventions
|
|
92
124
|
|
|
93
125
|
- **`ailf_raw.*`** — raw Airbyte-loaded tables (nested JSON, Airbyte metadata
|
|
@@ -64,6 +64,12 @@ SELECT
|
|
|
64
64
|
host_platform,
|
|
65
65
|
host_arch,
|
|
66
66
|
host_ci,
|
|
67
|
+
validity_status,
|
|
68
|
+
validity_method,
|
|
69
|
+
validity_ruleset_version,
|
|
70
|
+
TIMESTAMP(validity_assessed_at) AS validity_assessed_at,
|
|
71
|
+
validity_reasons,
|
|
72
|
+
(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental')) AS include_in_default_trends,
|
|
67
73
|
TIMESTAMP(_createdAt) AS synced_at
|
|
68
74
|
FROM
|
|
69
75
|
`data-platform-302218.ailf_raw.reports`;
|
|
@@ -139,6 +139,21 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
|
|
|
139
139
|
runId: z.ZodString;
|
|
140
140
|
targetDocuments: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
141
141
|
}, z.core.$strict>;
|
|
142
|
+
export declare const ReportValiditySchema: z.ZodObject<{
|
|
143
|
+
status: z.ZodEnum<{
|
|
144
|
+
ok: "ok";
|
|
145
|
+
degraded: "degraded";
|
|
146
|
+
incomplete: "incomplete";
|
|
147
|
+
suspect: "suspect";
|
|
148
|
+
}>;
|
|
149
|
+
reasons: z.ZodArray<z.ZodString>;
|
|
150
|
+
method: z.ZodEnum<{
|
|
151
|
+
auto: "auto";
|
|
152
|
+
manual: "manual";
|
|
153
|
+
}>;
|
|
154
|
+
rulesetVersion: z.ZodString;
|
|
155
|
+
assessedAt: z.ZodISODateTime;
|
|
156
|
+
}, z.core.$strict>;
|
|
142
157
|
export declare const ReportSchema: z.ZodObject<{
|
|
143
158
|
id: z.ZodString;
|
|
144
159
|
completedAt: z.ZodISODateTime;
|
|
@@ -269,6 +284,21 @@ export declare const ReportSchema: z.ZodObject<{
|
|
|
269
284
|
}>>;
|
|
270
285
|
detail: z.ZodString;
|
|
271
286
|
}, z.core.$strict>>;
|
|
287
|
+
validity: z.ZodOptional<z.ZodObject<{
|
|
288
|
+
status: z.ZodEnum<{
|
|
289
|
+
ok: "ok";
|
|
290
|
+
degraded: "degraded";
|
|
291
|
+
incomplete: "incomplete";
|
|
292
|
+
suspect: "suspect";
|
|
293
|
+
}>;
|
|
294
|
+
reasons: z.ZodArray<z.ZodString>;
|
|
295
|
+
method: z.ZodEnum<{
|
|
296
|
+
auto: "auto";
|
|
297
|
+
manual: "manual";
|
|
298
|
+
}>;
|
|
299
|
+
rulesetVersion: z.ZodString;
|
|
300
|
+
assessedAt: z.ZodISODateTime;
|
|
301
|
+
}, z.core.$strict>>;
|
|
272
302
|
}, z.core.$loose>;
|
|
273
303
|
export type ReportSchemaInput = z.input<typeof ReportSchema>;
|
|
274
304
|
export type ReportSchemaOutput = z.infer<typeof ReportSchema>;
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
*/
|
|
26
26
|
import { z } from "zod";
|
|
27
27
|
import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
|
|
28
|
-
import { DEGRADED_ENRICHMENT_FIELDS } from "../types/index.js";
|
|
28
|
+
import { DEGRADED_ENRICHMENT_FIELDS, REPORT_VALIDITY_STATUSES, } from "../types/index.js";
|
|
29
29
|
// ---------------------------------------------------------------------------
|
|
30
30
|
// RunContext building blocks (mirrors packages/shared/src/run-context.ts)
|
|
31
31
|
// ---------------------------------------------------------------------------
|
|
@@ -217,6 +217,21 @@ export const ReportProvenanceSchema = z
|
|
|
217
217
|
// (ScoreSummary, FeatureScore, the W0051 slim types) are out of Scope A.
|
|
218
218
|
// ---------------------------------------------------------------------------
|
|
219
219
|
const RecordPassthroughSchema = z.record(z.string(), z.unknown());
|
|
220
|
+
// ---------------------------------------------------------------------------
|
|
221
|
+
// ReportValidity — top-level data-health axis (D0059), orthogonal to
|
|
222
|
+
// `provenance.classification`. `satisfies z.ZodType<ReportValidity>` (D0045):
|
|
223
|
+
// the domain type is authored independently in ../types/report-validity.ts,
|
|
224
|
+
// so schema/type drift is a build error. Strict — unknown keys signal drift.
|
|
225
|
+
// ---------------------------------------------------------------------------
|
|
226
|
+
export const ReportValiditySchema = z
|
|
227
|
+
.object({
|
|
228
|
+
status: z.enum(REPORT_VALIDITY_STATUSES),
|
|
229
|
+
reasons: z.array(z.string()),
|
|
230
|
+
method: z.enum(["auto", "manual"]),
|
|
231
|
+
rulesetVersion: z.string().min(1),
|
|
232
|
+
assessedAt: z.iso.datetime({ offset: true }),
|
|
233
|
+
})
|
|
234
|
+
.strict();
|
|
220
235
|
export const ReportSchema = z
|
|
221
236
|
.object({
|
|
222
237
|
id: z.string().min(1),
|
|
@@ -236,7 +251,8 @@ export const ReportSchema = z
|
|
|
236
251
|
title: z.string().nullable().optional(),
|
|
237
252
|
// Degraded marker (mirrors `ReportDegradation`): present only when a full
|
|
238
253
|
// eval scored tests but enrichment did not complete. Strict — unknown
|
|
239
|
-
// keys here signal real drift.
|
|
254
|
+
// keys here signal real drift. Superseded by `validity` (D0059); retained
|
|
255
|
+
// for back-compat reads until the backfill migrates it.
|
|
240
256
|
degraded: z
|
|
241
257
|
.object({
|
|
242
258
|
reason: z.literal("enrichment-missing"),
|
|
@@ -247,5 +263,8 @@ export const ReportSchema = z
|
|
|
247
263
|
})
|
|
248
264
|
.strict()
|
|
249
265
|
.optional(),
|
|
266
|
+
// Data-health axis (D0059). Additive + nullable: pre-stamp reads have no
|
|
267
|
+
// `validity` and are treated as trustworthy until backfilled.
|
|
268
|
+
validity: ReportValiditySchema.optional(),
|
|
250
269
|
})
|
|
251
270
|
.passthrough();
|
|
@@ -13,6 +13,7 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
|
|
16
|
+
export { assessReportValidity, REPORT_VALIDITY_RULESET_VERSION, type AssessReportValidityOptions, type ReportValidityAssessment, type ReportValidityInput, type ReportValidityReviewFlag, type ReportValidityReviewReason, } from "./report-validity-detector.js";
|
|
16
17
|
export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
|
|
17
18
|
export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
|
|
18
19
|
export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
|
|
@@ -14,6 +14,10 @@ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resol
|
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, } from "./report-to-markdown.js";
|
|
16
16
|
// ---------------------------------------------------------------------------
|
|
17
|
+
// Report-validity detector (D0059) — pure, versioned trustworthiness engine
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
export { assessReportValidity, REPORT_VALIDITY_RULESET_VERSION, } from "./report-validity-detector.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
17
21
|
// Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
|
|
18
22
|
// ---------------------------------------------------------------------------
|
|
19
23
|
export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report-validity detector (D0059)
|
|
3
|
+
*
|
|
4
|
+
* The detection brain for the report trustworthiness model. A pure, versioned,
|
|
5
|
+
* I/O-free rules engine that computes both trustworthiness axes for a report —
|
|
6
|
+
* intent (`classification`, D0037) and data health (`validity`) — with a
|
|
7
|
+
* confidence tier so it never mislabels ambiguous data.
|
|
8
|
+
*
|
|
9
|
+
* Reused unchanged by the eval write path (`W-stamp-validity-write-path`,
|
|
10
|
+
* stamps every new report) and the one-shot backfill
|
|
11
|
+
* (`W-backfill-report-validity`, labels the historical store).
|
|
12
|
+
*
|
|
13
|
+
* **Confidence tiers (D0059 §2):**
|
|
14
|
+
*
|
|
15
|
+
* - **Tier 1 — auto.** High-confidence, unambiguous rules. Intent:
|
|
16
|
+
* `scheduled + github-actions → official`; `testing-ground` team /
|
|
17
|
+
* `env-override` source / generated-id executor → `test`/`experimental`;
|
|
18
|
+
* de-drift `ad-hoc → adhoc`. Validity: `degraded → "degraded"`.
|
|
19
|
+
* - **Tier 2 — review list.** Anything resting on shape-sensitive content
|
|
20
|
+
* signals (empty `testResults`, `testCount` mismatch) is *emitted to a
|
|
21
|
+
* review list rather than guessed* — 34%/66% of the historical store hits
|
|
22
|
+
* these and is dominated by report-shape evolution (GCS-externalized
|
|
23
|
+
* per-test data, per-model granularity), NOT defects. The engine never
|
|
24
|
+
* auto-flags them invalid; a human confirms in batch and the verdict is
|
|
25
|
+
* written `method:"manual"`.
|
|
26
|
+
*
|
|
27
|
+
* A `method:"manual"` verdict is authoritative — re-running the detector never
|
|
28
|
+
* overwrites it.
|
|
29
|
+
*
|
|
30
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
31
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
32
|
+
*/
|
|
33
|
+
import { type RunClassification, type RunExecutor, type RunOwner, type RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
34
|
+
import type { ReportDegradation, ReportValidity } from "../types/index.js";
|
|
35
|
+
/**
|
|
36
|
+
* Detector ruleset version. Bump on any rule or threshold recalibration so a
|
|
37
|
+
* later re-assessment is a comparable, auditable re-run (D0059 §2). Stamped
|
|
38
|
+
* onto every `validity` verdict the engine produces.
|
|
39
|
+
*/
|
|
40
|
+
export declare const REPORT_VALIDITY_RULESET_VERSION = "1.0.0";
|
|
41
|
+
/**
|
|
42
|
+
* The minimal report projection the detector reads.
|
|
43
|
+
*
|
|
44
|
+
* Authored as a raw-tolerant *consumer view* (not `Pick<Report, …>`) on
|
|
45
|
+
* purpose: the engine runs at the raw-document boundary during the backfill,
|
|
46
|
+
* where legacy drift like `ad-hoc` still exists and `provenance.classification`
|
|
47
|
+
* is therefore typed `string`, not `RunClassification`. A fully-typed `Report`
|
|
48
|
+
* is structurally assignable to this shape, so the write path can pass one
|
|
49
|
+
* directly. Only the seven fields the rules actually consult appear here.
|
|
50
|
+
*/
|
|
51
|
+
export interface ReportValidityInput {
|
|
52
|
+
provenance: {
|
|
53
|
+
/** Raw classification — may carry legacy drift (`ad-hoc`); normalized here. */
|
|
54
|
+
classification?: string;
|
|
55
|
+
trigger: RunTrigger;
|
|
56
|
+
executor: RunExecutor;
|
|
57
|
+
owner: RunOwner;
|
|
58
|
+
source: {
|
|
59
|
+
name: string;
|
|
60
|
+
};
|
|
61
|
+
};
|
|
62
|
+
summary: {
|
|
63
|
+
/** Per-feature scores; only `testCount` is read (summed across features). */
|
|
64
|
+
scores?: ReadonlyArray<{
|
|
65
|
+
testCount?: number;
|
|
66
|
+
}>;
|
|
67
|
+
/** Per-test×model results; only the length is read. */
|
|
68
|
+
testResults?: ReadonlyArray<unknown>;
|
|
69
|
+
/** Low-scoring grader judgments; only the length is read. */
|
|
70
|
+
lowScoringJudgments?: ReadonlyArray<unknown>;
|
|
71
|
+
};
|
|
72
|
+
/** Legacy degradation marker (D0059 subsumes it into `validity.status`). */
|
|
73
|
+
degraded?: ReportDegradation;
|
|
74
|
+
/** Existing verdict — a `method:"manual"` value is preserved on re-run. */
|
|
75
|
+
validity?: ReportValidity;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Why a report was routed to Tier-2 human review instead of being
|
|
79
|
+
* auto-decided. Each corresponds to a shape-sensitive signal the engine
|
|
80
|
+
* deliberately refuses to interpret automatically (D0059 §caveat).
|
|
81
|
+
*/
|
|
82
|
+
export type ReportValidityReviewReason = "empty-test-results" | "test-count-mismatch";
|
|
83
|
+
/** A single Tier-2 review-list entry. */
|
|
84
|
+
export interface ReportValidityReviewFlag {
|
|
85
|
+
reason: ReportValidityReviewReason;
|
|
86
|
+
/** Human-readable explanation for the batch-review UI / backfill report. */
|
|
87
|
+
detail: string;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* The detector's verdict for one report.
|
|
91
|
+
*
|
|
92
|
+
* `classification` is **optional**: present only when the engine has a
|
|
93
|
+
* positive opinion (a Tier-1 intent rule fired, or de-drift changed the
|
|
94
|
+
* value). It is `undefined` when the existing classification is already
|
|
95
|
+
* canonical and no rule applies — so callers patch idempotently and a re-run
|
|
96
|
+
* never clobbers an already-correct or human-corrected value.
|
|
97
|
+
*/
|
|
98
|
+
export interface ReportValidityAssessment {
|
|
99
|
+
classification?: RunClassification;
|
|
100
|
+
validity: ReportValidity;
|
|
101
|
+
reviewFlags: ReportValidityReviewFlag[];
|
|
102
|
+
}
|
|
103
|
+
export interface AssessReportValidityOptions {
|
|
104
|
+
/**
|
|
105
|
+
* ISO 8601 UTC timestamp stamped onto the verdict's `assessedAt`. Injected
|
|
106
|
+
* by the caller so the engine stays pure and deterministic (no clock read).
|
|
107
|
+
*/
|
|
108
|
+
assessedAt: string;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Assess a report's trustworthiness — its intent (`classification`) and data
|
|
112
|
+
* health (`validity`) — plus any Tier-2 signals that need human review.
|
|
113
|
+
*
|
|
114
|
+
* Pure and deterministic: same input + `assessedAt` → identical output.
|
|
115
|
+
*/
|
|
116
|
+
export declare function assessReportValidity(report: ReportValidityInput, options: AssessReportValidityOptions): ReportValidityAssessment;
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report-validity detector (D0059)
|
|
3
|
+
*
|
|
4
|
+
* The detection brain for the report trustworthiness model. A pure, versioned,
|
|
5
|
+
* I/O-free rules engine that computes both trustworthiness axes for a report —
|
|
6
|
+
* intent (`classification`, D0037) and data health (`validity`) — with a
|
|
7
|
+
* confidence tier so it never mislabels ambiguous data.
|
|
8
|
+
*
|
|
9
|
+
* Reused unchanged by the eval write path (`W-stamp-validity-write-path`,
|
|
10
|
+
* stamps every new report) and the one-shot backfill
|
|
11
|
+
* (`W-backfill-report-validity`, labels the historical store).
|
|
12
|
+
*
|
|
13
|
+
* **Confidence tiers (D0059 §2):**
|
|
14
|
+
*
|
|
15
|
+
* - **Tier 1 — auto.** High-confidence, unambiguous rules. Intent:
|
|
16
|
+
* `scheduled + github-actions → official`; `testing-ground` team /
|
|
17
|
+
* `env-override` source / generated-id executor → `test`/`experimental`;
|
|
18
|
+
* de-drift `ad-hoc → adhoc`. Validity: `degraded → "degraded"`.
|
|
19
|
+
* - **Tier 2 — review list.** Anything resting on shape-sensitive content
|
|
20
|
+
* signals (empty `testResults`, `testCount` mismatch) is *emitted to a
|
|
21
|
+
* review list rather than guessed* — 34%/66% of the historical store hits
|
|
22
|
+
* these and is dominated by report-shape evolution (GCS-externalized
|
|
23
|
+
* per-test data, per-model granularity), NOT defects. The engine never
|
|
24
|
+
* auto-flags them invalid; a human confirms in batch and the verdict is
|
|
25
|
+
* written `method:"manual"`.
|
|
26
|
+
*
|
|
27
|
+
* A `method:"manual"` verdict is authoritative — re-running the detector never
|
|
28
|
+
* overwrites it.
|
|
29
|
+
*
|
|
30
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
31
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
32
|
+
*/
|
|
33
|
+
import { isKnownExecutorIdentity, looksLikeGeneratedExecutorId, normalizeOwnerTeam, normalizeRunClassification, } from "../../ailf-shared/index.js";
|
|
34
|
+
/**
|
|
35
|
+
* Detector ruleset version. Bump on any rule or threshold recalibration so a
|
|
36
|
+
* later re-assessment is a comparable, auditable re-run (D0059 §2). Stamped
|
|
37
|
+
* onto every `validity` verdict the engine produces.
|
|
38
|
+
*/
|
|
39
|
+
export const REPORT_VALIDITY_RULESET_VERSION = "1.0.0";
|
|
40
|
+
/**
|
|
41
|
+
* Tier-1 intent detection. Returns a `RunClassification` only when the engine
|
|
42
|
+
* has a positive opinion; `undefined` means "leave the existing value".
|
|
43
|
+
*/
|
|
44
|
+
function detectIntent(provenance) {
|
|
45
|
+
const { classification, executor, owner, source, trigger } = provenance;
|
|
46
|
+
// Highest-confidence positive: a scheduled run executed by GitHub Actions is
|
|
47
|
+
// the canonical official series.
|
|
48
|
+
if (trigger.type === "scheduled" &&
|
|
49
|
+
executor.type === "system" &&
|
|
50
|
+
executor.name === "github-actions") {
|
|
51
|
+
return "official";
|
|
52
|
+
}
|
|
53
|
+
// Explicit non-canonical contexts.
|
|
54
|
+
if (normalizeOwnerTeam(owner.team) === "testing-ground")
|
|
55
|
+
return "test";
|
|
56
|
+
if (source.name === "env-override")
|
|
57
|
+
return "experimental";
|
|
58
|
+
if (executor.type === "user" &&
|
|
59
|
+
!isKnownExecutorIdentity(executor.name) &&
|
|
60
|
+
looksLikeGeneratedExecutorId(executor.name)) {
|
|
61
|
+
return "experimental";
|
|
62
|
+
}
|
|
63
|
+
// No rule fired — de-drift the existing value, returning it only when it
|
|
64
|
+
// actually changes.
|
|
65
|
+
const normalized = normalizeRunClassification(classification);
|
|
66
|
+
return normalized === classification ? undefined : normalized;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Tier-2 review flags. Surfaces shape-sensitive signals for human batch-review
|
|
70
|
+
* — it never decides validity itself. Skipped entirely for already-`degraded`
|
|
71
|
+
* reports, where missing/empty test data is already explained.
|
|
72
|
+
*/
|
|
73
|
+
function collectReviewFlags(summary) {
|
|
74
|
+
const flags = [];
|
|
75
|
+
const testResultCount = summary.testResults?.length ?? 0;
|
|
76
|
+
const summedTestCount = (summary.scores ?? []).reduce((total, score) => total + (score.testCount ?? 0), 0);
|
|
77
|
+
if (summedTestCount > 0 && testResultCount === 0) {
|
|
78
|
+
flags.push({
|
|
79
|
+
reason: "empty-test-results",
|
|
80
|
+
detail: `${summedTestCount} scored tests but testResults is empty — may be GCS-externalized (W0329) rather than a defect; needs human review.`,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
else if (testResultCount > 0 &&
|
|
84
|
+
summedTestCount > 0 &&
|
|
85
|
+
testResultCount !== summedTestCount) {
|
|
86
|
+
flags.push({
|
|
87
|
+
reason: "test-count-mismatch",
|
|
88
|
+
detail: `testResults length (${testResultCount}) ≠ summed testCount (${summedTestCount}) — per-model/sampling granularity difference, not necessarily a defect.`,
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
return flags;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Assess a report's trustworthiness — its intent (`classification`) and data
|
|
95
|
+
* health (`validity`) — plus any Tier-2 signals that need human review.
|
|
96
|
+
*
|
|
97
|
+
* Pure and deterministic: same input + `assessedAt` → identical output.
|
|
98
|
+
*/
|
|
99
|
+
export function assessReportValidity(report, options) {
|
|
100
|
+
const classification = detectIntent(report.provenance);
|
|
101
|
+
// A human verdict is authoritative — never overwrite a manual data-health
|
|
102
|
+
// assessment on a re-run. Intent normalization is still surfaced: it is
|
|
103
|
+
// idempotent and orthogonal to the data-health review.
|
|
104
|
+
if (report.validity?.method === "manual") {
|
|
105
|
+
return { classification, validity: report.validity, reviewFlags: [] };
|
|
106
|
+
}
|
|
107
|
+
// Tier-1 validity: the only auto-assigned non-`ok` status is `degraded`
|
|
108
|
+
// (subsumes the legacy flag). `incomplete`/`suspect` rest on the shape
|
|
109
|
+
// caveat and require human review — they are written `method:"manual"`.
|
|
110
|
+
const reasons = [];
|
|
111
|
+
let status = "ok";
|
|
112
|
+
if (report.degraded) {
|
|
113
|
+
status = "degraded";
|
|
114
|
+
reasons.push(`degraded:${report.degraded.reason}`);
|
|
115
|
+
}
|
|
116
|
+
const validity = {
|
|
117
|
+
status,
|
|
118
|
+
reasons,
|
|
119
|
+
method: "auto",
|
|
120
|
+
rulesetVersion: REPORT_VALIDITY_RULESET_VERSION,
|
|
121
|
+
assessedAt: options.assessedAt,
|
|
122
|
+
};
|
|
123
|
+
return {
|
|
124
|
+
classification,
|
|
125
|
+
validity,
|
|
126
|
+
reviewFlags: status === "degraded" ? [] : collectReviewFlags(report.summary),
|
|
127
|
+
};
|
|
128
|
+
}
|
|
@@ -14,6 +14,7 @@ import type { ArtifactType } from "../artifact-registry.js";
|
|
|
14
14
|
import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
|
|
15
15
|
import type { AssociationValues, RunId } from "./branded-ids.js";
|
|
16
16
|
import type { GraderJudgment } from "./grader-judgment.js";
|
|
17
|
+
import type { ReportValidity } from "./report-validity.js";
|
|
17
18
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
18
19
|
export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
19
20
|
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
@@ -43,6 +44,7 @@ export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJ
|
|
|
43
44
|
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
44
45
|
export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
|
|
45
46
|
export type { AilfUser, AilfUserPreferences, TeamReference } from "./user.js";
|
|
47
|
+
export { isReportValidityStatus, REPORT_VALIDITY_STATUSES, type ReportValidity, type ReportValidityMethod, type ReportValidityStatus, } from "./report-validity.js";
|
|
46
48
|
type DocumentRef = _DocumentRef;
|
|
47
49
|
/** Aggregated retrieval metrics for a feature area */
|
|
48
50
|
export interface AreaRetrievalMetrics {
|
|
@@ -1503,6 +1505,12 @@ export type DegradedEnrichmentField = (typeof DEGRADED_ENRICHMENT_FIELDS)[number
|
|
|
1503
1505
|
* because `grader-judgments.json` was missing). Present so the dashboard and
|
|
1504
1506
|
* Studio can show "enrichment failed" rather than a misleading empty
|
|
1505
1507
|
* "no tests" state on a report that still has a score.
|
|
1508
|
+
*
|
|
1509
|
+
* @deprecated Superseded by {@link ReportValidity} (D0059). The data-health
|
|
1510
|
+
* axis subsumes this flag: a degraded report is `validity.status:"degraded"`.
|
|
1511
|
+
* Retained for back-compat reads; the backfill migrates `degraded:true →
|
|
1512
|
+
* validity.status:"degraded"` (`W-backfill-report-validity`). Do not set it on
|
|
1513
|
+
* new reports — stamp `validity` instead.
|
|
1506
1514
|
*/
|
|
1507
1515
|
export interface ReportDegradation {
|
|
1508
1516
|
/** Why the report is degraded. Single-variant union, widen as needed. */
|
|
@@ -1517,8 +1525,19 @@ export interface Report {
|
|
|
1517
1525
|
/**
|
|
1518
1526
|
* Set when the report is published in a degraded state — a full eval
|
|
1519
1527
|
* scored tests but enrichment did not complete. Absent on healthy reports.
|
|
1528
|
+
*
|
|
1529
|
+
* @deprecated Superseded by {@link Report.validity} (D0059). New write paths
|
|
1530
|
+
* stamp `validity` (with `status:"degraded"` in this case) instead.
|
|
1520
1531
|
*/
|
|
1521
1532
|
degraded?: ReportDegradation;
|
|
1533
|
+
/**
|
|
1534
|
+
* Post-hoc data-health assessment (D0059) — orthogonal to
|
|
1535
|
+
* `provenance.classification` (run intent). Top-level because it judges the
|
|
1536
|
+
* report's *data*, not the run. Additive + nullable: absent on pre-stamp
|
|
1537
|
+
* reports, which are treated as trustworthy until backfilled. Subsumes the
|
|
1538
|
+
* legacy {@link Report.degraded} flag.
|
|
1539
|
+
*/
|
|
1540
|
+
validity?: ReportValidity;
|
|
1522
1541
|
/**
|
|
1523
1542
|
* Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
|
|
1524
1543
|
* The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this
|
|
@@ -19,6 +19,7 @@ export { evalModeType } from "./eval-mode-config.js";
|
|
|
19
19
|
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
|
|
20
20
|
export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
21
21
|
export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
22
|
+
export { isReportValidityStatus, REPORT_VALIDITY_STATUSES, } from "./report-validity.js";
|
|
22
23
|
/** Set of canonical legacy modes — exported for report-formatter use. */
|
|
23
24
|
export const LEGACY_FAILURE_MODES = [
|
|
24
25
|
"api-error",
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report validity (data-health axis)
|
|
3
|
+
*
|
|
4
|
+
* `ReportValidity` is the post-hoc data-health assessment of a published
|
|
5
|
+
* report, orthogonal to `provenance.classification` (run intent, D0037).
|
|
6
|
+
* It is a top-level sibling of the legacy `ReportDegradation` flag, which it
|
|
7
|
+
* subsumes (`degraded:true → validity.status:"degraded"`).
|
|
8
|
+
*
|
|
9
|
+
* Authored independently of the Zod schema so the schema can assert
|
|
10
|
+
* `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
|
|
11
|
+
* (D0045). Populated by the confidence-tiered detector
|
|
12
|
+
* (`W-report-validity-detector`) and the eval write path
|
|
13
|
+
* (`W-stamp-validity-write-path`); gated everywhere by the shared
|
|
14
|
+
* `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
17
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* The validity-status vocabulary. Single `as const` tuple so the runtime Zod
|
|
21
|
+
* `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
|
|
22
|
+
* the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
|
|
23
|
+
*
|
|
24
|
+
* - `ok` — trustworthy; included in default trends.
|
|
25
|
+
* - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
|
|
26
|
+
* flag).
|
|
27
|
+
* - `incomplete` — expected grains genuinely missing, after the report-shape
|
|
28
|
+
* caveat is accounted for (see design doc).
|
|
29
|
+
* - `suspect` — passed structural checks but flagged for review (anomaly or
|
|
30
|
+
* ambiguous heuristic).
|
|
31
|
+
*/
|
|
32
|
+
export declare const REPORT_VALIDITY_STATUSES: readonly ["ok", "degraded", "incomplete", "suspect"];
|
|
33
|
+
export type ReportValidityStatus = (typeof REPORT_VALIDITY_STATUSES)[number];
|
|
34
|
+
/**
|
|
35
|
+
* How the validity verdict was reached. A `"manual"` verdict is authoritative
|
|
36
|
+
* — re-running the detector never overwrites it (the detector emits a re-run
|
|
37
|
+
* only over `"auto"` verdicts).
|
|
38
|
+
*/
|
|
39
|
+
export type ReportValidityMethod = "auto" | "manual";
|
|
40
|
+
/**
|
|
41
|
+
* Post-hoc data-health assessment of a published report.
|
|
42
|
+
*
|
|
43
|
+
* Top-level on the `Report` (a judgment about the report's *data*), NOT under
|
|
44
|
+
* `provenance` (which records run *intent*). Additive and nullable: pre-stamp
|
|
45
|
+
* reads have no `validity` and are treated as trustworthy until backfilled.
|
|
46
|
+
*/
|
|
47
|
+
export interface ReportValidity {
|
|
48
|
+
/** Data-health verdict. */
|
|
49
|
+
status: ReportValidityStatus;
|
|
50
|
+
/** Which detector rules fired — the audit trail behind `status`. */
|
|
51
|
+
reasons: string[];
|
|
52
|
+
/** Whether an automated rule or a human produced this verdict. */
|
|
53
|
+
method: ReportValidityMethod;
|
|
54
|
+
/** Detector ruleset version, so re-assessments are comparable. */
|
|
55
|
+
rulesetVersion: string;
|
|
56
|
+
/** When the verdict was produced (ISO 8601 UTC). */
|
|
57
|
+
assessedAt: string;
|
|
58
|
+
}
|
|
59
|
+
/** Type guard for {@link ReportValidityStatus}. */
|
|
60
|
+
export declare function isReportValidityStatus(value: unknown): value is ReportValidityStatus;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report validity (data-health axis)
|
|
3
|
+
*
|
|
4
|
+
* `ReportValidity` is the post-hoc data-health assessment of a published
|
|
5
|
+
* report, orthogonal to `provenance.classification` (run intent, D0037).
|
|
6
|
+
* It is a top-level sibling of the legacy `ReportDegradation` flag, which it
|
|
7
|
+
* subsumes (`degraded:true → validity.status:"degraded"`).
|
|
8
|
+
*
|
|
9
|
+
* Authored independently of the Zod schema so the schema can assert
|
|
10
|
+
* `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
|
|
11
|
+
* (D0045). Populated by the confidence-tiered detector
|
|
12
|
+
* (`W-report-validity-detector`) and the eval write path
|
|
13
|
+
* (`W-stamp-validity-write-path`); gated everywhere by the shared
|
|
14
|
+
* `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
17
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* The validity-status vocabulary. Single `as const` tuple so the runtime Zod
|
|
21
|
+
* `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
|
|
22
|
+
* the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
|
|
23
|
+
*
|
|
24
|
+
* - `ok` — trustworthy; included in default trends.
|
|
25
|
+
* - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
|
|
26
|
+
* flag).
|
|
27
|
+
* - `incomplete` — expected grains genuinely missing, after the report-shape
|
|
28
|
+
* caveat is accounted for (see design doc).
|
|
29
|
+
* - `suspect` — passed structural checks but flagged for review (anomaly or
|
|
30
|
+
* ambiguous heuristic).
|
|
31
|
+
*/
|
|
32
|
+
export const REPORT_VALIDITY_STATUSES = [
|
|
33
|
+
"ok",
|
|
34
|
+
"degraded",
|
|
35
|
+
"incomplete",
|
|
36
|
+
"suspect",
|
|
37
|
+
];
|
|
38
|
+
/** Type guard for {@link ReportValidityStatus}. */
|
|
39
|
+
export function isReportValidityStatus(value) {
|
|
40
|
+
return (typeof value === "string" &&
|
|
41
|
+
REPORT_VALIDITY_STATUSES.includes(value));
|
|
42
|
+
}
|