@sanity/ailf 7.2.3 → 7.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +38 -0
  2. package/config/bigquery/README.md +39 -7
  3. package/config/bigquery/views/reports.sql +6 -0
  4. package/dist/_vendor/ailf-core/schemas/report.d.ts +30 -0
  5. package/dist/_vendor/ailf-core/schemas/report.js +21 -2
  6. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  7. package/dist/_vendor/ailf-core/services/index.js +4 -0
  8. package/dist/_vendor/ailf-core/services/report-validity-detector.d.ts +116 -0
  9. package/dist/_vendor/ailf-core/services/report-validity-detector.js +128 -0
  10. package/dist/_vendor/ailf-core/types/index.d.ts +19 -0
  11. package/dist/_vendor/ailf-core/types/index.js +1 -0
  12. package/dist/_vendor/ailf-core/types/report-validity.d.ts +60 -0
  13. package/dist/_vendor/ailf-core/types/report-validity.js +42 -0
  14. package/dist/_vendor/ailf-shared/generated/help-content.js +3 -2
  15. package/dist/_vendor/ailf-shared/index.d.ts +2 -1
  16. package/dist/_vendor/ailf-shared/index.js +2 -1
  17. package/dist/_vendor/ailf-shared/run-classification.d.ts +53 -0
  18. package/dist/_vendor/ailf-shared/run-classification.js +111 -0
  19. package/dist/_vendor/ailf-shared/trustworthiness.d.ts +97 -0
  20. package/dist/_vendor/ailf-shared/trustworthiness.js +86 -0
  21. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  22. package/dist/commands/publish.js +9 -2
  23. package/dist/orchestration/steps/publish-report-step.js +11 -3
  24. package/dist/pipeline/report-validity.d.ts +32 -0
  25. package/dist/pipeline/report-validity.js +43 -0
  26. package/dist/report-store.d.ts +1 -0
  27. package/dist/report-store.js +2 -0
  28. package/package.json +1 -1
@@ -104,6 +104,11 @@ definitions:
104
104
  "host_platform": provenance.host.platform,
105
105
  "host_arch": provenance.host.arch,
106
106
  "host_ci": provenance.host.ci,
107
+ "validity_status": validity.status,
108
+ "validity_method": validity.method,
109
+ "validity_ruleset_version": validity.rulesetVersion,
110
+ "validity_assessed_at": validity.assessedAt,
111
+ "validity_reasons": validity.reasons,
107
112
  _createdAt
108
113
  }
109
114
  record_selector:
@@ -724,6 +729,39 @@ schemas:
724
729
  - string
725
730
  - "null"
726
731
  description: CI provider when running under one (e.g., github-actions).
732
+ # ----------------------------------------------------------------
733
+ # D0059 — report validity (data-health axis, orthogonal to intent)
734
+ # ----------------------------------------------------------------
735
+ validity_status:
736
+ type:
737
+ - string
738
+ - "null"
739
+ description:
740
+ "Data-health verdict (D0059): ok | degraded | incomplete | suspect.
741
+ NULL for reports predating the validity stamp (treated as trusted)."
742
+ validity_method:
743
+ type:
744
+ - string
745
+ - "null"
746
+ description: '"auto" | "manual" — how the validity verdict was reached.'
747
+ validity_ruleset_version:
748
+ type:
749
+ - string
750
+ - "null"
751
+ description: Detector ruleset version, so re-assessments are comparable.
752
+ validity_assessed_at:
753
+ type:
754
+ - string
755
+ - "null"
756
+ description:
757
+ ISO 8601 UTC timestamp when the validity verdict was produced.
758
+ validity_reasons:
759
+ type:
760
+ - array
761
+ - "null"
762
+ items:
763
+ type: string
764
+ description: Which detector rules fired — the audit trail behind status.
727
765
  _createdAt:
728
766
  type:
729
767
  - string
@@ -22,13 +22,13 @@ BigQuery views (this directory)
22
22
 
23
23
  ## Files
24
24
 
25
- | File | Purpose |
26
- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
27
- | `views/area_scores.sql` | Flattens nested `model_scores` array into one row per area per model per report |
28
- | `views/reports.sql` | Clean passthrough view with correct types and column ordering |
29
- | `views/official_runs.sql` | Canonical trend series (D0037): `classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'` |
30
- | `views/official_area_scores.sql` | `area_scores` joined to `official_runs` — inherits the official-run predicate for area-level dashboards |
31
- | `views/team_runs_template.sql` | Recipe/template for instantiating per-team filtered views |
25
+ | File | Purpose |
26
+ | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
27
+ | `views/area_scores.sql` | Flattens nested `model_scores` array into one row per area per model per report |
28
+ | `views/reports.sql` | Clean passthrough view with correct types and column ordering; also materializes `validity_*` columns and the `include_in_default_trends` trust gate (D0059) |
29
+ | `views/official_runs.sql` | Canonical trend series (D0037): `classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'` |
30
+ | `views/official_area_scores.sql` | `area_scores` joined to `official_runs` — inherits the official-run predicate for area-level dashboards |
31
+ | `views/team_runs_template.sql` | Recipe/template for instantiating per-team filtered views |
32
32
 
33
33
  ## Setup
34
34
 
@@ -88,6 +88,38 @@ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false
88
88
  > run `bq query` from this repo regularly, consider setting the default with
89
89
  > `gcloud config set project data-platform-302218`.
90
90
 
91
+ ## Querying default (trustworthy) reports
92
+
93
+ `ailf.reports` exposes every synced report, including development runs and
94
+ structurally incomplete ones. BI dashboards and ad-hoc analysis should gate on
95
+ the **`include_in_default_trends`** boolean, which materializes the single
96
+ trustworthiness predicate (D0059) shared with the dashboard and Studio — so all
97
+ three surfaces show the same default population:
98
+
99
+ ```sql
100
+ SELECT *
101
+ FROM `data-platform-302218.ailf.reports`
102
+ WHERE include_in_default_trends = TRUE;
103
+ ```
104
+
105
+ `include_in_default_trends` is the SQL form of the `includeInDefaultTrends`
106
+ predicate (`INCLUDE_IN_DEFAULT_TRENDS_SQL` in `@sanity/ailf-shared`):
107
+
108
+ ```text
109
+ (validity_status IS NULL OR validity_status = 'ok')
110
+ AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))
111
+ ```
112
+
113
+ To audit the _excluded_ (untrusted) reports — the inverse population, useful for
114
+ the Tier-2 review pass — flip the predicate to `= FALSE`. The `validity_status`
115
+ / `validity_reasons` columns explain why each was excluded.
116
+
117
+ > The `official_runs` view (above) is a **stricter** canonical-series gate
118
+ > (`classification='official' AND trigger_type='scheduled' AND owner_team='core-docs'`);
119
+ > `include_in_default_trends` is the **broader** "trustworthy enough to show by
120
+ > default" gate. Use `official_runs` for the tracked trend line,
121
+ > `include_in_default_trends` for general default views.
122
+
91
123
  ## Naming conventions
92
124
 
93
125
  - **`ailf_raw.*`** — raw Airbyte-loaded tables (nested JSON, Airbyte metadata
@@ -64,6 +64,12 @@ SELECT
64
64
  host_platform,
65
65
  host_arch,
66
66
  host_ci,
67
+ validity_status,
68
+ validity_method,
69
+ validity_ruleset_version,
70
+ TIMESTAMP(validity_assessed_at) AS validity_assessed_at,
71
+ validity_reasons,
72
+ (validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental')) AS include_in_default_trends,
67
73
  TIMESTAMP(_createdAt) AS synced_at
68
74
  FROM
69
75
  `data-platform-302218.ailf_raw.reports`;
@@ -139,6 +139,21 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
139
139
  runId: z.ZodString;
140
140
  targetDocuments: z.ZodOptional<z.ZodArray<z.ZodString>>;
141
141
  }, z.core.$strict>;
142
+ export declare const ReportValiditySchema: z.ZodObject<{
143
+ status: z.ZodEnum<{
144
+ ok: "ok";
145
+ degraded: "degraded";
146
+ incomplete: "incomplete";
147
+ suspect: "suspect";
148
+ }>;
149
+ reasons: z.ZodArray<z.ZodString>;
150
+ method: z.ZodEnum<{
151
+ auto: "auto";
152
+ manual: "manual";
153
+ }>;
154
+ rulesetVersion: z.ZodString;
155
+ assessedAt: z.ZodISODateTime;
156
+ }, z.core.$strict>;
142
157
  export declare const ReportSchema: z.ZodObject<{
143
158
  id: z.ZodString;
144
159
  completedAt: z.ZodISODateTime;
@@ -269,6 +284,21 @@ export declare const ReportSchema: z.ZodObject<{
269
284
  }>>;
270
285
  detail: z.ZodString;
271
286
  }, z.core.$strict>>;
287
+ validity: z.ZodOptional<z.ZodObject<{
288
+ status: z.ZodEnum<{
289
+ ok: "ok";
290
+ degraded: "degraded";
291
+ incomplete: "incomplete";
292
+ suspect: "suspect";
293
+ }>;
294
+ reasons: z.ZodArray<z.ZodString>;
295
+ method: z.ZodEnum<{
296
+ auto: "auto";
297
+ manual: "manual";
298
+ }>;
299
+ rulesetVersion: z.ZodString;
300
+ assessedAt: z.ZodISODateTime;
301
+ }, z.core.$strict>>;
272
302
  }, z.core.$loose>;
273
303
  export type ReportSchemaInput = z.input<typeof ReportSchema>;
274
304
  export type ReportSchemaOutput = z.infer<typeof ReportSchema>;
@@ -25,7 +25,7 @@
25
25
  */
26
26
  import { z } from "zod";
27
27
  import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
28
- import { DEGRADED_ENRICHMENT_FIELDS } from "../types/index.js";
28
+ import { DEGRADED_ENRICHMENT_FIELDS, REPORT_VALIDITY_STATUSES, } from "../types/index.js";
29
29
  // ---------------------------------------------------------------------------
30
30
  // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
31
31
  // ---------------------------------------------------------------------------
@@ -217,6 +217,21 @@ export const ReportProvenanceSchema = z
217
217
  // (ScoreSummary, FeatureScore, the W0051 slim types) are out of Scope A.
218
218
  // ---------------------------------------------------------------------------
219
219
  const RecordPassthroughSchema = z.record(z.string(), z.unknown());
220
+ // ---------------------------------------------------------------------------
221
+ // ReportValidity — top-level data-health axis (D0059), orthogonal to
222
+ // `provenance.classification`. `satisfies z.ZodType<ReportValidity>` (D0045):
223
+ // the domain type is authored independently in ../types/report-validity.ts,
224
+ // so schema/type drift is a build error. Strict — unknown keys signal drift.
225
+ // ---------------------------------------------------------------------------
226
+ export const ReportValiditySchema = z
227
+ .object({
228
+ status: z.enum(REPORT_VALIDITY_STATUSES),
229
+ reasons: z.array(z.string()),
230
+ method: z.enum(["auto", "manual"]),
231
+ rulesetVersion: z.string().min(1),
232
+ assessedAt: z.iso.datetime({ offset: true }),
233
+ })
234
+ .strict();
220
235
  export const ReportSchema = z
221
236
  .object({
222
237
  id: z.string().min(1),
@@ -236,7 +251,8 @@ export const ReportSchema = z
236
251
  title: z.string().nullable().optional(),
237
252
  // Degraded marker (mirrors `ReportDegradation`): present only when a full
238
253
  // eval scored tests but enrichment did not complete. Strict — unknown
239
- // keys here signal real drift.
254
+ // keys here signal real drift. Superseded by `validity` (D0059); retained
255
+ // for back-compat reads until the backfill migrates it.
240
256
  degraded: z
241
257
  .object({
242
258
  reason: z.literal("enrichment-missing"),
@@ -247,5 +263,8 @@ export const ReportSchema = z
247
263
  })
248
264
  .strict()
249
265
  .optional(),
266
+ // Data-health axis (D0059). Additive + nullable: pre-stamp reads have no
267
+ // `validity` and are treated as trustworthy until backfilled.
268
+ validity: ReportValiditySchema.optional(),
250
269
  })
251
270
  .passthrough();
@@ -13,6 +13,7 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
16
+ export { assessReportValidity, REPORT_VALIDITY_RULESET_VERSION, type AssessReportValidityOptions, type ReportValidityAssessment, type ReportValidityInput, type ReportValidityReviewFlag, type ReportValidityReviewReason, } from "./report-validity-detector.js";
16
17
  export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
17
18
  export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
18
19
  export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
@@ -14,6 +14,10 @@ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resol
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, } from "./report-to-markdown.js";
16
16
  // ---------------------------------------------------------------------------
17
+ // Report-validity detector (D0059) — pure, versioned trustworthiness engine
18
+ // ---------------------------------------------------------------------------
19
+ export { assessReportValidity, REPORT_VALIDITY_RULESET_VERSION, } from "./report-validity-detector.js";
20
+ // ---------------------------------------------------------------------------
17
21
  // Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
18
22
  // ---------------------------------------------------------------------------
19
23
  export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
@@ -0,0 +1,116 @@
1
+ /**
2
+ * @sanity/ailf-core — Report-validity detector (D0059)
3
+ *
4
+ * The detection brain for the report trustworthiness model. A pure, versioned,
5
+ * I/O-free rules engine that computes both trustworthiness axes for a report —
6
+ * intent (`classification`, D0037) and data health (`validity`) — with a
7
+ * confidence tier so it never mislabels ambiguous data.
8
+ *
9
+ * Reused unchanged by the eval write path (`W-stamp-validity-write-path`,
10
+ * stamps every new report) and the one-shot backfill
11
+ * (`W-backfill-report-validity`, labels the historical store).
12
+ *
13
+ * **Confidence tiers (D0059 §2):**
14
+ *
15
+ * - **Tier 1 — auto.** High-confidence, unambiguous rules. Intent:
16
+ * `scheduled + github-actions → official`; `testing-ground` team /
17
+ * `env-override` source / generated-id executor → `test`/`experimental`;
18
+ * de-drift `ad-hoc → adhoc`. Validity: `degraded → "degraded"`.
19
+ * - **Tier 2 — review list.** Anything resting on shape-sensitive content
20
+ * signals (empty `testResults`, `testCount` mismatch) is *emitted to a
21
+ * review list rather than guessed* — 34%/66% of the historical store hits
22
+ * these and is dominated by report-shape evolution (GCS-externalized
23
+ * per-test data, per-model granularity), NOT defects. The engine never
24
+ * auto-flags them invalid; a human confirms in batch and the verdict is
25
+ * written `method:"manual"`.
26
+ *
27
+ * A `method:"manual"` verdict is authoritative — re-running the detector never
28
+ * overwrites it.
29
+ *
30
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
31
+ * @see docs/design-docs/report-trustworthiness-model.md
32
+ */
33
+ import { type RunClassification, type RunExecutor, type RunOwner, type RunTrigger } from "../../ailf-shared/index.d.ts";
34
+ import type { ReportDegradation, ReportValidity } from "../types/index.js";
35
+ /**
36
+ * Detector ruleset version. Bump on any rule or threshold recalibration so a
37
+ * later re-assessment is a comparable, auditable re-run (D0059 §2). Stamped
38
+ * onto every `validity` verdict the engine produces.
39
+ */
40
+ export declare const REPORT_VALIDITY_RULESET_VERSION = "1.0.0";
41
+ /**
42
+ * The minimal report projection the detector reads.
43
+ *
44
+ * Authored as a raw-tolerant *consumer view* (not `Pick<Report, …>`) on
45
+ * purpose: the engine runs at the raw-document boundary during the backfill,
46
+ * where legacy drift like `ad-hoc` still exists and `provenance.classification`
47
+ * is therefore typed `string`, not `RunClassification`. A fully-typed `Report`
48
+ * is structurally assignable to this shape, so the write path can pass one
49
+ * directly. Only the seven fields the rules actually consult appear here.
50
+ */
51
+ export interface ReportValidityInput {
52
+ provenance: {
53
+ /** Raw classification — may carry legacy drift (`ad-hoc`); normalized here. */
54
+ classification?: string;
55
+ trigger: RunTrigger;
56
+ executor: RunExecutor;
57
+ owner: RunOwner;
58
+ source: {
59
+ name: string;
60
+ };
61
+ };
62
+ summary: {
63
+ /** Per-feature scores; only `testCount` is read (summed across features). */
64
+ scores?: ReadonlyArray<{
65
+ testCount?: number;
66
+ }>;
67
+ /** Per-test×model results; only the length is read. */
68
+ testResults?: ReadonlyArray<unknown>;
69
+ /** Low-scoring grader judgments; only the length is read. */
70
+ lowScoringJudgments?: ReadonlyArray<unknown>;
71
+ };
72
+ /** Legacy degradation marker (D0059 subsumes it into `validity.status`). */
73
+ degraded?: ReportDegradation;
74
+ /** Existing verdict — a `method:"manual"` value is preserved on re-run. */
75
+ validity?: ReportValidity;
76
+ }
77
+ /**
78
+ * Why a report was routed to Tier-2 human review instead of being
79
+ * auto-decided. Each corresponds to a shape-sensitive signal the engine
80
+ * deliberately refuses to interpret automatically (D0059 §caveat).
81
+ */
82
+ export type ReportValidityReviewReason = "empty-test-results" | "test-count-mismatch";
83
+ /** A single Tier-2 review-list entry. */
84
+ export interface ReportValidityReviewFlag {
85
+ reason: ReportValidityReviewReason;
86
+ /** Human-readable explanation for the batch-review UI / backfill report. */
87
+ detail: string;
88
+ }
89
+ /**
90
+ * The detector's verdict for one report.
91
+ *
92
+ * `classification` is **optional**: present only when the engine has a
93
+ * positive opinion (a Tier-1 intent rule fired, or de-drift changed the
94
+ * value). It is `undefined` when the existing classification is already
95
+ * canonical and no rule applies — so callers patch idempotently and a re-run
96
+ * never clobbers an already-correct or human-corrected value.
97
+ */
98
+ export interface ReportValidityAssessment {
99
+ classification?: RunClassification;
100
+ validity: ReportValidity;
101
+ reviewFlags: ReportValidityReviewFlag[];
102
+ }
103
+ export interface AssessReportValidityOptions {
104
+ /**
105
+ * ISO 8601 UTC timestamp stamped onto the verdict's `assessedAt`. Injected
106
+ * by the caller so the engine stays pure and deterministic (no clock read).
107
+ */
108
+ assessedAt: string;
109
+ }
110
+ /**
111
+ * Assess a report's trustworthiness — its intent (`classification`) and data
112
+ * health (`validity`) — plus any Tier-2 signals that need human review.
113
+ *
114
+ * Pure and deterministic: same input + `assessedAt` → identical output.
115
+ */
116
+ export declare function assessReportValidity(report: ReportValidityInput, options: AssessReportValidityOptions): ReportValidityAssessment;
@@ -0,0 +1,128 @@
1
+ /**
2
+ * @sanity/ailf-core — Report-validity detector (D0059)
3
+ *
4
+ * The detection brain for the report trustworthiness model. A pure, versioned,
5
+ * I/O-free rules engine that computes both trustworthiness axes for a report —
6
+ * intent (`classification`, D0037) and data health (`validity`) — with a
7
+ * confidence tier so it never mislabels ambiguous data.
8
+ *
9
+ * Reused unchanged by the eval write path (`W-stamp-validity-write-path`,
10
+ * stamps every new report) and the one-shot backfill
11
+ * (`W-backfill-report-validity`, labels the historical store).
12
+ *
13
+ * **Confidence tiers (D0059 §2):**
14
+ *
15
+ * - **Tier 1 — auto.** High-confidence, unambiguous rules. Intent:
16
+ * `scheduled + github-actions → official`; `testing-ground` team /
17
+ * `env-override` source / generated-id executor → `test`/`experimental`;
18
+ * de-drift `ad-hoc → adhoc`. Validity: `degraded → "degraded"`.
19
+ * - **Tier 2 — review list.** Anything resting on shape-sensitive content
20
+ * signals (empty `testResults`, `testCount` mismatch) is *emitted to a
21
+ * review list rather than guessed* — 34%/66% of the historical store hits
22
+ * these and is dominated by report-shape evolution (GCS-externalized
23
+ * per-test data, per-model granularity), NOT defects. The engine never
24
+ * auto-flags them invalid; a human confirms in batch and the verdict is
25
+ * written `method:"manual"`.
26
+ *
27
+ * A `method:"manual"` verdict is authoritative — re-running the detector never
28
+ * overwrites it.
29
+ *
30
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
31
+ * @see docs/design-docs/report-trustworthiness-model.md
32
+ */
33
+ import { isKnownExecutorIdentity, looksLikeGeneratedExecutorId, normalizeOwnerTeam, normalizeRunClassification, } from "../../ailf-shared/index.js";
34
+ /**
35
+ * Detector ruleset version. Bump on any rule or threshold recalibration so a
36
+ * later re-assessment is a comparable, auditable re-run (D0059 §2). Stamped
37
+ * onto every `validity` verdict the engine produces.
38
+ */
39
+ export const REPORT_VALIDITY_RULESET_VERSION = "1.0.0";
40
+ /**
41
+ * Tier-1 intent detection. Returns a `RunClassification` only when the engine
42
+ * has a positive opinion; `undefined` means "leave the existing value".
43
+ */
44
+ function detectIntent(provenance) {
45
+ const { classification, executor, owner, source, trigger } = provenance;
46
+ // Highest-confidence positive: a scheduled run executed by GitHub Actions is
47
+ // the canonical official series.
48
+ if (trigger.type === "scheduled" &&
49
+ executor.type === "system" &&
50
+ executor.name === "github-actions") {
51
+ return "official";
52
+ }
53
+ // Explicit non-canonical contexts.
54
+ if (normalizeOwnerTeam(owner.team) === "testing-ground")
55
+ return "test";
56
+ if (source.name === "env-override")
57
+ return "experimental";
58
+ if (executor.type === "user" &&
59
+ !isKnownExecutorIdentity(executor.name) &&
60
+ looksLikeGeneratedExecutorId(executor.name)) {
61
+ return "experimental";
62
+ }
63
+ // No rule fired — de-drift the existing value, returning it only when it
64
+ // actually changes.
65
+ const normalized = normalizeRunClassification(classification);
66
+ return normalized === classification ? undefined : normalized;
67
+ }
68
+ /**
69
+ * Tier-2 review flags. Surfaces shape-sensitive signals for human batch-review
70
+ * — it never decides validity itself. Skipped entirely for already-`degraded`
71
+ * reports, where missing/empty test data is already explained.
72
+ */
73
+ function collectReviewFlags(summary) {
74
+ const flags = [];
75
+ const testResultCount = summary.testResults?.length ?? 0;
76
+ const summedTestCount = (summary.scores ?? []).reduce((total, score) => total + (score.testCount ?? 0), 0);
77
+ if (summedTestCount > 0 && testResultCount === 0) {
78
+ flags.push({
79
+ reason: "empty-test-results",
80
+ detail: `${summedTestCount} scored tests but testResults is empty — may be GCS-externalized (W0329) rather than a defect; needs human review.`,
81
+ });
82
+ }
83
+ else if (testResultCount > 0 &&
84
+ summedTestCount > 0 &&
85
+ testResultCount !== summedTestCount) {
86
+ flags.push({
87
+ reason: "test-count-mismatch",
88
+ detail: `testResults length (${testResultCount}) ≠ summed testCount (${summedTestCount}) — per-model/sampling granularity difference, not necessarily a defect.`,
89
+ });
90
+ }
91
+ return flags;
92
+ }
93
+ /**
94
+ * Assess a report's trustworthiness — its intent (`classification`) and data
95
+ * health (`validity`) — plus any Tier-2 signals that need human review.
96
+ *
97
+ * Pure and deterministic: same input + `assessedAt` → identical output.
98
+ */
99
+ export function assessReportValidity(report, options) {
100
+ const classification = detectIntent(report.provenance);
101
+ // A human verdict is authoritative — never overwrite a manual data-health
102
+ // assessment on a re-run. Intent normalization is still surfaced: it is
103
+ // idempotent and orthogonal to the data-health review.
104
+ if (report.validity?.method === "manual") {
105
+ return { classification, validity: report.validity, reviewFlags: [] };
106
+ }
107
+ // Tier-1 validity: the only auto-assigned non-`ok` status is `degraded`
108
+ // (subsumes the legacy flag). `incomplete`/`suspect` rest on the shape
109
+ // caveat and require human review — they are written `method:"manual"`.
110
+ const reasons = [];
111
+ let status = "ok";
112
+ if (report.degraded) {
113
+ status = "degraded";
114
+ reasons.push(`degraded:${report.degraded.reason}`);
115
+ }
116
+ const validity = {
117
+ status,
118
+ reasons,
119
+ method: "auto",
120
+ rulesetVersion: REPORT_VALIDITY_RULESET_VERSION,
121
+ assessedAt: options.assessedAt,
122
+ };
123
+ return {
124
+ classification,
125
+ validity,
126
+ reviewFlags: status === "degraded" ? [] : collectReviewFlags(report.summary),
127
+ };
128
+ }
@@ -14,6 +14,7 @@ import type { ArtifactType } from "../artifact-registry.js";
14
14
  import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
15
15
  import type { AssociationValues, RunId } from "./branded-ids.js";
16
16
  import type { GraderJudgment } from "./grader-judgment.js";
17
+ import type { ReportValidity } from "./report-validity.js";
17
18
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
18
19
  export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
19
20
  export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
@@ -43,6 +44,7 @@ export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJ
43
44
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
44
45
  export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
45
46
  export type { AilfUser, AilfUserPreferences, TeamReference } from "./user.js";
47
+ export { isReportValidityStatus, REPORT_VALIDITY_STATUSES, type ReportValidity, type ReportValidityMethod, type ReportValidityStatus, } from "./report-validity.js";
46
48
  type DocumentRef = _DocumentRef;
47
49
  /** Aggregated retrieval metrics for a feature area */
48
50
  export interface AreaRetrievalMetrics {
@@ -1503,6 +1505,12 @@ export type DegradedEnrichmentField = (typeof DEGRADED_ENRICHMENT_FIELDS)[number
1503
1505
  * because `grader-judgments.json` was missing). Present so the dashboard and
1504
1506
  * Studio can show "enrichment failed" rather than a misleading empty
1505
1507
  * "no tests" state on a report that still has a score.
1508
+ *
1509
+ * @deprecated Superseded by {@link ReportValidity} (D0059). The data-health
1510
+ * axis subsumes this flag: a degraded report is `validity.status:"degraded"`.
1511
+ * Retained for back-compat reads; the backfill migrates `degraded:true →
1512
+ * validity.status:"degraded"` (`W-backfill-report-validity`). Do not set it on
1513
+ * new reports — stamp `validity` instead.
1506
1514
  */
1507
1515
  export interface ReportDegradation {
1508
1516
  /** Why the report is degraded. Single-variant union, widen as needed. */
@@ -1517,8 +1525,19 @@ export interface Report {
1517
1525
  /**
1518
1526
  * Set when the report is published in a degraded state — a full eval
1519
1527
  * scored tests but enrichment did not complete. Absent on healthy reports.
1528
+ *
1529
+ * @deprecated Superseded by {@link Report.validity} (D0059). New write paths
1530
+ * stamp `validity` (with `status:"degraded"` in this case) instead.
1520
1531
  */
1521
1532
  degraded?: ReportDegradation;
1533
+ /**
1534
+ * Post-hoc data-health assessment (D0059) — orthogonal to
1535
+ * `provenance.classification` (run intent). Top-level because it judges the
1536
+ * report's *data*, not the run. Additive + nullable: absent on pre-stamp
1537
+ * reports, which are treated as trustworthy until backfilled. Subsumes the
1538
+ * legacy {@link Report.degraded} flag.
1539
+ */
1540
+ validity?: ReportValidity;
1522
1541
  /**
1523
1542
  * Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
1524
1543
  * The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this
@@ -19,6 +19,7 @@ export { evalModeType } from "./eval-mode-config.js";
19
19
  export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
20
20
  export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
21
21
  export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
22
+ export { isReportValidityStatus, REPORT_VALIDITY_STATUSES, } from "./report-validity.js";
22
23
  /** Set of canonical legacy modes — exported for report-formatter use. */
23
24
  export const LEGACY_FAILURE_MODES = [
24
25
  "api-error",
@@ -0,0 +1,60 @@
1
+ /**
2
+ * @sanity/ailf-core — Report validity (data-health axis)
3
+ *
4
+ * `ReportValidity` is the post-hoc data-health assessment of a published
5
+ * report, orthogonal to `provenance.classification` (run intent, D0037).
6
+ * It is a top-level sibling of the legacy `ReportDegradation` flag, which it
7
+ * subsumes (`degraded:true → validity.status:"degraded"`).
8
+ *
9
+ * Authored independently of the Zod schema so the schema can assert
10
+ * `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
11
+ * (D0045). Populated by the confidence-tiered detector
12
+ * (`W-report-validity-detector`) and the eval write path
13
+ * (`W-stamp-validity-write-path`); gated everywhere by the shared
14
+ * `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
15
+ *
16
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
17
+ * @see docs/design-docs/report-trustworthiness-model.md
18
+ */
19
+ /**
20
+ * The validity-status vocabulary. Single `as const` tuple so the runtime Zod
21
+ * `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
22
+ * the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
23
+ *
24
+ * - `ok` — trustworthy; included in default trends.
25
+ * - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
26
+ * flag).
27
+ * - `incomplete` — expected grains genuinely missing, after the report-shape
28
+ * caveat is accounted for (see design doc).
29
+ * - `suspect` — passed structural checks but flagged for review (anomaly or
30
+ * ambiguous heuristic).
31
+ */
32
+ export declare const REPORT_VALIDITY_STATUSES: readonly ["ok", "degraded", "incomplete", "suspect"];
33
+ export type ReportValidityStatus = (typeof REPORT_VALIDITY_STATUSES)[number];
34
+ /**
35
+ * How the validity verdict was reached. A `"manual"` verdict is authoritative
36
+ * — re-running the detector never overwrites it (the detector emits a re-run
37
+ * only over `"auto"` verdicts).
38
+ */
39
+ export type ReportValidityMethod = "auto" | "manual";
40
+ /**
41
+ * Post-hoc data-health assessment of a published report.
42
+ *
43
+ * Top-level on the `Report` (a judgment about the report's *data*), NOT under
44
+ * `provenance` (which records run *intent*). Additive and nullable: pre-stamp
45
+ * reads have no `validity` and are treated as trustworthy until backfilled.
46
+ */
47
+ export interface ReportValidity {
48
+ /** Data-health verdict. */
49
+ status: ReportValidityStatus;
50
+ /** Which detector rules fired — the audit trail behind `status`. */
51
+ reasons: string[];
52
+ /** Whether an automated rule or a human produced this verdict. */
53
+ method: ReportValidityMethod;
54
+ /** Detector ruleset version, so re-assessments are comparable. */
55
+ rulesetVersion: string;
56
+ /** When the verdict was produced (ISO 8601 UTC). */
57
+ assessedAt: string;
58
+ }
59
+ /** Type guard for {@link ReportValidityStatus}. */
60
+ export declare function isReportValidityStatus(value: unknown): value is ReportValidityStatus;
@@ -0,0 +1,42 @@
1
+ /**
2
+ * @sanity/ailf-core — Report validity (data-health axis)
3
+ *
4
+ * `ReportValidity` is the post-hoc data-health assessment of a published
5
+ * report, orthogonal to `provenance.classification` (run intent, D0037).
6
+ * It is a top-level sibling of the legacy `ReportDegradation` flag, which it
7
+ * subsumes (`degraded:true → validity.status:"degraded"`).
8
+ *
9
+ * Authored independently of the Zod schema so the schema can assert
10
+ * `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
11
+ * (D0045). Populated by the confidence-tiered detector
12
+ * (`W-report-validity-detector`) and the eval write path
13
+ * (`W-stamp-validity-write-path`); gated everywhere by the shared
14
+ * `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
15
+ *
16
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
17
+ * @see docs/design-docs/report-trustworthiness-model.md
18
+ */
19
+ /**
20
+ * The validity-status vocabulary. Single `as const` tuple so the runtime Zod
21
+ * `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
22
+ * the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
23
+ *
24
+ * - `ok` — trustworthy; included in default trends.
25
+ * - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
26
+ * flag).
27
+ * - `incomplete` — expected grains genuinely missing, after the report-shape
28
+ * caveat is accounted for (see design doc).
29
+ * - `suspect` — passed structural checks but flagged for review (anomaly or
30
+ * ambiguous heuristic).
31
+ */
32
+ export const REPORT_VALIDITY_STATUSES = [
33
+ "ok",
34
+ "degraded",
35
+ "incomplete",
36
+ "suspect",
37
+ ];
38
+ /** Type guard for {@link ReportValidityStatus}. */
39
+ export function isReportValidityStatus(value) {
40
+ return (typeof value === "string" &&
41
+ REPORT_VALIDITY_STATUSES.includes(value));
42
+ }
@@ -78,11 +78,12 @@ export const HELP_TOPICS = [
78
78
  },
79
79
  {
80
80
  "id": "reading-score-trends",
81
- "title": "Reading Score Trends",
82
- "body": "## What the timeline shows\n\nThe Score Timeline view plots your AI Literacy Score over time. Each point is an\nevaluation run a snapshot of how well your docs support AI agents at that\nmoment.\n\n## What to look for\n\n**Upward trends** after doc changes confirm that your improvements are working.\nIf you rewrote a GROQ guide and the GROQ area score climbs in the next run,\nthat's direct evidence of impact.\n\n**Sudden drops** usually mean something changed: a doc was deleted, an API\nchanged without a doc update, or a new task was added that exposes a gap.\n\n**Flat lines** mean stability neither improving nor regressing. This is fine\nfor mature areas but concerning for areas you're actively working on.\n\n## Meaningful change vs. noise\n\nSmall fluctuations (±2–3 points) between runs are normal — they come from LLM\nnon-determinism and grader variance. Focus on changes of **5+ points** sustained\nacross multiple runs. The comparison view applies a noise threshold to help\ndistinguish real changes from statistical noise.\n\n## Filtering the timeline\n\nUse the filters to focus on specific evaluation modes (baseline vs. full),\nspecific doc sources (production vs. branch), or specific feature areas.\nComparing the same area across modes reveals whether a problem is in the docs\nthemselves (baseline score) or in how agents find them (agentic score).",
81
+ "title": "Reading the Analytics View",
82
+ "body": "## What this view answers\n\nThe Analytics view is built around one question: **did your doc changes move the\nscore, and why?** Rather than open on a chart and leave you to find the story,\nit leads with the answer — a plain-language verdict and the areas that moved\nmost — then lets you drill down into the evidence.\n\n## The control bar\n\nThe top row picks what you're looking at:\n\n- **Metric** — which number to track (composite score, doc lift, retrieval gap,\n and so on).\n- **Break down by** — how to split it (feature area, team, model, source).\n- **Bucket** — how to group runs over time (per run, per day).\n- **Range** — how far back to look (for example, the last 30 days).\n\nThe second row holds the active **filter chips** use _Add filter_ to scope to\na team, source, or mode — and a scope hint (reports in scope vs. total). Every\nknob and filter is saved in the URL, so a shared link reproduces exactly what\nyou see. Use **Copy link** to grab it.\n\n## Overall — the read\n\nThe **verdict strip** is the headline. In plain language it says whether docs\nare pulling ahead or slipping, and shows the headline metric with its change (Δ)\nsince the start of the range, a model → agent → docs decomposition bar, and a\ncoverage cell (how many reports and high-confidence groups are in scope).\n\n## Movers\n\nThe **movers board** leads with the top **Improved** and **Regressed** areas as\ncards — not the average. Each card shows the area, its value and Δ, a\ndecomposition bar, the release that most likely caused the move, and a\nconfidence read. A low-confidence **watch** callout flags big swings backed by\ntoo few runs: watch them, don't celebrate them yet.\n\nClick a mover card to reveal and decompose that series in the evidence chart.\n\n## The evidence\n\nThe **focus chart** has two modes:\n\n- **Compare** plots the selected series over time. It defaults to a focused set\n (the movers plus the highest-volume areas) with a _show all_ expansion, and\n draws release markers inline.\n- **Decompose** shows the ceiling / floor / actual band for a single series,\n with causal story cards anchored to each release marker (for example, _\"Docs\n +3 ~5 −1 → doc-lift +8 measured around this release\"_).\n\nDecompose is offered for the composite metric broken down by feature area — the\ncase where the model → agent → docs story is meaningful.\n\n## The breakdown table\n\nOne row per area (or per whatever you broke down by), each with an inline\ndecomposition bar, a sparkline, confidence, Δ, \"docs add,\" and a report count.\nSort any column, and click a row to cross-highlight it in the chart. Export the\ntable to CSV.\n\n## Meaningful change vs. noise\n\nSmall movements between runs are normal — they come from model non-determinism\nand grader variance. This view leans on **confidence** (how many runs back a\nnumber) and the **movers ranking** rather than a single ±point threshold: trust\na sustained move in a high-confidence area over a large swing in a\nlow-confidence one. The low-confidence watch exists precisely to stop you\nover-reading thin data.\n\n## Measured, not invented\n\nThe causal story is computed from real data, never fabricated. Release markers\ncome from the doc-change counts already recorded in each report, and the\n\"measured around this release\" doc-lift effect is derived from the real ceiling\n− floor series around the marker. Per-area prose (\"the editor API changed\") is\nintentionally not shown the data carries change counts, not hand-written\nexplanations.",
83
83
  "source": "docs/help/reading-score-trends.md",
84
84
  "related": [
85
85
  "scoring-model",
86
+ "doc-lift",
86
87
  "comparing-runs"
87
88
  ]
88
89
  },
@@ -31,6 +31,7 @@ export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.j
31
31
  export { NOISE_THRESHOLD } from "./noise-threshold.js";
32
32
  export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
33
33
  export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, type SlugLike, } from "./owner-teams.js";
34
- export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
34
+ export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
35
35
  export { type RunTrigger } from "./run-trigger.js";
36
36
  export { type RunContext } from "./run-context.js";
37
+ export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, type TrustGateReport, } from "./trustworthiness.js";
@@ -30,4 +30,5 @@ export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
30
30
  export { NOISE_THRESHOLD } from "./noise-threshold.js";
31
31
  export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
32
32
  export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, } from "./owner-teams.js";
33
- export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
33
+ export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
34
+ export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, } from "./trustworthiness.js";
@@ -19,6 +19,59 @@
19
19
  export type RunClassification = "official" | "adhoc" | "experimental" | "test" | "external";
20
20
  export declare const RUN_CLASSIFICATIONS: readonly RunClassification[];
21
21
  export declare function isRunClassification(value: unknown): value is RunClassification;
22
+ /**
23
+ * Normalize a free-form classification value to a canonical
24
+ * {@link RunClassification}.
25
+ *
26
+ * - Trims and lowercases.
27
+ * - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
28
+ * - Defaults empty / unknown input to `adhoc` — D0037's documented
29
+ * default bucket, biased away from the canonical `official` series.
30
+ *
31
+ * Pure and deterministic — reused by the detector (`W-report-validity-detector`)
32
+ * and the backfill (`W-backfill-report-validity`).
33
+ *
34
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
35
+ */
36
+ export declare function normalizeRunClassification(value: string | undefined | null): RunClassification;
37
+ /**
38
+ * Collapse a free-form executor name onto its canonical identity slug.
39
+ *
40
+ * - Trims and lowercases.
41
+ * - Maps known spellings (above) to one identity.
42
+ * - Passes unknown names through (trimmed + lowercased).
43
+ * - Returns `undefined` for empty / nullish input.
44
+ *
45
+ * Pure and deterministic — used by the validity detector
46
+ * (`W-report-validity-detector`) to recognize a known human before the
47
+ * generated-id heuristic runs, and by the backfill to de-drift
48
+ * `provenance.executor.name`.
49
+ *
50
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
51
+ */
52
+ export declare function canonicalizeExecutorIdentity(name: string | undefined | null): string | undefined;
53
+ /** Whether an executor name collapses to a recognized human identity. */
54
+ export declare function isKnownExecutorIdentity(name: string | undefined | null): boolean;
55
+ /**
56
+ * Heuristic: does an executor name look like a *generated* handle/id rather
57
+ * than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
58
+ * `gL78msEDh` in the report store.)
59
+ *
60
+ * Deterministic and deliberately conservative — it judges *shape* only:
61
+ * a single token (no whitespace) of length 7–12, alphanumeric, mixing
62
+ * upper- and lower-case, and either containing a digit or showing ≥4
63
+ * upper/lower transitions. The transition floor is calibrated against the
64
+ * observed sample so two-word PascalCase names ("JohnSmith" — 3
65
+ * transitions) are NOT flagged; the generated ids (≥4 transitions or a
66
+ * digit) are. Known identities are excluded by the caller
67
+ * ({@link isKnownExecutorIdentity}) before this runs, so collapsed
68
+ * spellings like `GabeStah` never reach it as a positive.
69
+ *
70
+ * False positives are tolerable: the detector only uses this to propose an
71
+ * `experimental` classification, which is reversible (label-and-exclude,
72
+ * never delete) and surfaced for human review during the backfill.
73
+ */
74
+ export declare function looksLikeGeneratedExecutorId(name: string | undefined | null): boolean;
22
75
  /**
23
76
  * Attribution — which team and (optionally) individual the run *belongs to*.
24
77
  *
@@ -21,6 +21,117 @@ export function isRunClassification(value) {
21
21
  return (typeof value === "string" &&
22
22
  RUN_CLASSIFICATIONS.includes(value));
23
23
  }
24
+ /**
25
+ * Lowercase legacy spelling → canonical classification. The `RunClassification`
26
+ * type has long been canonical `adhoc`, but historical report data carries the
27
+ * hyphenated `ad-hoc` spelling (D0059 §Context). Only observed drift belongs
28
+ * here.
29
+ */
30
+ const RUN_CLASSIFICATION_ALIASES = {
31
+ "ad-hoc": "adhoc",
32
+ };
33
+ /**
34
+ * Normalize a free-form classification value to a canonical
35
+ * {@link RunClassification}.
36
+ *
37
+ * - Trims and lowercases.
38
+ * - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
39
+ * - Defaults empty / unknown input to `adhoc` — D0037's documented
40
+ * default bucket, biased away from the canonical `official` series.
41
+ *
42
+ * Pure and deterministic — reused by the detector (`W-report-validity-detector`)
43
+ * and the backfill (`W-backfill-report-validity`).
44
+ *
45
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
46
+ */
47
+ export function normalizeRunClassification(value) {
48
+ if (!value)
49
+ return "adhoc";
50
+ const trimmed = value.trim().toLowerCase();
51
+ if (!trimmed)
52
+ return "adhoc";
53
+ const canonical = RUN_CLASSIFICATION_ALIASES[trimmed] ?? trimmed;
54
+ return isRunClassification(canonical) ? canonical : "adhoc";
55
+ }
56
+ /**
57
+ * Lowercased executor-name spelling → canonical identity slug. One human
58
+ * appears under several spellings in the historical report store
59
+ * (D0059 §Context: `Gabe Wyatt` / `GabeStah` / `gabewyatt`); collapsing
60
+ * them lets attribution and `classification` queries treat them as one
61
+ * person. Only observed drift belongs here — unknown names pass through.
62
+ */
63
+ const EXECUTOR_IDENTITY_ALIASES = {
64
+ "gabe wyatt": "gabe-wyatt",
65
+ gabestah: "gabe-wyatt",
66
+ gabewyatt: "gabe-wyatt",
67
+ };
68
+ /**
69
+ * Collapse a free-form executor name onto its canonical identity slug.
70
+ *
71
+ * - Trims and lowercases.
72
+ * - Maps known spellings (above) to one identity.
73
+ * - Passes unknown names through (trimmed + lowercased).
74
+ * - Returns `undefined` for empty / nullish input.
75
+ *
76
+ * Pure and deterministic — used by the validity detector
77
+ * (`W-report-validity-detector`) to recognize a known human before the
78
+ * generated-id heuristic runs, and by the backfill to de-drift
79
+ * `provenance.executor.name`.
80
+ *
81
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
82
+ */
83
+ export function canonicalizeExecutorIdentity(name) {
84
+ if (!name)
85
+ return undefined;
86
+ const trimmed = name.trim().toLowerCase();
87
+ if (!trimmed)
88
+ return undefined;
89
+ return EXECUTOR_IDENTITY_ALIASES[trimmed] ?? trimmed;
90
+ }
91
+ /** Whether an executor name collapses to a recognized human identity. */
92
+ export function isKnownExecutorIdentity(name) {
93
+ if (!name)
94
+ return false;
95
+ return name.trim().toLowerCase() in EXECUTOR_IDENTITY_ALIASES;
96
+ }
97
+ /**
98
+ * Heuristic: does an executor name look like a *generated* handle/id rather
99
+ * than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
100
+ * `gL78msEDh` in the report store.)
101
+ *
102
+ * Deterministic and deliberately conservative — it judges *shape* only:
103
+ * a single token (no whitespace) of length 7–12, alphanumeric, mixing
104
+ * upper- and lower-case, and either containing a digit or showing ≥4
105
+ * upper/lower transitions. The transition floor is calibrated against the
106
+ * observed sample so two-word PascalCase names ("JohnSmith" — 3
107
+ * transitions) are NOT flagged; the generated ids (≥4 transitions or a
108
+ * digit) are. Known identities are excluded by the caller
109
+ * ({@link isKnownExecutorIdentity}) before this runs, so collapsed
110
+ * spellings like `GabeStah` never reach it as a positive.
111
+ *
112
+ * False positives are tolerable: the detector only uses this to propose an
113
+ * `experimental` classification, which is reversible (label-and-exclude,
114
+ * never delete) and surfaced for human review during the backfill.
115
+ */
116
+ export function looksLikeGeneratedExecutorId(name) {
117
+ if (!name)
118
+ return false;
119
+ const token = name.trim();
120
+ if (token.length < 7 || token.length > 12)
121
+ return false;
122
+ if (!/^[A-Za-z0-9]+$/.test(token))
123
+ return false;
124
+ if (!/[A-Z]/.test(token) || !/[a-z]/.test(token))
125
+ return false;
126
+ if (/[0-9]/.test(token))
127
+ return true;
128
+ let transitions = 0;
129
+ for (let i = 1; i < token.length; i++) {
130
+ if (/[A-Z]/.test(token[i - 1]) !== /[A-Z]/.test(token[i]))
131
+ transitions++;
132
+ }
133
+ return transitions >= 4;
134
+ }
24
135
  export const RUN_EXECUTOR_SURFACES = [
25
136
  "cli",
26
137
  "studio",
@@ -0,0 +1,97 @@
1
+ /**
2
+ * trustworthiness.ts — The single trust gate for reports (D0059).
3
+ *
4
+ * `includeInDefaultTrends` is the one definition of "show this report by
5
+ * default." Every surface (dashboard analytics, Studio presets, the BigQuery
6
+ * `reports.sql` view) references this predicate so the gate cannot drift
7
+ * between consumers.
8
+ *
9
+ * Two orthogonal axes decide inclusion:
10
+ *
11
+ * - **Validity (data health, D0059)** — the *primary* gate. A report is
12
+ * included only when its `validity.status` is `ok` OR validity is absent
13
+ * (pre-stamp reads are trusted until backfilled — the rollout is additive
14
+ * and nullable). Any non-`ok` status (`degraded` / `incomplete` /
15
+ * `suspect`) excludes the report regardless of intent.
16
+ * - **Intent (run classification, D0037)** — a *secondary* exclusion. The
17
+ * explicit `test` and `experimental` classifications are dropped;
18
+ * `adhoc` / `official` / `external` (and a missing classification) are kept.
19
+ * `adhoc` is intentionally included — it holds real production one-offs;
20
+ * the validity gate, not the intent gate, removes the bad ones inside it.
21
+ *
22
+ * We model a slim subset of the core `Report` shape (the two read axes) rather
23
+ * than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
24
+ * package is the dependency-graph leaf and imports nothing from core. A full
25
+ * core `Report` is structurally assignable to {@link TrustGateReport}.
26
+ *
27
+ * The predicate is total — it never throws — and is kept trivially
28
+ * translatable to the two query-language forms it is materialized as on the
29
+ * other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
30
+ * Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
31
+ * SQL boolean in the BigQuery `reports.sql` view
32
+ * ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
33
+ * function, so the one gate cannot drift between consumers; a cross-check test
34
+ * asserts all three forms agree across the full truth table.
35
+ *
36
+ * Note the SQL form is NULL-safe on *both* axes: a bare
37
+ * `classification NOT IN ('test','experimental')` would evaluate to `NULL`
38
+ * (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
39
+ * excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
40
+ * `classification IS NULL OR …`.
41
+ *
42
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
43
+ * @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
44
+ */
45
+ import type { RunClassification } from "./run-classification.js";
46
+ /**
47
+ * Slim subset of a core `Report` — only the two axes the trust gate reads.
48
+ *
49
+ * `validity.status` is typed as a bare `string` (not core's
50
+ * `ReportValidityStatus`) so this leaf package imports nothing from
51
+ * `@sanity/ailf-core`; the predicate only distinguishes `"ok"` from
52
+ * everything else. `validity` absent/`null` ⇒ pre-stamp read ⇒ trusted.
53
+ */
54
+ export interface TrustGateReport {
55
+ /** Data-health axis (D0059), top-level on the report. */
56
+ validity?: {
57
+ status: string;
58
+ } | null;
59
+ /** Run-intent axis (D0037), under provenance. */
60
+ provenance?: {
61
+ classification?: RunClassification | null;
62
+ } | null;
63
+ }
64
+ /**
65
+ * Whether a report should appear in default trend views.
66
+ *
67
+ * Validity is the primary gate; intent is a secondary exclusion. See the
68
+ * module header for the full rationale and the equivalent SQL.
69
+ *
70
+ * @returns `true` when the report is trustworthy enough to show by default.
71
+ */
72
+ export declare function includeInDefaultTrends(report: TrustGateReport): boolean;
73
+ /**
74
+ * GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
75
+ * `ailf.report` document. Drop it into a Studio structure filter with the
76
+ * document-type guard, e.g.
77
+ * `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
78
+ *
79
+ * GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
80
+ * unclassified report passes the intent clause without an explicit
81
+ * `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
82
+ * `defined(validity.status)` makes the absent-validity case trusted.
83
+ */
84
+ export declare const INCLUDE_IN_DEFAULT_TRENDS_GROQ = "(!defined(validity.status) || validity.status == \"ok\") && !(provenance.classification in [\"test\", \"experimental\"])";
85
+ /**
86
+ * SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
87
+ * flattened `ailf.reports` BigQuery row (columns `validity_status`,
88
+ * `classification`). Materialized verbatim as the `include_in_default_trends`
89
+ * column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
90
+ * asserts the view embeds this exact string.
91
+ *
92
+ * Both axes are NULL-safe so the column matches the TS predicate row-for-row:
93
+ * `classification NOT IN (...)` alone is `NULL` for an unclassified row under
94
+ * SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
95
+ * silently dropping pre-taxonomy reports the TS predicate keeps.
96
+ */
97
+ export declare const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
@@ -0,0 +1,86 @@
1
+ /**
2
+ * trustworthiness.ts — The single trust gate for reports (D0059).
3
+ *
4
+ * `includeInDefaultTrends` is the one definition of "show this report by
5
+ * default." Every surface (dashboard analytics, Studio presets, the BigQuery
6
+ * `reports.sql` view) references this predicate so the gate cannot drift
7
+ * between consumers.
8
+ *
9
+ * Two orthogonal axes decide inclusion:
10
+ *
11
+ * - **Validity (data health, D0059)** — the *primary* gate. A report is
12
+ * included only when its `validity.status` is `ok` OR validity is absent
13
+ * (pre-stamp reads are trusted until backfilled — the rollout is additive
14
+ * and nullable). Any non-`ok` status (`degraded` / `incomplete` /
15
+ * `suspect`) excludes the report regardless of intent.
16
+ * - **Intent (run classification, D0037)** — a *secondary* exclusion. The
17
+ * explicit `test` and `experimental` classifications are dropped;
18
+ * `adhoc` / `official` / `external` (and a missing classification) are kept.
19
+ * `adhoc` is intentionally included — it holds real production one-offs;
20
+ * the validity gate, not the intent gate, removes the bad ones inside it.
21
+ *
22
+ * We model a slim subset of the core `Report` shape (the two read axes) rather
23
+ * than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
24
+ * package is the dependency-graph leaf and imports nothing from core. A full
25
+ * core `Report` is structurally assignable to {@link TrustGateReport}.
26
+ *
27
+ * The predicate is total — it never throws — and is kept trivially
28
+ * translatable to the two query-language forms it is materialized as on the
29
+ * other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
30
+ * Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
31
+ * SQL boolean in the BigQuery `reports.sql` view
32
+ * ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
33
+ * function, so the one gate cannot drift between consumers; a cross-check test
34
+ * asserts all three forms agree across the full truth table.
35
+ *
36
+ * Note the SQL form is NULL-safe on *both* axes: a bare
37
+ * `classification NOT IN ('test','experimental')` would evaluate to `NULL`
38
+ * (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
39
+ * excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
40
+ * `classification IS NULL OR …`.
41
+ *
42
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
43
+ * @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
44
+ */
45
+ /**
46
+ * Whether a report should appear in default trend views.
47
+ *
48
+ * Validity is the primary gate; intent is a secondary exclusion. See the
49
+ * module header for the full rationale and the equivalent SQL.
50
+ *
51
+ * @returns `true` when the report is trustworthy enough to show by default.
52
+ */
53
+ export function includeInDefaultTrends(report) {
54
+ const status = report.validity?.status;
55
+ // Primary gate: trustworthy when explicitly `ok` or not yet assessed.
56
+ const validityOk = status == null || status === "ok";
57
+ const classification = report.provenance?.classification;
58
+ // Secondary exclusion: drop explicit test/experimental intent only.
59
+ const intentIncluded = classification !== "test" && classification !== "experimental";
60
+ return validityOk && intentIncluded;
61
+ }
62
+ /**
63
+ * GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
64
+ * `ailf.report` document. Drop it into a Studio structure filter with the
65
+ * document-type guard, e.g.
66
+ * `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
67
+ *
68
+ * GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
69
+ * unclassified report passes the intent clause without an explicit
70
+ * `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
71
+ * `defined(validity.status)` makes the absent-validity case trusted.
72
+ */
73
+ export const INCLUDE_IN_DEFAULT_TRENDS_GROQ = '(!defined(validity.status) || validity.status == "ok") && !(provenance.classification in ["test", "experimental"])';
74
+ /**
75
+ * SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
76
+ * flattened `ailf.reports` BigQuery row (columns `validity_status`,
77
+ * `classification`). Materialized verbatim as the `include_in_default_trends`
78
+ * column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
79
+ * asserts the view embeds this exact string.
80
+ *
81
+ * Both axes are NULL-safe so the column matches the TS predicate row-for-row:
82
+ * `classification NOT IN (...)` alone is `NULL` for an unclassified row under
83
+ * SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
84
+ * silently dropping pre-taxonomy reports the TS predicate keeps.
85
+ */
86
+ export const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
@@ -1564,8 +1564,8 @@ export declare const RepoConfigSchema: z.ZodObject<{
1564
1564
  summary: z.ZodOptional<z.ZodObject<{
1565
1565
  onRun: z.ZodOptional<z.ZodEnum<{
1566
1566
  never: "never";
1567
- always: "always";
1568
1567
  auto: "auto";
1568
+ always: "always";
1569
1569
  }>>;
1570
1570
  }, z.core.$strip>>;
1571
1571
  taskSource: z.ZodOptional<z.ZodObject<{
@@ -27,6 +27,7 @@ import { addOutputDirOption } from "./shared/options.js";
27
27
  import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
28
28
  import { buildProvenance, } from "../pipeline/provenance.js";
29
29
  import { generateReportTitle } from "../pipeline/report-title.js";
30
+ import { stampReportValidity } from "../pipeline/report-validity.js";
30
31
  import { buildSlimReportSummary } from "../_vendor/ailf-core/index.js";
31
32
  import { generateReportId, } from "../report-store.js";
32
33
  import { withRetry } from "../sinks/retry.js";
@@ -214,8 +215,14 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
214
215
  // -----------------------------------------------------------------------
215
216
  // 5. Write to Sanity (system of record)
216
217
  // -----------------------------------------------------------------------
218
+ // Stamp the data-health validity axis + normalize classification (D0059)
219
+ // — the same server-computed forward guarantee the pipeline write path
220
+ // applies, so reports published via this command carry validity too.
221
+ const stampedReport = stampReportValidity(report, now);
217
222
  console.log(" Writing to Sanity Content Lake...");
218
- const sanityResult = store ? await store.write(report) : null;
223
+ const sanityResult = store
224
+ ? await store.write(stampedReport)
225
+ : null;
219
226
  if (sanityResult) {
220
227
  console.log(` ✅ Report written: ${sanityResult}`);
221
228
  }
@@ -237,7 +244,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
237
244
  console.log();
238
245
  console.log(` Delivering to ${sinks.length} sink(s)...`);
239
246
  const settled = await Promise.allSettled(sinks.map(async (sink) => {
240
- const result = await withRetry(() => sink.publish(report));
247
+ const result = await withRetry(() => sink.publish(stampedReport));
241
248
  return { name: sink.name, result };
242
249
  }));
243
250
  for (const outcome of settled) {
@@ -16,6 +16,7 @@ import { assoc, buildSlimReportSummary, } from "../../_vendor/ailf-core/index.js
16
16
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
17
17
  import { buildProvenance, } from "../../pipeline/provenance.js";
18
18
  import { generateReportTitle } from "../../pipeline/report-title.js";
19
+ import { stampReportValidity } from "../../pipeline/report-validity.js";
19
20
  import { generateReportId } from "../../report-store.js";
20
21
  import { withRetry } from "../../sinks/retry.js";
21
22
  export class PublishReportStep {
@@ -145,21 +146,28 @@ export class PublishReportStep {
145
146
  testResults: slimSummary.testResults.map(slimTestResult),
146
147
  };
147
148
  }
149
+ // Stamp the data-health `validity` axis (D0059) and normalize
150
+ // `provenance.classification` on the report now that it is fully assembled
151
+ // (degradation + slim summary settled). The verdict is server-computed
152
+ // from the report's own data — never the caller envelope (D0037) — and
153
+ // assessed at the report's completion time. From here on, the stamped
154
+ // report is what reaches the snapshot artifact, the store, and the sinks.
155
+ const stampedReport = stampReportValidity(report, now);
148
156
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
149
157
  state.reportId = reportId;
150
158
  // W0050 — migrated from ctx.collector.capture to the unified writer.
151
159
  // reportSnapshot: full Report JSON for replay (run-scoped, bulk).
152
- await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), report);
160
+ await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), stampedReport);
153
161
  // autoComparison: delta vs baseline (run-scoped, bulk, optional).
154
162
  if (comparison) {
155
163
  await ctx.artifactWriter.emit("autoComparison", assoc(ctx), comparison);
156
164
  }
157
165
  // Write to store (system of record — best-effort, P5)
158
166
  const sanityResult = ctx.reportStore
159
- ? await ctx.reportStore.write(report)
167
+ ? await ctx.reportStore.write(stampedReport)
160
168
  : null;
161
169
  // Run sinks (fire-and-forget, P6)
162
- const publishResult = await runSinks(report, ctx);
170
+ const publishResult = await runSinks(stampedReport, ctx);
163
171
  // sinkResults: per-sink outcome (run-scoped, per-entry keyed by sink name).
164
172
  for (const r of publishResult.sinkResults) {
165
173
  await ctx.artifactWriter.emit("sinkResults", assoc(ctx, { name: r.name }), {
@@ -0,0 +1,32 @@
1
+ /**
2
+ * stampReportValidity — apply the report-trustworthiness detector at write time.
3
+ *
4
+ * The eval write path's forward guarantee (D0059): every newly written report
5
+ * carries a top-level `validity` data-health stamp so the trustworthiness gap
6
+ * cannot recur on new reports. Lives in `pipeline/` (not the orchestration
7
+ * step) so both report-write paths — `PublishReportStep` and the standalone
8
+ * `publish` command — import it without a command→orchestration-step coupling.
9
+ *
10
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
11
+ * @see docs/design-docs/report-trustworthiness-model.md
12
+ */
13
+ import { type Report } from "../_vendor/ailf-core/index.d.ts";
14
+ /**
15
+ * Stamp the data-health `validity` axis (D0059) onto a report and normalize
16
+ * its `provenance.classification` to the canonical spelling.
17
+ *
18
+ * Runs the pure detector (`assessReportValidity`) over the assembled report.
19
+ * `Report` structurally satisfies the detector's `ReportValidityInput`
20
+ * (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
21
+ * adapter is needed. The verdict is **server-computed from the report's own
22
+ * data** (D0037): `assessedAt` is injected by the caller (the report's
23
+ * completion time) and nothing is read from the caller envelope.
24
+ *
25
+ * `classification` is patched only when the detector returns one — it returns
26
+ * `undefined` when the existing value is already canonical and no Tier-1 rule
27
+ * fired, so the patch is idempotent and never clobbers a correct (or
28
+ * human-corrected) value. Tier-2 review flags are not persisted here; the
29
+ * one-shot backfill consumes them. Returns a new report; the input is not
30
+ * mutated.
31
+ */
32
+ export declare function stampReportValidity(report: Report, assessedAt: string): Report;
@@ -0,0 +1,43 @@
1
+ /**
2
+ * stampReportValidity — apply the report-trustworthiness detector at write time.
3
+ *
4
+ * The eval write path's forward guarantee (D0059): every newly written report
5
+ * carries a top-level `validity` data-health stamp so the trustworthiness gap
6
+ * cannot recur on new reports. Lives in `pipeline/` (not the orchestration
7
+ * step) so both report-write paths — `PublishReportStep` and the standalone
8
+ * `publish` command — import it without a command→orchestration-step coupling.
9
+ *
10
+ * @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
11
+ * @see docs/design-docs/report-trustworthiness-model.md
12
+ */
13
+ import { assessReportValidity } from "../_vendor/ailf-core/index.js";
14
+ /**
15
+ * Stamp the data-health `validity` axis (D0059) onto a report and normalize
16
+ * its `provenance.classification` to the canonical spelling.
17
+ *
18
+ * Runs the pure detector (`assessReportValidity`) over the assembled report.
19
+ * `Report` structurally satisfies the detector's `ReportValidityInput`
20
+ * (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
21
+ * adapter is needed. The verdict is **server-computed from the report's own
22
+ * data** (D0037): `assessedAt` is injected by the caller (the report's
23
+ * completion time) and nothing is read from the caller envelope.
24
+ *
25
+ * `classification` is patched only when the detector returns one — it returns
26
+ * `undefined` when the existing value is already canonical and no Tier-1 rule
27
+ * fired, so the patch is idempotent and never clobbers a correct (or
28
+ * human-corrected) value. Tier-2 review flags are not persisted here; the
29
+ * one-shot backfill consumes them. Returns a new report; the input is not
30
+ * mutated.
31
+ */
32
+ export function stampReportValidity(report, assessedAt) {
33
+ const { classification, validity } = assessReportValidity(report, {
34
+ assessedAt,
35
+ });
36
+ return {
37
+ ...report,
38
+ provenance: classification
39
+ ? { ...report.provenance, classification }
40
+ : report.provenance,
41
+ validity,
42
+ };
43
+ }
@@ -225,6 +225,7 @@ export interface SanityReportDoc {
225
225
  };
226
226
  tag: null | string;
227
227
  title: null | string;
228
+ validity?: Report["validity"];
228
229
  }
229
230
  export declare function toSanityReportDoc(report: Report): SanityReportDoc;
230
231
  /**
@@ -491,6 +491,7 @@ export function toSanityReportDoc(report) {
491
491
  },
492
492
  tag: report.tag ?? null,
493
493
  title: report.title ?? null,
494
+ ...(report.validity ? { validity: report.validity } : {}),
494
495
  };
495
496
  }
496
497
  /**
@@ -534,6 +535,7 @@ export function toReport(doc) {
534
535
  summary: doc.summary,
535
536
  tag: doc.tag,
536
537
  title: doc.title,
538
+ validity: doc.validity,
537
539
  };
538
540
  }
539
541
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "7.2.3",
3
+ "version": "7.3.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"