@sanity/ailf 7.2.3 → 7.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +38 -0
- package/config/bigquery/README.md +39 -7
- package/config/bigquery/views/reports.sql +6 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +30 -0
- package/dist/_vendor/ailf-core/schemas/report.js +21 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +4 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.d.ts +116 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.js +128 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +19 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/report-validity.d.ts +60 -0
- package/dist/_vendor/ailf-core/types/report-validity.js +42 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +3 -2
- package/dist/_vendor/ailf-shared/index.d.ts +2 -1
- package/dist/_vendor/ailf-shared/index.js +2 -1
- package/dist/_vendor/ailf-shared/run-classification.d.ts +53 -0
- package/dist/_vendor/ailf-shared/run-classification.js +111 -0
- package/dist/_vendor/ailf-shared/trustworthiness.d.ts +97 -0
- package/dist/_vendor/ailf-shared/trustworthiness.js +86 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/commands/publish.js +9 -2
- package/dist/orchestration/steps/publish-report-step.js +11 -3
- package/dist/pipeline/calculate-scores.js +8 -2
- package/dist/pipeline/report-validity.d.ts +32 -0
- package/dist/pipeline/report-validity.js +43 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/package.json +1 -1
|
@@ -78,11 +78,12 @@ export const HELP_TOPICS = [
|
|
|
78
78
|
},
|
|
79
79
|
{
|
|
80
80
|
"id": "reading-score-trends",
|
|
81
|
-
"title": "Reading
|
|
82
|
-
"body": "## What
|
|
81
|
+
"title": "Reading the Analytics View",
|
|
82
|
+
"body": "## What this view answers\n\nThe Analytics view is built around one question: **did your doc changes move the\nscore, and why?** Rather than open on a chart and leave you to find the story,\nit leads with the answer — a plain-language verdict and the areas that moved\nmost — then lets you drill down into the evidence.\n\n## The control bar\n\nThe top row picks what you're looking at:\n\n- **Metric** — which number to track (composite score, doc lift, retrieval gap,\n and so on).\n- **Break down by** — how to split it (feature area, team, model, source).\n- **Bucket** — how to group runs over time (per run, per day).\n- **Range** — how far back to look (for example, the last 30 days).\n\nThe second row holds the active **filter chips** — use _Add filter_ to scope to\na team, source, or mode — and a scope hint (reports in scope vs. total). Every\nknob and filter is saved in the URL, so a shared link reproduces exactly what\nyou see. Use **Copy link** to grab it.\n\n## Overall — the read\n\nThe **verdict strip** is the headline. In plain language it says whether docs\nare pulling ahead or slipping, and shows the headline metric with its change (Δ)\nsince the start of the range, a model → agent → docs decomposition bar, and a\ncoverage cell (how many reports and high-confidence groups are in scope).\n\n## Movers\n\nThe **movers board** leads with the top **Improved** and **Regressed** areas as\ncards — not the average. Each card shows the area, its value and Δ, a\ndecomposition bar, the release that most likely caused the move, and a\nconfidence read. A low-confidence **watch** callout flags big swings backed by\ntoo few runs: watch them, don't celebrate them yet.\n\nClick a mover card to reveal and decompose that series in the evidence chart.\n\n## The evidence\n\nThe **focus chart** has two modes:\n\n- **Compare** plots the selected series over time. It defaults to a focused set\n (the movers plus the highest-volume areas) with a _show all_ expansion, and\n draws release markers inline.\n- **Decompose** shows the ceiling / floor / actual band for a single series,\n with causal story cards anchored to each release marker (for example, _\"Docs\n +3 ~5 −1 → doc-lift +8 measured around this release\"_).\n\nDecompose is offered for the composite metric broken down by feature area — the\ncase where the model → agent → docs story is meaningful.\n\n## The breakdown table\n\nOne row per area (or per whatever you broke down by), each with an inline\ndecomposition bar, a sparkline, confidence, Δ, \"docs add,\" and a report count.\nSort any column, and click a row to cross-highlight it in the chart. Export the\ntable to CSV.\n\n## Meaningful change vs. noise\n\nSmall movements between runs are normal — they come from model non-determinism\nand grader variance. This view leans on **confidence** (how many runs back a\nnumber) and the **movers ranking** rather than a single ±point threshold: trust\na sustained move in a high-confidence area over a large swing in a\nlow-confidence one. The low-confidence watch exists precisely to stop you\nover-reading thin data.\n\n## Measured, not invented\n\nThe causal story is computed from real data, never fabricated. Release markers\ncome from the doc-change counts already recorded in each report, and the\n\"measured around this release\" doc-lift effect is derived from the real ceiling\n− floor series around the marker. Per-area prose (\"the editor API changed\") is\nintentionally not shown — the data carries change counts, not hand-written\nexplanations.",
|
|
83
83
|
"source": "docs/help/reading-score-trends.md",
|
|
84
84
|
"related": [
|
|
85
85
|
"scoring-model",
|
|
86
|
+
"doc-lift",
|
|
86
87
|
"comparing-runs"
|
|
87
88
|
]
|
|
88
89
|
},
|
|
@@ -31,6 +31,7 @@ export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.j
|
|
|
31
31
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
32
32
|
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
33
33
|
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, type SlugLike, } from "./owner-teams.js";
|
|
34
|
-
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
34
|
+
export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
35
35
|
export { type RunTrigger } from "./run-trigger.js";
|
|
36
36
|
export { type RunContext } from "./run-context.js";
|
|
37
|
+
export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, type TrustGateReport, } from "./trustworthiness.js";
|
|
@@ -30,4 +30,5 @@ export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
|
|
|
30
30
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
31
31
|
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
32
32
|
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, } from "./owner-teams.js";
|
|
33
|
-
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
33
|
+
export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
34
|
+
export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, } from "./trustworthiness.js";
|
|
@@ -19,6 +19,59 @@
|
|
|
19
19
|
export type RunClassification = "official" | "adhoc" | "experimental" | "test" | "external";
|
|
20
20
|
export declare const RUN_CLASSIFICATIONS: readonly RunClassification[];
|
|
21
21
|
export declare function isRunClassification(value: unknown): value is RunClassification;
|
|
22
|
+
/**
|
|
23
|
+
* Normalize a free-form classification value to a canonical
|
|
24
|
+
* {@link RunClassification}.
|
|
25
|
+
*
|
|
26
|
+
* - Trims and lowercases.
|
|
27
|
+
* - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
|
|
28
|
+
* - Defaults empty / unknown input to `adhoc` — D0037's documented
|
|
29
|
+
* default bucket, biased away from the canonical `official` series.
|
|
30
|
+
*
|
|
31
|
+
* Pure and deterministic — reused by the detector (`W-report-validity-detector`)
|
|
32
|
+
* and the backfill (`W-backfill-report-validity`).
|
|
33
|
+
*
|
|
34
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
35
|
+
*/
|
|
36
|
+
export declare function normalizeRunClassification(value: string | undefined | null): RunClassification;
|
|
37
|
+
/**
|
|
38
|
+
* Collapse a free-form executor name onto its canonical identity slug.
|
|
39
|
+
*
|
|
40
|
+
* - Trims and lowercases.
|
|
41
|
+
* - Maps known spellings (above) to one identity.
|
|
42
|
+
* - Passes unknown names through (trimmed + lowercased).
|
|
43
|
+
* - Returns `undefined` for empty / nullish input.
|
|
44
|
+
*
|
|
45
|
+
* Pure and deterministic — used by the validity detector
|
|
46
|
+
* (`W-report-validity-detector`) to recognize a known human before the
|
|
47
|
+
* generated-id heuristic runs, and by the backfill to de-drift
|
|
48
|
+
* `provenance.executor.name`.
|
|
49
|
+
*
|
|
50
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
51
|
+
*/
|
|
52
|
+
export declare function canonicalizeExecutorIdentity(name: string | undefined | null): string | undefined;
|
|
53
|
+
/** Whether an executor name collapses to a recognized human identity. */
|
|
54
|
+
export declare function isKnownExecutorIdentity(name: string | undefined | null): boolean;
|
|
55
|
+
/**
|
|
56
|
+
* Heuristic: does an executor name look like a *generated* handle/id rather
|
|
57
|
+
* than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
|
|
58
|
+
* `gL78msEDh` in the report store.)
|
|
59
|
+
*
|
|
60
|
+
* Deterministic and deliberately conservative — it judges *shape* only:
|
|
61
|
+
* a single token (no whitespace) of length 7–12, alphanumeric, mixing
|
|
62
|
+
* upper- and lower-case, and either containing a digit or showing ≥4
|
|
63
|
+
* upper/lower transitions. The transition floor is calibrated against the
|
|
64
|
+
* observed sample so two-word PascalCase names ("JohnSmith" — 3
|
|
65
|
+
* transitions) are NOT flagged; the generated ids (≥4 transitions or a
|
|
66
|
+
* digit) are. Known identities are excluded by the caller
|
|
67
|
+
* ({@link isKnownExecutorIdentity}) before this runs, so collapsed
|
|
68
|
+
* spellings like `GabeStah` never reach it as a positive.
|
|
69
|
+
*
|
|
70
|
+
* False positives are tolerable: the detector only uses this to propose an
|
|
71
|
+
* `experimental` classification, which is reversible (label-and-exclude,
|
|
72
|
+
* never delete) and surfaced for human review during the backfill.
|
|
73
|
+
*/
|
|
74
|
+
export declare function looksLikeGeneratedExecutorId(name: string | undefined | null): boolean;
|
|
22
75
|
/**
|
|
23
76
|
* Attribution — which team and (optionally) individual the run *belongs to*.
|
|
24
77
|
*
|
|
@@ -21,6 +21,117 @@ export function isRunClassification(value) {
|
|
|
21
21
|
return (typeof value === "string" &&
|
|
22
22
|
RUN_CLASSIFICATIONS.includes(value));
|
|
23
23
|
}
|
|
24
|
+
/**
|
|
25
|
+
* Lowercase legacy spelling → canonical classification. The `RunClassification`
|
|
26
|
+
* type has long been canonical `adhoc`, but historical report data carries the
|
|
27
|
+
* hyphenated `ad-hoc` spelling (D0059 §Context). Only observed drift belongs
|
|
28
|
+
* here.
|
|
29
|
+
*/
|
|
30
|
+
const RUN_CLASSIFICATION_ALIASES = {
|
|
31
|
+
"ad-hoc": "adhoc",
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Normalize a free-form classification value to a canonical
|
|
35
|
+
* {@link RunClassification}.
|
|
36
|
+
*
|
|
37
|
+
* - Trims and lowercases.
|
|
38
|
+
* - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
|
|
39
|
+
* - Defaults empty / unknown input to `adhoc` — D0037's documented
|
|
40
|
+
* default bucket, biased away from the canonical `official` series.
|
|
41
|
+
*
|
|
42
|
+
* Pure and deterministic — reused by the detector (`W-report-validity-detector`)
|
|
43
|
+
* and the backfill (`W-backfill-report-validity`).
|
|
44
|
+
*
|
|
45
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
46
|
+
*/
|
|
47
|
+
export function normalizeRunClassification(value) {
|
|
48
|
+
if (!value)
|
|
49
|
+
return "adhoc";
|
|
50
|
+
const trimmed = value.trim().toLowerCase();
|
|
51
|
+
if (!trimmed)
|
|
52
|
+
return "adhoc";
|
|
53
|
+
const canonical = RUN_CLASSIFICATION_ALIASES[trimmed] ?? trimmed;
|
|
54
|
+
return isRunClassification(canonical) ? canonical : "adhoc";
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Lowercased executor-name spelling → canonical identity slug. One human
|
|
58
|
+
* appears under several spellings in the historical report store
|
|
59
|
+
* (D0059 §Context: `Gabe Wyatt` / `GabeStah` / `gabewyatt`); collapsing
|
|
60
|
+
* them lets attribution and `classification` queries treat them as one
|
|
61
|
+
* person. Only observed drift belongs here — unknown names pass through.
|
|
62
|
+
*/
|
|
63
|
+
const EXECUTOR_IDENTITY_ALIASES = {
|
|
64
|
+
"gabe wyatt": "gabe-wyatt",
|
|
65
|
+
gabestah: "gabe-wyatt",
|
|
66
|
+
gabewyatt: "gabe-wyatt",
|
|
67
|
+
};
|
|
68
|
+
/**
|
|
69
|
+
* Collapse a free-form executor name onto its canonical identity slug.
|
|
70
|
+
*
|
|
71
|
+
* - Trims and lowercases.
|
|
72
|
+
* - Maps known spellings (above) to one identity.
|
|
73
|
+
* - Passes unknown names through (trimmed + lowercased).
|
|
74
|
+
* - Returns `undefined` for empty / nullish input.
|
|
75
|
+
*
|
|
76
|
+
* Pure and deterministic — used by the validity detector
|
|
77
|
+
* (`W-report-validity-detector`) to recognize a known human before the
|
|
78
|
+
* generated-id heuristic runs, and by the backfill to de-drift
|
|
79
|
+
* `provenance.executor.name`.
|
|
80
|
+
*
|
|
81
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
82
|
+
*/
|
|
83
|
+
export function canonicalizeExecutorIdentity(name) {
|
|
84
|
+
if (!name)
|
|
85
|
+
return undefined;
|
|
86
|
+
const trimmed = name.trim().toLowerCase();
|
|
87
|
+
if (!trimmed)
|
|
88
|
+
return undefined;
|
|
89
|
+
return EXECUTOR_IDENTITY_ALIASES[trimmed] ?? trimmed;
|
|
90
|
+
}
|
|
91
|
+
/** Whether an executor name collapses to a recognized human identity. */
|
|
92
|
+
export function isKnownExecutorIdentity(name) {
|
|
93
|
+
if (!name)
|
|
94
|
+
return false;
|
|
95
|
+
return name.trim().toLowerCase() in EXECUTOR_IDENTITY_ALIASES;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Heuristic: does an executor name look like a *generated* handle/id rather
|
|
99
|
+
* than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
|
|
100
|
+
* `gL78msEDh` in the report store.)
|
|
101
|
+
*
|
|
102
|
+
* Deterministic and deliberately conservative — it judges *shape* only:
|
|
103
|
+
* a single token (no whitespace) of length 7–12, alphanumeric, mixing
|
|
104
|
+
* upper- and lower-case, and either containing a digit or showing ≥4
|
|
105
|
+
* upper/lower transitions. The transition floor is calibrated against the
|
|
106
|
+
* observed sample so two-word PascalCase names ("JohnSmith" — 3
|
|
107
|
+
* transitions) are NOT flagged; the generated ids (≥4 transitions or a
|
|
108
|
+
* digit) are. Known identities are excluded by the caller
|
|
109
|
+
* ({@link isKnownExecutorIdentity}) before this runs, so collapsed
|
|
110
|
+
* spellings like `GabeStah` never reach it as a positive.
|
|
111
|
+
*
|
|
112
|
+
* False positives are tolerable: the detector only uses this to propose an
|
|
113
|
+
* `experimental` classification, which is reversible (label-and-exclude,
|
|
114
|
+
* never delete) and surfaced for human review during the backfill.
|
|
115
|
+
*/
|
|
116
|
+
export function looksLikeGeneratedExecutorId(name) {
|
|
117
|
+
if (!name)
|
|
118
|
+
return false;
|
|
119
|
+
const token = name.trim();
|
|
120
|
+
if (token.length < 7 || token.length > 12)
|
|
121
|
+
return false;
|
|
122
|
+
if (!/^[A-Za-z0-9]+$/.test(token))
|
|
123
|
+
return false;
|
|
124
|
+
if (!/[A-Z]/.test(token) || !/[a-z]/.test(token))
|
|
125
|
+
return false;
|
|
126
|
+
if (/[0-9]/.test(token))
|
|
127
|
+
return true;
|
|
128
|
+
let transitions = 0;
|
|
129
|
+
for (let i = 1; i < token.length; i++) {
|
|
130
|
+
if (/[A-Z]/.test(token[i - 1]) !== /[A-Z]/.test(token[i]))
|
|
131
|
+
transitions++;
|
|
132
|
+
}
|
|
133
|
+
return transitions >= 4;
|
|
134
|
+
}
|
|
24
135
|
export const RUN_EXECUTOR_SURFACES = [
|
|
25
136
|
"cli",
|
|
26
137
|
"studio",
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trustworthiness.ts — The single trust gate for reports (D0059).
|
|
3
|
+
*
|
|
4
|
+
* `includeInDefaultTrends` is the one definition of "show this report by
|
|
5
|
+
* default." Every surface (dashboard analytics, Studio presets, the BigQuery
|
|
6
|
+
* `reports.sql` view) references this predicate so the gate cannot drift
|
|
7
|
+
* between consumers.
|
|
8
|
+
*
|
|
9
|
+
* Two orthogonal axes decide inclusion:
|
|
10
|
+
*
|
|
11
|
+
* - **Validity (data health, D0059)** — the *primary* gate. A report is
|
|
12
|
+
* included only when its `validity.status` is `ok` OR validity is absent
|
|
13
|
+
* (pre-stamp reads are trusted until backfilled — the rollout is additive
|
|
14
|
+
* and nullable). Any non-`ok` status (`degraded` / `incomplete` /
|
|
15
|
+
* `suspect`) excludes the report regardless of intent.
|
|
16
|
+
* - **Intent (run classification, D0037)** — a *secondary* exclusion. The
|
|
17
|
+
* explicit `test` and `experimental` classifications are dropped;
|
|
18
|
+
* `adhoc` / `official` / `external` (and a missing classification) are kept.
|
|
19
|
+
* `adhoc` is intentionally included — it holds real production one-offs;
|
|
20
|
+
* the validity gate, not the intent gate, removes the bad ones inside it.
|
|
21
|
+
*
|
|
22
|
+
* We model a slim subset of the core `Report` shape (the two read axes) rather
|
|
23
|
+
* than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
|
|
24
|
+
* package is the dependency-graph leaf and imports nothing from core. A full
|
|
25
|
+
* core `Report` is structurally assignable to {@link TrustGateReport}.
|
|
26
|
+
*
|
|
27
|
+
* The predicate is total — it never throws — and is kept trivially
|
|
28
|
+
* translatable to the two query-language forms it is materialized as on the
|
|
29
|
+
* other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
|
|
30
|
+
* Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
|
|
31
|
+
* SQL boolean in the BigQuery `reports.sql` view
|
|
32
|
+
* ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
|
|
33
|
+
* function, so the one gate cannot drift between consumers; a cross-check test
|
|
34
|
+
* asserts all three forms agree across the full truth table.
|
|
35
|
+
*
|
|
36
|
+
* Note the SQL form is NULL-safe on *both* axes: a bare
|
|
37
|
+
* `classification NOT IN ('test','experimental')` would evaluate to `NULL`
|
|
38
|
+
* (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
|
|
39
|
+
* excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
|
|
40
|
+
* `classification IS NULL OR …`.
|
|
41
|
+
*
|
|
42
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
43
|
+
* @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
|
|
44
|
+
*/
|
|
45
|
+
import type { RunClassification } from "./run-classification.js";
|
|
46
|
+
/**
|
|
47
|
+
* Slim subset of a core `Report` — only the two axes the trust gate reads.
|
|
48
|
+
*
|
|
49
|
+
* `validity.status` is typed as a bare `string` (not core's
|
|
50
|
+
* `ReportValidityStatus`) so this leaf package imports nothing from
|
|
51
|
+
* `@sanity/ailf-core`; the predicate only distinguishes `"ok"` from
|
|
52
|
+
* everything else. `validity` absent/`null` ⇒ pre-stamp read ⇒ trusted.
|
|
53
|
+
*/
|
|
54
|
+
export interface TrustGateReport {
|
|
55
|
+
/** Data-health axis (D0059), top-level on the report. */
|
|
56
|
+
validity?: {
|
|
57
|
+
status: string;
|
|
58
|
+
} | null;
|
|
59
|
+
/** Run-intent axis (D0037), under provenance. */
|
|
60
|
+
provenance?: {
|
|
61
|
+
classification?: RunClassification | null;
|
|
62
|
+
} | null;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Whether a report should appear in default trend views.
|
|
66
|
+
*
|
|
67
|
+
* Validity is the primary gate; intent is a secondary exclusion. See the
|
|
68
|
+
* module header for the full rationale and the equivalent SQL.
|
|
69
|
+
*
|
|
70
|
+
* @returns `true` when the report is trustworthy enough to show by default.
|
|
71
|
+
*/
|
|
72
|
+
export declare function includeInDefaultTrends(report: TrustGateReport): boolean;
|
|
73
|
+
/**
|
|
74
|
+
* GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
|
|
75
|
+
* `ailf.report` document. Drop it into a Studio structure filter with the
|
|
76
|
+
* document-type guard, e.g.
|
|
77
|
+
* `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
|
|
78
|
+
*
|
|
79
|
+
* GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
|
|
80
|
+
* unclassified report passes the intent clause without an explicit
|
|
81
|
+
* `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
|
|
82
|
+
* `defined(validity.status)` makes the absent-validity case trusted.
|
|
83
|
+
*/
|
|
84
|
+
export declare const INCLUDE_IN_DEFAULT_TRENDS_GROQ = "(!defined(validity.status) || validity.status == \"ok\") && !(provenance.classification in [\"test\", \"experimental\"])";
|
|
85
|
+
/**
|
|
86
|
+
* SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
|
|
87
|
+
* flattened `ailf.reports` BigQuery row (columns `validity_status`,
|
|
88
|
+
* `classification`). Materialized verbatim as the `include_in_default_trends`
|
|
89
|
+
* column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
|
|
90
|
+
* asserts the view embeds this exact string.
|
|
91
|
+
*
|
|
92
|
+
* Both axes are NULL-safe so the column matches the TS predicate row-for-row:
|
|
93
|
+
* `classification NOT IN (...)` alone is `NULL` for an unclassified row under
|
|
94
|
+
* SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
|
|
95
|
+
* silently dropping pre-taxonomy reports the TS predicate keeps.
|
|
96
|
+
*/
|
|
97
|
+
export declare const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trustworthiness.ts — The single trust gate for reports (D0059).
|
|
3
|
+
*
|
|
4
|
+
* `includeInDefaultTrends` is the one definition of "show this report by
|
|
5
|
+
* default." Every surface (dashboard analytics, Studio presets, the BigQuery
|
|
6
|
+
* `reports.sql` view) references this predicate so the gate cannot drift
|
|
7
|
+
* between consumers.
|
|
8
|
+
*
|
|
9
|
+
* Two orthogonal axes decide inclusion:
|
|
10
|
+
*
|
|
11
|
+
* - **Validity (data health, D0059)** — the *primary* gate. A report is
|
|
12
|
+
* included only when its `validity.status` is `ok` OR validity is absent
|
|
13
|
+
* (pre-stamp reads are trusted until backfilled — the rollout is additive
|
|
14
|
+
* and nullable). Any non-`ok` status (`degraded` / `incomplete` /
|
|
15
|
+
* `suspect`) excludes the report regardless of intent.
|
|
16
|
+
* - **Intent (run classification, D0037)** — a *secondary* exclusion. The
|
|
17
|
+
* explicit `test` and `experimental` classifications are dropped;
|
|
18
|
+
* `adhoc` / `official` / `external` (and a missing classification) are kept.
|
|
19
|
+
* `adhoc` is intentionally included — it holds real production one-offs;
|
|
20
|
+
* the validity gate, not the intent gate, removes the bad ones inside it.
|
|
21
|
+
*
|
|
22
|
+
* We model a slim subset of the core `Report` shape (the two read axes) rather
|
|
23
|
+
* than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
|
|
24
|
+
* package is the dependency-graph leaf and imports nothing from core. A full
|
|
25
|
+
* core `Report` is structurally assignable to {@link TrustGateReport}.
|
|
26
|
+
*
|
|
27
|
+
* The predicate is total — it never throws — and is kept trivially
|
|
28
|
+
* translatable to the two query-language forms it is materialized as on the
|
|
29
|
+
* other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
|
|
30
|
+
* Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
|
|
31
|
+
* SQL boolean in the BigQuery `reports.sql` view
|
|
32
|
+
* ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
|
|
33
|
+
* function, so the one gate cannot drift between consumers; a cross-check test
|
|
34
|
+
* asserts all three forms agree across the full truth table.
|
|
35
|
+
*
|
|
36
|
+
* Note the SQL form is NULL-safe on *both* axes: a bare
|
|
37
|
+
* `classification NOT IN ('test','experimental')` would evaluate to `NULL`
|
|
38
|
+
* (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
|
|
39
|
+
* excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
|
|
40
|
+
* `classification IS NULL OR …`.
|
|
41
|
+
*
|
|
42
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
43
|
+
* @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
|
|
44
|
+
*/
|
|
45
|
+
/**
|
|
46
|
+
* Whether a report should appear in default trend views.
|
|
47
|
+
*
|
|
48
|
+
* Validity is the primary gate; intent is a secondary exclusion. See the
|
|
49
|
+
* module header for the full rationale and the equivalent SQL.
|
|
50
|
+
*
|
|
51
|
+
* @returns `true` when the report is trustworthy enough to show by default.
|
|
52
|
+
*/
|
|
53
|
+
export function includeInDefaultTrends(report) {
|
|
54
|
+
const status = report.validity?.status;
|
|
55
|
+
// Primary gate: trustworthy when explicitly `ok` or not yet assessed.
|
|
56
|
+
const validityOk = status == null || status === "ok";
|
|
57
|
+
const classification = report.provenance?.classification;
|
|
58
|
+
// Secondary exclusion: drop explicit test/experimental intent only.
|
|
59
|
+
const intentIncluded = classification !== "test" && classification !== "experimental";
|
|
60
|
+
return validityOk && intentIncluded;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
|
|
64
|
+
* `ailf.report` document. Drop it into a Studio structure filter with the
|
|
65
|
+
* document-type guard, e.g.
|
|
66
|
+
* `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
|
|
67
|
+
*
|
|
68
|
+
* GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
|
|
69
|
+
* unclassified report passes the intent clause without an explicit
|
|
70
|
+
* `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
|
|
71
|
+
* `defined(validity.status)` makes the absent-validity case trusted.
|
|
72
|
+
*/
|
|
73
|
+
export const INCLUDE_IN_DEFAULT_TRENDS_GROQ = '(!defined(validity.status) || validity.status == "ok") && !(provenance.classification in ["test", "experimental"])';
|
|
74
|
+
/**
|
|
75
|
+
* SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
|
|
76
|
+
* flattened `ailf.reports` BigQuery row (columns `validity_status`,
|
|
77
|
+
* `classification`). Materialized verbatim as the `include_in_default_trends`
|
|
78
|
+
* column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
|
|
79
|
+
* asserts the view embeds this exact string.
|
|
80
|
+
*
|
|
81
|
+
* Both axes are NULL-safe so the column matches the TS predicate row-for-row:
|
|
82
|
+
* `classification NOT IN (...)` alone is `NULL` for an unclassified row under
|
|
83
|
+
* SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
|
|
84
|
+
* silently dropping pre-taxonomy reports the TS predicate keeps.
|
|
85
|
+
*/
|
|
86
|
+
export const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
|
|
@@ -1564,8 +1564,8 @@ export declare const RepoConfigSchema: z.ZodObject<{
|
|
|
1564
1564
|
summary: z.ZodOptional<z.ZodObject<{
|
|
1565
1565
|
onRun: z.ZodOptional<z.ZodEnum<{
|
|
1566
1566
|
never: "never";
|
|
1567
|
-
always: "always";
|
|
1568
1567
|
auto: "auto";
|
|
1568
|
+
always: "always";
|
|
1569
1569
|
}>>;
|
|
1570
1570
|
}, z.core.$strip>>;
|
|
1571
1571
|
taskSource: z.ZodOptional<z.ZodObject<{
|
package/dist/commands/publish.js
CHANGED
|
@@ -27,6 +27,7 @@ import { addOutputDirOption } from "./shared/options.js";
|
|
|
27
27
|
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
28
28
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
29
29
|
import { generateReportTitle } from "../pipeline/report-title.js";
|
|
30
|
+
import { stampReportValidity } from "../pipeline/report-validity.js";
|
|
30
31
|
import { buildSlimReportSummary } from "../_vendor/ailf-core/index.js";
|
|
31
32
|
import { generateReportId, } from "../report-store.js";
|
|
32
33
|
import { withRetry } from "../sinks/retry.js";
|
|
@@ -214,8 +215,14 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
|
214
215
|
// -----------------------------------------------------------------------
|
|
215
216
|
// 5. Write to Sanity (system of record)
|
|
216
217
|
// -----------------------------------------------------------------------
|
|
218
|
+
// Stamp the data-health validity axis + normalize classification (D0059)
|
|
219
|
+
// — the same server-computed forward guarantee the pipeline write path
|
|
220
|
+
// applies, so reports published via this command carry validity too.
|
|
221
|
+
const stampedReport = stampReportValidity(report, now);
|
|
217
222
|
console.log(" Writing to Sanity Content Lake...");
|
|
218
|
-
const sanityResult = store
|
|
223
|
+
const sanityResult = store
|
|
224
|
+
? await store.write(stampedReport)
|
|
225
|
+
: null;
|
|
219
226
|
if (sanityResult) {
|
|
220
227
|
console.log(` ✅ Report written: ${sanityResult}`);
|
|
221
228
|
}
|
|
@@ -237,7 +244,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
|
237
244
|
console.log();
|
|
238
245
|
console.log(` Delivering to ${sinks.length} sink(s)...`);
|
|
239
246
|
const settled = await Promise.allSettled(sinks.map(async (sink) => {
|
|
240
|
-
const result = await withRetry(() => sink.publish(
|
|
247
|
+
const result = await withRetry(() => sink.publish(stampedReport));
|
|
241
248
|
return { name: sink.name, result };
|
|
242
249
|
}));
|
|
243
250
|
for (const outcome of settled) {
|
|
@@ -16,6 +16,7 @@ import { assoc, buildSlimReportSummary, } from "../../_vendor/ailf-core/index.js
|
|
|
16
16
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
17
17
|
import { buildProvenance, } from "../../pipeline/provenance.js";
|
|
18
18
|
import { generateReportTitle } from "../../pipeline/report-title.js";
|
|
19
|
+
import { stampReportValidity } from "../../pipeline/report-validity.js";
|
|
19
20
|
import { generateReportId } from "../../report-store.js";
|
|
20
21
|
import { withRetry } from "../../sinks/retry.js";
|
|
21
22
|
export class PublishReportStep {
|
|
@@ -145,21 +146,28 @@ export class PublishReportStep {
|
|
|
145
146
|
testResults: slimSummary.testResults.map(slimTestResult),
|
|
146
147
|
};
|
|
147
148
|
}
|
|
149
|
+
// Stamp the data-health `validity` axis (D0059) and normalize
|
|
150
|
+
// `provenance.classification` on the report now that it is fully assembled
|
|
151
|
+
// (degradation + slim summary settled). The verdict is server-computed
|
|
152
|
+
// from the report's own data — never the caller envelope (D0037) — and
|
|
153
|
+
// assessed at the report's completion time. From here on, the stamped
|
|
154
|
+
// report is what reaches the snapshot artifact, the store, and the sinks.
|
|
155
|
+
const stampedReport = stampReportValidity(report, now);
|
|
148
156
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
149
157
|
state.reportId = reportId;
|
|
150
158
|
// W0050 — migrated from ctx.collector.capture to the unified writer.
|
|
151
159
|
// reportSnapshot: full Report JSON for replay (run-scoped, bulk).
|
|
152
|
-
await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx),
|
|
160
|
+
await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), stampedReport);
|
|
153
161
|
// autoComparison: delta vs baseline (run-scoped, bulk, optional).
|
|
154
162
|
if (comparison) {
|
|
155
163
|
await ctx.artifactWriter.emit("autoComparison", assoc(ctx), comparison);
|
|
156
164
|
}
|
|
157
165
|
// Write to store (system of record — best-effort, P5)
|
|
158
166
|
const sanityResult = ctx.reportStore
|
|
159
|
-
? await ctx.reportStore.write(
|
|
167
|
+
? await ctx.reportStore.write(stampedReport)
|
|
160
168
|
: null;
|
|
161
169
|
// Run sinks (fire-and-forget, P6)
|
|
162
|
-
const publishResult = await runSinks(
|
|
170
|
+
const publishResult = await runSinks(stampedReport, ctx);
|
|
163
171
|
// sinkResults: per-sink outcome (run-scoped, per-entry keyed by sink name).
|
|
164
172
|
for (const r of publishResult.sinkResults) {
|
|
165
173
|
await ctx.artifactWriter.emit("sinkResults", assoc(ctx, { name: r.name }), {
|
|
@@ -1479,9 +1479,15 @@ export async function calculateAndWriteScores(options) {
|
|
|
1479
1479
|
logger: log,
|
|
1480
1480
|
});
|
|
1481
1481
|
// Mutate-in-place so subsequent steps (validateGraderJudgmentsCalibration,
|
|
1482
|
-
// persist) see the consensus-merged scores.
|
|
1482
|
+
// persist) see the consensus-merged scores. Snapshot first: the runner's
|
|
1483
|
+
// no-borderline fast path returns the SAME array reference it received,
|
|
1484
|
+
// so `regraded` may alias `judgments`. Truncating `judgments` would then
|
|
1485
|
+
// empty `regraded` before the spread reads it, silently wiping every
|
|
1486
|
+
// judgment (extract N, persist 0) — the divergence the post-persist guard
|
|
1487
|
+
// aborts on. Copying breaks the alias regardless of what the runner returns.
|
|
1488
|
+
const merged = [...regraded];
|
|
1483
1489
|
judgments.length = 0;
|
|
1484
|
-
judgments.push(...
|
|
1490
|
+
judgments.push(...merged);
|
|
1485
1491
|
if (consistencyByJudgment.size > 0) {
|
|
1486
1492
|
log.info(`Borderline consensus merged ${consistencyByJudgment.size} judgment(s)`);
|
|
1487
1493
|
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stampReportValidity — apply the report-trustworthiness detector at write time.
|
|
3
|
+
*
|
|
4
|
+
* The eval write path's forward guarantee (D0059): every newly written report
|
|
5
|
+
* carries a top-level `validity` data-health stamp so the trustworthiness gap
|
|
6
|
+
* cannot recur on new reports. Lives in `pipeline/` (not the orchestration
|
|
7
|
+
* step) so both report-write paths — `PublishReportStep` and the standalone
|
|
8
|
+
* `publish` command — import it without a command→orchestration-step coupling.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
11
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
12
|
+
*/
|
|
13
|
+
import { type Report } from "../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
/**
|
|
15
|
+
* Stamp the data-health `validity` axis (D0059) onto a report and normalize
|
|
16
|
+
* its `provenance.classification` to the canonical spelling.
|
|
17
|
+
*
|
|
18
|
+
* Runs the pure detector (`assessReportValidity`) over the assembled report.
|
|
19
|
+
* `Report` structurally satisfies the detector's `ReportValidityInput`
|
|
20
|
+
* (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
|
|
21
|
+
* adapter is needed. The verdict is **server-computed from the report's own
|
|
22
|
+
* data** (D0037): `assessedAt` is injected by the caller (the report's
|
|
23
|
+
* completion time) and nothing is read from the caller envelope.
|
|
24
|
+
*
|
|
25
|
+
* `classification` is patched only when the detector returns one — it returns
|
|
26
|
+
* `undefined` when the existing value is already canonical and no Tier-1 rule
|
|
27
|
+
* fired, so the patch is idempotent and never clobbers a correct (or
|
|
28
|
+
* human-corrected) value. Tier-2 review flags are not persisted here; the
|
|
29
|
+
* one-shot backfill consumes them. Returns a new report; the input is not
|
|
30
|
+
* mutated.
|
|
31
|
+
*/
|
|
32
|
+
export declare function stampReportValidity(report: Report, assessedAt: string): Report;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stampReportValidity — apply the report-trustworthiness detector at write time.
|
|
3
|
+
*
|
|
4
|
+
* The eval write path's forward guarantee (D0059): every newly written report
|
|
5
|
+
* carries a top-level `validity` data-health stamp so the trustworthiness gap
|
|
6
|
+
* cannot recur on new reports. Lives in `pipeline/` (not the orchestration
|
|
7
|
+
* step) so both report-write paths — `PublishReportStep` and the standalone
|
|
8
|
+
* `publish` command — import it without a command→orchestration-step coupling.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
11
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
12
|
+
*/
|
|
13
|
+
import { assessReportValidity } from "../_vendor/ailf-core/index.js";
|
|
14
|
+
/**
|
|
15
|
+
* Stamp the data-health `validity` axis (D0059) onto a report and normalize
|
|
16
|
+
* its `provenance.classification` to the canonical spelling.
|
|
17
|
+
*
|
|
18
|
+
* Runs the pure detector (`assessReportValidity`) over the assembled report.
|
|
19
|
+
* `Report` structurally satisfies the detector's `ReportValidityInput`
|
|
20
|
+
* (`provenance` extends `RunContext`; `summary` is a `ReportSummary`), so no
|
|
21
|
+
* adapter is needed. The verdict is **server-computed from the report's own
|
|
22
|
+
* data** (D0037): `assessedAt` is injected by the caller (the report's
|
|
23
|
+
* completion time) and nothing is read from the caller envelope.
|
|
24
|
+
*
|
|
25
|
+
* `classification` is patched only when the detector returns one — it returns
|
|
26
|
+
* `undefined` when the existing value is already canonical and no Tier-1 rule
|
|
27
|
+
* fired, so the patch is idempotent and never clobbers a correct (or
|
|
28
|
+
* human-corrected) value. Tier-2 review flags are not persisted here; the
|
|
29
|
+
* one-shot backfill consumes them. Returns a new report; the input is not
|
|
30
|
+
* mutated.
|
|
31
|
+
*/
|
|
32
|
+
export function stampReportValidity(report, assessedAt) {
|
|
33
|
+
const { classification, validity } = assessReportValidity(report, {
|
|
34
|
+
assessedAt,
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
...report,
|
|
38
|
+
provenance: classification
|
|
39
|
+
? { ...report.provenance, classification }
|
|
40
|
+
: report.provenance,
|
|
41
|
+
validity,
|
|
42
|
+
};
|
|
43
|
+
}
|
package/dist/report-store.d.ts
CHANGED
package/dist/report-store.js
CHANGED
|
@@ -491,6 +491,7 @@ export function toSanityReportDoc(report) {
|
|
|
491
491
|
},
|
|
492
492
|
tag: report.tag ?? null,
|
|
493
493
|
title: report.title ?? null,
|
|
494
|
+
...(report.validity ? { validity: report.validity } : {}),
|
|
494
495
|
};
|
|
495
496
|
}
|
|
496
497
|
/**
|
|
@@ -534,6 +535,7 @@ export function toReport(doc) {
|
|
|
534
535
|
summary: doc.summary,
|
|
535
536
|
tag: doc.tag,
|
|
536
537
|
title: doc.title,
|
|
538
|
+
validity: doc.validity,
|
|
537
539
|
};
|
|
538
540
|
}
|
|
539
541
|
/**
|