@sanity/ailf 7.2.2 → 7.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +38 -0
- package/config/bigquery/README.md +39 -7
- package/config/bigquery/views/reports.sql +6 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +22 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/report.d.ts +30 -0
- package/dist/_vendor/ailf-core/schemas/report.js +21 -2
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +14 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +4 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.d.ts +116 -0
- package/dist/_vendor/ailf-core/services/report-validity-detector.js +128 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +19 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/report-validity.d.ts +60 -0
- package/dist/_vendor/ailf-core/types/report-validity.js +42 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +4 -3
- package/dist/_vendor/ailf-shared/glossary.d.ts +32 -0
- package/dist/_vendor/ailf-shared/glossary.js +35 -0
- package/dist/_vendor/ailf-shared/index.d.ts +2 -1
- package/dist/_vendor/ailf-shared/index.js +2 -1
- package/dist/_vendor/ailf-shared/run-classification.d.ts +53 -0
- package/dist/_vendor/ailf-shared/run-classification.js +111 -0
- package/dist/_vendor/ailf-shared/trustworthiness.d.ts +97 -0
- package/dist/_vendor/ailf-shared/trustworthiness.js +86 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +8 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +10 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +12 -2
- package/dist/artifact-capture/gcs-artifact-writer.js +18 -0
- package/dist/commands/publish.js +9 -2
- package/dist/orchestration/steps/publish-report-step.js +11 -3
- package/dist/orchestration/steps/run-eval-step.js +56 -3
- package/dist/pipeline/cache-hit-restore.d.ts +37 -1
- package/dist/pipeline/cache-hit-restore.js +108 -1
- package/dist/pipeline/report-validity.d.ts +32 -0
- package/dist/pipeline/report-validity.js +43 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/package.json +1 -1
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report validity (data-health axis)
|
|
3
|
+
*
|
|
4
|
+
* `ReportValidity` is the post-hoc data-health assessment of a published
|
|
5
|
+
* report, orthogonal to `provenance.classification` (run intent, D0037).
|
|
6
|
+
* It is a top-level sibling of the legacy `ReportDegradation` flag, which it
|
|
7
|
+
* subsumes (`degraded:true → validity.status:"degraded"`).
|
|
8
|
+
*
|
|
9
|
+
* Authored independently of the Zod schema so the schema can assert
|
|
10
|
+
* `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
|
|
11
|
+
* (D0045). Populated by the confidence-tiered detector
|
|
12
|
+
* (`W-report-validity-detector`) and the eval write path
|
|
13
|
+
* (`W-stamp-validity-write-path`); gated everywhere by the shared
|
|
14
|
+
* `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
17
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* The validity-status vocabulary. Single `as const` tuple so the runtime Zod
|
|
21
|
+
* `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
|
|
22
|
+
* the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
|
|
23
|
+
*
|
|
24
|
+
* - `ok` — trustworthy; included in default trends.
|
|
25
|
+
* - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
|
|
26
|
+
* flag).
|
|
27
|
+
* - `incomplete` — expected grains genuinely missing, after the report-shape
|
|
28
|
+
* caveat is accounted for (see design doc).
|
|
29
|
+
* - `suspect` — passed structural checks but flagged for review (anomaly or
|
|
30
|
+
* ambiguous heuristic).
|
|
31
|
+
*/
|
|
32
|
+
export declare const REPORT_VALIDITY_STATUSES: readonly ["ok", "degraded", "incomplete", "suspect"];
|
|
33
|
+
export type ReportValidityStatus = (typeof REPORT_VALIDITY_STATUSES)[number];
|
|
34
|
+
/**
|
|
35
|
+
* How the validity verdict was reached. A `"manual"` verdict is authoritative
|
|
36
|
+
* — re-running the detector never overwrites it (the detector emits a re-run
|
|
37
|
+
* only over `"auto"` verdicts).
|
|
38
|
+
*/
|
|
39
|
+
export type ReportValidityMethod = "auto" | "manual";
|
|
40
|
+
/**
|
|
41
|
+
* Post-hoc data-health assessment of a published report.
|
|
42
|
+
*
|
|
43
|
+
* Top-level on the `Report` (a judgment about the report's *data*), NOT under
|
|
44
|
+
* `provenance` (which records run *intent*). Additive and nullable: pre-stamp
|
|
45
|
+
* reads have no `validity` and are treated as trustworthy until backfilled.
|
|
46
|
+
*/
|
|
47
|
+
export interface ReportValidity {
|
|
48
|
+
/** Data-health verdict. */
|
|
49
|
+
status: ReportValidityStatus;
|
|
50
|
+
/** Which detector rules fired — the audit trail behind `status`. */
|
|
51
|
+
reasons: string[];
|
|
52
|
+
/** Whether an automated rule or a human produced this verdict. */
|
|
53
|
+
method: ReportValidityMethod;
|
|
54
|
+
/** Detector ruleset version, so re-assessments are comparable. */
|
|
55
|
+
rulesetVersion: string;
|
|
56
|
+
/** When the verdict was produced (ISO 8601 UTC). */
|
|
57
|
+
assessedAt: string;
|
|
58
|
+
}
|
|
59
|
+
/** Type guard for {@link ReportValidityStatus}. */
|
|
60
|
+
export declare function isReportValidityStatus(value: unknown): value is ReportValidityStatus;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — Report validity (data-health axis)
|
|
3
|
+
*
|
|
4
|
+
* `ReportValidity` is the post-hoc data-health assessment of a published
|
|
5
|
+
* report, orthogonal to `provenance.classification` (run intent, D0037).
|
|
6
|
+
* It is a top-level sibling of the legacy `ReportDegradation` flag, which it
|
|
7
|
+
* subsumes (`degraded:true → validity.status:"degraded"`).
|
|
8
|
+
*
|
|
9
|
+
* Authored independently of the Zod schema so the schema can assert
|
|
10
|
+
* `satisfies z.ZodType<ReportValidity>` and turn drift into a build error
|
|
11
|
+
* (D0045). Populated by the confidence-tiered detector
|
|
12
|
+
* (`W-report-validity-detector`) and the eval write path
|
|
13
|
+
* (`W-stamp-validity-write-path`); gated everywhere by the shared
|
|
14
|
+
* `includeInDefaultTrends` predicate (`W-trustworthiness-predicate`).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
17
|
+
* @see docs/design-docs/report-trustworthiness-model.md
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* The validity-status vocabulary. Single `as const` tuple so the runtime Zod
|
|
21
|
+
* `z.enum(...)` and the `ReportValidityStatus` type derive from one source —
|
|
22
|
+
* the same drift-proofing the `DEGRADED_ENRICHMENT_FIELDS` tuple uses.
|
|
23
|
+
*
|
|
24
|
+
* - `ok` — trustworthy; included in default trends.
|
|
25
|
+
* - `degraded` — enrichment/grading failed (subsumes the legacy `degraded`
|
|
26
|
+
* flag).
|
|
27
|
+
* - `incomplete` — expected grains genuinely missing, after the report-shape
|
|
28
|
+
* caveat is accounted for (see design doc).
|
|
29
|
+
* - `suspect` — passed structural checks but flagged for review (anomaly or
|
|
30
|
+
* ambiguous heuristic).
|
|
31
|
+
*/
|
|
32
|
+
export const REPORT_VALIDITY_STATUSES = [
|
|
33
|
+
"ok",
|
|
34
|
+
"degraded",
|
|
35
|
+
"incomplete",
|
|
36
|
+
"suspect",
|
|
37
|
+
];
|
|
38
|
+
/** Type guard for {@link ReportValidityStatus}. */
|
|
39
|
+
export function isReportValidityStatus(value) {
|
|
40
|
+
return (typeof value === "string" &&
|
|
41
|
+
REPORT_VALIDITY_STATUSES.includes(value));
|
|
42
|
+
}
|
|
@@ -78,11 +78,12 @@ export const HELP_TOPICS = [
|
|
|
78
78
|
},
|
|
79
79
|
{
|
|
80
80
|
"id": "reading-score-trends",
|
|
81
|
-
"title": "Reading
|
|
82
|
-
"body": "## What
|
|
81
|
+
"title": "Reading the Analytics View",
|
|
82
|
+
"body": "## What this view answers\n\nThe Analytics view is built around one question: **did your doc changes move the\nscore, and why?** Rather than open on a chart and leave you to find the story,\nit leads with the answer — a plain-language verdict and the areas that moved\nmost — then lets you drill down into the evidence.\n\n## The control bar\n\nThe top row picks what you're looking at:\n\n- **Metric** — which number to track (composite score, doc lift, retrieval gap,\n and so on).\n- **Break down by** — how to split it (feature area, team, model, source).\n- **Bucket** — how to group runs over time (per run, per day).\n- **Range** — how far back to look (for example, the last 30 days).\n\nThe second row holds the active **filter chips** — use _Add filter_ to scope to\na team, source, or mode — and a scope hint (reports in scope vs. total). Every\nknob and filter is saved in the URL, so a shared link reproduces exactly what\nyou see. Use **Copy link** to grab it.\n\n## Overall — the read\n\nThe **verdict strip** is the headline. In plain language it says whether docs\nare pulling ahead or slipping, and shows the headline metric with its change (Δ)\nsince the start of the range, a model → agent → docs decomposition bar, and a\ncoverage cell (how many reports and high-confidence groups are in scope).\n\n## Movers\n\nThe **movers board** leads with the top **Improved** and **Regressed** areas as\ncards — not the average. Each card shows the area, its value and Δ, a\ndecomposition bar, the release that most likely caused the move, and a\nconfidence read. A low-confidence **watch** callout flags big swings backed by\ntoo few runs: watch them, don't celebrate them yet.\n\nClick a mover card to reveal and decompose that series in the evidence chart.\n\n## The evidence\n\nThe **focus chart** has two modes:\n\n- **Compare** plots the selected series over time. It defaults to a focused set\n (the movers plus the highest-volume areas) with a _show all_ expansion, and\n draws release markers inline.\n- **Decompose** shows the ceiling / floor / actual band for a single series,\n with causal story cards anchored to each release marker (for example, _\"Docs\n +3 ~5 −1 → doc-lift +8 measured around this release\"_).\n\nDecompose is offered for the composite metric broken down by feature area — the\ncase where the model → agent → docs story is meaningful.\n\n## The breakdown table\n\nOne row per area (or per whatever you broke down by), each with an inline\ndecomposition bar, a sparkline, confidence, Δ, \"docs add,\" and a report count.\nSort any column, and click a row to cross-highlight it in the chart. Export the\ntable to CSV.\n\n## Meaningful change vs. noise\n\nSmall movements between runs are normal — they come from model non-determinism\nand grader variance. This view leans on **confidence** (how many runs back a\nnumber) and the **movers ranking** rather than a single ±point threshold: trust\na sustained move in a high-confidence area over a large swing in a\nlow-confidence one. The low-confidence watch exists precisely to stop you\nover-reading thin data.\n\n## Measured, not invented\n\nThe causal story is computed from real data, never fabricated. Release markers\ncome from the doc-change counts already recorded in each report, and the\n\"measured around this release\" doc-lift effect is derived from the real ceiling\n− floor series around the marker. Per-area prose (\"the editor API changed\") is\nintentionally not shown — the data carries change counts, not hand-written\nexplanations.",
|
|
83
83
|
"source": "docs/help/reading-score-trends.md",
|
|
84
84
|
"related": [
|
|
85
85
|
"scoring-model",
|
|
86
|
+
"doc-lift",
|
|
86
87
|
"comparing-runs"
|
|
87
88
|
]
|
|
88
89
|
},
|
|
@@ -142,7 +143,7 @@ export const HELP_TOPICS = [
|
|
|
142
143
|
{
|
|
143
144
|
"id": "glossary",
|
|
144
145
|
"title": "Glossary",
|
|
145
|
-
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: ⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0–100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0–100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Δ**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Δ**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap Δ**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Δ**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.\n\n**Needs Attention (70–79)**\n: Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion Δ**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness Δ**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage Δ**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Δ**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
146
|
+
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: ⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0–100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0–100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Δ**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Δ**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap Δ**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Δ**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.\n\n**Needs Attention (70–79)**\n: Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion Δ**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness Δ**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage Δ**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Δ**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.\n\n**Gold**\n: Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.\n\n**Baseline**\n: Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.\n\n**Naive**\n: Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.\n\n**Optimized**\n: Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.\n\n**Normal**\n: Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.\n\n**Fail**\n: Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.\n\n**Low dim**\n: Low dim — the run produced output but at least one grading dimension scored below 60.\n\n**OK**\n: OK — the run produced output and every grading dimension scored 60 or above.",
|
|
146
147
|
"source": "packages/shared/src/glossary.ts",
|
|
147
148
|
"tags": [
|
|
148
149
|
"reference",
|
|
@@ -314,5 +314,37 @@ export declare const GLOSSARY: {
|
|
|
314
314
|
readonly label: "Cross-Repo";
|
|
315
315
|
readonly long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
|
|
316
316
|
};
|
|
317
|
+
readonly variantGold: {
|
|
318
|
+
readonly label: "Gold";
|
|
319
|
+
readonly long: "Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.";
|
|
320
|
+
};
|
|
321
|
+
readonly variantBaseline: {
|
|
322
|
+
readonly label: "Baseline";
|
|
323
|
+
readonly long: "Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.";
|
|
324
|
+
};
|
|
325
|
+
readonly engineNaive: {
|
|
326
|
+
readonly label: "Naive";
|
|
327
|
+
readonly long: "Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.";
|
|
328
|
+
};
|
|
329
|
+
readonly engineOptimized: {
|
|
330
|
+
readonly label: "Optimized";
|
|
331
|
+
readonly long: "Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.";
|
|
332
|
+
};
|
|
333
|
+
readonly engineNormal: {
|
|
334
|
+
readonly label: "Normal";
|
|
335
|
+
readonly long: "Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.";
|
|
336
|
+
};
|
|
337
|
+
readonly statusFail: {
|
|
338
|
+
readonly label: "Fail";
|
|
339
|
+
readonly long: "Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.";
|
|
340
|
+
};
|
|
341
|
+
readonly statusLowDim: {
|
|
342
|
+
readonly label: "Low dim";
|
|
343
|
+
readonly long: "Low dim — the run produced output but at least one grading dimension scored below 60.";
|
|
344
|
+
};
|
|
345
|
+
readonly statusOk: {
|
|
346
|
+
readonly label: "OK";
|
|
347
|
+
readonly long: "OK — the run produced output and every grading dimension scored 60 or above.";
|
|
348
|
+
};
|
|
317
349
|
};
|
|
318
350
|
export type GlossarySlug = keyof typeof GLOSSARY;
|
|
@@ -327,4 +327,39 @@ export const GLOSSARY = {
|
|
|
327
327
|
label: "Cross-Repo",
|
|
328
328
|
long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
329
329
|
},
|
|
330
|
+
// -- Variant values (per-test docs condition) ------------------------------
|
|
331
|
+
variantGold: {
|
|
332
|
+
label: "Gold",
|
|
333
|
+
long: "Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.",
|
|
334
|
+
},
|
|
335
|
+
variantBaseline: {
|
|
336
|
+
label: "Baseline",
|
|
337
|
+
long: "Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.",
|
|
338
|
+
},
|
|
339
|
+
// -- Execution mode values (how the model was driven) ----------------------
|
|
340
|
+
engineNaive: {
|
|
341
|
+
label: "Naive",
|
|
342
|
+
long: "Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.",
|
|
343
|
+
},
|
|
344
|
+
engineOptimized: {
|
|
345
|
+
label: "Optimized",
|
|
346
|
+
long: "Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.",
|
|
347
|
+
},
|
|
348
|
+
engineNormal: {
|
|
349
|
+
label: "Normal",
|
|
350
|
+
long: "Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.",
|
|
351
|
+
},
|
|
352
|
+
// -- Status values (per-test outcome) --------------------------------------
|
|
353
|
+
statusFail: {
|
|
354
|
+
label: "Fail",
|
|
355
|
+
long: "Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.",
|
|
356
|
+
},
|
|
357
|
+
statusLowDim: {
|
|
358
|
+
label: "Low dim",
|
|
359
|
+
long: "Low dim — the run produced output but at least one grading dimension scored below 60.",
|
|
360
|
+
},
|
|
361
|
+
statusOk: {
|
|
362
|
+
label: "OK",
|
|
363
|
+
long: "OK — the run produced output and every grading dimension scored 60 or above.",
|
|
364
|
+
},
|
|
330
365
|
};
|
|
@@ -31,6 +31,7 @@ export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.j
|
|
|
31
31
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
32
32
|
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
33
33
|
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, type SlugLike, } from "./owner-teams.js";
|
|
34
|
-
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
34
|
+
export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
35
35
|
export { type RunTrigger } from "./run-trigger.js";
|
|
36
36
|
export { type RunContext } from "./run-context.js";
|
|
37
|
+
export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, type TrustGateReport, } from "./trustworthiness.js";
|
|
@@ -30,4 +30,5 @@ export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
|
|
|
30
30
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
31
31
|
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
32
32
|
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, } from "./owner-teams.js";
|
|
33
|
-
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
33
|
+
export { canonicalizeExecutorIdentity, isKnownExecutorIdentity, isRunClassification, looksLikeGeneratedExecutorId, normalizeRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
34
|
+
export { includeInDefaultTrends, INCLUDE_IN_DEFAULT_TRENDS_GROQ, INCLUDE_IN_DEFAULT_TRENDS_SQL, } from "./trustworthiness.js";
|
|
@@ -19,6 +19,59 @@
|
|
|
19
19
|
export type RunClassification = "official" | "adhoc" | "experimental" | "test" | "external";
|
|
20
20
|
export declare const RUN_CLASSIFICATIONS: readonly RunClassification[];
|
|
21
21
|
export declare function isRunClassification(value: unknown): value is RunClassification;
|
|
22
|
+
/**
|
|
23
|
+
* Normalize a free-form classification value to a canonical
|
|
24
|
+
* {@link RunClassification}.
|
|
25
|
+
*
|
|
26
|
+
* - Trims and lowercases.
|
|
27
|
+
* - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
|
|
28
|
+
* - Defaults empty / unknown input to `adhoc` — D0037's documented
|
|
29
|
+
* default bucket, biased away from the canonical `official` series.
|
|
30
|
+
*
|
|
31
|
+
* Pure and deterministic — reused by the detector (`W-report-validity-detector`)
|
|
32
|
+
* and the backfill (`W-backfill-report-validity`).
|
|
33
|
+
*
|
|
34
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
35
|
+
*/
|
|
36
|
+
export declare function normalizeRunClassification(value: string | undefined | null): RunClassification;
|
|
37
|
+
/**
|
|
38
|
+
* Collapse a free-form executor name onto its canonical identity slug.
|
|
39
|
+
*
|
|
40
|
+
* - Trims and lowercases.
|
|
41
|
+
* - Maps known spellings (above) to one identity.
|
|
42
|
+
* - Passes unknown names through (trimmed + lowercased).
|
|
43
|
+
* - Returns `undefined` for empty / nullish input.
|
|
44
|
+
*
|
|
45
|
+
* Pure and deterministic — used by the validity detector
|
|
46
|
+
* (`W-report-validity-detector`) to recognize a known human before the
|
|
47
|
+
* generated-id heuristic runs, and by the backfill to de-drift
|
|
48
|
+
* `provenance.executor.name`.
|
|
49
|
+
*
|
|
50
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
51
|
+
*/
|
|
52
|
+
export declare function canonicalizeExecutorIdentity(name: string | undefined | null): string | undefined;
|
|
53
|
+
/** Whether an executor name collapses to a recognized human identity. */
|
|
54
|
+
export declare function isKnownExecutorIdentity(name: string | undefined | null): boolean;
|
|
55
|
+
/**
|
|
56
|
+
* Heuristic: does an executor name look like a *generated* handle/id rather
|
|
57
|
+
* than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
|
|
58
|
+
* `gL78msEDh` in the report store.)
|
|
59
|
+
*
|
|
60
|
+
* Deterministic and deliberately conservative — it judges *shape* only:
|
|
61
|
+
* a single token (no whitespace) of length 7–12, alphanumeric, mixing
|
|
62
|
+
* upper- and lower-case, and either containing a digit or showing ≥4
|
|
63
|
+
* upper/lower transitions. The transition floor is calibrated against the
|
|
64
|
+
* observed sample so two-word PascalCase names ("JohnSmith" — 3
|
|
65
|
+
* transitions) are NOT flagged; the generated ids (≥4 transitions or a
|
|
66
|
+
* digit) are. Known identities are excluded by the caller
|
|
67
|
+
* ({@link isKnownExecutorIdentity}) before this runs, so collapsed
|
|
68
|
+
* spellings like `GabeStah` never reach it as a positive.
|
|
69
|
+
*
|
|
70
|
+
* False positives are tolerable: the detector only uses this to propose an
|
|
71
|
+
* `experimental` classification, which is reversible (label-and-exclude,
|
|
72
|
+
* never delete) and surfaced for human review during the backfill.
|
|
73
|
+
*/
|
|
74
|
+
export declare function looksLikeGeneratedExecutorId(name: string | undefined | null): boolean;
|
|
22
75
|
/**
|
|
23
76
|
* Attribution — which team and (optionally) individual the run *belongs to*.
|
|
24
77
|
*
|
|
@@ -21,6 +21,117 @@ export function isRunClassification(value) {
|
|
|
21
21
|
return (typeof value === "string" &&
|
|
22
22
|
RUN_CLASSIFICATIONS.includes(value));
|
|
23
23
|
}
|
|
24
|
+
/**
|
|
25
|
+
* Lowercase legacy spelling → canonical classification. The `RunClassification`
|
|
26
|
+
* type has long been canonical `adhoc`, but historical report data carries the
|
|
27
|
+
* hyphenated `ad-hoc` spelling (D0059 §Context). Only observed drift belongs
|
|
28
|
+
* here.
|
|
29
|
+
*/
|
|
30
|
+
const RUN_CLASSIFICATION_ALIASES = {
|
|
31
|
+
"ad-hoc": "adhoc",
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Normalize a free-form classification value to a canonical
|
|
35
|
+
* {@link RunClassification}.
|
|
36
|
+
*
|
|
37
|
+
* - Trims and lowercases.
|
|
38
|
+
* - Maps the legacy `ad-hoc` spelling onto canonical `adhoc`.
|
|
39
|
+
* - Defaults empty / unknown input to `adhoc` — D0037's documented
|
|
40
|
+
* default bucket, biased away from the canonical `official` series.
|
|
41
|
+
*
|
|
42
|
+
* Pure and deterministic — reused by the detector (`W-report-validity-detector`)
|
|
43
|
+
* and the backfill (`W-backfill-report-validity`).
|
|
44
|
+
*
|
|
45
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
46
|
+
*/
|
|
47
|
+
export function normalizeRunClassification(value) {
|
|
48
|
+
if (!value)
|
|
49
|
+
return "adhoc";
|
|
50
|
+
const trimmed = value.trim().toLowerCase();
|
|
51
|
+
if (!trimmed)
|
|
52
|
+
return "adhoc";
|
|
53
|
+
const canonical = RUN_CLASSIFICATION_ALIASES[trimmed] ?? trimmed;
|
|
54
|
+
return isRunClassification(canonical) ? canonical : "adhoc";
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Lowercased executor-name spelling → canonical identity slug. One human
|
|
58
|
+
* appears under several spellings in the historical report store
|
|
59
|
+
* (D0059 §Context: `Gabe Wyatt` / `GabeStah` / `gabewyatt`); collapsing
|
|
60
|
+
* them lets attribution and `classification` queries treat them as one
|
|
61
|
+
* person. Only observed drift belongs here — unknown names pass through.
|
|
62
|
+
*/
|
|
63
|
+
const EXECUTOR_IDENTITY_ALIASES = {
|
|
64
|
+
"gabe wyatt": "gabe-wyatt",
|
|
65
|
+
gabestah: "gabe-wyatt",
|
|
66
|
+
gabewyatt: "gabe-wyatt",
|
|
67
|
+
};
|
|
68
|
+
/**
|
|
69
|
+
* Collapse a free-form executor name onto its canonical identity slug.
|
|
70
|
+
*
|
|
71
|
+
* - Trims and lowercases.
|
|
72
|
+
* - Maps known spellings (above) to one identity.
|
|
73
|
+
* - Passes unknown names through (trimmed + lowercased).
|
|
74
|
+
* - Returns `undefined` for empty / nullish input.
|
|
75
|
+
*
|
|
76
|
+
* Pure and deterministic — used by the validity detector
|
|
77
|
+
* (`W-report-validity-detector`) to recognize a known human before the
|
|
78
|
+
* generated-id heuristic runs, and by the backfill to de-drift
|
|
79
|
+
* `provenance.executor.name`.
|
|
80
|
+
*
|
|
81
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
82
|
+
*/
|
|
83
|
+
export function canonicalizeExecutorIdentity(name) {
|
|
84
|
+
if (!name)
|
|
85
|
+
return undefined;
|
|
86
|
+
const trimmed = name.trim().toLowerCase();
|
|
87
|
+
if (!trimmed)
|
|
88
|
+
return undefined;
|
|
89
|
+
return EXECUTOR_IDENTITY_ALIASES[trimmed] ?? trimmed;
|
|
90
|
+
}
|
|
91
|
+
/** Whether an executor name collapses to a recognized human identity. */
|
|
92
|
+
export function isKnownExecutorIdentity(name) {
|
|
93
|
+
if (!name)
|
|
94
|
+
return false;
|
|
95
|
+
return name.trim().toLowerCase() in EXECUTOR_IDENTITY_ALIASES;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Heuristic: does an executor name look like a *generated* handle/id rather
|
|
99
|
+
* than a human name? (D0059 §Context flagged ids like `gDVzuuHam`,
|
|
100
|
+
* `gL78msEDh` in the report store.)
|
|
101
|
+
*
|
|
102
|
+
* Deterministic and deliberately conservative — it judges *shape* only:
|
|
103
|
+
* a single token (no whitespace) of length 7–12, alphanumeric, mixing
|
|
104
|
+
* upper- and lower-case, and either containing a digit or showing ≥4
|
|
105
|
+
* upper/lower transitions. The transition floor is calibrated against the
|
|
106
|
+
* observed sample so two-word PascalCase names ("JohnSmith" — 3
|
|
107
|
+
* transitions) are NOT flagged; the generated ids (≥4 transitions or a
|
|
108
|
+
* digit) are. Known identities are excluded by the caller
|
|
109
|
+
* ({@link isKnownExecutorIdentity}) before this runs, so collapsed
|
|
110
|
+
* spellings like `GabeStah` never reach it as a positive.
|
|
111
|
+
*
|
|
112
|
+
* False positives are tolerable: the detector only uses this to propose an
|
|
113
|
+
* `experimental` classification, which is reversible (label-and-exclude,
|
|
114
|
+
* never delete) and surfaced for human review during the backfill.
|
|
115
|
+
*/
|
|
116
|
+
export function looksLikeGeneratedExecutorId(name) {
|
|
117
|
+
if (!name)
|
|
118
|
+
return false;
|
|
119
|
+
const token = name.trim();
|
|
120
|
+
if (token.length < 7 || token.length > 12)
|
|
121
|
+
return false;
|
|
122
|
+
if (!/^[A-Za-z0-9]+$/.test(token))
|
|
123
|
+
return false;
|
|
124
|
+
if (!/[A-Z]/.test(token) || !/[a-z]/.test(token))
|
|
125
|
+
return false;
|
|
126
|
+
if (/[0-9]/.test(token))
|
|
127
|
+
return true;
|
|
128
|
+
let transitions = 0;
|
|
129
|
+
for (let i = 1; i < token.length; i++) {
|
|
130
|
+
if (/[A-Z]/.test(token[i - 1]) !== /[A-Z]/.test(token[i]))
|
|
131
|
+
transitions++;
|
|
132
|
+
}
|
|
133
|
+
return transitions >= 4;
|
|
134
|
+
}
|
|
24
135
|
export const RUN_EXECUTOR_SURFACES = [
|
|
25
136
|
"cli",
|
|
26
137
|
"studio",
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trustworthiness.ts — The single trust gate for reports (D0059).
|
|
3
|
+
*
|
|
4
|
+
* `includeInDefaultTrends` is the one definition of "show this report by
|
|
5
|
+
* default." Every surface (dashboard analytics, Studio presets, the BigQuery
|
|
6
|
+
* `reports.sql` view) references this predicate so the gate cannot drift
|
|
7
|
+
* between consumers.
|
|
8
|
+
*
|
|
9
|
+
* Two orthogonal axes decide inclusion:
|
|
10
|
+
*
|
|
11
|
+
* - **Validity (data health, D0059)** — the *primary* gate. A report is
|
|
12
|
+
* included only when its `validity.status` is `ok` OR validity is absent
|
|
13
|
+
* (pre-stamp reads are trusted until backfilled — the rollout is additive
|
|
14
|
+
* and nullable). Any non-`ok` status (`degraded` / `incomplete` /
|
|
15
|
+
* `suspect`) excludes the report regardless of intent.
|
|
16
|
+
* - **Intent (run classification, D0037)** — a *secondary* exclusion. The
|
|
17
|
+
* explicit `test` and `experimental` classifications are dropped;
|
|
18
|
+
* `adhoc` / `official` / `external` (and a missing classification) are kept.
|
|
19
|
+
* `adhoc` is intentionally included — it holds real production one-offs;
|
|
20
|
+
* the validity gate, not the intent gate, removes the bad ones inside it.
|
|
21
|
+
*
|
|
22
|
+
* We model a slim subset of the core `Report` shape (the two read axes) rather
|
|
23
|
+
* than importing `Report` / `ReportValidity` from `@sanity/ailf-core`: this
|
|
24
|
+
* package is the dependency-graph leaf and imports nothing from core. A full
|
|
25
|
+
* core `Report` is structurally assignable to {@link TrustGateReport}.
|
|
26
|
+
*
|
|
27
|
+
* The predicate is total — it never throws — and is kept trivially
|
|
28
|
+
* translatable to the two query-language forms it is materialized as on the
|
|
29
|
+
* other surfaces (`W-studio-bigquery-validity`): the GROQ filter behind the
|
|
30
|
+
* Studio "Trustworthy" preset ({@link INCLUDE_IN_DEFAULT_TRENDS_GROQ}) and the
|
|
31
|
+
* SQL boolean in the BigQuery `reports.sql` view
|
|
32
|
+
* ({@link INCLUDE_IN_DEFAULT_TRENDS_SQL}). Those constants live here, beside the
|
|
33
|
+
* function, so the one gate cannot drift between consumers; a cross-check test
|
|
34
|
+
* asserts all three forms agree across the full truth table.
|
|
35
|
+
*
|
|
36
|
+
* Note the SQL form is NULL-safe on *both* axes: a bare
|
|
37
|
+
* `classification NOT IN ('test','experimental')` would evaluate to `NULL`
|
|
38
|
+
* (not `TRUE`) for an unclassified row under SQL three-valued logic, silently
|
|
39
|
+
* excluding pre-taxonomy reports the TS predicate keeps — hence the explicit
|
|
40
|
+
* `classification IS NULL OR …`.
|
|
41
|
+
*
|
|
42
|
+
* @see docs/decisions/D0059-report-validity-axis-and-trustworthiness-gate.md
|
|
43
|
+
* @see docs/design-docs/report-trustworthiness-model.md — §Decision/3
|
|
44
|
+
*/
|
|
45
|
+
import type { RunClassification } from "./run-classification.js";
|
|
46
|
+
/**
|
|
47
|
+
* Slim subset of a core `Report` — only the two axes the trust gate reads.
|
|
48
|
+
*
|
|
49
|
+
* `validity.status` is typed as a bare `string` (not core's
|
|
50
|
+
* `ReportValidityStatus`) so this leaf package imports nothing from
|
|
51
|
+
* `@sanity/ailf-core`; the predicate only distinguishes `"ok"` from
|
|
52
|
+
* everything else. `validity` absent/`null` ⇒ pre-stamp read ⇒ trusted.
|
|
53
|
+
*/
|
|
54
|
+
export interface TrustGateReport {
|
|
55
|
+
/** Data-health axis (D0059), top-level on the report. */
|
|
56
|
+
validity?: {
|
|
57
|
+
status: string;
|
|
58
|
+
} | null;
|
|
59
|
+
/** Run-intent axis (D0037), under provenance. */
|
|
60
|
+
provenance?: {
|
|
61
|
+
classification?: RunClassification | null;
|
|
62
|
+
} | null;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Whether a report should appear in default trend views.
|
|
66
|
+
*
|
|
67
|
+
* Validity is the primary gate; intent is a secondary exclusion. See the
|
|
68
|
+
* module header for the full rationale and the equivalent SQL.
|
|
69
|
+
*
|
|
70
|
+
* @returns `true` when the report is trustworthy enough to show by default.
|
|
71
|
+
*/
|
|
72
|
+
export declare function includeInDefaultTrends(report: TrustGateReport): boolean;
|
|
73
|
+
/**
|
|
74
|
+
* GROQ form of {@link includeInDefaultTrends}, as a boolean expression over an
|
|
75
|
+
* `ailf.report` document. Drop it into a Studio structure filter with the
|
|
76
|
+
* document-type guard, e.g.
|
|
77
|
+
* `` `_type == "ailf.report" && ${INCLUDE_IN_DEFAULT_TRENDS_GROQ}` ``.
|
|
78
|
+
*
|
|
79
|
+
* GROQ's `in` returns `false` (not `null`) for an absent left operand, so an
|
|
80
|
+
* unclassified report passes the intent clause without an explicit
|
|
81
|
+
* `defined(...)` guard — matching the TS predicate's "missing ⇒ kept" rule.
|
|
82
|
+
* `defined(validity.status)` makes the absent-validity case trusted.
|
|
83
|
+
*/
|
|
84
|
+
export declare const INCLUDE_IN_DEFAULT_TRENDS_GROQ = "(!defined(validity.status) || validity.status == \"ok\") && !(provenance.classification in [\"test\", \"experimental\"])";
|
|
85
|
+
/**
|
|
86
|
+
* SQL form of {@link includeInDefaultTrends}, as a boolean expression over the
|
|
87
|
+
* flattened `ailf.reports` BigQuery row (columns `validity_status`,
|
|
88
|
+
* `classification`). Materialized verbatim as the `include_in_default_trends`
|
|
89
|
+
* column in `packages/eval/config/bigquery/views/reports.sql`; an eval test
|
|
90
|
+
* asserts the view embeds this exact string.
|
|
91
|
+
*
|
|
92
|
+
* Both axes are NULL-safe so the column matches the TS predicate row-for-row:
|
|
93
|
+
* `classification NOT IN (...)` alone is `NULL` for an unclassified row under
|
|
94
|
+
* SQL three-valued logic, which a `WHERE`/boolean context treats as `FALSE` —
|
|
95
|
+
* silently dropping pre-taxonomy reports the TS predicate keeps.
|
|
96
|
+
*/
|
|
97
|
+
export declare const INCLUDE_IN_DEFAULT_TRENDS_SQL = "(validity_status IS NULL OR validity_status = 'ok') AND (classification IS NULL OR classification NOT IN ('test', 'experimental'))";
|