@sanity/ailf-studio 2.2.1 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +44 -8
- package/dist/index.js +330 -30
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -119,10 +119,11 @@ declare const RunTaskEvaluationAction: DocumentActionComponent;
|
|
|
119
119
|
* ```
|
|
120
120
|
*
|
|
121
121
|
* `ailfStructure` renders filtered entries for `ailf.task`, `ailf.team`,
|
|
122
|
-
* `ailf.featureArea`, and `ailf.report` in place of their
|
|
123
|
-
* list items, plus a top-level diagnostic for teams whose
|
|
124
|
-
* reference unknown event-type strings. Other document types
|
|
125
|
-
* at their Studio default via
|
|
122
|
+
* `ailf.user`, `ailf.featureArea`, and `ailf.report` in place of their
|
|
123
|
+
* default Studio list items, plus a top-level diagnostic for teams whose
|
|
124
|
+
* channel events reference unknown event-type strings. Other document types
|
|
125
|
+
* are preserved at their Studio default via
|
|
126
|
+
* `S.documentTypeListItems().filter(...)`.
|
|
126
127
|
* Consumers who already maintain a custom structure can splice individual
|
|
127
128
|
* helpers in via the per-type exports.
|
|
128
129
|
*/
|
|
@@ -135,9 +136,10 @@ declare const RunTaskEvaluationAction: DocumentActionComponent;
|
|
|
135
136
|
declare function ailfTaskStructureItem(S: StructureBuilder): ReturnType<StructureBuilder["listItem"]>;
|
|
136
137
|
/**
|
|
137
138
|
* Full structure resolver that replaces the default entries for
|
|
138
|
-
* `ailf.task`, `ailf.team`, `ailf.
|
|
139
|
-
* filtered AILF views, exposes a top-level
|
|
140
|
-
* and keeps every other document type list item
|
|
139
|
+
* `ailf.task`, `ailf.team`, `ailf.user`, `ailf.featureArea`, and
|
|
140
|
+
* `ailf.report` with filtered AILF views, exposes a top-level
|
|
141
|
+
* unknown-events diagnostic, and keeps every other document type list item
|
|
142
|
+
* at its Studio default.
|
|
141
143
|
*/
|
|
142
144
|
declare const ailfStructure: StructureResolver;
|
|
143
145
|
|
|
@@ -512,6 +514,7 @@ declare const featureAreaSchema: {
|
|
|
512
514
|
preview?: sanity.PreviewConfig<{
|
|
513
515
|
areaId: string;
|
|
514
516
|
description: string;
|
|
517
|
+
team: string;
|
|
515
518
|
}, Record<string, unknown>> | undefined;
|
|
516
519
|
};
|
|
517
520
|
|
|
@@ -639,6 +642,39 @@ declare const teamSchema: {
|
|
|
639
642
|
}, Record<string, unknown>> | undefined;
|
|
640
643
|
};
|
|
641
644
|
|
|
645
|
+
/**
|
|
646
|
+
* schema/user.ts
|
|
647
|
+
*
|
|
648
|
+
* Sanity document schema for `ailf.user` — a per-account user document that
|
|
649
|
+
* stores self-declared team affiliation (references to `ailf.team`) plus UI
|
|
650
|
+
* preferences, and is the primary source for dashboard personalization.
|
|
651
|
+
*
|
|
652
|
+
* This schema is a deliberate **subset** of the canonical `AilfUser` domain
|
|
653
|
+
* type (`packages/core/src/types/user.ts`), per D0045 (TS-first domain type →
|
|
654
|
+
* Zod `satisfies` → Sanity-as-subset; Studio schemas are hand-authored and
|
|
655
|
+
* never TypeGen-inverted).
|
|
656
|
+
*
|
|
657
|
+
* Every field is `readOnly: true`. The dashboard App SDK is the write surface
|
|
658
|
+
* (lazy `createIfNotExists` + patch via `useApplyDocumentActions`); Studio is
|
|
659
|
+
* for operator inspection/triage only. `readOnly` is a Studio-UI concern and
|
|
660
|
+
* does not affect the dashboard's programmatic writes.
|
|
661
|
+
*
|
|
662
|
+
* Relationship to D0055: `ailf.user` is the user-side identity/affiliation axis
|
|
663
|
+
* deferred by the team-entity work — distinct from `area.team` ownership and
|
|
664
|
+
* `team.notifications[].scope` subscription.
|
|
665
|
+
*
|
|
666
|
+
* @see docs/design-docs/user-settings.md § Type-architecture placement
|
|
667
|
+
*/
|
|
668
|
+
declare const userSchema: {
|
|
669
|
+
type: "document";
|
|
670
|
+
name: "ailf.user";
|
|
671
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
672
|
+
preview?: sanity.PreviewConfig<{
|
|
673
|
+
displayName: string;
|
|
674
|
+
email: string;
|
|
675
|
+
}, Record<string, unknown>> | undefined;
|
|
676
|
+
};
|
|
677
|
+
|
|
642
678
|
/**
|
|
643
679
|
* schema/webhook-config.ts
|
|
644
680
|
*
|
|
@@ -1230,4 +1266,4 @@ interface AilfPluginOptions {
|
|
|
1230
1266
|
*/
|
|
1231
1267
|
declare const ailfPlugin: sanity.Plugin<void | AilfPluginOptions>;
|
|
1232
1268
|
|
|
1233
|
-
export { type AilfPluginOptions, ArchiveTaskAction, AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GraduateToNativeAction, HelpDrawer, HelpProvider, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, RestoreTaskAction, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfStructure, ailfTaskStructureItem, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, teamSchema, useHelp, webhookConfigSchema };
|
|
1269
|
+
export { type AilfPluginOptions, ArchiveTaskAction, AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GraduateToNativeAction, HelpDrawer, HelpProvider, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, RestoreTaskAction, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfStructure, ailfTaskStructureItem, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, teamSchema, useHelp, userSchema, webhookConfigSchema };
|
package/dist/index.js
CHANGED
|
@@ -530,6 +530,41 @@ var GLOSSARY = {
|
|
|
530
530
|
triggerCrossRepo: {
|
|
531
531
|
label: "Cross-Repo",
|
|
532
532
|
long: "Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks."
|
|
533
|
+
},
|
|
534
|
+
// -- Variant values (per-test docs condition) ------------------------------
|
|
535
|
+
variantGold: {
|
|
536
|
+
label: "Gold",
|
|
537
|
+
long: "Gold variant \u2014 the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do."
|
|
538
|
+
},
|
|
539
|
+
variantBaseline: {
|
|
540
|
+
label: "Baseline",
|
|
541
|
+
long: "Baseline variant \u2014 no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift."
|
|
542
|
+
},
|
|
543
|
+
// -- Execution mode values (how the model was driven) ----------------------
|
|
544
|
+
engineNaive: {
|
|
545
|
+
label: "Naive",
|
|
546
|
+
long: "Naive \u2014 the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy."
|
|
547
|
+
},
|
|
548
|
+
engineOptimized: {
|
|
549
|
+
label: "Optimized",
|
|
550
|
+
long: "Optimized \u2014 the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval."
|
|
551
|
+
},
|
|
552
|
+
engineNormal: {
|
|
553
|
+
label: "Normal",
|
|
554
|
+
long: "Normal \u2014 a direct vendor-API call. The model produces a single-shot completion with no agent loop."
|
|
555
|
+
},
|
|
556
|
+
// -- Status values (per-test outcome) --------------------------------------
|
|
557
|
+
statusFail: {
|
|
558
|
+
label: "Fail",
|
|
559
|
+
long: "Fail \u2014 the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced."
|
|
560
|
+
},
|
|
561
|
+
statusLowDim: {
|
|
562
|
+
label: "Low dim",
|
|
563
|
+
long: "Low dim \u2014 the run produced output but at least one grading dimension scored below 60."
|
|
564
|
+
},
|
|
565
|
+
statusOk: {
|
|
566
|
+
label: "OK",
|
|
567
|
+
long: "OK \u2014 the run produced output and every grading dimension scored 60 or above."
|
|
533
568
|
}
|
|
534
569
|
};
|
|
535
570
|
|
|
@@ -609,6 +644,17 @@ following:
|
|
|
609
644
|
"scoring-model"
|
|
610
645
|
]
|
|
611
646
|
},
|
|
647
|
+
{
|
|
648
|
+
"id": "failure-modes",
|
|
649
|
+
"title": "Failure Modes",
|
|
650
|
+
"body": "## What this view is for\n\nThe Recommendations view tells you which fixes to make. This view tells you what\nkind of problem you have. It groups the run's weaknesses by the documentation\nissue behind them, so you can see patterns across the whole evaluation rather\nthan one fix at a time. If most of your weak spots are the same kind of problem,\nthat is a signal about how to spend your docs effort.\n\n## What you are looking at\n\nRecent reports show **interpretive cards** drawn from the run's diagnosis:\n\n- **Weakest area** names the single feature area dragging the score down most,\n the failure mode behind it, and a confidence level with the sample size, so\n you know how strong the signal is.\n- **Failure mode** highlights one category of problem, which scoring dimension\n it shows up in, and how often it occurred across the tests that were checked.\n- **Area summary** gives a plain-language read on how an area is doing and why.\n\nOlder reports show a **category breakdown** instead. Each failure category is a\nchip with a count. Selecting a chip lists the gaps in that category, and each\ngap shows an estimated score lift if fixed, a confidence level, a short\nremediation note, and the specific tasks that exposed it. You can click a task\nto jump to it.\n\n## The failure modes\n\nEach weakness is sorted into one of these categories. The category is the\nfastest way to know what kind of work the fix needs:\n\n- **Missing docs**: the doc the model needed does not exist or is not indexed.\n The fix is to write new documentation.\n- **Incorrect docs**: a doc has a factual error or a wrong example. The fix is\n to correct it.\n- **Outdated docs**: a doc exists but reflects a previous API surface. The fix\n is to bring it up to date.\n- **Poor structure**: the information is correct but hard for an agent to find\n or skim. The fix is to reorganize or clarify.\n- **Model limitation**: the model struggles even with correct docs available.\n This is not a documentation problem, so treat it as context rather than a\n to-do.\n- **Unclassified**: the run could not categorize the weakness. Use the linked\n tasks and the grader's notes to judge it yourself.\n\nDepending on the evaluation mode you may see additional categories, including\nones specific to agent behavior such as tool misuse or missing error handling.\n\n## How to use it\n\nStart with the category that has the most gaps or the highest combined lift. The\ncategory tells you the shape of the work before you open a single page: write,\ncorrect, update, or restructure. Categories that are not documentation problems,\nsuch as model limitation, are worth noting but are not yours to fix in the docs.\n\n## Related views\n\n- **Recommendations** turns these weaknesses into a ranked list of specific\n edits.\n- **Low-scoring judgments** shows the grader's raw notes on the tests that\n scored lowest, which is the most granular signal behind any failure mode.\n\n## When this view is empty\n\nIf a report shows no failure modes, the evaluation either classified nothing\nworth flagging or the run predates this view. A clean result here usually means\nthe docs held up across the evaluated tasks.",
|
|
651
|
+
"source": "docs/help/failure-modes.md",
|
|
652
|
+
"related": [
|
|
653
|
+
"recommendations",
|
|
654
|
+
"scoring-model",
|
|
655
|
+
"negative-doc-lift"
|
|
656
|
+
]
|
|
657
|
+
},
|
|
612
658
|
{
|
|
613
659
|
"id": "getting-started",
|
|
614
660
|
"title": "Getting Started",
|
|
@@ -661,23 +707,183 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
|
661
707
|
{
|
|
662
708
|
"id": "interpreting-diagnostics",
|
|
663
709
|
"title": "Interpreting Diagnostics",
|
|
664
|
-
"body": "##
|
|
710
|
+
"body": "## Reading the health of your docs\n\nA report scores each feature area on how well your documentation lets AI coding\ntools implement that feature. Reading those scores well is what turns a number\ninto a plan: it tells you where the docs are working, where they are not, and\nwhat kind of problem you are dealing with.\n\n## Health bands\n\nEach area's score falls into one of three bands:\n\n- **Strong (80 and above)**: docs are working well. Agents produce correct,\n complete implementations. No action needed unless you see a regression.\n- **Needs attention (70 to 79)**: docs are okay but have gaps. A specific\n dimension such as code correctness or doc coverage may be dragging the score\n down. Worth investigating.\n- **Weak (below 70)**: docs are not providing enough support. Agents\n consistently struggle with these features. These need priority attention.\n\n## Strong areas are signal too\n\nIt is easy to focus only on what is broken, but the strong areas are worth\nreading. They show what good looks like in your docs: clear structure, accurate\nexamples, the patterns agents can follow. When you fix a weak area, that is the\nbar to copy.\n\n## Key diagnostic signals\n\nA low score has a reason behind it. These signals tell you which reason, and\nwhat to do about it:\n\n| Signal | What it means | What to do |\n| ------------------------------ | ------------------------------------------- | ---------------------------------------- |\n| **Negative doc lift** | Docs are worse than no docs | Rewrite or remove the offending docs |\n| **Large retrieval gap** | Good docs exist but agents cannot find them | Improve page titles, metadata, structure |\n| **Low code correctness** | Agents find the docs but produce bad code | Add or fix code examples |\n| **Low doc coverage** | The docs do not cover what the task needs | Write new documentation |\n| **Efficiency anomaly (>100%)** | Agents do better without the docs | Injected docs may be confusing the model |\n\n## Where to go next\n\nWhen you know which areas are weak and why, the **Recommendations** view turns\nthat into a ranked list of specific edits, and the **Failure modes** view groups\nthe weaknesses by the kind of documentation problem behind them.",
|
|
665
711
|
"source": "docs/help/interpreting-diagnostics.md",
|
|
666
712
|
"related": [
|
|
667
|
-
"
|
|
668
|
-
"
|
|
713
|
+
"recommendations",
|
|
714
|
+
"failure-modes",
|
|
715
|
+
"scoring-model"
|
|
669
716
|
]
|
|
670
717
|
},
|
|
671
718
|
{
|
|
672
719
|
"id": "reading-score-trends",
|
|
673
|
-
"title": "Reading
|
|
674
|
-
"body":
|
|
720
|
+
"title": "Reading the Analytics View",
|
|
721
|
+
"body": `## What this view answers
|
|
722
|
+
|
|
723
|
+
The Analytics view is built around one question: **did your doc changes move the
|
|
724
|
+
score, and why?** Rather than open on a chart and leave you to find the story,
|
|
725
|
+
it leads with the answer \u2014 a plain-language verdict and the areas that moved
|
|
726
|
+
most \u2014 then lets you drill down into the evidence.
|
|
727
|
+
|
|
728
|
+
## The control bar
|
|
729
|
+
|
|
730
|
+
The top row picks what you're looking at:
|
|
731
|
+
|
|
732
|
+
- **Metric** \u2014 which number to track (composite score, doc lift, retrieval gap,
|
|
733
|
+
and so on).
|
|
734
|
+
- **Break down by** \u2014 how to split it (feature area, team, model, source).
|
|
735
|
+
- **Bucket** \u2014 how to group runs over time (per run, per day).
|
|
736
|
+
- **Range** \u2014 how far back to look (for example, the last 30 days).
|
|
737
|
+
|
|
738
|
+
The second row holds the active **filter chips** \u2014 use _Add filter_ to scope to
|
|
739
|
+
a team, source, or mode \u2014 and a scope hint (reports in scope vs. total). Every
|
|
740
|
+
knob and filter is saved in the URL, so a shared link reproduces exactly what
|
|
741
|
+
you see. Use **Copy link** to grab it.
|
|
742
|
+
|
|
743
|
+
## Overall \u2014 the read
|
|
744
|
+
|
|
745
|
+
The **verdict strip** is the headline. In plain language it says whether docs
|
|
746
|
+
are pulling ahead or slipping, and shows the headline metric with its change (\u0394)
|
|
747
|
+
since the start of the range, a model \u2192 agent \u2192 docs decomposition bar, and a
|
|
748
|
+
coverage cell (how many reports and high-confidence groups are in scope).
|
|
749
|
+
|
|
750
|
+
## Movers
|
|
751
|
+
|
|
752
|
+
The **movers board** leads with the top **Improved** and **Regressed** areas as
|
|
753
|
+
cards \u2014 not the average. Each card shows the area, its value and \u0394, a
|
|
754
|
+
decomposition bar, the release that most likely caused the move, and a
|
|
755
|
+
confidence read. A low-confidence **watch** callout flags big swings backed by
|
|
756
|
+
too few runs: watch them, don't celebrate them yet.
|
|
757
|
+
|
|
758
|
+
Click a mover card to reveal and decompose that series in the evidence chart.
|
|
759
|
+
|
|
760
|
+
## The evidence
|
|
761
|
+
|
|
762
|
+
The **focus chart** has two modes:
|
|
763
|
+
|
|
764
|
+
- **Compare** plots the selected series over time. It defaults to a focused set
|
|
765
|
+
(the movers plus the highest-volume areas) with a _show all_ expansion, and
|
|
766
|
+
draws release markers inline.
|
|
767
|
+
- **Decompose** shows the ceiling / floor / actual band for a single series,
|
|
768
|
+
with causal story cards anchored to each release marker (for example, _"Docs
|
|
769
|
+
+3 ~5 \u22121 \u2192 doc-lift +8 measured around this release"_).
|
|
770
|
+
|
|
771
|
+
Decompose is offered for the composite metric broken down by feature area \u2014 the
|
|
772
|
+
case where the model \u2192 agent \u2192 docs story is meaningful.
|
|
773
|
+
|
|
774
|
+
## The breakdown table
|
|
775
|
+
|
|
776
|
+
One row per area (or per whatever you broke down by), each with an inline
|
|
777
|
+
decomposition bar, a sparkline, confidence, \u0394, "docs add," and a report count.
|
|
778
|
+
Sort any column, and click a row to cross-highlight it in the chart. Export the
|
|
779
|
+
table to CSV.
|
|
780
|
+
|
|
781
|
+
## Meaningful change vs. noise
|
|
782
|
+
|
|
783
|
+
Small movements between runs are normal \u2014 they come from model non-determinism
|
|
784
|
+
and grader variance. This view leans on **confidence** (how many runs back a
|
|
785
|
+
number) and the **movers ranking** rather than a single \xB1point threshold: trust
|
|
786
|
+
a sustained move in a high-confidence area over a large swing in a
|
|
787
|
+
low-confidence one. The low-confidence watch exists precisely to stop you
|
|
788
|
+
over-reading thin data.
|
|
789
|
+
|
|
790
|
+
## Measured, not invented
|
|
791
|
+
|
|
792
|
+
The causal story is computed from real data, never fabricated. Release markers
|
|
793
|
+
come from the doc-change counts already recorded in each report, and the
|
|
794
|
+
"measured around this release" doc-lift effect is derived from the real ceiling
|
|
795
|
+
\u2212 floor series around the marker. Per-area prose ("the editor API changed") is
|
|
796
|
+
intentionally not shown \u2014 the data carries change counts, not hand-written
|
|
797
|
+
explanations.`,
|
|
675
798
|
"source": "docs/help/reading-score-trends.md",
|
|
676
799
|
"related": [
|
|
677
800
|
"scoring-model",
|
|
801
|
+
"doc-lift",
|
|
678
802
|
"comparing-runs"
|
|
679
803
|
]
|
|
680
804
|
},
|
|
805
|
+
{
|
|
806
|
+
"id": "recommendations",
|
|
807
|
+
"title": "Recommendations",
|
|
808
|
+
"body": `## What this view is for
|
|
809
|
+
|
|
810
|
+
This is the "what do I fix" view. The scores tell you how well your
|
|
811
|
+
documentation supports AI coding tools. This view turns those scores into a
|
|
812
|
+
ranked list of specific changes, so you can spend your time on the edits that
|
|
813
|
+
should move the score the most.
|
|
814
|
+
|
|
815
|
+
Everything here comes from the same evaluation run you are looking at, and it
|
|
816
|
+
points at your own documentation pages rather than giving generic advice.
|
|
817
|
+
|
|
818
|
+
## What you are looking at
|
|
819
|
+
|
|
820
|
+
Recent reports show a set of **diagnosis cards**. Each card answers one question
|
|
821
|
+
about the run.
|
|
822
|
+
|
|
823
|
+
**Top recommendations** is the main card. It opens with a short summary, then
|
|
824
|
+
lists a few suggested changes ranked by priority. Each suggestion has:
|
|
825
|
+
|
|
826
|
+
- A **priority** tag of high, medium, or low that tells you what to do first.
|
|
827
|
+
- A **title** that names the change in one line.
|
|
828
|
+
- A **description** of the specific fix, usually quoting the exact symbol,
|
|
829
|
+
query, or pattern involved.
|
|
830
|
+
- A **doc reference** showing which page, and the section when it is known, the
|
|
831
|
+
change applies to. Every reference points to a real page that was part of this
|
|
832
|
+
run, so you can open it and start editing.
|
|
833
|
+
|
|
834
|
+
You may also see supporting cards:
|
|
835
|
+
|
|
836
|
+
- **Doc attribution spotlight** shows which documentation pages most influenced
|
|
837
|
+
the results, and whether each one helped or hurt. Use it to confirm a
|
|
838
|
+
recommendation is pointing at the right page.
|
|
839
|
+
- **Low-confidence attribution** lists results where the link between a doc and
|
|
840
|
+
an outcome was uncertain. Treat anything flagged here as a lead to verify, not
|
|
841
|
+
a settled conclusion.
|
|
842
|
+
- **Regression vs baseline** appears when you are comparing against an earlier
|
|
843
|
+
run. It shows which areas moved up or down and the likely reason for each
|
|
844
|
+
change.
|
|
845
|
+
|
|
846
|
+
## How to use it
|
|
847
|
+
|
|
848
|
+
Work top down. Start with the high-priority suggestions, open the referenced
|
|
849
|
+
page, and make the change. Priority reflects how much each change is expected to
|
|
850
|
+
help, so the top of the list is usually where your effort goes furthest.
|
|
851
|
+
|
|
852
|
+
The recommendations are written by a model that reads this run's results. They
|
|
853
|
+
are grounded in your actual docs and cannot reference a page that was not in the
|
|
854
|
+
run, but they are still suggestions. Read the linked page before acting, and use
|
|
855
|
+
the confidence signals to decide how much to trust each item.
|
|
856
|
+
|
|
857
|
+
## Where this comes from
|
|
858
|
+
|
|
859
|
+
A recommendation is the end of a chain: a test scored low, the grader said why,
|
|
860
|
+
the run classified that into a failure mode, and this view proposes the edit. If
|
|
861
|
+
you want to see the failure modes themselves, grouped by category, open the
|
|
862
|
+
**Failure modes** view. If you want the grader's raw notes on the lowest scores,
|
|
863
|
+
open the **Low-scoring judgments** view.
|
|
864
|
+
|
|
865
|
+
## Older reports
|
|
866
|
+
|
|
867
|
+
Reports created before the diagnosis cards shipped show a simpler list instead.
|
|
868
|
+
Each row names a feature area, the failure mode behind it, an estimated score
|
|
869
|
+
lift if you fix it, a confidence level, and the tasks that exposed the gap. The
|
|
870
|
+
estimated lift is conservative. It assumes fixing the gap raises the weak
|
|
871
|
+
dimension only to the median of the others, so the real improvement can be
|
|
872
|
+
higher.
|
|
873
|
+
|
|
874
|
+
## When this view is empty
|
|
875
|
+
|
|
876
|
+
If a report shows no recommendations, the evaluation either ran and found
|
|
877
|
+
nothing worth flagging, or the run predates this feature. A score with no
|
|
878
|
+
recommendations is usually a good sign, because it means the docs held up across
|
|
879
|
+
the evaluated tasks.`,
|
|
880
|
+
"source": "docs/help/recommendations.md",
|
|
881
|
+
"related": [
|
|
882
|
+
"failure-modes",
|
|
883
|
+
"interpreting-diagnostics",
|
|
884
|
+
"scoring-model"
|
|
885
|
+
]
|
|
886
|
+
},
|
|
681
887
|
{
|
|
682
888
|
"id": "retrieval-gap",
|
|
683
889
|
"title": "Retrieval Gap & Infrastructure Efficiency",
|
|
@@ -700,17 +906,6 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
|
700
906
|
"eval-modes"
|
|
701
907
|
]
|
|
702
908
|
},
|
|
703
|
-
{
|
|
704
|
-
"id": "weaknesses-recommendations",
|
|
705
|
-
"title": "Weaknesses & Recommendations",
|
|
706
|
-
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** \u2014 Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** \u2014 Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** \u2014 How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** \u2014 specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** \u2014 The type of doc problem identified:\n - `missing-docs` \u2014 The functionality isn't documented at all.\n - `incorrect-docs` \u2014 The docs contain factual errors.\n - `outdated-docs` \u2014 The docs describe an old API version or pattern.\n - `poor-structure` \u2014 The docs exist but are hard to find or understand.\n- **Estimated lift** \u2014 How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate \u2014 actual improvement may be higher.\n- **Confidence** \u2014 How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** \u2014 Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** \u2014 a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong \u2014 missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
707
|
-
"source": "docs/help/weaknesses-recommendations.md",
|
|
708
|
-
"related": [
|
|
709
|
-
"interpreting-diagnostics",
|
|
710
|
-
"scoring-model",
|
|
711
|
-
"negative-doc-lift"
|
|
712
|
-
]
|
|
713
|
-
},
|
|
714
909
|
{
|
|
715
910
|
"id": "how-agents-work",
|
|
716
911
|
"title": "How AI Agents Find Documentation",
|
|
@@ -734,7 +929,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
|
734
929
|
{
|
|
735
930
|
"id": "glossary",
|
|
736
931
|
"title": "Glossary",
|
|
737
|
-
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall \u0394**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual \u0394**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap \u0394**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency \u0394**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Needs Attention (70\u201379)**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion \u0394**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness \u0394**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage \u0394**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area \u0394**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
932
|
+
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall \u0394**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual \u0394**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap \u0394**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency \u0394**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Needs Attention (70\u201379)**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion \u0394**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness \u0394**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage \u0394**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area \u0394**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.\n\n**Gold**\n: Gold variant \u2014 the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.\n\n**Baseline**\n: Baseline variant \u2014 no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.\n\n**Naive**\n: Naive \u2014 the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.\n\n**Optimized**\n: Optimized \u2014 the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.\n\n**Normal**\n: Normal \u2014 a direct vendor-API call. The model produces a single-shot completion with no agent loop.\n\n**Fail**\n: Fail \u2014 the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.\n\n**Low dim**\n: Low dim \u2014 the run produced output but at least one grading dimension scored below 60.\n\n**OK**\n: OK \u2014 the run produced output and every grading dimension scored 60 or above.",
|
|
738
933
|
"source": "packages/shared/src/glossary.ts",
|
|
739
934
|
"tags": [
|
|
740
935
|
"reference",
|
|
@@ -1296,16 +1491,22 @@ var featureAreaSchema = defineType2({
|
|
|
1296
1491
|
],
|
|
1297
1492
|
name: "ailf.featureArea",
|
|
1298
1493
|
preview: {
|
|
1299
|
-
|
|
1494
|
+
// `team` derefs the owning-team reference one hop to its displayName,
|
|
1495
|
+
// so the area list shows ownership at a glance (the dataset-level
|
|
1496
|
+
// "Unowned areas" diagnostic lives in structure.ts).
|
|
1497
|
+
prepare({ areaId, description, team }) {
|
|
1300
1498
|
const id = areaId !== null && typeof areaId === "object" && "current" in areaId ? areaId.current : void 0;
|
|
1499
|
+
const idStr = typeof id === "string" ? id : "";
|
|
1500
|
+
const teamStr = typeof team === "string" ? team : "";
|
|
1301
1501
|
return {
|
|
1302
|
-
subtitle:
|
|
1502
|
+
subtitle: teamStr ? `${idStr} \xB7 ${teamStr}` : idStr,
|
|
1303
1503
|
title: typeof description === "string" ? description : "Feature Area"
|
|
1304
1504
|
};
|
|
1305
1505
|
},
|
|
1306
1506
|
select: {
|
|
1307
1507
|
areaId: "areaId",
|
|
1308
|
-
description: "description"
|
|
1508
|
+
description: "description",
|
|
1509
|
+
team: "team.displayName"
|
|
1309
1510
|
}
|
|
1310
1511
|
},
|
|
1311
1512
|
title: "AILF Feature Area",
|
|
@@ -4409,16 +4610,97 @@ var teamSchema = defineType6({
|
|
|
4409
4610
|
type: "document"
|
|
4410
4611
|
});
|
|
4411
4612
|
|
|
4613
|
+
// src/schema/user.ts
|
|
4614
|
+
import { defineArrayMember as defineArrayMember2, defineField as defineField7, defineType as defineType7 } from "sanity";
|
|
4615
|
+
var userSchema = defineType7({
|
|
4616
|
+
fields: [
|
|
4617
|
+
defineField7({
|
|
4618
|
+
description: "Sanity account id (CurrentUser.id) \u2014 the stable, globally-unique key. Mirrored as a field so GROQ can join/filter; the document _id is the deterministic `ailf.user.${sanityUserId}`.",
|
|
4619
|
+
name: "sanityUserId",
|
|
4620
|
+
readOnly: true,
|
|
4621
|
+
title: "Sanity User ID",
|
|
4622
|
+
type: "string"
|
|
4623
|
+
}),
|
|
4624
|
+
defineField7({
|
|
4625
|
+
description: "Denormalized email for display / joins (lowercased on write).",
|
|
4626
|
+
name: "email",
|
|
4627
|
+
readOnly: true,
|
|
4628
|
+
title: "Email",
|
|
4629
|
+
type: "string"
|
|
4630
|
+
}),
|
|
4631
|
+
defineField7({
|
|
4632
|
+
description: "Display-name snapshot from CurrentUser.name.",
|
|
4633
|
+
name: "displayName",
|
|
4634
|
+
readOnly: true,
|
|
4635
|
+
title: "Display Name",
|
|
4636
|
+
type: "string"
|
|
4637
|
+
}),
|
|
4638
|
+
defineField7({
|
|
4639
|
+
description: "Self-declared team affiliation \u2014 drives dashboard personalization only. References to ailf.team documents in the same dataset.",
|
|
4640
|
+
name: "teams",
|
|
4641
|
+
of: [
|
|
4642
|
+
defineArrayMember2({
|
|
4643
|
+
to: [{ type: "ailf.team" }],
|
|
4644
|
+
type: "reference"
|
|
4645
|
+
})
|
|
4646
|
+
],
|
|
4647
|
+
readOnly: true,
|
|
4648
|
+
title: "Teams",
|
|
4649
|
+
type: "array"
|
|
4650
|
+
}),
|
|
4651
|
+
defineField7({
|
|
4652
|
+
description: "Per-user UI personalization.",
|
|
4653
|
+
fields: [
|
|
4654
|
+
defineField7({
|
|
4655
|
+
description: "The user's default team \u2014 one of `teams`. Distinct from `teams` so 'which team's view do I default to' can differ from 'all teams I affiliate with'. The slug consumers need is derived in GROQ.",
|
|
4656
|
+
name: "primaryTeam",
|
|
4657
|
+
readOnly: true,
|
|
4658
|
+
title: "Primary Team",
|
|
4659
|
+
to: [{ type: "ailf.team" }],
|
|
4660
|
+
type: "reference"
|
|
4661
|
+
})
|
|
4662
|
+
],
|
|
4663
|
+
name: "preferences",
|
|
4664
|
+
readOnly: true,
|
|
4665
|
+
title: "Preferences",
|
|
4666
|
+
type: "object"
|
|
4667
|
+
}),
|
|
4668
|
+
defineField7({
|
|
4669
|
+
description: "ISO 8601 UTC \u2014 stamped on each save by the dashboard.",
|
|
4670
|
+
name: "updatedAt",
|
|
4671
|
+
readOnly: true,
|
|
4672
|
+
title: "Updated At",
|
|
4673
|
+
type: "datetime"
|
|
4674
|
+
})
|
|
4675
|
+
],
|
|
4676
|
+
name: "ailf.user",
|
|
4677
|
+
preview: {
|
|
4678
|
+
prepare({ displayName, email }) {
|
|
4679
|
+
const title = typeof displayName === "string" && displayName ? displayName : typeof email === "string" && email ? email : "(unknown user)";
|
|
4680
|
+
return {
|
|
4681
|
+
subtitle: typeof email === "string" ? email : "",
|
|
4682
|
+
title
|
|
4683
|
+
};
|
|
4684
|
+
},
|
|
4685
|
+
select: {
|
|
4686
|
+
displayName: "displayName",
|
|
4687
|
+
email: "email"
|
|
4688
|
+
}
|
|
4689
|
+
},
|
|
4690
|
+
title: "AILF User",
|
|
4691
|
+
type: "document"
|
|
4692
|
+
});
|
|
4693
|
+
|
|
4412
4694
|
// src/schema/webhook-config.ts
|
|
4413
|
-
import { ALL_FIELDS_GROUP as ALL_FIELDS_GROUP7, defineField as
|
|
4414
|
-
var webhookConfigSchema =
|
|
4695
|
+
import { ALL_FIELDS_GROUP as ALL_FIELDS_GROUP7, defineField as defineField8, defineType as defineType8 } from "sanity";
|
|
4696
|
+
var webhookConfigSchema = defineType8({
|
|
4415
4697
|
groups: [
|
|
4416
4698
|
{ name: "main", title: "Main", default: true },
|
|
4417
4699
|
{ name: "optional", title: "Optional" },
|
|
4418
4700
|
ALL_FIELDS_GROUP7
|
|
4419
4701
|
],
|
|
4420
4702
|
fields: [
|
|
4421
|
-
|
|
4703
|
+
defineField8({
|
|
4422
4704
|
description: "When enabled, publishing articles will automatically trigger AI Literacy evaluations for affected feature areas.",
|
|
4423
4705
|
group: ["main", "all-fields"],
|
|
4424
4706
|
initialValue: false,
|
|
@@ -4426,7 +4708,7 @@ var webhookConfigSchema = defineType7({
|
|
|
4426
4708
|
title: "Evaluate on Publish",
|
|
4427
4709
|
type: "boolean"
|
|
4428
4710
|
}),
|
|
4429
|
-
|
|
4711
|
+
defineField8({
|
|
4430
4712
|
description: "Which evaluation mode to use for webhook-triggered evaluations.",
|
|
4431
4713
|
group: ["main", "all-fields"],
|
|
4432
4714
|
initialValue: "baseline",
|
|
@@ -4442,7 +4724,7 @@ var webhookConfigSchema = defineType7({
|
|
|
4442
4724
|
title: "Evaluation Mode",
|
|
4443
4725
|
type: "string"
|
|
4444
4726
|
}),
|
|
4445
|
-
|
|
4727
|
+
defineField8({
|
|
4446
4728
|
description: "Maximum evaluations per day. Prevents runaway costs from rapid editing.",
|
|
4447
4729
|
group: ["main", "all-fields"],
|
|
4448
4730
|
initialValue: 20,
|
|
@@ -4451,7 +4733,7 @@ var webhookConfigSchema = defineType7({
|
|
|
4451
4733
|
type: "number",
|
|
4452
4734
|
validation: (rule) => rule.min(1).max(100)
|
|
4453
4735
|
}),
|
|
4454
|
-
|
|
4736
|
+
defineField8({
|
|
4455
4737
|
description: "Seconds to wait after the last edit before dispatching. Coalesces rapid edits into a single evaluation.",
|
|
4456
4738
|
group: ["optional", "all-fields"],
|
|
4457
4739
|
initialValue: 300,
|
|
@@ -4460,7 +4742,7 @@ var webhookConfigSchema = defineType7({
|
|
|
4460
4742
|
type: "number",
|
|
4461
4743
|
validation: (rule) => rule.min(10).max(3600)
|
|
4462
4744
|
}),
|
|
4463
|
-
|
|
4745
|
+
defineField8({
|
|
4464
4746
|
description: "Specific feature areas to evaluate. Leave empty to evaluate all affected areas automatically.",
|
|
4465
4747
|
group: ["optional", "all-fields"],
|
|
4466
4748
|
name: "areas",
|
|
@@ -4468,7 +4750,7 @@ var webhookConfigSchema = defineType7({
|
|
|
4468
4750
|
title: "Area Filter",
|
|
4469
4751
|
type: "array"
|
|
4470
4752
|
}),
|
|
4471
|
-
|
|
4753
|
+
defineField8({
|
|
4472
4754
|
description: "Slack webhook URL for notifications about webhook-triggered evaluations.",
|
|
4473
4755
|
group: ["optional", "all-fields"],
|
|
4474
4756
|
name: "notifySlack",
|
|
@@ -4615,7 +4897,7 @@ function deriveHelpTopic(routerState) {
|
|
|
4615
4897
|
if (routerState.reportId) {
|
|
4616
4898
|
switch (routerState.tab) {
|
|
4617
4899
|
case "diagnostics":
|
|
4618
|
-
return routerState.subTab === "strengths" ? "interpreting-diagnostics" : "
|
|
4900
|
+
return routerState.subTab === "strengths" ? "interpreting-diagnostics" : "recommendations";
|
|
4619
4901
|
case "activity":
|
|
4620
4902
|
return "how-agents-work";
|
|
4621
4903
|
default:
|
|
@@ -15813,6 +16095,21 @@ function ailfTeamsStructureItem(S) {
|
|
|
15813
16095
|
])
|
|
15814
16096
|
);
|
|
15815
16097
|
}
|
|
16098
|
+
function ailfUsersStructureItem(S) {
|
|
16099
|
+
return S.listItem().id("ailfUsers").title("Users").child(
|
|
16100
|
+
S.list().id("ailfUsersViews").title("Users").items([
|
|
16101
|
+
S.listItem().id("allUsers").title("All users").child(
|
|
16102
|
+
S.documentTypeList("ailf.user").id("ailfUsersAll").title("All users")
|
|
16103
|
+
),
|
|
16104
|
+
S.divider(),
|
|
16105
|
+
S.listItem().id("usersWithoutTeams").title("\u26A0 Users without teams").icon(WarningOutlineIcon5).child(
|
|
16106
|
+
S.documentTypeList("ailf.user").id("ailfUsersWithoutTeams").title("\u26A0 Users without teams").apiVersion(API_VERSION).filter(
|
|
16107
|
+
'_type == "ailf.user" && (!defined(teams) || count(teams) == 0)'
|
|
16108
|
+
)
|
|
16109
|
+
)
|
|
16110
|
+
])
|
|
16111
|
+
);
|
|
16112
|
+
}
|
|
15816
16113
|
function ailfAreasStructureItem(S) {
|
|
15817
16114
|
return S.listItem().id("ailfAreas").title("Areas").child(
|
|
15818
16115
|
S.list().id("ailfAreasViews").title("Areas").items([
|
|
@@ -15851,13 +16148,14 @@ function ailfChannelsWithUnknownEventsItem(S) {
|
|
|
15851
16148
|
var ailfStructure = (S) => S.list().id("root").title("Content").items([
|
|
15852
16149
|
ailfTaskStructureItem(S),
|
|
15853
16150
|
ailfTeamsStructureItem(S),
|
|
16151
|
+
ailfUsersStructureItem(S),
|
|
15854
16152
|
ailfAreasStructureItem(S),
|
|
15855
16153
|
ailfReportsStructureItem(S),
|
|
15856
16154
|
ailfChannelsWithUnknownEventsItem(S),
|
|
15857
16155
|
S.divider(),
|
|
15858
16156
|
...S.documentTypeListItems().filter((listItem) => {
|
|
15859
16157
|
const id = listItem.getId();
|
|
15860
|
-
return id !== "ailf.task" && id !== "ailf.team" && id !== "ailf.featureArea" && id !== "ailf.report";
|
|
16158
|
+
return id !== "ailf.task" && id !== "ailf.team" && id !== "ailf.user" && id !== "ailf.featureArea" && id !== "ailf.report";
|
|
15861
16159
|
})
|
|
15862
16160
|
]);
|
|
15863
16161
|
|
|
@@ -16207,6 +16505,7 @@ var ailfPlugin = definePlugin((options) => ({
|
|
|
16207
16505
|
reportSchema,
|
|
16208
16506
|
taskSchema,
|
|
16209
16507
|
teamSchema,
|
|
16508
|
+
userSchema,
|
|
16210
16509
|
webhookConfigSchema
|
|
16211
16510
|
]
|
|
16212
16511
|
},
|
|
@@ -16252,5 +16551,6 @@ export {
|
|
|
16252
16551
|
taskSchema,
|
|
16253
16552
|
teamSchema,
|
|
16254
16553
|
useHelp,
|
|
16554
|
+
userSchema,
|
|
16255
16555
|
webhookConfigSchema
|
|
16256
16556
|
};
|