@sanity/ailf 6.1.1 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/constants.d.ts +12 -0
- package/dist/_vendor/ailf-core/constants.js +12 -0
- package/dist/_vendor/ailf-shared/editorial-reference.d.ts +48 -0
- package/dist/_vendor/ailf-shared/editorial-reference.js +43 -0
- package/dist/_vendor/ailf-shared/gcs-defaults.d.ts +16 -0
- package/dist/_vendor/ailf-shared/gcs-defaults.js +16 -0
- package/dist/_vendor/ailf-shared/generated/help-content.d.ts +2 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +140 -0
- package/dist/_vendor/ailf-shared/glossary.d.ts +318 -0
- package/dist/_vendor/ailf-shared/glossary.js +330 -0
- package/dist/_vendor/ailf-shared/help-content.d.ts +10 -0
- package/dist/_vendor/ailf-shared/help-content.js +10 -0
- package/dist/_vendor/ailf-shared/help-topics.d.ts +26 -0
- package/dist/_vendor/ailf-shared/help-topics.js +1 -0
- package/dist/_vendor/ailf-shared/index.d.ts +5 -0
- package/dist/_vendor/ailf-shared/index.js +4 -0
- package/dist/composition-root.js +7 -5
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/sanity/client.d.ts +6 -21
- package/dist/sanity/client.js +20 -22
- package/dist/webhook/eval-request-handler.d.ts +32 -29
- package/dist/webhook/eval-request-handler.js +90 -50
- package/package.json +3 -3
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* glossary.ts
|
|
3
|
+
*
|
|
4
|
+
* Centralized metric glossary used by Studio and dashboard alike. Each entry
|
|
5
|
+
* is keyed by a canonical slug. The value carries:
|
|
6
|
+
*
|
|
7
|
+
* - `label`: the canonical user-facing label, e.g., "Overall Score". Used
|
|
8
|
+
* by the dashboard's `<MetricLabel>` and may be reused as table-header
|
|
9
|
+
* copy where space allows.
|
|
10
|
+
* - `long`: full description for tooltips and help drawers. Copy was
|
|
11
|
+
* ported verbatim from the Studio-era `packages/studio/src/glossary.ts`.
|
|
12
|
+
*
|
|
13
|
+
* A `short` field is a planned future addition for tight contexts where the
|
|
14
|
+
* `long` description is too verbose. Adding it later is a non-breaking
|
|
15
|
+
* schema change.
|
|
16
|
+
*
|
|
17
|
+
* @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
|
|
18
|
+
* @see docs/architecture.md (scoring model)
|
|
19
|
+
*/
|
|
20
|
+
export interface GlossaryEntry {
|
|
21
|
+
label: string;
|
|
22
|
+
long: string;
|
|
23
|
+
}
|
|
24
|
+
export declare const GLOSSARY: {
|
|
25
|
+
readonly overallScore: {
|
|
26
|
+
readonly label: "Overall Score";
|
|
27
|
+
readonly long: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
28
|
+
};
|
|
29
|
+
readonly docLift: {
|
|
30
|
+
readonly label: "Doc Lift";
|
|
31
|
+
readonly long: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
|
|
32
|
+
};
|
|
33
|
+
readonly actualScore: {
|
|
34
|
+
readonly label: "Actual Score";
|
|
35
|
+
readonly long: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
|
|
36
|
+
};
|
|
37
|
+
readonly retrievalGap: {
|
|
38
|
+
readonly label: "Retrieval Gap";
|
|
39
|
+
readonly long: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
|
|
40
|
+
};
|
|
41
|
+
readonly infraEfficiency: {
|
|
42
|
+
readonly label: "Infra Efficiency";
|
|
43
|
+
readonly long: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
|
|
44
|
+
};
|
|
45
|
+
readonly floor: {
|
|
46
|
+
readonly label: "Floor";
|
|
47
|
+
readonly long: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
|
|
48
|
+
};
|
|
49
|
+
readonly ceiling: {
|
|
50
|
+
readonly label: "Ceiling";
|
|
51
|
+
readonly long: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
|
|
52
|
+
};
|
|
53
|
+
readonly actual: {
|
|
54
|
+
readonly label: "Actual";
|
|
55
|
+
readonly long: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
|
|
56
|
+
};
|
|
57
|
+
readonly retGap: {
|
|
58
|
+
readonly label: "Ret. Gap";
|
|
59
|
+
readonly long: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
|
|
60
|
+
};
|
|
61
|
+
readonly efficiency: {
|
|
62
|
+
readonly label: "Efficiency";
|
|
63
|
+
readonly long: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
|
|
64
|
+
};
|
|
65
|
+
readonly invertedRetGap: {
|
|
66
|
+
readonly label: "Inverted Retrieval Gap";
|
|
67
|
+
readonly long: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
|
|
68
|
+
};
|
|
69
|
+
readonly score: {
|
|
70
|
+
readonly label: "Score";
|
|
71
|
+
readonly long: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
|
|
72
|
+
};
|
|
73
|
+
readonly taskCompletion: {
|
|
74
|
+
readonly label: "Task Completion";
|
|
75
|
+
readonly long: "Can the LLM implement the requested feature? Graded 0–100.";
|
|
76
|
+
};
|
|
77
|
+
readonly codeCorrectness: {
|
|
78
|
+
readonly label: "Code Correctness";
|
|
79
|
+
readonly long: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
|
|
80
|
+
};
|
|
81
|
+
readonly docCoverage: {
|
|
82
|
+
readonly label: "Doc Coverage";
|
|
83
|
+
readonly long: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
|
|
84
|
+
};
|
|
85
|
+
readonly tests: {
|
|
86
|
+
readonly label: "Tests";
|
|
87
|
+
readonly long: "Number of test cases in this feature area.";
|
|
88
|
+
};
|
|
89
|
+
readonly overallDelta: {
|
|
90
|
+
readonly label: "Overall Δ";
|
|
91
|
+
readonly long: "Change in overall score between the two runs. Positive means the experiment scored higher.";
|
|
92
|
+
};
|
|
93
|
+
readonly actualDelta: {
|
|
94
|
+
readonly label: "Actual Δ";
|
|
95
|
+
readonly long: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
|
|
96
|
+
};
|
|
97
|
+
readonly retGapDelta: {
|
|
98
|
+
readonly label: "Ret. Gap Δ";
|
|
99
|
+
readonly long: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.";
|
|
100
|
+
};
|
|
101
|
+
readonly efficiencyDelta: {
|
|
102
|
+
readonly label: "Efficiency Δ";
|
|
103
|
+
readonly long: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.";
|
|
104
|
+
};
|
|
105
|
+
readonly baseline: {
|
|
106
|
+
readonly label: "Baseline";
|
|
107
|
+
readonly long: "The reference run you're comparing against.";
|
|
108
|
+
};
|
|
109
|
+
readonly experiment: {
|
|
110
|
+
readonly label: "Experiment";
|
|
111
|
+
readonly long: "The new run you're evaluating.";
|
|
112
|
+
};
|
|
113
|
+
readonly delta: {
|
|
114
|
+
readonly label: "Delta";
|
|
115
|
+
readonly long: "Difference between experiment and baseline. Positive means improvement, negative means regression.";
|
|
116
|
+
};
|
|
117
|
+
readonly change: {
|
|
118
|
+
readonly label: "Change";
|
|
119
|
+
readonly long: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
|
|
120
|
+
};
|
|
121
|
+
readonly lowScoringJudgments: {
|
|
122
|
+
readonly label: "Low-Scoring Judgments";
|
|
123
|
+
readonly long: "The grading model's explanations for tests that scored below 70/100.";
|
|
124
|
+
};
|
|
125
|
+
readonly judgmentReason: {
|
|
126
|
+
readonly label: "Judgment Reason";
|
|
127
|
+
readonly long: "The grading model's natural language explanation of what went wrong.";
|
|
128
|
+
};
|
|
129
|
+
readonly healthStrong: {
|
|
130
|
+
readonly label: "Strong (80+)";
|
|
131
|
+
readonly long: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.";
|
|
132
|
+
};
|
|
133
|
+
readonly healthAttention: {
|
|
134
|
+
readonly label: "Needs Attention (70–79)";
|
|
135
|
+
readonly long: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.";
|
|
136
|
+
};
|
|
137
|
+
readonly healthWeak: {
|
|
138
|
+
readonly label: "Weak (<70)";
|
|
139
|
+
readonly long: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
|
|
140
|
+
};
|
|
141
|
+
readonly negativeDocLiftMetric: {
|
|
142
|
+
readonly label: "Negative Doc Lift";
|
|
143
|
+
readonly long: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
|
|
144
|
+
};
|
|
145
|
+
readonly weakAreas: {
|
|
146
|
+
readonly label: "Weak Areas";
|
|
147
|
+
readonly long: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
|
|
148
|
+
};
|
|
149
|
+
readonly docsHurt: {
|
|
150
|
+
readonly label: "Docs Hurt Performance";
|
|
151
|
+
readonly long: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.";
|
|
152
|
+
};
|
|
153
|
+
readonly retrievalIssues: {
|
|
154
|
+
readonly label: "Retrieval Issues";
|
|
155
|
+
readonly long: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
|
|
156
|
+
};
|
|
157
|
+
readonly dimWeaknesses: {
|
|
158
|
+
readonly label: "Dimension Weaknesses";
|
|
159
|
+
readonly long: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
|
|
160
|
+
};
|
|
161
|
+
readonly efficiencyAnomalies: {
|
|
162
|
+
readonly label: "Efficiency Anomalies";
|
|
163
|
+
readonly long: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
|
|
164
|
+
};
|
|
165
|
+
readonly docLiftWins: {
|
|
166
|
+
readonly label: "Doc Lift Wins";
|
|
167
|
+
readonly long: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
|
|
168
|
+
};
|
|
169
|
+
readonly retrievalExcellence: {
|
|
170
|
+
readonly label: "Retrieval Excellence";
|
|
171
|
+
readonly long: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
|
|
172
|
+
};
|
|
173
|
+
readonly modelBreakdown: {
|
|
174
|
+
readonly label: "Model Breakdown";
|
|
175
|
+
readonly long: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.";
|
|
176
|
+
};
|
|
177
|
+
readonly strengths: {
|
|
178
|
+
readonly label: "Strengths";
|
|
179
|
+
readonly long: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
|
|
180
|
+
};
|
|
181
|
+
readonly recommendations: {
|
|
182
|
+
readonly label: "Recommendations";
|
|
183
|
+
readonly long: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
|
|
184
|
+
};
|
|
185
|
+
readonly totalPotentialLift: {
|
|
186
|
+
readonly label: "Total Potential Lift";
|
|
187
|
+
readonly long: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
|
|
188
|
+
};
|
|
189
|
+
readonly failureMode: {
|
|
190
|
+
readonly label: "Failure Mode";
|
|
191
|
+
readonly long: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).";
|
|
192
|
+
};
|
|
193
|
+
readonly estimatedLift: {
|
|
194
|
+
readonly label: "Estimated Lift";
|
|
195
|
+
readonly long: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
|
|
196
|
+
};
|
|
197
|
+
readonly confidence: {
|
|
198
|
+
readonly label: "Confidence";
|
|
199
|
+
readonly long: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.";
|
|
200
|
+
};
|
|
201
|
+
readonly agentBehaviorOverview: {
|
|
202
|
+
readonly label: "Agent Behavior";
|
|
203
|
+
readonly long: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
|
|
204
|
+
};
|
|
205
|
+
readonly searchQueries: {
|
|
206
|
+
readonly label: "Search Queries";
|
|
207
|
+
readonly long: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
|
|
208
|
+
};
|
|
209
|
+
readonly docSlugsVisited: {
|
|
210
|
+
readonly label: "Unique Doc Slugs";
|
|
211
|
+
readonly long: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
|
|
212
|
+
};
|
|
213
|
+
readonly externalDomains: {
|
|
214
|
+
readonly label: "External Domains";
|
|
215
|
+
readonly long: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.";
|
|
216
|
+
};
|
|
217
|
+
readonly avgDocPagesVisited: {
|
|
218
|
+
readonly label: "Avg Pages Visited";
|
|
219
|
+
readonly long: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.";
|
|
220
|
+
};
|
|
221
|
+
readonly avgSearchesPerformed: {
|
|
222
|
+
readonly label: "Avg Searches";
|
|
223
|
+
readonly long: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.";
|
|
224
|
+
};
|
|
225
|
+
readonly avgNetworkTimeMs: {
|
|
226
|
+
readonly label: "Avg Network Time";
|
|
227
|
+
readonly long: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.";
|
|
228
|
+
};
|
|
229
|
+
readonly totalRequests: {
|
|
230
|
+
readonly label: "Total Requests";
|
|
231
|
+
readonly long: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.";
|
|
232
|
+
};
|
|
233
|
+
readonly totalBytesDownloaded: {
|
|
234
|
+
readonly label: "Total Bytes Downloaded";
|
|
235
|
+
readonly long: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.";
|
|
236
|
+
};
|
|
237
|
+
readonly dimTaskCompletion: {
|
|
238
|
+
readonly label: "Task Completion Δ";
|
|
239
|
+
readonly long: "Change in task completion between runs. Positive means implementations are more complete.";
|
|
240
|
+
};
|
|
241
|
+
readonly dimCodeCorrectness: {
|
|
242
|
+
readonly label: "Code Correctness Δ";
|
|
243
|
+
readonly long: "Change in code correctness between runs. Positive means better code quality.";
|
|
244
|
+
};
|
|
245
|
+
readonly dimDocCoverage: {
|
|
246
|
+
readonly label: "Doc Coverage Δ";
|
|
247
|
+
readonly long: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
|
|
248
|
+
};
|
|
249
|
+
readonly areaDelta: {
|
|
250
|
+
readonly label: "Area Δ";
|
|
251
|
+
readonly long: "Score change for this area compared to the previous evaluation run.";
|
|
252
|
+
};
|
|
253
|
+
readonly sourceProduction: {
|
|
254
|
+
readonly label: "Production";
|
|
255
|
+
readonly long: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
|
|
256
|
+
};
|
|
257
|
+
readonly sourceBranch: {
|
|
258
|
+
readonly label: "Branch";
|
|
259
|
+
readonly long: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
|
|
260
|
+
};
|
|
261
|
+
readonly sourceLocal: {
|
|
262
|
+
readonly label: "Local";
|
|
263
|
+
readonly long: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
|
|
264
|
+
};
|
|
265
|
+
readonly reportScore: {
|
|
266
|
+
readonly label: "Score";
|
|
267
|
+
readonly long: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
|
|
268
|
+
};
|
|
269
|
+
readonly reportMode: {
|
|
270
|
+
readonly label: "Mode";
|
|
271
|
+
readonly long: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
|
|
272
|
+
};
|
|
273
|
+
readonly reportTrigger: {
|
|
274
|
+
readonly label: "Trigger";
|
|
275
|
+
readonly long: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
|
|
276
|
+
};
|
|
277
|
+
readonly modeBaseline: {
|
|
278
|
+
readonly label: "Baseline";
|
|
279
|
+
readonly long: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
|
|
280
|
+
};
|
|
281
|
+
readonly modeFull: {
|
|
282
|
+
readonly label: "Full";
|
|
283
|
+
readonly long: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
|
|
284
|
+
};
|
|
285
|
+
readonly modeAgentic: {
|
|
286
|
+
readonly label: "Agentic";
|
|
287
|
+
readonly long: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
|
|
288
|
+
};
|
|
289
|
+
readonly modeObserved: {
|
|
290
|
+
readonly label: "Observed";
|
|
291
|
+
readonly long: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
|
|
292
|
+
};
|
|
293
|
+
readonly modeDebug: {
|
|
294
|
+
readonly label: "Debug";
|
|
295
|
+
readonly long: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
|
|
296
|
+
};
|
|
297
|
+
readonly triggerManual: {
|
|
298
|
+
readonly label: "Manual";
|
|
299
|
+
readonly long: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
|
|
300
|
+
};
|
|
301
|
+
readonly triggerCi: {
|
|
302
|
+
readonly label: "CI";
|
|
303
|
+
readonly long: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
|
|
304
|
+
};
|
|
305
|
+
readonly triggerSchedule: {
|
|
306
|
+
readonly label: "Scheduled";
|
|
307
|
+
readonly long: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
|
|
308
|
+
};
|
|
309
|
+
readonly triggerWebhook: {
|
|
310
|
+
readonly label: "Webhook";
|
|
311
|
+
readonly long: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
|
|
312
|
+
};
|
|
313
|
+
readonly triggerCrossRepo: {
|
|
314
|
+
readonly label: "Cross-Repo";
|
|
315
|
+
readonly long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
|
|
316
|
+
};
|
|
317
|
+
};
|
|
318
|
+
export type GlossarySlug = keyof typeof GLOSSARY;
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* glossary.ts
|
|
3
|
+
*
|
|
4
|
+
* Centralized metric glossary used by Studio and dashboard alike. Each entry
|
|
5
|
+
* is keyed by a canonical slug. The value carries:
|
|
6
|
+
*
|
|
7
|
+
* - `label`: the canonical user-facing label, e.g., "Overall Score". Used
|
|
8
|
+
* by the dashboard's `<MetricLabel>` and may be reused as table-header
|
|
9
|
+
* copy where space allows.
|
|
10
|
+
* - `long`: full description for tooltips and help drawers. Copy was
|
|
11
|
+
* ported verbatim from the Studio-era `packages/studio/src/glossary.ts`.
|
|
12
|
+
*
|
|
13
|
+
* A `short` field is a planned future addition for tight contexts where the
|
|
14
|
+
* `long` description is too verbose. Adding it later is a non-breaking
|
|
15
|
+
* schema change.
|
|
16
|
+
*
|
|
17
|
+
* @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
|
|
18
|
+
* @see docs/architecture.md (scoring model)
|
|
19
|
+
*/
|
|
20
|
+
export const GLOSSARY = {
|
|
21
|
+
// -- Overview stats -------------------------------------------------------
|
|
22
|
+
overallScore: {
|
|
23
|
+
label: "Overall Score",
|
|
24
|
+
long: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
|
|
25
|
+
},
|
|
26
|
+
docLift: {
|
|
27
|
+
label: "Doc Lift",
|
|
28
|
+
long: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.",
|
|
29
|
+
},
|
|
30
|
+
actualScore: {
|
|
31
|
+
label: "Actual Score",
|
|
32
|
+
long: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.",
|
|
33
|
+
},
|
|
34
|
+
retrievalGap: {
|
|
35
|
+
label: "Retrieval Gap",
|
|
36
|
+
long: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.",
|
|
37
|
+
},
|
|
38
|
+
infraEfficiency: {
|
|
39
|
+
label: "Infra Efficiency",
|
|
40
|
+
long: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.",
|
|
41
|
+
},
|
|
42
|
+
// -- Three-layer decomposition columns ------------------------------------
|
|
43
|
+
floor: {
|
|
44
|
+
label: "Floor",
|
|
45
|
+
long: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.",
|
|
46
|
+
},
|
|
47
|
+
ceiling: {
|
|
48
|
+
label: "Ceiling",
|
|
49
|
+
long: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.",
|
|
50
|
+
},
|
|
51
|
+
actual: {
|
|
52
|
+
label: "Actual",
|
|
53
|
+
long: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.",
|
|
54
|
+
},
|
|
55
|
+
retGap: {
|
|
56
|
+
label: "Ret. Gap",
|
|
57
|
+
long: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.",
|
|
58
|
+
},
|
|
59
|
+
efficiency: {
|
|
60
|
+
label: "Efficiency",
|
|
61
|
+
long: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).",
|
|
62
|
+
},
|
|
63
|
+
invertedRetGap: {
|
|
64
|
+
label: "Inverted Retrieval Gap",
|
|
65
|
+
long: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.",
|
|
66
|
+
},
|
|
67
|
+
// -- Per-area score columns -----------------------------------------------
|
|
68
|
+
score: {
|
|
69
|
+
label: "Score",
|
|
70
|
+
long: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.",
|
|
71
|
+
},
|
|
72
|
+
taskCompletion: {
|
|
73
|
+
label: "Task Completion",
|
|
74
|
+
long: "Can the LLM implement the requested feature? Graded 0–100.",
|
|
75
|
+
},
|
|
76
|
+
codeCorrectness: {
|
|
77
|
+
label: "Code Correctness",
|
|
78
|
+
long: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.",
|
|
79
|
+
},
|
|
80
|
+
docCoverage: {
|
|
81
|
+
label: "Doc Coverage",
|
|
82
|
+
long: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.",
|
|
83
|
+
},
|
|
84
|
+
tests: {
|
|
85
|
+
label: "Tests",
|
|
86
|
+
long: "Number of test cases in this feature area.",
|
|
87
|
+
},
|
|
88
|
+
// -- Comparison deltas ----------------------------------------------------
|
|
89
|
+
overallDelta: {
|
|
90
|
+
label: "Overall Δ",
|
|
91
|
+
long: "Change in overall score between the two runs. Positive means the experiment scored higher.",
|
|
92
|
+
},
|
|
93
|
+
actualDelta: {
|
|
94
|
+
label: "Actual Δ",
|
|
95
|
+
long: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.",
|
|
96
|
+
},
|
|
97
|
+
retGapDelta: {
|
|
98
|
+
label: "Ret. Gap Δ",
|
|
99
|
+
long: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.",
|
|
100
|
+
},
|
|
101
|
+
efficiencyDelta: {
|
|
102
|
+
label: "Efficiency Δ",
|
|
103
|
+
long: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.",
|
|
104
|
+
},
|
|
105
|
+
// -- Comparison table columns ---------------------------------------------
|
|
106
|
+
baseline: {
|
|
107
|
+
label: "Baseline",
|
|
108
|
+
long: "The reference run you're comparing against.",
|
|
109
|
+
},
|
|
110
|
+
experiment: {
|
|
111
|
+
label: "Experiment",
|
|
112
|
+
long: "The new run you're evaluating.",
|
|
113
|
+
},
|
|
114
|
+
delta: {
|
|
115
|
+
label: "Delta",
|
|
116
|
+
long: "Difference between experiment and baseline. Positive means improvement, negative means regression.",
|
|
117
|
+
},
|
|
118
|
+
change: {
|
|
119
|
+
label: "Change",
|
|
120
|
+
long: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).",
|
|
121
|
+
},
|
|
122
|
+
// -- Grader judgments ------------------------------------------------------
|
|
123
|
+
lowScoringJudgments: {
|
|
124
|
+
label: "Low-Scoring Judgments",
|
|
125
|
+
long: "The grading model's explanations for tests that scored below 70/100.",
|
|
126
|
+
},
|
|
127
|
+
judgmentReason: {
|
|
128
|
+
label: "Judgment Reason",
|
|
129
|
+
long: "The grading model's natural language explanation of what went wrong.",
|
|
130
|
+
},
|
|
131
|
+
// -- Diagnostics overview ---------------------------------------------------
|
|
132
|
+
healthStrong: {
|
|
133
|
+
label: "Strong (80+)",
|
|
134
|
+
long: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.",
|
|
135
|
+
},
|
|
136
|
+
healthAttention: {
|
|
137
|
+
label: "Needs Attention (70–79)",
|
|
138
|
+
long: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.",
|
|
139
|
+
},
|
|
140
|
+
healthWeak: {
|
|
141
|
+
label: "Weak (<70)",
|
|
142
|
+
long: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.",
|
|
143
|
+
},
|
|
144
|
+
negativeDocLiftMetric: {
|
|
145
|
+
label: "Negative Doc Lift",
|
|
146
|
+
long: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.",
|
|
147
|
+
},
|
|
148
|
+
weakAreas: {
|
|
149
|
+
label: "Weak Areas",
|
|
150
|
+
long: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.",
|
|
151
|
+
},
|
|
152
|
+
docsHurt: {
|
|
153
|
+
label: "Docs Hurt Performance",
|
|
154
|
+
long: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.",
|
|
155
|
+
},
|
|
156
|
+
retrievalIssues: {
|
|
157
|
+
label: "Retrieval Issues",
|
|
158
|
+
long: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.",
|
|
159
|
+
},
|
|
160
|
+
dimWeaknesses: {
|
|
161
|
+
label: "Dimension Weaknesses",
|
|
162
|
+
long: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).",
|
|
163
|
+
},
|
|
164
|
+
efficiencyAnomalies: {
|
|
165
|
+
label: "Efficiency Anomalies",
|
|
166
|
+
long: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.",
|
|
167
|
+
},
|
|
168
|
+
docLiftWins: {
|
|
169
|
+
label: "Doc Lift Wins",
|
|
170
|
+
long: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.",
|
|
171
|
+
},
|
|
172
|
+
retrievalExcellence: {
|
|
173
|
+
label: "Retrieval Excellence",
|
|
174
|
+
long: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.",
|
|
175
|
+
},
|
|
176
|
+
// -- Model breakdown --------------------------------------------------------
|
|
177
|
+
modelBreakdown: {
|
|
178
|
+
label: "Model Breakdown",
|
|
179
|
+
long: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.",
|
|
180
|
+
},
|
|
181
|
+
// -- Strengths (positive diagnostics) ---------------------------------------
|
|
182
|
+
strengths: {
|
|
183
|
+
label: "Strengths",
|
|
184
|
+
long: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.",
|
|
185
|
+
},
|
|
186
|
+
// -- Recommendations / gap analysis ----------------------------------------
|
|
187
|
+
recommendations: {
|
|
188
|
+
label: "Recommendations",
|
|
189
|
+
long: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.",
|
|
190
|
+
},
|
|
191
|
+
totalPotentialLift: {
|
|
192
|
+
label: "Total Potential Lift",
|
|
193
|
+
long: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.",
|
|
194
|
+
},
|
|
195
|
+
failureMode: {
|
|
196
|
+
label: "Failure Mode",
|
|
197
|
+
long: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).",
|
|
198
|
+
},
|
|
199
|
+
estimatedLift: {
|
|
200
|
+
label: "Estimated Lift",
|
|
201
|
+
long: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.",
|
|
202
|
+
},
|
|
203
|
+
confidence: {
|
|
204
|
+
label: "Confidence",
|
|
205
|
+
long: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.",
|
|
206
|
+
},
|
|
207
|
+
// -- Agent behavior --------------------------------------------------------
|
|
208
|
+
agentBehaviorOverview: {
|
|
209
|
+
label: "Agent Behavior",
|
|
210
|
+
long: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.",
|
|
211
|
+
},
|
|
212
|
+
searchQueries: {
|
|
213
|
+
label: "Search Queries",
|
|
214
|
+
long: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.",
|
|
215
|
+
},
|
|
216
|
+
docSlugsVisited: {
|
|
217
|
+
label: "Unique Doc Slugs",
|
|
218
|
+
long: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.",
|
|
219
|
+
},
|
|
220
|
+
externalDomains: {
|
|
221
|
+
label: "External Domains",
|
|
222
|
+
long: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.",
|
|
223
|
+
},
|
|
224
|
+
avgDocPagesVisited: {
|
|
225
|
+
label: "Avg Pages Visited",
|
|
226
|
+
long: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.",
|
|
227
|
+
},
|
|
228
|
+
avgSearchesPerformed: {
|
|
229
|
+
label: "Avg Searches",
|
|
230
|
+
long: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.",
|
|
231
|
+
},
|
|
232
|
+
avgNetworkTimeMs: {
|
|
233
|
+
label: "Avg Network Time",
|
|
234
|
+
long: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.",
|
|
235
|
+
},
|
|
236
|
+
totalRequests: {
|
|
237
|
+
label: "Total Requests",
|
|
238
|
+
long: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.",
|
|
239
|
+
},
|
|
240
|
+
totalBytesDownloaded: {
|
|
241
|
+
label: "Total Bytes Downloaded",
|
|
242
|
+
long: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.",
|
|
243
|
+
},
|
|
244
|
+
// -- Dimension deltas -----------------------------------------------------
|
|
245
|
+
dimTaskCompletion: {
|
|
246
|
+
label: "Task Completion Δ",
|
|
247
|
+
long: "Change in task completion between runs. Positive means implementations are more complete.",
|
|
248
|
+
},
|
|
249
|
+
dimCodeCorrectness: {
|
|
250
|
+
label: "Code Correctness Δ",
|
|
251
|
+
long: "Change in code correctness between runs. Positive means better code quality.",
|
|
252
|
+
},
|
|
253
|
+
dimDocCoverage: {
|
|
254
|
+
label: "Doc Coverage Δ",
|
|
255
|
+
long: "Change in doc coverage between runs. Positive means the docs are providing more useful information.",
|
|
256
|
+
},
|
|
257
|
+
// -- Per-area trend delta ----------------------------------------------------
|
|
258
|
+
areaDelta: {
|
|
259
|
+
label: "Area Δ",
|
|
260
|
+
long: "Score change for this area compared to the previous evaluation run.",
|
|
261
|
+
},
|
|
262
|
+
// -- Source values -----------------------------------------------------------
|
|
263
|
+
sourceProduction: {
|
|
264
|
+
label: "Production",
|
|
265
|
+
long: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.",
|
|
266
|
+
},
|
|
267
|
+
sourceBranch: {
|
|
268
|
+
label: "Branch",
|
|
269
|
+
long: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.",
|
|
270
|
+
},
|
|
271
|
+
sourceLocal: {
|
|
272
|
+
label: "Local",
|
|
273
|
+
long: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.",
|
|
274
|
+
},
|
|
275
|
+
// -- Report list columns ----------------------------------------------------
|
|
276
|
+
reportScore: {
|
|
277
|
+
label: "Score",
|
|
278
|
+
long: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
|
|
279
|
+
},
|
|
280
|
+
reportMode: {
|
|
281
|
+
label: "Mode",
|
|
282
|
+
long: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.",
|
|
283
|
+
},
|
|
284
|
+
reportTrigger: {
|
|
285
|
+
label: "Trigger",
|
|
286
|
+
long: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.",
|
|
287
|
+
},
|
|
288
|
+
// -- Mode values -----------------------------------------------------------
|
|
289
|
+
modeBaseline: {
|
|
290
|
+
label: "Baseline",
|
|
291
|
+
long: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).",
|
|
292
|
+
},
|
|
293
|
+
modeFull: {
|
|
294
|
+
label: "Full",
|
|
295
|
+
long: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.",
|
|
296
|
+
},
|
|
297
|
+
modeAgentic: {
|
|
298
|
+
label: "Agentic",
|
|
299
|
+
long: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?",
|
|
300
|
+
},
|
|
301
|
+
modeObserved: {
|
|
302
|
+
label: "Observed",
|
|
303
|
+
long: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.",
|
|
304
|
+
},
|
|
305
|
+
modeDebug: {
|
|
306
|
+
label: "Debug",
|
|
307
|
+
long: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.",
|
|
308
|
+
},
|
|
309
|
+
// -- Trigger values --------------------------------------------------------
|
|
310
|
+
triggerManual: {
|
|
311
|
+
label: "Manual",
|
|
312
|
+
long: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.",
|
|
313
|
+
},
|
|
314
|
+
triggerCi: {
|
|
315
|
+
label: "CI",
|
|
316
|
+
long: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.",
|
|
317
|
+
},
|
|
318
|
+
triggerSchedule: {
|
|
319
|
+
label: "Scheduled",
|
|
320
|
+
long: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.",
|
|
321
|
+
},
|
|
322
|
+
triggerWebhook: {
|
|
323
|
+
label: "Webhook",
|
|
324
|
+
long: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.",
|
|
325
|
+
},
|
|
326
|
+
triggerCrossRepo: {
|
|
327
|
+
label: "Cross-Repo",
|
|
328
|
+
long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
329
|
+
},
|
|
330
|
+
};
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Re-export of the build-generated help-topic table.
|
|
3
|
+
*
|
|
4
|
+
* The underlying file `src/generated/help-content.ts` is emitted by
|
|
5
|
+
* `scripts/extract-help.ts` and is gitignored. Run `pnpm extract-help`
|
|
6
|
+
* (invoked automatically by this package's `prebuild`) to (re)generate it.
|
|
7
|
+
*
|
|
8
|
+
* @see scripts/extract-help.ts
|
|
9
|
+
*/
|
|
10
|
+
export { HELP_TOPICS } from "./generated/help-content.js";
|