@sanity/ailf-studio 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.d.ts +28 -13
- package/dist/index.js +283 -190
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -138,7 +138,7 @@ registration step needed. There are four ways to execute tasks:
|
|
|
138
138
|
| **Scheduled pipeline** | GitHub Actions cron (daily + weekly) | All enabled tasks |
|
|
139
139
|
|
|
140
140
|
See the
|
|
141
|
-
[CONTRIBUTING_TASKS](https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/
|
|
141
|
+
[CONTRIBUTING_TASKS](https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md#running-your-task)
|
|
142
142
|
guide for the full execution flow and details on each method.
|
|
143
143
|
|
|
144
144
|
## Dashboard Views
|
package/dist/index.d.ts
CHANGED
|
@@ -26,7 +26,7 @@ import { DocumentRef } from './document-ref.js';
|
|
|
26
26
|
* `ownership: "repo"` (i.e., active mirrors). Native tasks and
|
|
27
27
|
* already-graduated tasks never see it.
|
|
28
28
|
*
|
|
29
|
-
* @see docs/exec-plans/task-lifecycle/phase-1-ownership.md
|
|
29
|
+
* @see docs/archive/exec-plans/task-lifecycle/phase-1-ownership.md
|
|
30
30
|
*/
|
|
31
31
|
|
|
32
32
|
declare const GraduateToNativeAction: DocumentActionComponent;
|
|
@@ -90,7 +90,7 @@ declare function ReleasePicker(props: StringInputProps): react_jsx_runtime.JSX.E
|
|
|
90
90
|
*
|
|
91
91
|
* Paired with `SyncStatusBadge` to show sync freshness for active mirrors.
|
|
92
92
|
*
|
|
93
|
-
* @see docs/exec-plans/task-lifecycle/phase-1-ownership.md
|
|
93
|
+
* @see docs/archive/exec-plans/task-lifecycle/phase-1-ownership.md
|
|
94
94
|
*/
|
|
95
95
|
interface GitAuthorInfo {
|
|
96
96
|
gitName?: string;
|
|
@@ -130,7 +130,7 @@ declare function MirrorBanner({ origin, ownership }: MirrorBannerProps): react_j
|
|
|
130
130
|
* Uses the `origin.lastSyncedAt` timestamp that the pipeline sets
|
|
131
131
|
* on every mirror upsert (Phase 5a).
|
|
132
132
|
*
|
|
133
|
-
* @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
133
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
134
134
|
*/
|
|
135
135
|
interface SyncStatusBadgeProps {
|
|
136
136
|
/** ISO 8601 timestamp from origin.lastSyncedAt */
|
|
@@ -399,10 +399,25 @@ interface ReportListItem {
|
|
|
399
399
|
interface ScoreItem {
|
|
400
400
|
codeCorrectness: number;
|
|
401
401
|
docCoverage: number;
|
|
402
|
+
/**
|
|
403
|
+
* Generic dimension scores map — all dimensions by camelCase key (0–100).
|
|
404
|
+
*
|
|
405
|
+
* Non-literacy modes (agent-harness, mcp-server) store their actual
|
|
406
|
+
* dimensions here (e.g., agentOutput, toolUsage). Literacy mode may
|
|
407
|
+
* also populate this alongside the three legacy named fields above.
|
|
408
|
+
*
|
|
409
|
+
* UI components should read from this map via `resolveDimensions()` in
|
|
410
|
+
* `lib/dimensions.ts` rather than hardcoding the three named fields.
|
|
411
|
+
*/
|
|
412
|
+
dimensions?: Record<string, number>;
|
|
402
413
|
docLift: number;
|
|
403
414
|
/** Sanity documents used for this feature area's evaluation */
|
|
404
415
|
documents?: DocumentRef[];
|
|
405
416
|
feature: string;
|
|
417
|
+
/** Grouping strategy — "task" for agent-harness, "feature" for literacy */
|
|
418
|
+
groupType?: "aggregate" | "feature" | "task";
|
|
419
|
+
/** True when floor > ceiling (docs hurt performance) */
|
|
420
|
+
negativeDocLift?: boolean;
|
|
406
421
|
taskCompletion: number;
|
|
407
422
|
testCount: number;
|
|
408
423
|
totalScore: number;
|
|
@@ -611,24 +626,24 @@ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
|
|
|
611
626
|
* stat cards, table headers, and comparison views.
|
|
612
627
|
*
|
|
613
628
|
* @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
|
|
614
|
-
* @see docs/
|
|
629
|
+
* @see docs/architecture.md (scoring model)
|
|
615
630
|
*/
|
|
616
631
|
declare const GLOSSARY: {
|
|
617
|
-
readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
618
|
-
readonly docLift: "How much the docs help, compared to the model's training data alone.
|
|
632
|
+
readonly overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
633
|
+
readonly docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
|
|
619
634
|
readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
|
|
620
635
|
readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
|
|
621
636
|
readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
|
|
622
|
-
readonly floor: "
|
|
637
|
+
readonly floor: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
|
|
623
638
|
readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
|
|
624
639
|
readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
|
|
625
640
|
readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
|
|
626
641
|
readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
|
|
627
642
|
readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
|
|
628
|
-
readonly score: "
|
|
643
|
+
readonly score: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
|
|
629
644
|
readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
|
|
630
645
|
readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
|
|
631
|
-
readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
|
|
646
|
+
readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
|
|
632
647
|
readonly tests: "Number of test cases in this feature area.";
|
|
633
648
|
readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
|
|
634
649
|
readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
|
|
@@ -645,7 +660,7 @@ declare const GLOSSARY: {
|
|
|
645
660
|
readonly healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
|
|
646
661
|
readonly negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
|
|
647
662
|
readonly weakAreas: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
|
|
648
|
-
readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation
|
|
663
|
+
readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.";
|
|
649
664
|
readonly retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
|
|
650
665
|
readonly dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
|
|
651
666
|
readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
|
|
@@ -674,7 +689,7 @@ declare const GLOSSARY: {
|
|
|
674
689
|
readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
|
|
675
690
|
readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
|
|
676
691
|
readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
|
|
677
|
-
readonly reportScore: "The overall
|
|
692
|
+
readonly reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
|
|
678
693
|
readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
|
|
679
694
|
readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
|
|
680
695
|
readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
|
|
@@ -899,7 +914,7 @@ declare const reportSchema: {
|
|
|
899
914
|
* repositories. Mirrored tasks have a read-only `origin` block that
|
|
900
915
|
* tracks their source repo provenance.
|
|
901
916
|
*
|
|
902
|
-
* @see docs/
|
|
917
|
+
* @see docs/contributing-tasks.md#running-your-task — full execution guide
|
|
903
918
|
* @see docs/design-docs/tasks-as-content.md
|
|
904
919
|
* @see docs/design-docs/tasks-as-content.md#decision-8-domain-specific-assertion-types-not-a-promptfoo-subset
|
|
905
920
|
*/
|
|
@@ -909,11 +924,11 @@ declare const taskSchema: {
|
|
|
909
924
|
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
910
925
|
preview?: sanity.PreviewConfig<{
|
|
911
926
|
area: string;
|
|
912
|
-
description: string;
|
|
913
927
|
id: string;
|
|
914
928
|
origin: string;
|
|
915
929
|
ownership: string;
|
|
916
930
|
status: string;
|
|
931
|
+
title: string;
|
|
917
932
|
}, Record<string, unknown>> | undefined;
|
|
918
933
|
};
|
|
919
934
|
|
package/dist/index.js
CHANGED
|
@@ -117,12 +117,24 @@ function scoreGrade(score) {
|
|
|
117
117
|
// ../shared/dist/noise-threshold.js
|
|
118
118
|
var NOISE_THRESHOLD = 2;
|
|
119
119
|
|
|
120
|
-
// ../shared/dist/
|
|
121
|
-
var
|
|
122
|
-
"
|
|
123
|
-
"
|
|
124
|
-
"
|
|
125
|
-
|
|
120
|
+
// ../shared/dist/eval-modes.js
|
|
121
|
+
var CANONICAL_EVAL_MODES = [
|
|
122
|
+
"literacy",
|
|
123
|
+
"mcp-server",
|
|
124
|
+
"agent-harness",
|
|
125
|
+
"knowledge-probe",
|
|
126
|
+
"custom"
|
|
127
|
+
];
|
|
128
|
+
var LEGACY_EVAL_MODE_ALIASES = [
|
|
129
|
+
"baseline",
|
|
130
|
+
"agentic",
|
|
131
|
+
"observed",
|
|
132
|
+
"full"
|
|
133
|
+
];
|
|
134
|
+
var RAW_EVAL_MODES = [
|
|
135
|
+
...CANONICAL_EVAL_MODES,
|
|
136
|
+
...LEGACY_EVAL_MODE_ALIASES
|
|
137
|
+
];
|
|
126
138
|
|
|
127
139
|
// src/types.ts
|
|
128
140
|
function formatPercent(n) {
|
|
@@ -2218,8 +2230,8 @@ var taskSchema = defineType5({
|
|
|
2218
2230
|
defineField5({
|
|
2219
2231
|
description: "Human-readable label shown in reports and Studio",
|
|
2220
2232
|
group: ["main", "all-fields"],
|
|
2221
|
-
name: "
|
|
2222
|
-
title: "
|
|
2233
|
+
name: "title",
|
|
2234
|
+
title: "Title",
|
|
2223
2235
|
type: "string",
|
|
2224
2236
|
validation: (rule) => rule.required()
|
|
2225
2237
|
}),
|
|
@@ -2280,9 +2292,9 @@ var taskSchema = defineType5({
|
|
|
2280
2292
|
defineField5({
|
|
2281
2293
|
description: "The implementation prompt sent to the evaluated LLM. This is the user-facing request (e.g., 'Write GROQ queries for a blog app...'). Plain text or Markdown.",
|
|
2282
2294
|
group: ["main", "all-fields"],
|
|
2283
|
-
name: "
|
|
2295
|
+
name: "promptText",
|
|
2284
2296
|
rows: 10,
|
|
2285
|
-
title: "
|
|
2297
|
+
title: "Prompt Text",
|
|
2286
2298
|
type: "text",
|
|
2287
2299
|
validation: (rule) => rule.required()
|
|
2288
2300
|
}),
|
|
@@ -2292,7 +2304,7 @@ var taskSchema = defineType5({
|
|
|
2292
2304
|
defineField5({
|
|
2293
2305
|
description: "Feature area this task belongs to. Used for score aggregation and --area CLI filtering.",
|
|
2294
2306
|
group: ["main", "all-fields"],
|
|
2295
|
-
name: "
|
|
2307
|
+
name: "area",
|
|
2296
2308
|
title: "Feature Area",
|
|
2297
2309
|
to: [{ type: "ailf.featureArea" }],
|
|
2298
2310
|
type: "reference",
|
|
@@ -2312,7 +2324,7 @@ var taskSchema = defineType5({
|
|
|
2312
2324
|
defineField5({
|
|
2313
2325
|
description: "Documentation that the LLM should use to complete this task. These become the gold-standard doc context injected in baseline mode. Each entry is either a single article document or a content release (whose articles are all included automatically).",
|
|
2314
2326
|
group: ["main", "all-fields"],
|
|
2315
|
-
name: "
|
|
2327
|
+
name: "contextDocs",
|
|
2316
2328
|
of: [
|
|
2317
2329
|
{
|
|
2318
2330
|
components: {
|
|
@@ -2437,7 +2449,7 @@ var taskSchema = defineType5({
|
|
|
2437
2449
|
defineField5({
|
|
2438
2450
|
description: "Grading criteria for evaluating the LLM's output. At least one assertion is required. Use llm-rubric with a template for scored evaluation, or value-based assertions for exact checks.",
|
|
2439
2451
|
group: ["main", "all-fields"],
|
|
2440
|
-
name: "
|
|
2452
|
+
name: "assertions",
|
|
2441
2453
|
of: [
|
|
2442
2454
|
{
|
|
2443
2455
|
fields: [
|
|
@@ -2585,8 +2597,8 @@ var taskSchema = defineType5({
|
|
|
2585
2597
|
type: "boolean"
|
|
2586
2598
|
}),
|
|
2587
2599
|
defineField5({
|
|
2588
|
-
description: 'Rubric mode for baseline. "
|
|
2589
|
-
initialValue: "
|
|
2600
|
+
description: 'Rubric mode for baseline. "full" uses the same rubric as gold, "abbreviated" uses a shorter rubric, "none" skips rubric grading.',
|
|
2601
|
+
initialValue: "full",
|
|
2590
2602
|
name: "rubric",
|
|
2591
2603
|
options: {
|
|
2592
2604
|
list: [
|
|
@@ -2916,11 +2928,11 @@ var taskSchema = defineType5({
|
|
|
2916
2928
|
preview: {
|
|
2917
2929
|
prepare({
|
|
2918
2930
|
area,
|
|
2919
|
-
description,
|
|
2920
2931
|
id,
|
|
2921
2932
|
origin,
|
|
2922
2933
|
ownership,
|
|
2923
|
-
status
|
|
2934
|
+
status,
|
|
2935
|
+
title
|
|
2924
2936
|
}) {
|
|
2925
2937
|
const taskId = id !== null && typeof id === "object" && "current" in id ? id.current : void 0;
|
|
2926
2938
|
const isMirror = ownership === "repo" || !ownership && origin !== null && typeof origin === "object" && "repo" in origin;
|
|
@@ -2944,16 +2956,16 @@ var taskSchema = defineType5({
|
|
|
2944
2956
|
const statusIcon = status === "archived" ? "\u{1F4E6} " : status === "draft" ? "\u{1F9EA} " : status === "paused" ? "\u23F8\uFE0F " : "";
|
|
2945
2957
|
return {
|
|
2946
2958
|
subtitle: `${areaStr}${typeof taskId === "string" ? taskId : ""}${syncInfo}`,
|
|
2947
|
-
title: `${prefix}${statusIcon}${typeof
|
|
2959
|
+
title: `${prefix}${statusIcon}${typeof title === "string" ? title : "Task"}`
|
|
2948
2960
|
};
|
|
2949
2961
|
},
|
|
2950
2962
|
select: {
|
|
2951
|
-
area: "
|
|
2952
|
-
description: "description",
|
|
2963
|
+
area: "area.areaId",
|
|
2953
2964
|
id: "id",
|
|
2954
2965
|
origin: "origin",
|
|
2955
2966
|
ownership: "ownership",
|
|
2956
|
-
status: "status"
|
|
2967
|
+
status: "status",
|
|
2968
|
+
title: "title"
|
|
2957
2969
|
}
|
|
2958
2970
|
},
|
|
2959
2971
|
// Document-level read-only when owned by a repo.
|
|
@@ -3138,16 +3150,6 @@ function useHelp() {
|
|
|
3138
3150
|
|
|
3139
3151
|
// src/generated/help-content.ts
|
|
3140
3152
|
var HELP_TOPICS = [
|
|
3141
|
-
{
|
|
3142
|
-
"id": "eval-modes",
|
|
3143
|
-
"title": "Evaluation Modes",
|
|
3144
|
-
"body": "The framework runs each task under different conditions to isolate what helps AI\nagents succeed:\n\n**Full** (default) \u2014 Runs baseline + agentic sequentially in a single pipeline\nrun, then merges results into a **three-layer decomposition**: floor (model\nknowledge alone), ceiling (gold-standard docs injected), and actual\n(agent-retrieved docs). The retrieval gap (ceiling \u2212 actual) measures how much\ndocumentation quality is lost to discoverability.\n\n**Baseline** \u2014 Docs are fetched from Sanity via GROQ and injected directly into\nthe prompt. Each task runs twice: once with docs, once without. The LLM never\ntouches the internet. The score difference is the **Doc Lift** \u2014 how much\ndocumentation helps vs. training data alone.\n\n**Agentic** \u2014 The LLM gets no docs in the prompt. Instead it receives tools\n(`web_search`, `fetch_page`, `list_docs`) and must find relevant documentation\non its own. Two sub-modes test different discovery strategies:\n\n- **Naive** \u2014 web search \u2192 reads pages via Jina Reader (simulates a human)\n- **Optimized** \u2014 starts from `llms.txt` \u2192 fetches `.md` versions directly\n\n**Observed** \u2014 Same as baseline (docs injected in prompt), but the LLM's HTTP\nactivity is recorded through an instrumented proxy (`InstrumentedProvider`).\nCaptures which API endpoints the model references without changing its behavior.",
|
|
3145
|
-
"source": "docs/ARCHITECTURE.md",
|
|
3146
|
-
"related": [
|
|
3147
|
-
"scoring-model",
|
|
3148
|
-
"three-layer"
|
|
3149
|
-
]
|
|
3150
|
-
},
|
|
3151
3153
|
{
|
|
3152
3154
|
"id": "negative-doc-lift",
|
|
3153
3155
|
"title": "When Docs Hurt: Negative Doc Lift",
|
|
@@ -3305,7 +3307,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
|
3305
3307
|
{
|
|
3306
3308
|
"id": "scoring-model",
|
|
3307
3309
|
"title": "Understanding Scores",
|
|
3308
|
-
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task
|
|
3310
|
+
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs): Total = Task \xD7 0.50 + Code \xD7 0.25 + Docs \xD7 0.25\nBaseline (no docs): Total = Task \xD7 0.60 + Code \xD7 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling \u2212 floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100** | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379** | Needs attention \u2014 there may be gaps in specific dimensions |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
|
|
3309
3311
|
"source": "docs/help/scoring-model.md",
|
|
3310
3312
|
"related": [
|
|
3311
3313
|
"three-layer",
|
|
@@ -3327,17 +3329,27 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
|
3327
3329
|
{
|
|
3328
3330
|
"id": "how-agents-work",
|
|
3329
3331
|
"title": "How AI Agents Find Documentation",
|
|
3330
|
-
"body": "Understanding how popular AI coding agents retrieve and use documentation is\ncentral to the ai-literacy-framework evaluation framework. This document\nexplains the mechanisms used by common agents and how our test modes simulate\nthem.\n\n## The Documentation Access Problem\n\nWhen a developer asks an AI coding assistant \"Set up a Sanity Studio with a\ncustom blog schema,\" the agent needs to find and read the relevant Sanity\ndocumentation. But different agents do this in fundamentally different ways, and\nthose differences directly impact the quality of the response.\n\nThe framework measures this impact through four evaluation modes: **full**\n(default \u2014 runs baseline + agentic together), **baseline** (docs in prompt),\n**agentic** (tool-calling with real web access), and **observed** (instrumented\nsingle-call).\n\n## How Popular Agents Work\n\n### Claude Code (Anthropic)\n\nClaude Code has built-in tools including `WebSearchTool` and `WebFetchTool`.\nWhen a user asks a Sanity question:\n\n1. The model decides whether to search the web\n2. If so, it calls `WebSearchTool` with a query string\n3. Search results come back as structured data (titles, URLs, snippets)\n4. The model may call `WebFetchTool` to read specific pages\n5. The fetched content is returned as **rendered text** \u2014 Claude Code's fetch\n tool handles JavaScript rendering internally, so even SPA pages return\n readable content\n6. The model synthesizes the fetched docs with its training data to produce an\n answer\n\n**Key characteristic**: Claude Code sees the web as rendered, readable text. It\ndoesn't get raw HTML soup. But it also doesn't know about agent-friendly\nendpoints like `.md` files or `llms.txt` \u2014 it fetches the same HTML pages a\nbrowser would load.\n\n### ChatGPT (OpenAI)\n\nChatGPT's browsing capability uses Bing search under the hood:\n\n1. The model decides to search (users can also explicitly ask it to browse)\n2. It searches via Bing, getting ranked results\n3. It can \"click\" on results to read page content\n4. Pages are rendered server-side and returned as text\n5. The model reads relevant sections and synthesizes an answer\n\n**Key characteristic**: ChatGPT's browsing is similar to Claude Code \u2014 it gets\nrendered content. The URLs visited are returned in citations. It also has no\nawareness of `llms.txt` or `.md` endpoints.\n\n### Cursor\n\nCursor takes a different approach:\n\n1. It maintains a pre-built index of popular documentation sites (`@docs`)\n2. Users can manually add documentation sources\n3. It also has web search capability for unknown topics\n4. Codebase context is injected automatically from the project\n\n**Key characteristic**: Cursor's `@docs` feature means it may have indexed\nSanity docs already, but the index may be outdated. For unknown topics, it falls\nback to web search like the other agents.\n\n### GitHub Copilot\n\nCopilot primarily relies on:\n\n1. The model's training data (parametric knowledge)\n2. Codebase context from the current project\n3. Bing search for `@workspace` queries in newer versions\n\n**Key characteristic**: Copilot historically had no web access, relying entirely\non training data. Newer versions can search, but the experience is similar to\nChatGPT.\n\n## The JavaScript SPA Problem\n\nSanity's documentation site (`sanity.io/docs`) is built with Next.js \u2014 a\nJavaScript single-page application. When an agent makes a raw HTTP request:\n\n```\nGET https://www.sanity.io/docs/schema-types\n\u2192 Returns ~125KB of HTML that is mostly:\n - <script> tags for Next.js bundles\n - React hydration data\n - Navigation chrome\n - Very little actual documentation text\n```\n\nReal agents handle this differently than a raw `fetch()`:\n\n| Agent | Raw fetch? | Gets readable content? | How? |\n| ------------- | ---------- | ---------------------- | ---------------------------------- |\n| Claude Code | No | Yes | Built-in rendering in WebFetchTool |\n| ChatGPT | No | Yes | Server-side rendering via Bing |\n| Cursor | No | Yes | Pre-built doc index |\n| Raw `fetch()` | Yes | **No** | Gets HTML soup |\n\nThis is why the agentic provider uses **Jina Reader** (`r.jina.ai`) as a\nreadability proxy in \"naive\" mode \u2014 it simulates the rendering capability that\nreal agents have built in.\n\n## Sanity's Agent-Friendly Endpoints\n\nSanity has invested in making their documentation accessible to AI agents\nthrough special endpoints:\n\n### `.md` endpoint\n\nAppending `.md` to any docs URL returns pure markdown:\n\n```\nGET https://www.sanity.io/docs/schema-types.md\nContent-Type: text/markdown;charset=UTF-8\n\n# Schema types\nSchema types are used to define the shape of your content...\n```\n\nThis returns **clean markdown** \u2014 no HTML, no JavaScript, no navigation. Just\nthe documentation content. Typical response size: 2-10KB (vs 125KB for the HTML\npage).\n\n### `llms.txt`\n\nSanity provides an `llms.txt` file at `https://www.sanity.io/docs/llms.txt` \u2014 a\nstructured listing of all documentation pages designed for AI agent consumption:\n\n```\n# Sanity\n## Docs\n- [Manage Sanity with code](https://www.sanity.io/docs/blueprints)\n- [Introduction](https://www.sanity.io/docs/blueprints-introduction)\n- [Deploy with GitHub Actions](https://www.sanity.io/docs/blueprints/blueprint-action)\n...\n```\n\nThis follows the emerging [llms.txt standard](https://llmstxt.org/) \u2014 a\nmachine-readable table of contents that tells agents what documentation is\navailable and where to find it.\n\n### Impact on Agent Performance\n\nOur smoke tests show the dramatic difference these endpoints make:\n\n| Metric | Naive Agent (Jina) | Optimized Agent (.md) |\n| ---------------- | ------------------ | --------------------- |\n| Result | \u274C FAIL | \u2705 PASS |\n| Latency | 57.9s | 15.2s (3.8\xD7 faster) |\n| Bytes downloaded | 108 KB | 59 KB (45% less) |\n| Total requests | 9 | 6 (33% fewer) |\n| Search queries | 3 | 0 (used llms.txt) |\n\nThe optimized agent skips search entirely \u2014 it calls `list_docs(\"sanity.io\")` to\nget the `llms.txt` table of contents, identifies the relevant pages, and fetches\nthem directly as `.md`. No search round-trips, no proxy overhead, no content\ncleaning needed.\n\n## How Test Modes Map to Real Agents\n\n| Mode | Config | Simulates | Documentation Access |\n| -------------------------- | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------- |\n| `eval` (baseline) | `promptfooconfig.yaml` | No agent \u2014 docs in prompt | Docs are injected directly into the prompt context, with and without variants |\n| `eval:observed` | `promptfooconfig.observed.yaml` | Non-agentic API call | Single OpenAI API call, records the HTTP request but model doesn't browse |\n| `eval:agentic` (naive) | `promptfooconfig.agentic.yaml` | Claude Code, ChatGPT today | Model has `web_search` + `fetch_page` tools; pages fetched via Jina Reader (simulates JS rendering) |\n| `eval:agentic` (optimized) | `promptfooconfig.agentic.yaml` | Ideal future agent | Model has `web_search` + `fetch_page` + `list_docs` tools; fetches `.md` endpoints directly, uses `llms.txt` for discovery |\n\n### Why Both Naive and Optimized?\n\nThe comparison between naive and optimized modes answers a critical business\nquestion:\n\n> **\"How much does investing in agent-friendly documentation endpoints (`.md`,\n> `llms.txt`) improve the AI developer experience?\"**\n\nIf the optimized agent significantly outperforms the naive agent, it validates\nthe investment in these endpoints. The data from our tests provides concrete\nevidence for this.\n\n## Limitations of the Simulation\n\nWhile the agentic provider faithfully simulates agent behavior, there are\ndifferences from real agents:\n\n1. **Search quality**: We use DuckDuckGo via Jina as a search fallback. Real\n agents use Bing (ChatGPT) or their own search (Claude Code). Search result\n quality varies.\n\n2. **Page rendering**: Jina Reader is a good proxy for JS rendering, but may\n produce slightly different output than what Claude Code or ChatGPT's internal\n renderers produce.\n\n3. **Context window management**: Real agents have sophisticated context\n management \u2014 they may truncate long pages, summarize content, or use sliding\n windows. Our provider returns content up to a fixed limit (12KB).\n\n4. **Codebase context**: Real agents (especially Cursor and Copilot) inject the\n developer's current codebase into context. Our eval doesn't simulate this \u2014\n it only tests documentation retrieval.\n\n5. **Multi-turn interactions**: A real developer might have a conversation with\n their agent, refining the request. Our eval tests single-turn interactions.\n\n## Future Directions\n\n- **Subprocess agents**: Run actual `claude` CLI or other agent CLIs as\n subprocesses and capture their real network traffic via HTTP proxy\n interception\n- **Anthropic/OpenAI native tools**: Use Claude's built-in `web_search` tool or\n OpenAI's `web_search_preview` in the Responses API for more faithful\n simulation\n- **Agent-specific configs**: Separate configs that match each agent's exact\n tool set and system prompt\n- **Codebase context injection**: Simulate a project workspace to test how\n agents combine docs with code context",
|
|
3332
|
+
"body": "Understanding how popular AI coding agents retrieve and use documentation is\ncentral to the ai-literacy-framework evaluation framework. This document\nexplains the mechanisms used by common agents and how our test modes simulate\nthem.\n\n## The Documentation Access Problem\n\nWhen a developer asks an AI coding assistant \"Set up a Sanity Studio with a\ncustom blog schema,\" the agent needs to find and read the relevant Sanity\ndocumentation. But different agents do this in fundamentally different ways, and\nthose differences directly impact the quality of the response.\n\nThe framework measures this impact through four evaluation modes: **full**\n(default \u2014 runs baseline + agentic together), **baseline** (docs in prompt),\n**agentic** (tool-calling with real web access), and **observed** (instrumented\nsingle-call).\n\n## How Popular Agents Work\n\n### Claude Code (Anthropic)\n\nClaude Code has built-in tools including `WebSearchTool` and `WebFetchTool`.\nWhen a user asks a Sanity question:\n\n1. The model decides whether to search the web\n2. If so, it calls `WebSearchTool` with a query string\n3. Search results come back as structured data (titles, URLs, snippets)\n4. The model may call `WebFetchTool` to read specific pages\n5. The fetched content is returned as **rendered text** \u2014 Claude Code's fetch\n tool handles JavaScript rendering internally, so even SPA pages return\n readable content\n6. The model synthesizes the fetched docs with its training data to produce an\n answer\n\n**Key characteristic**: Claude Code sees the web as rendered, readable text. It\ndoesn't get raw HTML soup. But it also doesn't know about agent-friendly\nendpoints like `.md` files or `llms.txt` \u2014 it fetches the same HTML pages a\nbrowser would load.\n\n### ChatGPT (OpenAI)\n\nChatGPT's browsing capability uses Bing search under the hood:\n\n1. The model decides to search (users can also explicitly ask it to browse)\n2. It searches via Bing, getting ranked results\n3. It can \"click\" on results to read page content\n4. Pages are rendered server-side and returned as text\n5. The model reads relevant sections and synthesizes an answer\n\n**Key characteristic**: ChatGPT's browsing is similar to Claude Code \u2014 it gets\nrendered content. The URLs visited are returned in citations. It also has no\nawareness of `llms.txt` or `.md` endpoints.\n\n### Cursor\n\nCursor takes a different approach:\n\n1. It maintains a pre-built index of popular documentation sites (`@docs`)\n2. Users can manually add documentation sources\n3. It also has web search capability for unknown topics\n4. Codebase context is injected automatically from the project\n\n**Key characteristic**: Cursor's `@docs` feature means it may have indexed\nSanity docs already, but the index may be outdated. For unknown topics, it falls\nback to web search like the other agents.\n\n### GitHub Copilot\n\nCopilot primarily relies on:\n\n1. The model's training data (parametric knowledge)\n2. Codebase context from the current project\n3. Bing search for `@workspace` queries in newer versions\n\n**Key characteristic**: Copilot historically had no web access, relying entirely\non training data. Newer versions can search, but the experience is similar to\nChatGPT.\n\n## The JavaScript SPA Problem\n\nSanity's documentation site (`sanity.io/docs`) is built with Next.js \u2014 a\nJavaScript single-page application. When an agent makes a raw HTTP request:\n\n```\nGET https://www.sanity.io/docs/schema-types\n\u2192 Returns ~125KB of HTML that is mostly:\n - <script> tags for Next.js bundles\n - React hydration data\n - Navigation chrome\n - Very little actual documentation text\n```\n\nReal agents handle this differently than a raw `fetch()`:\n\n| Agent | Raw fetch? | Gets readable content? | How? |\n| ------------- | ---------- | ---------------------- | ---------------------------------- |\n| Claude Code | No | Yes | Built-in rendering in WebFetchTool |\n| ChatGPT | No | Yes | Server-side rendering via Bing |\n| Cursor | No | Yes | Pre-built doc index |\n| Raw `fetch()` | Yes | **No** | Gets HTML soup |\n\nThis is why the agentic provider uses **Jina Reader** (`r.jina.ai`) as a\nreadability proxy in \"naive\" mode \u2014 it simulates the rendering capability that\nreal agents have built in.\n\n## Sanity's Agent-Friendly Endpoints\n\nSanity has invested in making their documentation accessible to AI agents\nthrough special endpoints:\n\n### `.md` endpoint\n\nAppending `.md` to any docs URL returns pure markdown:\n\n```\nGET https://www.sanity.io/docs/schema-types.md\nContent-Type: text/markdown;charset=UTF-8\n\n# Schema types\nSchema types are used to define the shape of your content...\n```\n\nThis returns **clean markdown** \u2014 no HTML, no JavaScript, no navigation. Just\nthe documentation content. Typical response size: 2-10KB (vs 125KB for the HTML\npage).\n\n### `llms.txt`\n\nSanity provides an `llms.txt` file at `https://www.sanity.io/docs/llms.txt` \u2014 a\nstructured listing of all documentation pages designed for AI agent consumption:\n\n```\n# Sanity\n## Docs\n- [Manage Sanity with code](https://www.sanity.io/docs/blueprints)\n- [Introduction](https://www.sanity.io/docs/blueprints-introduction)\n- [Deploy with GitHub Actions](https://www.sanity.io/docs/blueprints/blueprint-action)\n...\n```\n\nThis follows the emerging [llms.txt standard](https://llmstxt.org/) \u2014 a\nmachine-readable table of contents that tells agents what documentation is\navailable and where to find it.\n\n### Impact on Agent Performance\n\nOur smoke tests show the dramatic difference these endpoints make:\n\n| Metric | Naive Agent (Jina) | Optimized Agent (.md) |\n| ---------------- | ------------------ | --------------------- |\n| Result | \u274C FAIL | \u2705 PASS |\n| Latency | 57.9s | 15.2s (3.8\xD7 faster) |\n| Bytes downloaded | 108 KB | 59 KB (45% less) |\n| Total requests | 9 | 6 (33% fewer) |\n| Search queries | 3 | 0 (used llms.txt) |\n\nThe optimized agent skips search entirely \u2014 it calls `list_docs(\"sanity.io\")` to\nget the `llms.txt` table of contents, identifies the relevant pages, and fetches\nthem directly as `.md`. No search round-trips, no proxy overhead, no content\ncleaning needed.\n\n## How Test Modes Map to Real Agents\n\n| Mode | Config | Simulates | Documentation Access |\n| -------------------------- | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------- |\n| `eval` (baseline) | `promptfooconfig.yaml` | No agent \u2014 docs in prompt | Docs are injected directly into the prompt context, with and without variants |\n| `eval:observed` | `promptfooconfig.observed.yaml` | Non-agentic API call | Single OpenAI API call, records the HTTP request but model doesn't browse |\n| `eval:agentic` (naive) | `promptfooconfig.agentic.yaml` | Claude Code, ChatGPT today | Model has `web_search` + `fetch_page` tools; pages fetched via Jina Reader (simulates JS rendering) |\n| `eval:agentic` (optimized) | `promptfooconfig.agentic.yaml` | Ideal future agent | Model has `web_search` + `fetch_page` + `list_docs` tools; fetches `.md` endpoints directly, uses `llms.txt` for discovery |\n| `agent-harness` | compiled via compiler | Real agent in sandbox | Agent harness mode evaluates real agent behavior in a sandboxed environment (Docker, tempdir, git-worktree) |\n\n### Why Both Naive and Optimized?\n\nThe comparison between naive and optimized modes answers a critical business\nquestion:\n\n> **\"How much does investing in agent-friendly documentation endpoints (`.md`,\n> `llms.txt`) improve the AI developer experience?\"**\n\nIf the optimized agent significantly outperforms the naive agent, it validates\nthe investment in these endpoints. The data from our tests provides concrete\nevidence for this.\n\n## Limitations of the Simulation\n\nWhile the agentic provider faithfully simulates agent behavior, there are\ndifferences from real agents:\n\n1. **Search quality**: We use DuckDuckGo via Jina as a search fallback. Real\n agents use Bing (ChatGPT) or their own search (Claude Code). Search result\n quality varies.\n\n2. **Page rendering**: Jina Reader is a good proxy for JS rendering, but may\n produce slightly different output than what Claude Code or ChatGPT's internal\n renderers produce.\n\n3. **Context window management**: Real agents have sophisticated context\n management \u2014 they may truncate long pages, summarize content, or use sliding\n windows. Our provider returns content up to a fixed limit (12KB).\n\n4. **Codebase context**: Real agents (especially Cursor and Copilot) inject the\n developer's current codebase into context. Our eval doesn't simulate this \u2014\n it only tests documentation retrieval.\n\n5. **Multi-turn interactions**: A real developer might have a conversation with\n their agent, refining the request. Our eval tests single-turn interactions.\n\n## Future Directions\n\nThe architecture overhaul (Phase 4: agent harness mode) addressed several of\nthese goals \u2014 real agents can now be evaluated in sandboxed environments with\nfixture provisioning, tool manifests, and process-quality scoring. Remaining\ndirections:\n\n- **Subprocess agents** _(partially addressed by agent harness mode)_: The\n harness supports running agents via entrypoints in Docker, tempdir, or\n git-worktree sandboxes. Real `claude` CLI or other agent CLIs can be\n configured as harness entrypoints.\n- **Anthropic/OpenAI native tools**: Use Claude's built-in `web_search` tool or\n OpenAI's `web_search_preview` in the Responses API for more faithful\n simulation of the agentic mode\n- **Agent-specific configs**: The compiler's mode handler system makes it\n straightforward to create per-agent configurations\n- **Codebase context injection** _(partially addressed by fixture\n provisioning)_: The agent harness fixture provisioner can inject project\n workspaces, dependency manifests, and code contexts into sandbox environments",
|
|
3331
3333
|
"source": "docs/how-agents-work.md",
|
|
3332
3334
|
"related": [
|
|
3333
3335
|
"eval-modes",
|
|
3334
3336
|
"retrieval-gap"
|
|
3335
3337
|
]
|
|
3336
3338
|
},
|
|
3339
|
+
{
|
|
3340
|
+
"id": "eval-modes",
|
|
3341
|
+
"title": "Evaluation Modes",
|
|
3342
|
+
"body": '> **This guide is for:** Anyone using AILF who wants to understand what modes\n> exist and when to use each one.\n\nAILF supports five canonical evaluation modes. Each mode measures a different\naspect of AI tool effectiveness.\n\n## Mode overview\n\n| Mode | What it measures | When to use it |\n| ------------------- | ---------------------------------------------------- | ------------------------------------- |\n| **literacy** | Can AI agents implement features using your docs? | Testing documentation quality |\n| **mcp-server** | Can an LLM correctly use your MCP server\'s tools? | Testing MCP server implementations |\n| **knowledge-probe** | What does the model know without any docs? | Measuring baseline model knowledge |\n| **agent-harness** | Can an autonomous agent complete tasks in a sandbox? | Testing agent capabilities end-to-end |\n| **custom** | Whatever you define | Building your own evaluation type |\n\n## Choosing a mode\n\n```\nWhat do you want to test?\n \u2502\n \u251C\u2500\u2500 "Are our docs helping AI agents?" \u2500\u2500\u2500\u2500\u2500\u2500\u2192 literacy\n \u251C\u2500\u2500 "Does our MCP server work correctly?" \u2500\u2500\u2192 mcp-server\n \u251C\u2500\u2500 "What does the model already know?" \u2500\u2500\u2500\u2500\u2192 knowledge-probe\n \u251C\u2500\u2500 "Can an agent complete real tasks?" \u2500\u2500\u2500\u2500\u2192 agent-harness\n \u2514\u2500\u2500 "Something else entirely" \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2192 custom\n```',
|
|
3343
|
+
"source": "docs/modes.md",
|
|
3344
|
+
"related": [
|
|
3345
|
+
"scoring-model",
|
|
3346
|
+
"three-layer"
|
|
3347
|
+
]
|
|
3348
|
+
},
|
|
3337
3349
|
{
|
|
3338
3350
|
"id": "glossary",
|
|
3339
3351
|
"title": "Glossary",
|
|
3340
|
-
"body": "**Overall Score**\n: A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Score without any documentation. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Weighted score for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
3352
|
+
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
3341
3353
|
"source": "packages/studio/src/glossary.ts",
|
|
3342
3354
|
"tags": [
|
|
3343
3355
|
"reference",
|
|
@@ -3386,23 +3398,23 @@ import { useClient as useClient3 } from "sanity";
|
|
|
3386
3398
|
// src/glossary.ts
|
|
3387
3399
|
var GLOSSARY = {
|
|
3388
3400
|
// -- Overview stats -------------------------------------------------------
|
|
3389
|
-
overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
|
|
3390
|
-
docLift: "How much the docs help, compared to the model's training data alone.
|
|
3401
|
+
overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
|
|
3402
|
+
docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.",
|
|
3391
3403
|
actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.",
|
|
3392
3404
|
retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.",
|
|
3393
3405
|
infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.",
|
|
3394
3406
|
// -- Three-layer decomposition columns ------------------------------------
|
|
3395
|
-
floor: "
|
|
3407
|
+
floor: "Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.",
|
|
3396
3408
|
ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.",
|
|
3397
3409
|
actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.",
|
|
3398
3410
|
retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.",
|
|
3399
3411
|
efficiency: "What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).",
|
|
3400
3412
|
invertedRetGap: "\u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.",
|
|
3401
3413
|
// -- Per-area score columns -----------------------------------------------
|
|
3402
|
-
score: "
|
|
3414
|
+
score: "Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.",
|
|
3403
3415
|
taskCompletion: "Can the LLM implement the requested feature? Graded 0\u2013100.",
|
|
3404
3416
|
codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.",
|
|
3405
|
-
docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100.",
|
|
3417
|
+
docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.",
|
|
3406
3418
|
tests: "Number of test cases in this feature area.",
|
|
3407
3419
|
// -- Comparison deltas ----------------------------------------------------
|
|
3408
3420
|
overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.",
|
|
@@ -3423,7 +3435,7 @@ var GLOSSARY = {
|
|
|
3423
3435
|
healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.",
|
|
3424
3436
|
negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.",
|
|
3425
3437
|
weakAreas: "Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.",
|
|
3426
|
-
docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation
|
|
3438
|
+
docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.",
|
|
3427
3439
|
retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.",
|
|
3428
3440
|
dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).",
|
|
3429
3441
|
efficiencyAnomalies: "Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.",
|
|
@@ -3460,7 +3472,7 @@ var GLOSSARY = {
|
|
|
3460
3472
|
sourceBranch: "Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.",
|
|
3461
3473
|
sourceLocal: "Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.",
|
|
3462
3474
|
// -- Report list columns ----------------------------------------------------
|
|
3463
|
-
reportScore: "The overall
|
|
3475
|
+
reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
|
|
3464
3476
|
reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.",
|
|
3465
3477
|
reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.",
|
|
3466
3478
|
// -- Mode values -----------------------------------------------------------
|
|
@@ -3477,6 +3489,82 @@ var GLOSSARY = {
|
|
|
3477
3489
|
triggerCrossRepo: "Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks."
|
|
3478
3490
|
};
|
|
3479
3491
|
|
|
3492
|
+
// src/lib/dimensions.ts
|
|
3493
|
+
var DIMENSION_TOOLTIPS = {
|
|
3494
|
+
agentOutput: "Quality and completeness of the agent's output. Graded 0\u2013100.",
|
|
3495
|
+
assertionPassRate: "Fraction of structural assertions that passed. Graded 0\u2013100.",
|
|
3496
|
+
codeCorrectness: GLOSSARY.codeCorrectness,
|
|
3497
|
+
docCoverage: GLOSSARY.docCoverage,
|
|
3498
|
+
taskCompletion: GLOSSARY.taskCompletion,
|
|
3499
|
+
toolUsage: "How effectively the agent used available tools (file read/write, shell, etc.). Graded 0\u2013100."
|
|
3500
|
+
};
|
|
3501
|
+
function dimensionLabel(key) {
|
|
3502
|
+
return key.replace(/([A-Z])/g, " $1").replace(/^./, (c) => c.toUpperCase()).trim();
|
|
3503
|
+
}
|
|
3504
|
+
var SHORT_LABELS = {
|
|
3505
|
+
agentOutput: "Agent",
|
|
3506
|
+
assertionPassRate: "Assert",
|
|
3507
|
+
codeCorrectness: "Code",
|
|
3508
|
+
docCoverage: "Docs",
|
|
3509
|
+
taskCompletion: "Task",
|
|
3510
|
+
toolUsage: "Tools"
|
|
3511
|
+
};
|
|
3512
|
+
function dimensionShortLabel(key) {
|
|
3513
|
+
return SHORT_LABELS[key] ?? dimensionLabel(key).split(" ")[0];
|
|
3514
|
+
}
|
|
3515
|
+
var LITERACY_DEFAULTS = [
|
|
3516
|
+
{ key: "taskCompletion", field: "taskCompletion" },
|
|
3517
|
+
{ key: "codeCorrectness", field: "codeCorrectness" },
|
|
3518
|
+
{ key: "docCoverage", field: "docCoverage" }
|
|
3519
|
+
];
|
|
3520
|
+
function resolveDimensions(score) {
|
|
3521
|
+
const dims = score.dimensions;
|
|
3522
|
+
if (dims && Object.keys(dims).length > 0) {
|
|
3523
|
+
return Object.entries(dims).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => ({
|
|
3524
|
+
key,
|
|
3525
|
+
label: dimensionLabel(key),
|
|
3526
|
+
tooltip: DIMENSION_TOOLTIPS[key] ?? "",
|
|
3527
|
+
value
|
|
3528
|
+
}));
|
|
3529
|
+
}
|
|
3530
|
+
return LITERACY_DEFAULTS.map(({ key, field }) => ({
|
|
3531
|
+
key,
|
|
3532
|
+
label: dimensionLabel(key),
|
|
3533
|
+
tooltip: DIMENSION_TOOLTIPS[key] ?? "",
|
|
3534
|
+
value: score[field] ?? 0
|
|
3535
|
+
}));
|
|
3536
|
+
}
|
|
3537
|
+
function isLiteracyMode(mode) {
|
|
3538
|
+
if (mode) return mode === "literacy";
|
|
3539
|
+
return true;
|
|
3540
|
+
}
|
|
3541
|
+
function collectDimensionKeys(scores) {
|
|
3542
|
+
const keys = /* @__PURE__ */ new Set();
|
|
3543
|
+
for (const score of scores) {
|
|
3544
|
+
if (score.dimensions && Object.keys(score.dimensions).length > 0) {
|
|
3545
|
+
for (const key of Object.keys(score.dimensions)) {
|
|
3546
|
+
keys.add(key);
|
|
3547
|
+
}
|
|
3548
|
+
} else {
|
|
3549
|
+
for (const { key } of LITERACY_DEFAULTS) {
|
|
3550
|
+
keys.add(key);
|
|
3551
|
+
}
|
|
3552
|
+
}
|
|
3553
|
+
}
|
|
3554
|
+
return [...keys].sort();
|
|
3555
|
+
}
|
|
3556
|
+
function getDimensionValue(score, key) {
|
|
3557
|
+
if (score.dimensions?.[key] != null) return score.dimensions[key];
|
|
3558
|
+
const legacyMap = {
|
|
3559
|
+
codeCorrectness: "codeCorrectness",
|
|
3560
|
+
docCoverage: "docCoverage",
|
|
3561
|
+
taskCompletion: "taskCompletion"
|
|
3562
|
+
};
|
|
3563
|
+
const field = legacyMap[key];
|
|
3564
|
+
if (field) return score[field] ?? 0;
|
|
3565
|
+
return 0;
|
|
3566
|
+
}
|
|
3567
|
+
|
|
3480
3568
|
// src/lib/comparison.ts
|
|
3481
3569
|
function scoreMap(summary) {
|
|
3482
3570
|
return new Map(summary.scores.map((s) => [s.feature, s]));
|
|
@@ -3543,14 +3631,26 @@ function computeThreeLayerDeltas(baseline, experiment) {
|
|
|
3543
3631
|
function computeDimensionDeltas(baseline, experiment) {
|
|
3544
3632
|
const bMap = scoreMap(baseline);
|
|
3545
3633
|
const eMap = scoreMap(experiment);
|
|
3634
|
+
const allDimKeys = collectDimensionKeys([
|
|
3635
|
+
...baseline.scores,
|
|
3636
|
+
...experiment.scores
|
|
3637
|
+
]);
|
|
3546
3638
|
return allAreas(baseline, experiment).map((area) => {
|
|
3547
3639
|
const bItem = bMap.get(area);
|
|
3548
3640
|
const eItem = eMap.get(area);
|
|
3641
|
+
const deltas = {};
|
|
3642
|
+
for (const key of allDimKeys) {
|
|
3643
|
+
const bVal = bItem ? getDimensionValue(bItem, key) : 0;
|
|
3644
|
+
const eVal = eItem ? getDimensionValue(eItem, key) : 0;
|
|
3645
|
+
deltas[key] = eVal - bVal;
|
|
3646
|
+
}
|
|
3549
3647
|
return {
|
|
3550
3648
|
area,
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3649
|
+
deltas,
|
|
3650
|
+
// Legacy accessors for backward compatibility
|
|
3651
|
+
taskDelta: deltas.taskCompletion ?? 0,
|
|
3652
|
+
codeDelta: deltas.codeCorrectness ?? 0,
|
|
3653
|
+
docDelta: deltas.docCoverage ?? 0
|
|
3554
3654
|
};
|
|
3555
3655
|
});
|
|
3556
3656
|
}
|
|
@@ -3593,7 +3693,10 @@ var latestReportsQuery = (
|
|
|
3593
3693
|
taskCompletion,
|
|
3594
3694
|
codeCorrectness,
|
|
3595
3695
|
docCoverage,
|
|
3696
|
+
dimensions,
|
|
3596
3697
|
docLift,
|
|
3698
|
+
groupType,
|
|
3699
|
+
negativeDocLift,
|
|
3597
3700
|
testCount,
|
|
3598
3701
|
actualScore,
|
|
3599
3702
|
retrievalGap,
|
|
@@ -6033,6 +6136,7 @@ function DiagnosticsOverview({
|
|
|
6033
6136
|
overall,
|
|
6034
6137
|
comparison,
|
|
6035
6138
|
durationMs,
|
|
6139
|
+
mode,
|
|
6036
6140
|
totalTests
|
|
6037
6141
|
}) {
|
|
6038
6142
|
if (scores.length === 0) return null;
|
|
@@ -6043,6 +6147,7 @@ function DiagnosticsOverview({
|
|
|
6043
6147
|
const weak = scores.filter((s) => s.totalScore < SCORE_CAUTION);
|
|
6044
6148
|
const negativeDocLiftCount = scores.filter((s) => s.docLift < 0).length;
|
|
6045
6149
|
const hasAgenticData = overall.avgActualScore != null;
|
|
6150
|
+
const showDocMetrics = !mode || mode === "literacy";
|
|
6046
6151
|
const improved = comparison?.improved ?? [];
|
|
6047
6152
|
const regressed = comparison?.regressed ?? [];
|
|
6048
6153
|
const unchanged = comparison?.unchanged ?? [];
|
|
@@ -6057,7 +6162,7 @@ function DiagnosticsOverview({
|
|
|
6057
6162
|
style: {
|
|
6058
6163
|
display: "grid",
|
|
6059
6164
|
gap: 12,
|
|
6060
|
-
gridTemplateColumns: "repeat(3, 1fr)"
|
|
6165
|
+
gridTemplateColumns: showDocMetrics ? "repeat(3, 1fr)" : "repeat(2, 1fr)"
|
|
6061
6166
|
},
|
|
6062
6167
|
children: [
|
|
6063
6168
|
/* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx25(
|
|
@@ -6066,11 +6171,11 @@ function DiagnosticsOverview({
|
|
|
6066
6171
|
delta: comparison?.deltas.overall,
|
|
6067
6172
|
label: "AVG SCORE",
|
|
6068
6173
|
sentiment: scoreSentiment(overall.avgScore),
|
|
6069
|
-
subtitle: "With-docs ceiling",
|
|
6174
|
+
subtitle: showDocMetrics ? "With-docs ceiling" : "Weighted composite",
|
|
6070
6175
|
value: Math.round(overall.avgScore)
|
|
6071
6176
|
}
|
|
6072
6177
|
) }),
|
|
6073
|
-
/* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
|
|
6178
|
+
showDocMetrics && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
|
|
6074
6179
|
ScoreCard,
|
|
6075
6180
|
{
|
|
6076
6181
|
delta: comparison?.deltas.docLift,
|
|
@@ -6080,7 +6185,7 @@ function DiagnosticsOverview({
|
|
|
6080
6185
|
value: Math.round(overall.avgDocLift)
|
|
6081
6186
|
}
|
|
6082
6187
|
) }),
|
|
6083
|
-
/* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
|
|
6188
|
+
showDocMetrics ? /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
|
|
6084
6189
|
ScoreCard,
|
|
6085
6190
|
{
|
|
6086
6191
|
label: "FLOOR",
|
|
@@ -6088,6 +6193,13 @@ function DiagnosticsOverview({
|
|
|
6088
6193
|
subtitle: "Without docs baseline",
|
|
6089
6194
|
value: Math.round(overall.avgFloorScore ?? 0)
|
|
6090
6195
|
}
|
|
6196
|
+
) }) : /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(
|
|
6197
|
+
ScoreCard,
|
|
6198
|
+
{
|
|
6199
|
+
label: "TESTS",
|
|
6200
|
+
subtitle: "Total test cases",
|
|
6201
|
+
value: totalTests ?? 0
|
|
6202
|
+
}
|
|
6091
6203
|
) })
|
|
6092
6204
|
]
|
|
6093
6205
|
}
|
|
@@ -6098,10 +6210,10 @@ function DiagnosticsOverview({
|
|
|
6098
6210
|
style: {
|
|
6099
6211
|
display: "grid",
|
|
6100
6212
|
gap: 12,
|
|
6101
|
-
gridTemplateColumns: "repeat(3, 1fr)"
|
|
6213
|
+
gridTemplateColumns: showDocMetrics ? "repeat(3, 1fr)" : "repeat(2, 1fr)"
|
|
6102
6214
|
},
|
|
6103
6215
|
children: [
|
|
6104
|
-
/* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
|
|
6216
|
+
showDocMetrics && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
|
|
6105
6217
|
MetricCard,
|
|
6106
6218
|
{
|
|
6107
6219
|
label: "Negative Doc Lift",
|
|
@@ -6109,7 +6221,7 @@ function DiagnosticsOverview({
|
|
|
6109
6221
|
value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
|
|
6110
6222
|
}
|
|
6111
6223
|
) }),
|
|
6112
|
-
/* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests ?? 0) }) }),
|
|
6224
|
+
showDocMetrics && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests ?? 0) }) }),
|
|
6113
6225
|
durationMs != null && durationMs > 0 ? /* @__PURE__ */ jsx25(
|
|
6114
6226
|
HoverTip,
|
|
6115
6227
|
{
|
|
@@ -6649,7 +6761,9 @@ function groupByArea(judgments) {
|
|
|
6649
6761
|
}
|
|
6650
6762
|
return [...byArea.entries()].sort(([a], [b]) => a.localeCompare(b)).map(([area, js]) => [area, js.sort((a, b) => a.score - b.score)]);
|
|
6651
6763
|
}
|
|
6652
|
-
|
|
6764
|
+
function dimensionLabel2(dim) {
|
|
6765
|
+
return dim.split("-").map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join(" ");
|
|
6766
|
+
}
|
|
6653
6767
|
function JudgmentList({
|
|
6654
6768
|
focus,
|
|
6655
6769
|
judgments,
|
|
@@ -6741,7 +6855,7 @@ function JudgmentCard({
|
|
|
6741
6855
|
const [expanded, setExpanded] = useState12(focused);
|
|
6742
6856
|
const cardRef = useRef5(null);
|
|
6743
6857
|
const toast = useToast2();
|
|
6744
|
-
const dimLabel =
|
|
6858
|
+
const dimLabel = dimensionLabel2(judgment.dimension);
|
|
6745
6859
|
const sep = judgment.taskId.indexOf(" - ");
|
|
6746
6860
|
const taskName = sep > 0 ? judgment.taskId.substring(sep + 3) : judgment.taskId;
|
|
6747
6861
|
useEffect8(() => {
|
|
@@ -7762,22 +7876,40 @@ import React3, {
|
|
|
7762
7876
|
import { WarningOutlineIcon as WarningOutlineIcon2 } from "@sanity/icons";
|
|
7763
7877
|
import { Box as Box19, Flex as Flex22, Stack as Stack25, Text as Text30 } from "@sanity/ui";
|
|
7764
7878
|
import { Fragment as Fragment11, jsx as jsx41, jsxs as jsxs29 } from "react/jsx-runtime";
|
|
7879
|
+
var DIMENSION_TOOLTIPS2 = {
|
|
7880
|
+
agentOutput: "Quality and completeness of the agent's output. Graded 0\u2013100.",
|
|
7881
|
+
assertionPassRate: "Fraction of structural assertions that passed. Graded 0\u2013100.",
|
|
7882
|
+
codeCorrectness: GLOSSARY.codeCorrectness,
|
|
7883
|
+
docCoverage: GLOSSARY.docCoverage,
|
|
7884
|
+
taskCompletion: GLOSSARY.taskCompletion,
|
|
7885
|
+
toolUsage: "How effectively the agent used available tools (file read/write, shell, etc.). Graded 0\u2013100."
|
|
7886
|
+
};
|
|
7765
7887
|
function tableTier2(width) {
|
|
7766
7888
|
if (width >= 900) return "full";
|
|
7767
7889
|
if (width >= 600) return "compact";
|
|
7768
7890
|
return "narrow";
|
|
7769
7891
|
}
|
|
7770
|
-
function gridColumns(tier, hasActual) {
|
|
7892
|
+
function gridColumns(tier, dimCount, showLift, hasActual) {
|
|
7893
|
+
const dims = Array.from({ length: dimCount }, () => "1fr").join(" ");
|
|
7771
7894
|
switch (tier) {
|
|
7772
|
-
case "full":
|
|
7773
|
-
|
|
7774
|
-
|
|
7775
|
-
|
|
7895
|
+
case "full": {
|
|
7896
|
+
const parts = ["120px", "1fr", dims];
|
|
7897
|
+
if (showLift) parts.push("80px");
|
|
7898
|
+
if (showLift) parts.push("72px");
|
|
7899
|
+
if (hasActual) parts.push("72px");
|
|
7900
|
+
return parts.join(" ");
|
|
7901
|
+
}
|
|
7902
|
+
case "compact": {
|
|
7903
|
+
const parts = ["96px", "1fr", dims];
|
|
7904
|
+
if (showLift) parts.push("80px");
|
|
7905
|
+
return parts.join(" ");
|
|
7906
|
+
}
|
|
7776
7907
|
case "narrow":
|
|
7777
|
-
return
|
|
7908
|
+
return `56px 1fr ${dims}`;
|
|
7778
7909
|
}
|
|
7779
7910
|
}
|
|
7780
7911
|
function AreaScoresGrid({
|
|
7912
|
+
mode,
|
|
7781
7913
|
scores,
|
|
7782
7914
|
perArea,
|
|
7783
7915
|
perModel
|
|
@@ -7788,6 +7920,8 @@ function AreaScoresGrid({
|
|
|
7788
7920
|
() => scores.some((s) => s.actualScore != null),
|
|
7789
7921
|
[scores]
|
|
7790
7922
|
);
|
|
7923
|
+
const showLift = isLiteracyMode(mode);
|
|
7924
|
+
const dimKeys = useMemo6(() => collectDimensionKeys(scores), [scores]);
|
|
7791
7925
|
const [sortField, setSortField] = useState17("score");
|
|
7792
7926
|
const [sortDir, setSortDir] = useState17("desc");
|
|
7793
7927
|
const handleSort = useCallback22(
|
|
@@ -7809,16 +7943,10 @@ function AreaScoresGrid({
|
|
|
7809
7943
|
return (a.totalScore - b.totalScore) * dir;
|
|
7810
7944
|
case "area":
|
|
7811
7945
|
return a.feature.localeCompare(b.feature) * dir;
|
|
7812
|
-
case "task":
|
|
7813
|
-
return (a.taskCompletion - b.taskCompletion) * dir;
|
|
7814
|
-
case "code":
|
|
7815
|
-
return (a.codeCorrectness - b.codeCorrectness) * dir;
|
|
7816
|
-
case "docs":
|
|
7817
|
-
return (a.docCoverage - b.docCoverage) * dir;
|
|
7818
7946
|
case "lift":
|
|
7819
7947
|
return (a.docLift - b.docLift) * dir;
|
|
7820
7948
|
default:
|
|
7821
|
-
return
|
|
7949
|
+
return (getDimensionValue(a, sortField) - getDimensionValue(b, sortField)) * dir;
|
|
7822
7950
|
}
|
|
7823
7951
|
});
|
|
7824
7952
|
}, [scores, sortField, sortDir]);
|
|
@@ -7848,7 +7976,12 @@ function AreaScoresGrid({
|
|
|
7848
7976
|
borderBottom: "1px solid var(--card-border-color)",
|
|
7849
7977
|
display: "grid",
|
|
7850
7978
|
gap: "0 12px",
|
|
7851
|
-
gridTemplateColumns: gridColumns(
|
|
7979
|
+
gridTemplateColumns: gridColumns(
|
|
7980
|
+
tier,
|
|
7981
|
+
dimKeys.length,
|
|
7982
|
+
showLift,
|
|
7983
|
+
hasActual
|
|
7984
|
+
),
|
|
7852
7985
|
padding: "12px 16px 8px"
|
|
7853
7986
|
},
|
|
7854
7987
|
children: [
|
|
@@ -7859,7 +7992,7 @@ function AreaScoresGrid({
|
|
|
7859
7992
|
direction: sortDir,
|
|
7860
7993
|
label: "Score",
|
|
7861
7994
|
onClick: () => handleSort("score"),
|
|
7862
|
-
tooltip:
|
|
7995
|
+
tooltip: GLOSSARY.score
|
|
7863
7996
|
}
|
|
7864
7997
|
),
|
|
7865
7998
|
/* @__PURE__ */ jsx41(
|
|
@@ -7871,37 +8004,18 @@ function AreaScoresGrid({
|
|
|
7871
8004
|
onClick: () => handleSort("area")
|
|
7872
8005
|
}
|
|
7873
8006
|
),
|
|
7874
|
-
/* @__PURE__ */ jsx41(
|
|
8007
|
+
dimKeys.map((key) => /* @__PURE__ */ jsx41(
|
|
7875
8008
|
ColHeader3,
|
|
7876
8009
|
{
|
|
7877
|
-
active: sortField ===
|
|
8010
|
+
active: sortField === key,
|
|
7878
8011
|
direction: sortDir,
|
|
7879
|
-
label:
|
|
7880
|
-
onClick: () => handleSort(
|
|
7881
|
-
tooltip:
|
|
7882
|
-
}
|
|
7883
|
-
|
|
7884
|
-
|
|
7885
|
-
|
|
7886
|
-
{
|
|
7887
|
-
active: sortField === "code",
|
|
7888
|
-
direction: sortDir,
|
|
7889
|
-
label: "Code",
|
|
7890
|
-
onClick: () => handleSort("code"),
|
|
7891
|
-
tooltip: GLOSSARY.codeCorrectness
|
|
7892
|
-
}
|
|
7893
|
-
),
|
|
7894
|
-
/* @__PURE__ */ jsx41(
|
|
7895
|
-
ColHeader3,
|
|
7896
|
-
{
|
|
7897
|
-
active: sortField === "docs",
|
|
7898
|
-
direction: sortDir,
|
|
7899
|
-
label: "Docs",
|
|
7900
|
-
onClick: () => handleSort("docs"),
|
|
7901
|
-
tooltip: GLOSSARY.docCoverage
|
|
7902
|
-
}
|
|
7903
|
-
),
|
|
7904
|
-
tier !== "narrow" && /* @__PURE__ */ jsx41(
|
|
8012
|
+
label: dimensionShortLabel(key),
|
|
8013
|
+
onClick: () => handleSort(key),
|
|
8014
|
+
tooltip: DIMENSION_TOOLTIPS2[key] ?? `${dimensionShortLabel(key)} dimension score (0\u2013100).`
|
|
8015
|
+
},
|
|
8016
|
+
key
|
|
8017
|
+
)),
|
|
8018
|
+
tier !== "narrow" && showLift && /* @__PURE__ */ jsx41(
|
|
7905
8019
|
ColHeader3,
|
|
7906
8020
|
{
|
|
7907
8021
|
active: sortField === "lift",
|
|
@@ -7911,7 +8025,7 @@ function AreaScoresGrid({
|
|
|
7911
8025
|
tooltip: GLOSSARY.docLift
|
|
7912
8026
|
}
|
|
7913
8027
|
),
|
|
7914
|
-
tier === "full" && /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
|
|
8028
|
+
tier === "full" && showLift && /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
|
|
7915
8029
|
tier === "full" && hasActual && /* @__PURE__ */ jsx41(ColHeader3, { label: "Actual", tooltip: GLOSSARY.actualScore })
|
|
7916
8030
|
]
|
|
7917
8031
|
}
|
|
@@ -7922,15 +8036,19 @@ function AreaScoresGrid({
|
|
|
7922
8036
|
{
|
|
7923
8037
|
area,
|
|
7924
8038
|
delta: perArea?.[area.feature],
|
|
8039
|
+
dimKeys,
|
|
7925
8040
|
hasActual,
|
|
8041
|
+
showLift,
|
|
7926
8042
|
tier
|
|
7927
8043
|
}
|
|
7928
8044
|
),
|
|
7929
8045
|
modelScoresByFeature && /* @__PURE__ */ jsx41(
|
|
7930
8046
|
ModelSubRows,
|
|
7931
8047
|
{
|
|
8048
|
+
dimKeys,
|
|
7932
8049
|
hasActual,
|
|
7933
8050
|
models: modelScoresByFeature.get(area.feature),
|
|
8051
|
+
showLift,
|
|
7934
8052
|
tier
|
|
7935
8053
|
}
|
|
7936
8054
|
)
|
|
@@ -7938,26 +8056,32 @@ function AreaScoresGrid({
|
|
|
7938
8056
|
] });
|
|
7939
8057
|
}
|
|
7940
8058
|
function ModelSubRows({
|
|
8059
|
+
dimKeys,
|
|
7941
8060
|
hasActual,
|
|
7942
8061
|
models,
|
|
8062
|
+
showLift,
|
|
7943
8063
|
tier
|
|
7944
8064
|
}) {
|
|
7945
8065
|
if (!models || models.length === 0) return null;
|
|
7946
8066
|
return /* @__PURE__ */ jsx41(Fragment11, { children: models.map((entry) => /* @__PURE__ */ jsx41(
|
|
7947
8067
|
ModelRow,
|
|
7948
8068
|
{
|
|
8069
|
+
dimKeys,
|
|
7949
8070
|
hasActual,
|
|
7950
8071
|
label: entry.label,
|
|
7951
8072
|
scores: entry.scores,
|
|
8073
|
+
showLift,
|
|
7952
8074
|
tier
|
|
7953
8075
|
},
|
|
7954
8076
|
entry.label
|
|
7955
8077
|
)) });
|
|
7956
8078
|
}
|
|
7957
8079
|
function ModelRow({
|
|
8080
|
+
dimKeys,
|
|
7958
8081
|
hasActual,
|
|
7959
8082
|
label,
|
|
7960
8083
|
scores,
|
|
8084
|
+
showLift,
|
|
7961
8085
|
tier
|
|
7962
8086
|
}) {
|
|
7963
8087
|
const isNarrow = tier === "narrow";
|
|
@@ -7970,7 +8094,12 @@ function ModelRow({
|
|
|
7970
8094
|
borderBottom: "1px solid var(--card-border-color)",
|
|
7971
8095
|
display: "grid",
|
|
7972
8096
|
gap: "0 12px",
|
|
7973
|
-
gridTemplateColumns: gridColumns(
|
|
8097
|
+
gridTemplateColumns: gridColumns(
|
|
8098
|
+
tier,
|
|
8099
|
+
dimKeys.length,
|
|
8100
|
+
showLift,
|
|
8101
|
+
hasActual
|
|
8102
|
+
),
|
|
7974
8103
|
padding: isNarrow ? "6px 12px 6px 20px" : "6px 16px 6px 28px"
|
|
7975
8104
|
},
|
|
7976
8105
|
children: [
|
|
@@ -7987,34 +8116,17 @@ function ModelRow({
|
|
|
7987
8116
|
}
|
|
7988
8117
|
) }),
|
|
7989
8118
|
/* @__PURE__ */ jsx41(Flex22, { align: "center", gap: 2, children: /* @__PURE__ */ jsx41(Text30, { muted: true, size: 1, children: label }) }),
|
|
7990
|
-
/* @__PURE__ */ jsx41(
|
|
7991
|
-
DimCell,
|
|
7992
|
-
{
|
|
7993
|
-
area: label,
|
|
7994
|
-
dim: "Task Completion",
|
|
7995
|
-
size: "small",
|
|
7996
|
-
value: scores.taskCompletion
|
|
7997
|
-
}
|
|
7998
|
-
),
|
|
7999
|
-
/* @__PURE__ */ jsx41(
|
|
8119
|
+
dimKeys.map((key) => /* @__PURE__ */ jsx41(
|
|
8000
8120
|
DimCell,
|
|
8001
8121
|
{
|
|
8002
8122
|
area: label,
|
|
8003
|
-
dim:
|
|
8123
|
+
dim: dimensionLabel(key),
|
|
8004
8124
|
size: "small",
|
|
8005
|
-
value: scores
|
|
8006
|
-
}
|
|
8007
|
-
|
|
8008
|
-
|
|
8009
|
-
|
|
8010
|
-
{
|
|
8011
|
-
area: label,
|
|
8012
|
-
dim: "Doc Coverage",
|
|
8013
|
-
size: "small",
|
|
8014
|
-
value: scores.docCoverage
|
|
8015
|
-
}
|
|
8016
|
-
),
|
|
8017
|
-
!isNarrow && /* @__PURE__ */ jsxs29(
|
|
8125
|
+
value: getDimensionValue(scores, key)
|
|
8126
|
+
},
|
|
8127
|
+
key
|
|
8128
|
+
)),
|
|
8129
|
+
!isNarrow && showLift && /* @__PURE__ */ jsxs29(
|
|
8018
8130
|
Text30,
|
|
8019
8131
|
{
|
|
8020
8132
|
size: 1,
|
|
@@ -8029,7 +8141,7 @@ function ModelRow({
|
|
|
8029
8141
|
]
|
|
8030
8142
|
}
|
|
8031
8143
|
),
|
|
8032
|
-
tier === "full" && /* @__PURE__ */ jsx41(
|
|
8144
|
+
tier === "full" && showLift && /* @__PURE__ */ jsx41(
|
|
8033
8145
|
Text30,
|
|
8034
8146
|
{
|
|
8035
8147
|
muted: true,
|
|
@@ -8057,7 +8169,9 @@ function ModelRow({
|
|
|
8057
8169
|
function AreaRow({
|
|
8058
8170
|
area,
|
|
8059
8171
|
delta,
|
|
8172
|
+
dimKeys,
|
|
8060
8173
|
hasActual,
|
|
8174
|
+
showLift,
|
|
8061
8175
|
tier
|
|
8062
8176
|
}) {
|
|
8063
8177
|
const isNarrow = tier === "narrow";
|
|
@@ -8069,7 +8183,12 @@ function AreaRow({
|
|
|
8069
8183
|
borderBottom: "1px solid var(--card-border-color)",
|
|
8070
8184
|
display: "grid",
|
|
8071
8185
|
gap: "0 12px",
|
|
8072
|
-
gridTemplateColumns: gridColumns(
|
|
8186
|
+
gridTemplateColumns: gridColumns(
|
|
8187
|
+
tier,
|
|
8188
|
+
dimKeys.length,
|
|
8189
|
+
showLift,
|
|
8190
|
+
hasActual
|
|
8191
|
+
),
|
|
8073
8192
|
padding: isNarrow ? "8px 12px" : "10px 16px"
|
|
8074
8193
|
},
|
|
8075
8194
|
children: [
|
|
@@ -8079,7 +8198,7 @@ function AreaRow({
|
|
|
8079
8198
|
{
|
|
8080
8199
|
text: /* @__PURE__ */ jsxs29(Text30, { size: 2, style: { lineHeight: 1.5 }, children: [
|
|
8081
8200
|
/* @__PURE__ */ jsx41("span", { style: { fontWeight: 600 }, children: area.feature }),
|
|
8082
|
-
"
|
|
8201
|
+
" score:",
|
|
8083
8202
|
" ",
|
|
8084
8203
|
/* @__PURE__ */ jsx41(
|
|
8085
8204
|
"span",
|
|
@@ -8095,8 +8214,7 @@ function AreaRow({
|
|
|
8095
8214
|
/* @__PURE__ */ jsx41("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
|
|
8096
8215
|
".",
|
|
8097
8216
|
" ",
|
|
8098
|
-
GLOSSARY.score
|
|
8099
|
-
" This is the ceiling \u2014 with gold-standard docs injected."
|
|
8217
|
+
GLOSSARY.score
|
|
8100
8218
|
] }),
|
|
8101
8219
|
children: /* @__PURE__ */ jsx41(
|
|
8102
8220
|
"div",
|
|
@@ -8123,7 +8241,7 @@ function AreaRow({
|
|
|
8123
8241
|
] }),
|
|
8124
8242
|
/* @__PURE__ */ jsxs29(Flex22, { align: "center", gap: 2, wrap: "wrap", children: [
|
|
8125
8243
|
/* @__PURE__ */ jsx41(Text30, { size: 2, weight: "medium", children: area.feature }),
|
|
8126
|
-
area.negativeDocLift && /* @__PURE__ */ jsx41(HoverTip, { text: GLOSSARY.docsHurt, children: /* @__PURE__ */ jsx41(
|
|
8244
|
+
area.negativeDocLift && showLift && /* @__PURE__ */ jsx41(HoverTip, { text: GLOSSARY.docsHurt, children: /* @__PURE__ */ jsx41(
|
|
8127
8245
|
"span",
|
|
8128
8246
|
{
|
|
8129
8247
|
style: {
|
|
@@ -8140,31 +8258,16 @@ function AreaRow({
|
|
|
8140
8258
|
}
|
|
8141
8259
|
) })
|
|
8142
8260
|
] }),
|
|
8143
|
-
/* @__PURE__ */ jsx41(
|
|
8144
|
-
DimCell,
|
|
8145
|
-
{
|
|
8146
|
-
area: area.feature,
|
|
8147
|
-
dim: "Task Completion",
|
|
8148
|
-
value: area.taskCompletion
|
|
8149
|
-
}
|
|
8150
|
-
),
|
|
8151
|
-
/* @__PURE__ */ jsx41(
|
|
8152
|
-
DimCell,
|
|
8153
|
-
{
|
|
8154
|
-
area: area.feature,
|
|
8155
|
-
dim: "Code Correctness",
|
|
8156
|
-
value: area.codeCorrectness
|
|
8157
|
-
}
|
|
8158
|
-
),
|
|
8159
|
-
/* @__PURE__ */ jsx41(
|
|
8261
|
+
dimKeys.map((key) => /* @__PURE__ */ jsx41(
|
|
8160
8262
|
DimCell,
|
|
8161
8263
|
{
|
|
8162
8264
|
area: area.feature,
|
|
8163
|
-
dim:
|
|
8164
|
-
value: area
|
|
8165
|
-
}
|
|
8166
|
-
|
|
8167
|
-
|
|
8265
|
+
dim: dimensionLabel(key),
|
|
8266
|
+
value: getDimensionValue(area, key)
|
|
8267
|
+
},
|
|
8268
|
+
key
|
|
8269
|
+
)),
|
|
8270
|
+
!isNarrow && showLift && /* @__PURE__ */ jsx41(
|
|
8168
8271
|
HoverTip,
|
|
8169
8272
|
{
|
|
8170
8273
|
text: /* @__PURE__ */ jsxs29(Text30, { size: 2, style: { lineHeight: 1.5 }, children: [
|
|
@@ -8206,7 +8309,7 @@ function AreaRow({
|
|
|
8206
8309
|
)
|
|
8207
8310
|
}
|
|
8208
8311
|
),
|
|
8209
|
-
tier === "full" && /* @__PURE__ */ jsx41(
|
|
8312
|
+
tier === "full" && showLift && /* @__PURE__ */ jsx41(
|
|
8210
8313
|
Text30,
|
|
8211
8314
|
{
|
|
8212
8315
|
muted: true,
|
|
@@ -8243,11 +8346,10 @@ function DimCell({
|
|
|
8243
8346
|
size = "normal",
|
|
8244
8347
|
value
|
|
8245
8348
|
}) {
|
|
8246
|
-
const
|
|
8247
|
-
|
|
8248
|
-
|
|
8249
|
-
|
|
8250
|
-
};
|
|
8349
|
+
const dimKey = Object.keys(DIMENSION_TOOLTIPS2).find(
|
|
8350
|
+
(k) => dimensionLabel(k) === dim
|
|
8351
|
+
);
|
|
8352
|
+
const tooltip = dimKey ? DIMENSION_TOOLTIPS2[dimKey] : "";
|
|
8251
8353
|
const textSize = size === "small" ? 0 : 1;
|
|
8252
8354
|
const barHeight = size === "small" ? 3 : 4;
|
|
8253
8355
|
return /* @__PURE__ */ jsx41(
|
|
@@ -8274,7 +8376,7 @@ function DimCell({
|
|
|
8274
8376
|
/* @__PURE__ */ jsx41("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
|
|
8275
8377
|
".",
|
|
8276
8378
|
" ",
|
|
8277
|
-
|
|
8379
|
+
tooltip
|
|
8278
8380
|
] }),
|
|
8279
8381
|
children: /* @__PURE__ */ jsxs29(Stack25, { space: 1, style: { width: "100%" }, children: [
|
|
8280
8382
|
/* @__PURE__ */ jsx41(
|
|
@@ -8509,6 +8611,7 @@ function useModelSelection({
|
|
|
8509
8611
|
// src/components/report-detail/StrengthsList.tsx
|
|
8510
8612
|
import { jsx as jsx43, jsxs as jsxs31 } from "react/jsx-runtime";
|
|
8511
8613
|
function StrengthsList({
|
|
8614
|
+
mode,
|
|
8512
8615
|
scores,
|
|
8513
8616
|
comparison,
|
|
8514
8617
|
perModel
|
|
@@ -8548,6 +8651,7 @@ function StrengthsList({
|
|
|
8548
8651
|
/* @__PURE__ */ jsx43(
|
|
8549
8652
|
AreaScoresGrid,
|
|
8550
8653
|
{
|
|
8654
|
+
mode,
|
|
8551
8655
|
perArea: comparison?.deltas?.perArea,
|
|
8552
8656
|
perModel: expandedPerModel,
|
|
8553
8657
|
scores: displayedScores
|
|
@@ -8613,6 +8717,7 @@ import {
|
|
|
8613
8717
|
import { Box as Box21, Flex as Flex25, Stack as Stack27, Text as Text33 } from "@sanity/ui";
|
|
8614
8718
|
import { jsx as jsx44, jsxs as jsxs32 } from "react/jsx-runtime";
|
|
8615
8719
|
function WeaknessesList({
|
|
8720
|
+
mode,
|
|
8616
8721
|
scores,
|
|
8617
8722
|
comparison,
|
|
8618
8723
|
perModel
|
|
@@ -8658,6 +8763,7 @@ function WeaknessesList({
|
|
|
8658
8763
|
/* @__PURE__ */ jsx44(
|
|
8659
8764
|
AreaScoresGrid,
|
|
8660
8765
|
{
|
|
8766
|
+
mode,
|
|
8661
8767
|
perArea,
|
|
8662
8768
|
perModel: expandedPerModel,
|
|
8663
8769
|
scores: weakAreas
|
|
@@ -8947,40 +9053,22 @@ function dimTip(area, dim, score, description) {
|
|
|
8947
9053
|
] });
|
|
8948
9054
|
}
|
|
8949
9055
|
function getDimensionWeaknesses(area) {
|
|
9056
|
+
const dims = resolveDimensions(area);
|
|
8950
9057
|
const result = [];
|
|
8951
|
-
|
|
8952
|
-
|
|
8953
|
-
|
|
8954
|
-
|
|
8955
|
-
|
|
8956
|
-
|
|
8957
|
-
|
|
8958
|
-
|
|
8959
|
-
|
|
8960
|
-
|
|
8961
|
-
|
|
8962
|
-
|
|
8963
|
-
|
|
8964
|
-
|
|
8965
|
-
tip: dimTip(
|
|
8966
|
-
area.feature,
|
|
8967
|
-
"Code Correctness",
|
|
8968
|
-
Math.round(area.codeCorrectness),
|
|
8969
|
-
GLOSSARY.codeCorrectness
|
|
8970
|
-
),
|
|
8971
|
-
value: Math.round(area.codeCorrectness)
|
|
8972
|
-
});
|
|
8973
|
-
if (area.docCoverage < DIMENSION_WEAKNESS)
|
|
8974
|
-
result.push({
|
|
8975
|
-
dimension: "Doc Coverage",
|
|
8976
|
-
tip: dimTip(
|
|
8977
|
-
area.feature,
|
|
8978
|
-
"Doc Coverage",
|
|
8979
|
-
Math.round(area.docCoverage),
|
|
8980
|
-
GLOSSARY.docCoverage
|
|
8981
|
-
),
|
|
8982
|
-
value: Math.round(area.docCoverage)
|
|
8983
|
-
});
|
|
9058
|
+
for (const dim of dims) {
|
|
9059
|
+
if (dim.value < DIMENSION_WEAKNESS) {
|
|
9060
|
+
result.push({
|
|
9061
|
+
dimension: dim.label,
|
|
9062
|
+
tip: dimTip(
|
|
9063
|
+
area.feature,
|
|
9064
|
+
dim.label,
|
|
9065
|
+
Math.round(dim.value),
|
|
9066
|
+
dim.tooltip
|
|
9067
|
+
),
|
|
9068
|
+
value: Math.round(dim.value)
|
|
9069
|
+
});
|
|
9070
|
+
}
|
|
9071
|
+
}
|
|
8984
9072
|
return result;
|
|
8985
9073
|
}
|
|
8986
9074
|
|
|
@@ -9137,6 +9225,7 @@ function ReportDetail({
|
|
|
9137
9225
|
{
|
|
9138
9226
|
comparison,
|
|
9139
9227
|
durationMs: report.durationMs,
|
|
9228
|
+
mode: provenance.mode,
|
|
9140
9229
|
overall: summary.overall,
|
|
9141
9230
|
scores: summary.scores,
|
|
9142
9231
|
totalTests
|
|
@@ -9153,6 +9242,7 @@ function ReportDetail({
|
|
|
9153
9242
|
comparison,
|
|
9154
9243
|
focus,
|
|
9155
9244
|
judgments: summary.lowScoringJudgments,
|
|
9245
|
+
mode: provenance.mode,
|
|
9156
9246
|
onNavigate: (newSubTab, newFocus) => onTabChange("diagnostics", newSubTab, newFocus),
|
|
9157
9247
|
perModel: summary.perModel,
|
|
9158
9248
|
recommendations: summary.recommendations,
|
|
@@ -9189,6 +9279,7 @@ function DiagnosticsPanel({
|
|
|
9189
9279
|
comparison,
|
|
9190
9280
|
focus,
|
|
9191
9281
|
judgments,
|
|
9282
|
+
mode,
|
|
9192
9283
|
onNavigate,
|
|
9193
9284
|
perModel,
|
|
9194
9285
|
recommendations,
|
|
@@ -9254,6 +9345,7 @@ function DiagnosticsPanel({
|
|
|
9254
9345
|
StrengthsList,
|
|
9255
9346
|
{
|
|
9256
9347
|
comparison,
|
|
9348
|
+
mode,
|
|
9257
9349
|
perModel,
|
|
9258
9350
|
scores
|
|
9259
9351
|
}
|
|
@@ -9263,6 +9355,7 @@ function DiagnosticsPanel({
|
|
|
9263
9355
|
WeaknessesList,
|
|
9264
9356
|
{
|
|
9265
9357
|
comparison,
|
|
9358
|
+
mode,
|
|
9266
9359
|
perModel,
|
|
9267
9360
|
scores
|
|
9268
9361
|
}
|