npm - @sanity/ailf-studio - Versions diffs - 1.0.0 → 1.1.1 - Mend

@sanity/ailf-studio 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -2585,8 +2585,8 @@ var taskSchema = defineType5({
           type: "boolean"
         }),
         defineField5({
-          description: 'Rubric mode for baseline. "abbreviated" uses a shorter rubric, "full" uses the same rubric as gold, "none" skips rubric grading.',
-          initialValue: "abbreviated",
+          description: 'Rubric mode for baseline. "full" uses the same rubric as gold, "abbreviated" uses a shorter rubric, "none" skips rubric grading.',
+          initialValue: "full",
           name: "rubric",
           options: {
             list: [
@@ -3064,14 +3064,14 @@ import {
   Box as Box25,
   Button as Button9,
   Container,
-  Flex as Flex30,
+  Flex as Flex31,
   Stack as Stack34,
   Tab as Tab2,
   TabList as TabList2,
   TabPanel as TabPanel2,
-  Text as Text40
+  Text as Text41
 } from "@sanity/ui";
-import { useCallback as useCallback25 } from "react";
+import { useCallback as useCallback27 } from "react";
 import { useRouter as useRouter3 } from "sanity/router";
 // src/lib/help-context.ts
@@ -3305,7 +3305,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
   {
     "id": "scoring-model",
     "title": "Understanding Scores",
-    "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n  Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n  and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n  needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task:\n\n```\nTotal = Task Completion \xD7 0.50 + Code Correctness \xD7 0.25 + Doc Coverage \xD7 0.25\n```\n\nThis weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range  | Interpretation                                                    |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100**   | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379**    | Needs attention \u2014 there may be gaps in specific dimensions        |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area             |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n  alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n  prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n  docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n  reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
+    "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n  Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n  and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n  needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs):    Total = Task \xD7 0.50 + Code \xD7 0.25 + Docs \xD7 0.25\nBaseline (no docs):  Total = Task \xD7 0.60 + Code \xD7 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling \u2212 floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range  | Interpretation                                                    |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100**   | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379**    | Needs attention \u2014 there may be gaps in specific dimensions        |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area             |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n  alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n  prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n  docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n  reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
     "source": "docs/help/scoring-model.md",
     "related": [
       "three-layer",
@@ -3337,7 +3337,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
   {
     "id": "glossary",
     "title": "Glossary",
-    "body": "**Overall Score**\n: A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Score without any documentation. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Weighted score for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
+    "body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%. The floor uses a different profile (Task \xD7 60% + Code \xD7 40%, no Doc Coverage).\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
     "source": "packages/studio/src/glossary.ts",
     "tags": [
       "reference",
@@ -3386,23 +3386,23 @@ import { useClient as useClient3 } from "sanity";
 // src/glossary.ts
 var GLOSSARY = {
   // -- Overview stats -------------------------------------------------------
-  overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
-  docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.",
+  overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
+  docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.",
   actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.",
   retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.",
   infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.",
   // -- Three-layer decomposition columns ------------------------------------
-  floor: "Score without any documentation. This tells you what the model already knows from its training data.",
+  floor: "Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.",
   ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.",
   actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.",
   retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.",
   efficiency: "What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).",
   invertedRetGap: "\u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.",
   // -- Per-area score columns -----------------------------------------------
-  score: "Weighted score for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.",
+  score: "Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%. The floor uses a different profile (Task \xD7 60% + Code \xD7 40%, no Doc Coverage).",
   taskCompletion: "Can the LLM implement the requested feature? Graded 0\u2013100.",
   codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.",
-  docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100.",
+  docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.",
   tests: "Number of test cases in this feature area.",
   // -- Comparison deltas ----------------------------------------------------
   overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.",
@@ -3429,6 +3429,8 @@ var GLOSSARY = {
   efficiencyAnomalies: "Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.",
   docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.",
   retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.",
+  // -- Model breakdown --------------------------------------------------------
+  modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.",
   // -- Strengths (positive diagnostics) ---------------------------------------
   strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.",
   // -- Recommendations / gap analysis ----------------------------------------
@@ -3458,7 +3460,7 @@ var GLOSSARY = {
   sourceBranch: "Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.",
   sourceLocal: "Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.",
   // -- Report list columns ----------------------------------------------------
-  reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
+  reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
   reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.",
   reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.",
   // -- Mode values -----------------------------------------------------------
@@ -5591,10 +5593,10 @@ function LatestReports({
 import { ArrowLeftIcon as ArrowLeftIcon3 } from "@sanity/icons";
 import {
   Badge as Badge7,
-  Box as Box23,
+  Box as Box22,
   Button as Button8,
   Flex as Flex26,
-  Stack as Stack29,
+  Stack as Stack28,
   Tab,
   TabList,
   TabPanel,
@@ -5602,10 +5604,10 @@ import {
   Tooltip as Tooltip8
 } from "@sanity/ui";
 import {
-  useCallback as useCallback23,
+  useCallback as useCallback25,
   useEffect as useEffect9,
-  useMemo as useMemo7,
-  useState as useState18
+  useMemo as useMemo9,
+  useState as useState19
 } from "react";
 import { useClient as useClient10 } from "sanity";
@@ -5971,21 +5973,6 @@ function scoreBg(score) {
 function scoreBorder(score) {
   return COLORS[colorForScore(score)].border;
 }
-function scoreBoxStyle(score) {
-  const key = colorForScore(score);
-  return {
-    alignItems: "center",
-    backgroundColor: COLORS[key].bg,
-    borderRadius: 6,
-    color: COLORS[key].text,
-    display: "flex",
-    fontFamily: "var(--font-code-size)",
-    fontWeight: 700,
-    height: 48,
-    justifyContent: "center",
-    width: 48
-  };
-}
 function barFillColor(score) {
   const key = colorForScore(score);
   switch (key) {
@@ -6055,160 +6042,180 @@ function DiagnosticsOverview({
   );
   const weak = scores.filter((s) => s.totalScore < SCORE_CAUTION);
   const negativeDocLiftCount = scores.filter((s) => s.docLift < 0).length;
+  const hasAgenticData = overall.avgActualScore != null;
   const improved = comparison?.improved ?? [];
   const regressed = comparison?.regressed ?? [];
   const unchanged = comparison?.unchanged ?? [];
   const hasComparison = improved.length > 0 || regressed.length > 0 || unchanged.length > 0;
   return /* @__PURE__ */ jsxs21(Stack18, { space: 4, children: [
-    /* @__PURE__ */ jsxs21(
-      "div",
-      {
-        style: {
-          display: "grid",
-          gap: 12,
-          gridTemplateColumns: "repeat(4, 1fr)"
-        },
-        children: [
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx25(
-            ScoreCard,
-            {
-              delta: comparison?.deltas.overall,
-              label: "AVG SCORE",
-              sentiment: scoreSentiment(overall.avgScore),
-              subtitle: "Overall quality score",
-              value: Math.round(overall.avgScore)
-            }
-          ) }),
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
-            ScoreCard,
-            {
-              delta: comparison?.deltas.docLift,
-              label: "AVG DOC LIFT",
-              sentiment: docLiftSentiment(overall.avgDocLift),
-              subtitle: "Improvement with docs",
-              value: Math.round(overall.avgDocLift)
-            }
-          ) }),
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.ceiling, children: /* @__PURE__ */ jsx25(
-            ScoreCard,
-            {
-              label: "AVG CEILING",
-              sentiment: scoreSentiment(overall.avgCeilingScore ?? 0),
-              subtitle: "Best case performance",
-              value: Math.round(overall.avgCeilingScore ?? 0)
-            }
-          ) }),
-          overall.avgInfrastructureEfficiency != null ? /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency, children: /* @__PURE__ */ jsx25(
-            ScoreCard,
-            {
-              label: "EFFICIENCY",
-              sentiment: efficiencySentiment(
-                overall.avgInfrastructureEfficiency
-              ),
-              subtitle: "Infra utilization",
-              suffix: "%",
-              value: Math.round(overall.avgInfrastructureEfficiency * 100)
-            }
-          ) }) : /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
-            ScoreCard,
-            {
-              label: "AVG FLOOR",
-              sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
-              subtitle: "Model-only baseline",
-              value: Math.round(overall.avgFloorScore ?? 0)
-            }
-          ) })
-        ]
-      }
-    ),
-    /* @__PURE__ */ jsxs21(
-      "div",
-      {
-        style: { display: "grid", gap: 12, gridTemplateColumns: "1fr 1fr 1fr" },
-        children: [
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthStrong, children: /* @__PURE__ */ jsx25(
-            HealthCard,
-            {
-              color: strong.length > 0 ? "emerald" : "muted",
-              count: strong.length,
-              icon: /* @__PURE__ */ jsx25(CheckmarkCircleIcon, {}),
-              label: "Strong (80+)"
-            }
-          ) }),
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthAttention, children: /* @__PURE__ */ jsx25(
-            HealthCard,
-            {
-              color: attention.length === 0 ? "muted" : "amber",
-              count: attention.length,
-              icon: /* @__PURE__ */ jsx25(WarningOutlineIcon, {}),
-              label: "Attention (70-79)"
-            }
-          ) }),
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthWeak, children: /* @__PURE__ */ jsx25(
-            HealthCard,
-            {
-              color: weak.length === 0 ? "muted" : "red",
-              count: weak.length,
-              icon: /* @__PURE__ */ jsx25(ErrorOutlineIcon, {}),
-              label: "Weak (<70)"
-            }
-          ) })
-        ]
-      }
-    ),
-    /* @__PURE__ */ jsxs21(
-      "div",
-      {
-        style: {
-          display: "grid",
-          gap: 12,
-          gridTemplateColumns: "repeat(auto-fit, minmax(140px, 1fr))"
-        },
-        children: [
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
-            MetricCard,
-            {
-              label: "Avg Floor",
-              sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
-              value: String(Math.round(overall.avgFloorScore ?? 0))
-            }
-          ) }),
-          overall.avgActualScore != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.actualScore, children: /* @__PURE__ */ jsx25(
-            MetricCard,
-            {
-              label: "Avg Actual",
-              sentiment: scoreSentiment(overall.avgActualScore),
-              value: String(Math.round(overall.avgActualScore))
-            }
-          ) }),
-          overall.avgRetrievalGap != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.retrievalGap, children: /* @__PURE__ */ jsx25(
-            MetricCard,
-            {
-              label: "Avg Retrieval Gap",
-              sentiment: retrievalGapSentiment(overall.avgRetrievalGap),
-              value: overall.avgRetrievalGap.toFixed(1)
-            }
-          ) }),
-          /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
-            MetricCard,
-            {
-              label: "Negative Doc Lift",
-              sentiment: negativeDocLiftSentiment(negativeDocLiftCount),
-              value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
-            }
-          ) }),
-          totalTests != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests) }) }),
-          durationMs != null && durationMs > 0 && /* @__PURE__ */ jsx25(
-            HoverTip,
-            {
-              display: "block",
-              text: "Total wall-clock time for the evaluation pipeline run.",
-              children: /* @__PURE__ */ jsx25(MetricCard, { label: "Duration", value: formatDuration(durationMs) })
-            }
-          )
-        ]
-      }
-    ),
+    /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
+      /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Baseline" }) }),
+      /* @__PURE__ */ jsxs21(Stack18, { space: 3, padding: 3, children: [
+        /* @__PURE__ */ jsxs21(
+          "div",
+          {
+            style: {
+              display: "grid",
+              gap: 12,
+              gridTemplateColumns: "repeat(3, 1fr)"
+            },
+            children: [
+              /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx25(
+                ScoreCard,
+                {
+                  delta: comparison?.deltas.overall,
+                  label: "AVG SCORE",
+                  sentiment: scoreSentiment(overall.avgScore),
+                  subtitle: "With-docs ceiling",
+                  value: Math.round(overall.avgScore)
+                }
+              ) }),
+              /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
+                ScoreCard,
+                {
+                  delta: comparison?.deltas.docLift,
+                  label: "DOC LIFT",
+                  sentiment: docLiftSentiment(overall.avgDocLift),
+                  subtitle: "Improvement from docs",
+                  value: Math.round(overall.avgDocLift)
+                }
+              ) }),
+              /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
+                ScoreCard,
+                {
+                  label: "FLOOR",
+                  sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
+                  subtitle: "Without docs baseline",
+                  value: Math.round(overall.avgFloorScore ?? 0)
+                }
+              ) })
+            ]
+          }
+        ),
+        /* @__PURE__ */ jsxs21(
+          "div",
+          {
+            style: {
+              display: "grid",
+              gap: 12,
+              gridTemplateColumns: "repeat(3, 1fr)"
+            },
+            children: [
+              /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
+                MetricCard,
+                {
+                  label: "Negative Doc Lift",
+                  sentiment: negativeDocLiftSentiment(negativeDocLiftCount),
+                  value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
+                }
+              ) }),
+              /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests ?? 0) }) }),
+              durationMs != null && durationMs > 0 ? /* @__PURE__ */ jsx25(
+                HoverTip,
+                {
+                  display: "block",
+                  text: "Total wall-clock time for the evaluation pipeline run.",
+                  children: /* @__PURE__ */ jsx25(
+                    MetricCard,
+                    {
+                      label: "Duration",
+                      value: formatDuration(durationMs)
+                    }
+                  )
+                }
+              ) : /* @__PURE__ */ jsx25("div", {})
+            ]
+          }
+        )
+      ] })
+    ] }),
+    hasAgenticData && /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
+      /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Agent Performance" }) }),
+      /* @__PURE__ */ jsx25(Stack18, { space: 3, padding: 3, children: /* @__PURE__ */ jsxs21(
+        "div",
+        {
+          style: {
+            display: "grid",
+            gap: 12,
+            gridTemplateColumns: "repeat(3, 1fr)"
+          },
+          children: [
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.actualScore, children: /* @__PURE__ */ jsx25(
+              ScoreCard,
+              {
+                delta: comparison?.deltas.actualDelta,
+                label: "ACTUAL SCORE",
+                sentiment: scoreSentiment(overall.avgActualScore),
+                subtitle: "Agent-retrieved docs",
+                value: Math.round(overall.avgActualScore)
+              }
+            ) }),
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.retrievalGap, children: /* @__PURE__ */ jsx25(
+              ScoreCard,
+              {
+                label: "RETRIEVAL GAP",
+                sentiment: overall.avgRetrievalGap != null ? retrievalGapSentiment(overall.avgRetrievalGap) : void 0,
+                subtitle: "Lost to findability",
+                suffix: "pts",
+                value: overall.avgRetrievalGap != null ? Math.round(overall.avgRetrievalGap) : 0
+              }
+            ) }),
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency, children: /* @__PURE__ */ jsx25(
+              ScoreCard,
+              {
+                label: "EFFICIENCY",
+                sentiment: overall.avgInfrastructureEfficiency != null ? efficiencySentiment(overall.avgInfrastructureEfficiency) : void 0,
+                subtitle: "Doc quality reaching agents",
+                suffix: "%",
+                value: overall.avgInfrastructureEfficiency != null ? Math.round(overall.avgInfrastructureEfficiency * 100) : 0
+              }
+            ) })
+          ]
+        }
+      ) })
+    ] }),
+    /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
+      /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Area Health" }) }),
+      /* @__PURE__ */ jsx25(Box15, { padding: 3, children: /* @__PURE__ */ jsxs21(
+        "div",
+        {
+          style: {
+            display: "grid",
+            gap: 12,
+            gridTemplateColumns: "1fr 1fr 1fr"
+          },
+          children: [
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthStrong, children: /* @__PURE__ */ jsx25(
+              HealthCard,
+              {
+                color: strong.length > 0 ? "emerald" : "muted",
+                count: strong.length,
+                icon: /* @__PURE__ */ jsx25(CheckmarkCircleIcon, {}),
+                label: "Strong (80+)"
+              }
+            ) }),
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthAttention, children: /* @__PURE__ */ jsx25(
+              HealthCard,
+              {
+                color: attention.length === 0 ? "muted" : "amber",
+                count: attention.length,
+                icon: /* @__PURE__ */ jsx25(WarningOutlineIcon, {}),
+                label: "Attention (70-79)"
+              }
+            ) }),
+            /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthWeak, children: /* @__PURE__ */ jsx25(
+              HealthCard,
+              {
+                color: weak.length === 0 ? "muted" : "red",
+                count: weak.length,
+                icon: /* @__PURE__ */ jsx25(ErrorOutlineIcon, {}),
+                label: "Weak (<70)"
+              }
+            ) })
+          ]
+        }
+      ) })
+    ] }),
     hasComparison && /* @__PURE__ */ jsxs21(Box15, { style: neutralCardStyle, children: [
       /* @__PURE__ */ jsx25(
         Box15,
@@ -6289,6 +6296,26 @@ function DiagnosticsOverview({
     ] })
   ] });
 }
+var sectionWrapperStyle = {
+  border: "1px solid var(--card-border-color)",
+  borderRadius: 6,
+  overflow: "hidden"
+};
+var sectionHeaderStyle = {
+  borderBottom: "1px solid var(--card-border-color)"
+};
+function SectionLabel({ label }) {
+  return /* @__PURE__ */ jsx25(
+    Text23,
+    {
+      muted: true,
+      size: 1,
+      style: { letterSpacing: "0.08em", textTransform: "uppercase" },
+      weight: "semibold",
+      children: label
+    }
+  );
+}
 function ScoreCard({
   delta,
   label,
@@ -7722,11 +7749,12 @@ function ReportHeader({
 }
 // src/components/report-detail/StrengthsList.tsx
+import { useMemo as useMemo8 } from "react";
 import { CheckmarkCircleIcon as CheckmarkCircleIcon2, SearchIcon as SearchIcon6 } from "@sanity/icons";
-import { Box as Box20, Flex as Flex23, Stack as Stack26, Text as Text31 } from "@sanity/ui";
+import { Box as Box20, Flex as Flex24, Stack as Stack26, Text as Text32 } from "@sanity/ui";
-// src/components/report-detail/StrengthsTable.tsx
-import {
+// src/components/report-detail/AreaScoresGrid.tsx
+import React3, {
   useCallback as useCallback22,
   useMemo as useMemo6,
   useState as useState17
@@ -7739,14 +7767,27 @@ function tableTier2(width) {
   if (width >= 600) return "compact";
   return "narrow";
 }
-var GRID3 = {
-  full: "120px 1fr 1fr 1fr 1fr 80px 72px 72px",
-  compact: "96px 1fr 1fr 1fr 1fr 80px",
-  narrow: "56px 1fr 1fr 1fr 1fr"
-};
-function StrengthsTable({ scores, perArea }) {
+function gridColumns(tier, hasActual) {
+  switch (tier) {
+    case "full":
+      return hasActual ? "120px 1fr 1fr 1fr 1fr 80px 72px 72px" : "120px 1fr 1fr 1fr 1fr 80px 72px";
+    case "compact":
+      return "96px 1fr 1fr 1fr 1fr 80px";
+    case "narrow":
+      return "56px 1fr 1fr 1fr 1fr";
+  }
+}
+function AreaScoresGrid({
+  scores,
+  perArea,
+  perModel
+}) {
   const { ref: containerRef, width } = useContainerWidth();
   const tier = tableTier2(width);
+  const hasActual = useMemo6(
+    () => scores.some((s) => s.actualScore != null),
+    [scores]
+  );
   const [sortField, setSortField] = useState17("score");
   const [sortDir, setSortDir] = useState17("desc");
   const handleSort = useCallback22(
@@ -7781,6 +7822,24 @@ function StrengthsTable({ scores, perArea }) {
       }
     });
   }, [scores, sortField, sortDir]);
+  const modelScoresByFeature = useMemo6(() => {
+    if (!perModel) return null;
+    const map = /* @__PURE__ */ new Map();
+    for (const model of perModel) {
+      for (const score of model.scores) {
+        let list = map.get(score.feature);
+        if (!list) {
+          list = [];
+          map.set(score.feature, list);
+        }
+        list.push({ label: model.label, scores: score });
+      }
+    }
+    for (const list of map.values()) {
+      list.sort((a, b) => a.label.localeCompare(b.label));
+    }
+    return map;
+  }, [perModel]);
   return /* @__PURE__ */ jsxs29(Box19, { ref: containerRef, style: { ...neutralCardStyle, overflow: "auto" }, children: [
     /* @__PURE__ */ jsxs29(
       "div",
@@ -7789,7 +7848,7 @@ function StrengthsTable({ scores, perArea }) {
           borderBottom: "1px solid var(--card-border-color)",
           display: "grid",
           gap: "0 12px",
-          gridTemplateColumns: GRID3[tier],
+          gridTemplateColumns: gridColumns(tier, hasActual),
           padding: "12px 16px 8px"
         },
         children: [
@@ -7800,7 +7859,7 @@ function StrengthsTable({ scores, perArea }) {
               direction: sortDir,
               label: "Score",
               onClick: () => handleSort("score"),
-              tooltip: GLOSSARY.score
+              tooltip: `${GLOSSARY.score} This is the ceiling score \u2014 with gold-standard docs injected.`
             }
           ),
           /* @__PURE__ */ jsx41(
@@ -7852,27 +7911,153 @@ function StrengthsTable({ scores, perArea }) {
               tooltip: GLOSSARY.docLift
             }
           ),
-          tier === "full" && /* @__PURE__ */ jsxs29(Fragment11, { children: [
-            /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
-            /* @__PURE__ */ jsx41(ColHeader3, { label: "Ceil", tooltip: GLOSSARY.ceiling })
-          ] })
+          tier === "full" && /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
+          tier === "full" && hasActual && /* @__PURE__ */ jsx41(ColHeader3, { label: "Actual", tooltip: GLOSSARY.actualScore })
         ]
       }
     ),
-    sorted.map((area) => /* @__PURE__ */ jsx41(
-      AreaRow,
-      {
-        area,
-        delta: perArea?.[area.feature],
-        tier
-      },
-      area.feature
-    ))
+    sorted.map((area) => /* @__PURE__ */ jsxs29(React3.Fragment, { children: [
+      /* @__PURE__ */ jsx41(
+        AreaRow,
+        {
+          area,
+          delta: perArea?.[area.feature],
+          hasActual,
+          tier
+        }
+      ),
+      modelScoresByFeature && /* @__PURE__ */ jsx41(
+        ModelSubRows,
+        {
+          hasActual,
+          models: modelScoresByFeature.get(area.feature),
+          tier
+        }
+      )
+    ] }, area.feature))
   ] });
 }
+function ModelSubRows({
+  hasActual,
+  models,
+  tier
+}) {
+  if (!models || models.length === 0) return null;
+  return /* @__PURE__ */ jsx41(Fragment11, { children: models.map((entry) => /* @__PURE__ */ jsx41(
+    ModelRow,
+    {
+      hasActual,
+      label: entry.label,
+      scores: entry.scores,
+      tier
+    },
+    entry.label
+  )) });
+}
+function ModelRow({
+  hasActual,
+  label,
+  scores,
+  tier
+}) {
+  const isNarrow = tier === "narrow";
+  return /* @__PURE__ */ jsxs29(
+    "div",
+    {
+      style: {
+        alignItems: "center",
+        backgroundColor: "var(--card-bg2-color, rgba(255,255,255,0.02))",
+        borderBottom: "1px solid var(--card-border-color)",
+        display: "grid",
+        gap: "0 12px",
+        gridTemplateColumns: gridColumns(tier, hasActual),
+        padding: isNarrow ? "6px 12px 6px 20px" : "6px 16px 6px 28px"
+      },
+      children: [
+        /* @__PURE__ */ jsx41(Flex22, { align: "center", children: /* @__PURE__ */ jsx41(
+          Text30,
+          {
+            size: 1,
+            style: {
+              color: scoreColor(scores.totalScore),
+              fontFamily: "var(--font-code-size, monospace)",
+              fontWeight: 600
+            },
+            children: Math.round(scores.totalScore)
+          }
+        ) }),
+        /* @__PURE__ */ jsx41(Flex22, { align: "center", gap: 2, children: /* @__PURE__ */ jsx41(Text30, { muted: true, size: 1, children: label }) }),
+        /* @__PURE__ */ jsx41(
+          DimCell,
+          {
+            area: label,
+            dim: "Task Completion",
+            size: "small",
+            value: scores.taskCompletion
+          }
+        ),
+        /* @__PURE__ */ jsx41(
+          DimCell,
+          {
+            area: label,
+            dim: "Code Correctness",
+            size: "small",
+            value: scores.codeCorrectness
+          }
+        ),
+        /* @__PURE__ */ jsx41(
+          DimCell,
+          {
+            area: label,
+            dim: "Doc Coverage",
+            size: "small",
+            value: scores.docCoverage
+          }
+        ),
+        !isNarrow && /* @__PURE__ */ jsxs29(
+          Text30,
+          {
+            size: 1,
+            style: {
+              color: scores.docLift >= 5 ? "#34d399" : scores.docLift < 0 ? "#f87171" : "var(--card-muted-fg-color)",
+              fontFamily: "var(--font-code-size, monospace)",
+              fontWeight: 500
+            },
+            children: [
+              scores.docLift > 0 ? "+" : "",
+              scores.docLift
+            ]
+          }
+        ),
+        tier === "full" && /* @__PURE__ */ jsx41(
+          Text30,
+          {
+            muted: true,
+            size: 1,
+            style: { fontFamily: "var(--font-code-size, monospace)" },
+            children: Math.round(scores.floorScore ?? 0)
+          }
+        ),
+        tier === "full" && hasActual && /* @__PURE__ */ jsx41(
+          Text30,
+          {
+            size: 1,
+            style: {
+              color: scores.actualScore != null ? scoreColor(scores.actualScore) : "var(--card-muted-fg-color)",
+              fontFamily: "var(--font-code-size, monospace)",
+              fontWeight: 500
+            },
+            children: scores.actualScore != null ? Math.round(scores.actualScore) : "\u2014"
+          }
+        )
+      ]
+    }
+  );
+}
 function AreaRow({
   area,
   delta,
+  hasActual,
   tier
 }) {
   const isNarrow = tier === "narrow";
@@ -7884,7 +8069,7 @@ function AreaRow({
         borderBottom: "1px solid var(--card-border-color)",
         display: "grid",
         gap: "0 12px",
-        gridTemplateColumns: GRID3[tier],
+        gridTemplateColumns: gridColumns(tier, hasActual),
         padding: isNarrow ? "8px 12px" : "10px 16px"
       },
       children: [
@@ -7894,7 +8079,7 @@ function AreaRow({
             {
               text: /* @__PURE__ */ jsxs29(Text30, { size: 2, style: { lineHeight: 1.5 }, children: [
                 /* @__PURE__ */ jsx41("span", { style: { fontWeight: 600 }, children: area.feature }),
-                " composite:",
+                " ceiling score:",
                 " ",
                 /* @__PURE__ */ jsx41(
                   "span",
@@ -7910,7 +8095,8 @@ function AreaRow({
                 /* @__PURE__ */ jsx41("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
                 ".",
                 " ",
-                GLOSSARY.score
+                GLOSSARY.score,
+                " This is the ceiling \u2014 with gold-standard docs injected."
               ] }),
               children: /* @__PURE__ */ jsx41(
                 "div",
@@ -8029,13 +8215,22 @@ function AreaRow({
             children: Math.round(area.floorScore ?? 0)
           }
         ),
-        tier === "full" && /* @__PURE__ */ jsx41(
-          Text30,
+        tier === "full" && hasActual && /* @__PURE__ */ jsx41(
+          HoverTip,
           {
-            muted: true,
-            size: 2,
-            style: { fontFamily: "var(--font-code-size, monospace)" },
-            children: Math.round(area.ceilingScore ?? 0)
+            text: area.actualScore != null ? `${area.feature} actual score: ${Math.round(area.actualScore)}/100. ${GLOSSARY.actualScore}` : `No agentic data for ${area.feature}.`,
+            children: /* @__PURE__ */ jsx41(
+              Text30,
+              {
+                size: 2,
+                style: {
+                  color: area.actualScore != null ? scoreColor(area.actualScore) : "var(--card-muted-fg-color)",
+                  fontFamily: "var(--font-code-size, monospace)",
+                  fontWeight: 600
+                },
+                children: area.actualScore != null ? Math.round(area.actualScore) : "\u2014"
+              }
+            )
           }
         )
       ]
@@ -8045,6 +8240,7 @@ function AreaRow({
 function DimCell({
   area,
   dim,
+  size = "normal",
   value
 }) {
   const glossary = {
@@ -8052,6 +8248,8 @@ function DimCell({
     "Code Correctness": GLOSSARY.codeCorrectness,
     "Doc Coverage": GLOSSARY.docCoverage
   };
+  const textSize = size === "small" ? 0 : 1;
+  const barHeight = size === "small" ? 3 : 4;
   return /* @__PURE__ */ jsx41(
     HoverTip,
     {
@@ -8082,7 +8280,7 @@ function DimCell({
         /* @__PURE__ */ jsx41(
           Text30,
           {
-            size: 1,
+            size: textSize,
             style: {
               color: scoreColor(value),
               fontFamily: "var(--font-code-size, monospace)",
@@ -8097,7 +8295,7 @@ function DimCell({
             style: {
               backgroundColor: "var(--card-border-color)",
               borderRadius: 999,
-              height: 4,
+              height: barHeight,
               overflow: "hidden",
               width: "100%"
             },
@@ -8170,51 +8368,219 @@ function ColHeader3({
   ] });
 }
-// src/components/report-detail/StrengthsList.tsx
+// src/components/report-detail/ModelSelector.tsx
+import { useCallback as useCallback23 } from "react";
+import { Flex as Flex23, Text as Text31 } from "@sanity/ui";
 import { jsx as jsx42, jsxs as jsxs30 } from "react/jsx-runtime";
-function StrengthsList({ scores, comparison }) {
-  const retrievalSuccesses = scores.filter(
+var pillBase = {
+  borderColor: "var(--card-border-color)",
+  borderRadius: 999,
+  borderStyle: "solid",
+  borderWidth: 1,
+  cursor: "pointer",
+  fontSize: 13,
+  fontWeight: 500,
+  lineHeight: 1,
+  padding: "5px 12px",
+  transition: "all 150ms ease",
+  userSelect: "none",
+  whiteSpace: "nowrap"
+};
+var pillDefault = {
+  ...pillBase,
+  backgroundColor: "transparent",
+  color: "var(--card-muted-fg-color)"
+};
+var pillSelected = {
+  ...pillBase,
+  backgroundColor: "rgba(16,185,129,0.15)",
+  borderColor: "rgba(16,185,129,0.40)",
+  color: "#34d399"
+};
+function ModelSelector({
+  models,
+  selection,
+  onChange
+}) {
+  return /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 1, wrap: "wrap", children: [
+    /* @__PURE__ */ jsx42(
+      Pill2,
+      {
+        isSelected: selection === null,
+        label: "All Models",
+        onClick: () => onChange(null)
+      }
+    ),
+    models.map((model) => /* @__PURE__ */ jsx42(
+      Pill2,
+      {
+        isSelected: selection === model.modelId,
+        label: model.label,
+        onClick: () => onChange(model.modelId)
+      },
+      model.modelId
+    )),
+    /* @__PURE__ */ jsx42(
+      "div",
+      {
+        style: {
+          backgroundColor: "var(--card-border-color)",
+          height: 16,
+          marginInline: 4,
+          width: 1
+        }
+      }
+    ),
+    /* @__PURE__ */ jsx42(
+      Pill2,
+      {
+        isSelected: selection === "expanded",
+        label: "By Model",
+        onClick: () => onChange("expanded")
+      }
+    )
+  ] });
+}
+function Pill2({
+  isSelected,
+  label,
+  onClick
+}) {
+  const handleKeyDown = useCallback23(
+    (e) => {
+      if (e.key === "Enter" || e.key === " ") {
+        e.preventDefault();
+        onClick();
+      }
+    },
+    [onClick]
+  );
+  return /* @__PURE__ */ jsx42(
+    "span",
+    {
+      onClick,
+      onKeyDown: handleKeyDown,
+      role: "button",
+      style: isSelected ? pillSelected : pillDefault,
+      tabIndex: 0,
+      children: /* @__PURE__ */ jsx42(
+        Text31,
+        {
+          size: 1,
+          style: {
+            color: "inherit",
+            fontWeight: "inherit"
+          },
+          children: label
+        }
+      )
+    }
+  );
+}
+// src/components/report-detail/useModelSelection.ts
+import { useCallback as useCallback24, useMemo as useMemo7, useState as useState18 } from "react";
+function useModelSelection({
+  scores,
+  perModel
+}) {
+  const [selection, setSelection] = useState18(null);
+  const onSelectionChange = useCallback24((next) => {
+    setSelection(next);
+  }, []);
+  const isExpanded = selection === "expanded";
+  const hasModels = perModel != null && perModel.length > 1;
+  const resolvedScores = useMemo7(() => {
+    if (isExpanded || selection === null || !perModel) return scores;
+    const model = perModel.find((m) => m.modelId === selection);
+    return model?.scores ?? scores;
+  }, [isExpanded, selection, perModel, scores]);
+  const expandedPerModel = isExpanded ? perModel ?? void 0 : void 0;
+  return {
+    selection,
+    onSelectionChange,
+    resolvedScores,
+    isExpanded,
+    hasModels,
+    expandedPerModel
+  };
+}
+// src/components/report-detail/StrengthsList.tsx
+import { jsx as jsx43, jsxs as jsxs31 } from "react/jsx-runtime";
+function StrengthsList({
+  scores,
+  comparison,
+  perModel
+}) {
+  const {
+    selection,
+    onSelectionChange,
+    resolvedScores,
+    hasModels,
+    expandedPerModel
+  } = useModelSelection({ scores, perModel });
+  const displayedScores = useMemo8(
+    () => resolvedScores.filter((s) => s.totalScore >= SCORE_CAUTION),
+    [resolvedScores]
+  );
+  const retrievalSuccesses = displayedScores.filter(
     (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency >= EFFICIENCY_POSITIVE && !s.invertedRetrievalGap
   ).sort(
     (a, b) => (b.infrastructureEfficiency ?? 0) - (a.infrastructureEfficiency ?? 0)
   );
-  if (scores.length === 0) return null;
-  return /* @__PURE__ */ jsxs30(Stack26, { space: 5, children: [
-    /* @__PURE__ */ jsxs30(Stack26, { space: 3, children: [
-      /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 2, children: [
-        /* @__PURE__ */ jsx42(CheckmarkCircleIcon2, { style: { color: "#34d399" } }),
-        /* @__PURE__ */ jsx42(Text31, { size: 2, weight: "medium", children: "All Areas \u2014 Scores & Doc Lift" }),
-        /* @__PURE__ */ jsx42(InfoTip, { text: GLOSSARY.strengths })
+  if (displayedScores.length === 0) return null;
+  return /* @__PURE__ */ jsxs31(Stack26, { space: 5, children: [
+    /* @__PURE__ */ jsxs31(Stack26, { space: 3, children: [
+      /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, wrap: "wrap", children: [
+        /* @__PURE__ */ jsx43(CheckmarkCircleIcon2, { style: { color: "#34d399" } }),
+        /* @__PURE__ */ jsx43(Text32, { size: 2, weight: "medium", children: "Strong Areas (70+)" }),
+        /* @__PURE__ */ jsx43(InfoTip, { text: GLOSSARY.strengths }),
+        hasModels && /* @__PURE__ */ jsx43(Box20, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx43(
+          ModelSelector,
+          {
+            models: perModel,
+            onChange: onSelectionChange,
+            selection
+          }
+        ) })
       ] }),
-      /* @__PURE__ */ jsx42(StrengthsTable, { perArea: comparison?.deltas?.perArea, scores })
+      /* @__PURE__ */ jsx43(
+        AreaScoresGrid,
+        {
+          perArea: comparison?.deltas?.perArea,
+          perModel: expandedPerModel,
+          scores: displayedScores
+        }
+      )
     ] }),
-    retrievalSuccesses.length > 0 && /* @__PURE__ */ jsxs30(Box20, { style: neutralCardStyle, children: [
-      /* @__PURE__ */ jsx42(
+    retrievalSuccesses.length > 0 && /* @__PURE__ */ jsxs31(Box20, { style: neutralCardStyle, children: [
+      /* @__PURE__ */ jsx43(
         Box20,
         {
           padding: 4,
           style: { borderBottom: "1px solid var(--card-border-color)" },
-          children: /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 2, children: [
-            /* @__PURE__ */ jsx42(SearchIcon6, { style: { color: "#34d399" } }),
-            /* @__PURE__ */ jsxs30(Text31, { size: 2, weight: "medium", children: [
+          children: /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, children: [
+            /* @__PURE__ */ jsx43(SearchIcon6, { style: { color: "#34d399" } }),
+            /* @__PURE__ */ jsxs31(Text32, { size: 2, weight: "medium", children: [
               "Retrieval Successes (",
               Math.round(EFFICIENCY_POSITIVE * 100),
               "%+ efficiency)"
             ] }),
-            /* @__PURE__ */ jsx42(InfoTip, { text: GLOSSARY.retrievalExcellence })
+            /* @__PURE__ */ jsx43(InfoTip, { text: GLOSSARY.retrievalExcellence })
           ] })
         }
       ),
-      /* @__PURE__ */ jsx42(Stack26, { children: retrievalSuccesses.map((area, i) => /* @__PURE__ */ jsxs30(
-        Flex23,
+      /* @__PURE__ */ jsx43(Stack26, { children: retrievalSuccesses.map((area, i) => /* @__PURE__ */ jsxs31(
+        Flex24,
         {
           align: "center",
           justify: "space-between",
           padding: 4,
           style: i > 0 ? dividerStyle : void 0,
           children: [
-            /* @__PURE__ */ jsx42(Text31, { size: 2, children: area.feature }),
-            /* @__PURE__ */ jsx42(
+            /* @__PURE__ */ jsx43(Text32, { size: 2, children: area.feature }),
+            /* @__PURE__ */ jsx43(
               "span",
               {
                 style: {
@@ -8240,392 +8606,72 @@ function StrengthsList({ scores, comparison }) {
 import {
   ErrorOutlineIcon as ErrorOutlineIcon3,
   SearchIcon as SearchIcon7,
-  WarningOutlineIcon as WarningOutlineIcon4,
+  WarningOutlineIcon as WarningOutlineIcon3,
   BoltIcon as BoltIcon2,
   ArrowDownIcon as ArrowDownIcon2
 } from "@sanity/icons";
-import { Box as Box22, Flex as Flex25, Stack as Stack28, Text as Text33 } from "@sanity/ui";
-// src/components/report-detail/AreaScoreRow.tsx
-import { WarningOutlineIcon as WarningOutlineIcon3 } from "@sanity/icons";
-import { Box as Box21, Flex as Flex24, Stack as Stack27, Text as Text32 } from "@sanity/ui";
-import { jsx as jsx43, jsxs as jsxs31 } from "react/jsx-runtime";
-function AreaScoreRow({ area, showTrend }) {
-  return /* @__PURE__ */ jsx43(Box21, { style: { ...neutralCardStyle, padding: 20 }, children: /* @__PURE__ */ jsxs31(Stack27, { space: 4, children: [
-    /* @__PURE__ */ jsxs31(Flex24, { align: "flex-start", gap: 3, justify: "space-between", wrap: "wrap", children: [
-      /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 3, children: [
-        /* @__PURE__ */ jsx43(
-          HoverTip,
-          {
-            text: /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
-              /* @__PURE__ */ jsx43("span", { style: tipBold, children: area.feature }),
-              " composite score:",
-              " ",
-              /* @__PURE__ */ jsx43(
-                "span",
-                {
-                  style: { ...tipValue, color: scoreColor(area.totalScore) },
-                  children: Math.round(area.totalScore)
-                }
-              ),
-              /* @__PURE__ */ jsx43("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
-              ". ",
-              GLOSSARY.score
-            ] }),
-            children: /* @__PURE__ */ jsx43(Box21, { style: scoreBoxStyle(area.totalScore), children: /* @__PURE__ */ jsx43("span", { style: { fontSize: 20 }, children: Math.round(area.totalScore) }) })
-          }
-        ),
-        /* @__PURE__ */ jsxs31(Stack27, { space: 2, children: [
-          /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, wrap: "wrap", children: [
-            /* @__PURE__ */ jsx43(Text32, { size: 3, weight: "semibold", children: area.feature }),
-            area.negativeDocLift && /* @__PURE__ */ jsx43(HoverTip, { text: GLOSSARY.docsHurt, children: /* @__PURE__ */ jsxs31(
-              "span",
-              {
-                style: {
-                  alignItems: "center",
-                  backgroundColor: "rgba(239,68,68,0.2)",
-                  borderRadius: 4,
-                  color: "#f87171",
-                  display: "inline-flex",
-                  fontSize: 13,
-                  gap: 4,
-                  padding: "3px 8px"
-                },
-                children: [
-                  /* @__PURE__ */ jsx43(WarningOutlineIcon3, {}),
-                  "Docs Hurt"
-                ]
-              }
-            ) }),
-            area.invertedRetrievalGap && /* @__PURE__ */ jsx43(HoverTip, { text: GLOSSARY.invertedRetGap, children: /* @__PURE__ */ jsx43(
-              "span",
-              {
-                style: {
-                  backgroundColor: "rgba(245,158,11,0.2)",
-                  borderRadius: 4,
-                  color: "#fbbf24",
-                  fontSize: 13,
-                  padding: "3px 8px"
-                },
-                children: "Inverted Retrieval"
-              }
-            ) })
-          ] }),
-          /* @__PURE__ */ jsxs31(Text32, { muted: true, size: 2, children: [
-            area.testCount,
-            " test",
-            area.testCount === 1 ? "" : "s"
-          ] })
-        ] })
-      ] }),
-      showTrend && /* @__PURE__ */ jsx43(
-        "span",
-        {
-          style: {
-            backgroundColor: showTrend === "improved" ? "rgba(16,185,129,0.2)" : showTrend === "regressed" ? "rgba(239,68,68,0.2)" : "var(--card-muted-bg-color)",
-            borderRadius: 4,
-            color: showTrend === "improved" ? "#34d399" : showTrend === "regressed" ? "#f87171" : "var(--card-muted-fg-color)",
-            fontSize: 13,
-            fontWeight: 500,
-            padding: "4px 10px"
-          },
-          children: showTrend
-        }
-      )
-    ] }),
-    /* @__PURE__ */ jsxs31(
-      "div",
-      {
-        style: {
-          display: "grid",
-          gap: 16,
-          gridTemplateColumns: "1fr 1fr 1fr"
-        },
-        children: [
-          /* @__PURE__ */ jsx43(
-            DimBar,
-            {
-              label: "Task Completion",
-              tip: dimBarTip(
-                area.feature,
-                "Task Completion",
-                area.taskCompletion,
-                GLOSSARY.taskCompletion
-              ),
-              value: area.taskCompletion
-            }
-          ),
-          /* @__PURE__ */ jsx43(
-            DimBar,
-            {
-              label: "Code Correctness",
-              tip: dimBarTip(
-                area.feature,
-                "Code Correctness",
-                area.codeCorrectness,
-                GLOSSARY.codeCorrectness
-              ),
-              value: area.codeCorrectness
-            }
-          ),
-          /* @__PURE__ */ jsx43(
-            DimBar,
-            {
-              label: "Doc Coverage",
-              tip: dimBarTip(
-                area.feature,
-                "Doc Coverage",
-                area.docCoverage,
-                GLOSSARY.docCoverage
-              ),
-              value: area.docCoverage
-            }
-          )
-        ]
-      }
-    ),
-    /* @__PURE__ */ jsxs31(Flex24, { gap: 5, style: { ...dividerStyle, paddingTop: 12 }, wrap: "wrap", children: [
-      /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          color: area.negativeDocLift ? "#f87171" : "#34d399",
-          label: "Doc Lift",
-          tip: metricTip(
-            area.feature,
-            "Doc Lift",
-            `${area.docLift > 0 ? "+" : ""}${area.docLift}`,
-            GLOSSARY.docLift
-          ),
-          value: `${area.docLift > 0 ? "+" : ""}${area.docLift}`
-        }
-      ),
-      /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          label: "Ceiling",
-          tip: metricTip(
-            area.feature,
-            "Ceiling",
-            String(Math.round(area.ceilingScore ?? 0)),
-            GLOSSARY.ceiling
-          ),
-          value: String(Math.round(area.ceilingScore ?? 0))
-        }
-      ),
-      /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          label: "Floor",
-          tip: metricTip(
-            area.feature,
-            "Floor",
-            String(Math.round(area.floorScore ?? 0)),
-            GLOSSARY.floor
-          ),
-          value: String(Math.round(area.floorScore ?? 0))
-        }
-      ),
-      area.actualScore != null && /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          label: "Actual",
-          tip: metricTip(
-            area.feature,
-            "Actual",
-            String(Math.round(area.actualScore)),
-            GLOSSARY.actualScore
-          ),
-          value: String(Math.round(area.actualScore))
-        }
-      ),
-      area.infrastructureEfficiency != null && /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          color: efficiencyColor(area.infrastructureEfficiency),
-          label: "Efficiency",
-          tip: metricTip(
-            area.feature,
-            "Efficiency",
-            formatPercent(area.infrastructureEfficiency),
-            GLOSSARY.infraEfficiency
-          ),
-          value: formatPercent(area.infrastructureEfficiency)
-        }
-      ),
-      area.retrievalGap != null && /* @__PURE__ */ jsx43(
-        MetricPair,
-        {
-          label: "Ret Gap",
-          tip: metricTip(
-            area.feature,
-            "Retrieval Gap",
-            String(area.retrievalGap),
-            GLOSSARY.retrievalGap
-          ),
-          value: String(area.retrievalGap)
-        }
-      )
-    ] })
-  ] }) });
-}
-var tipValue = {
-  fontFamily: "var(--font-code-size, monospace)",
-  fontWeight: 600
-};
-var tipBold = { fontWeight: 600 };
-function dimBarTip(area, dim, score, description) {
-  return /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
-    /* @__PURE__ */ jsx43("span", { style: tipBold, children: area }),
-    " \u2192 ",
-    /* @__PURE__ */ jsx43("span", { style: tipBold, children: dim }),
-    ":",
-    " ",
-    /* @__PURE__ */ jsx43("span", { style: { ...tipValue, color: scoreColor(score) }, children: Math.round(score) }),
-    /* @__PURE__ */ jsx43("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
-    ".",
-    " ",
-    description
-  ] });
-}
-function DimBar({
-  label,
-  value,
-  tip
-}) {
-  const bar = /* @__PURE__ */ jsxs31(Stack27, { space: 2, style: { flex: 1 }, children: [
-    /* @__PURE__ */ jsxs31(Flex24, { align: "center", justify: "space-between", children: [
-      /* @__PURE__ */ jsx43(Text32, { muted: true, size: 1, children: label }),
-      /* @__PURE__ */ jsx43(
-        "span",
-        {
-          style: {
-            color: scoreColor(value),
-            fontFamily: "var(--font-code-size, monospace)",
-            fontSize: 14,
-            fontWeight: 600
-          },
-          children: Math.round(value)
-        }
-      )
-    ] }),
-    /* @__PURE__ */ jsx43(
-      Box21,
-      {
-        style: {
-          backgroundColor: "var(--card-border-color)",
-          borderRadius: 999,
-          height: 6,
-          overflow: "hidden"
-        },
-        children: /* @__PURE__ */ jsx43(
-          Box21,
-          {
-            style: {
-              backgroundColor: barFillColor(value),
-              borderRadius: 999,
-              height: "100%",
-              transition: "width 0.3s",
-              width: `${Math.min(value, 100)}%`
-            }
-          }
-        )
-      }
-    )
-  ] });
-  if (tip) {
-    return /* @__PURE__ */ jsx43(HoverTip, { text: tip, children: bar });
-  }
-  return bar;
-}
-function metricTip(area, metric, displayValue, description) {
-  return /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
-    /* @__PURE__ */ jsx43("span", { style: tipBold, children: area }),
-    " \u2192",
-    " ",
-    /* @__PURE__ */ jsx43("span", { style: tipBold, children: metric }),
-    ":",
-    " ",
-    /* @__PURE__ */ jsx43("span", { style: tipValue, children: displayValue }),
-    ". ",
-    description
-  ] });
-}
-function MetricPair({
-  color,
-  label,
-  value,
-  tip
-}) {
-  const content = /* @__PURE__ */ jsxs31(Text32, { muted: true, size: 1, children: [
-    label,
-    ":",
-    " ",
-    /* @__PURE__ */ jsx43(
-      "span",
-      {
-        style: {
-          color: color ?? "var(--card-fg-color)",
-          fontFamily: "var(--font-code-size, monospace)",
-          fontWeight: 500
-        },
-        children: value
-      }
-    )
-  ] });
-  if (tip) {
-    return /* @__PURE__ */ jsx43(HoverTip, { text: tip, children: content });
-  }
-  return content;
-}
-// src/components/report-detail/WeaknessesList.tsx
+import { Box as Box21, Flex as Flex25, Stack as Stack27, Text as Text33 } from "@sanity/ui";
 import { jsx as jsx44, jsxs as jsxs32 } from "react/jsx-runtime";
-function WeaknessesList({ scores, comparison }) {
-  const weakAreas = scores.filter((s) => s.totalScore < SCORE_CAUTION).sort((a, b) => a.totalScore - b.totalScore);
+function WeaknessesList({
+  scores,
+  comparison,
+  perModel
+}) {
+  const {
+    selection,
+    onSelectionChange,
+    resolvedScores,
+    hasModels,
+    expandedPerModel
+  } = useModelSelection({ scores, perModel });
+  const weakFeatures = new Set(
+    scores.filter((s) => s.totalScore < SCORE_CAUTION).map((s) => s.feature)
+  );
+  const weakAreas = resolvedScores.filter((s) => weakFeatures.has(s.feature)).sort((a, b) => a.totalScore - b.totalScore);
   const docsHurt = scores.filter((s) => s.negativeDocLift);
   const retrievalIssues = scores.filter(
     (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency < EFFICIENCY_CAUTION && !s.invertedRetrievalGap
   );
   const dimWeaknesses = scores.map((s) => ({ area: s, dims: getDimensionWeaknesses(s) })).filter(({ dims }) => dims.length > 0);
   const regressed = comparison?.regressed ?? [];
-  const improved = comparison?.improved ?? [];
-  const unchanged = comparison?.unchanged ?? [];
   const perArea = comparison?.deltas?.perArea;
   const efficiencyAnomalies = scores.filter(
     (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency > EFFICIENCY_ANOMALY
   );
   const hasContent = weakAreas.length > 0 || docsHurt.length > 0 || retrievalIssues.length > 0 || dimWeaknesses.length > 0 || regressed.length > 0 || efficiencyAnomalies.length > 0;
   if (!hasContent) return null;
-  const trendFor = (feature) => {
-    if (improved.includes(feature)) return "improved";
-    if (regressed.includes(feature)) return "regressed";
-    if (unchanged.includes(feature)) return "unchanged";
-    return null;
-  };
-  return /* @__PURE__ */ jsxs32(Stack28, { space: 5, children: [
-    weakAreas.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
-      /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
+  return /* @__PURE__ */ jsxs32(Stack27, { space: 5, children: [
+    weakAreas.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
+      /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, wrap: "wrap", children: [
         /* @__PURE__ */ jsx44(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
         /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Weak Areas (<70)" }),
-        /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.weakAreas })
+        /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.weakAreas }),
+        hasModels && /* @__PURE__ */ jsx44(Box21, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx44(
+          ModelSelector,
+          {
+            models: perModel,
+            onChange: onSelectionChange,
+            selection
+          }
+        ) })
       ] }),
-      /* @__PURE__ */ jsx44(Stack28, { space: 3, children: weakAreas.map((area) => /* @__PURE__ */ jsx44(
-        AreaScoreRow,
+      /* @__PURE__ */ jsx44(
+        AreaScoresGrid,
         {
-          area,
-          showTrend: trendFor(area.feature)
-        },
-        area.feature
-      )) })
+          perArea,
+          perModel: expandedPerModel,
+          scores: weakAreas
+        }
+      )
     ] }),
-    docsHurt.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
+    docsHurt.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
       /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
         /* @__PURE__ */ jsx44(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
         /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Docs Hurt Performance (Negative Doc Lift)" }),
         /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.docsHurt })
       ] }),
-      /* @__PURE__ */ jsx44(Box22, { style: sectionStyle("red"), children: docsHurt.map((area, i) => /* @__PURE__ */ jsxs32(
-        Box22,
+      /* @__PURE__ */ jsx44(Box21, { style: sectionStyle("red"), children: docsHurt.map((area, i) => /* @__PURE__ */ jsxs32(
+        Box21,
         {
           padding: 4,
           style: i > 0 ? { borderTop: "1px solid rgba(239,68,68,0.2)" } : void 0,
@@ -8661,7 +8707,7 @@ function WeaknessesList({ scores, comparison }) {
                 }
               )
             ] }),
-            /* @__PURE__ */ jsx44(Box22, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
+            /* @__PURE__ */ jsx44(Box21, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
               area.invertedRetrievalGap && /* @__PURE__ */ jsxs32("span", { style: { color: "#fbbf24" }, children: [
                 "Agent does better by NOT finding these docs.",
                 " "
@@ -8677,14 +8723,14 @@ function WeaknessesList({ scores, comparison }) {
         area.feature
       )) })
     ] }),
-    retrievalIssues.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
+    retrievalIssues.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
       /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
         /* @__PURE__ */ jsx44(SearchIcon7, { style: { color: "#fbbf24" } }),
         /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Retrieval Issues (<70% efficiency)" }),
         /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.retrievalIssues })
       ] }),
-      /* @__PURE__ */ jsx44(Box22, { style: sectionStyle("amber"), children: retrievalIssues.map((area, i) => /* @__PURE__ */ jsxs32(
-        Box22,
+      /* @__PURE__ */ jsx44(Box21, { style: sectionStyle("amber"), children: retrievalIssues.map((area, i) => /* @__PURE__ */ jsxs32(
+        Box21,
         {
           padding: 4,
           style: i > 0 ? { borderTop: "1px solid rgba(245,158,11,0.2)" } : void 0,
@@ -8720,7 +8766,7 @@ function WeaknessesList({ scores, comparison }) {
                 }
               )
             ] }),
-            /* @__PURE__ */ jsx44(Box22, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
+            /* @__PURE__ */ jsx44(Box21, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
               "Actual score (",
               Math.round(area.actualScore ?? 0),
               ") is much lower than ceiling (",
@@ -8735,14 +8781,14 @@ function WeaknessesList({ scores, comparison }) {
         area.feature
       )) })
     ] }),
-    dimWeaknesses.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
+    dimWeaknesses.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
       /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
-        /* @__PURE__ */ jsx44(WarningOutlineIcon4, { style: { color: "#fbbf24" } }),
+        /* @__PURE__ */ jsx44(WarningOutlineIcon3, { style: { color: "#fbbf24" } }),
         /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Dimension Weaknesses (<50)" }),
         /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.dimWeaknesses })
       ] }),
-      /* @__PURE__ */ jsx44(Box22, { style: neutralCardStyle, children: dimWeaknesses.map(({ area, dims }, i) => /* @__PURE__ */ jsxs32(
-        Box22,
+      /* @__PURE__ */ jsx44(Box21, { style: neutralCardStyle, children: dimWeaknesses.map(({ area, dims }, i) => /* @__PURE__ */ jsxs32(
+        Box21,
         {
           padding: 4,
           style: i > 0 ? dividerStyle : void 0,
@@ -8786,9 +8832,9 @@ function WeaknessesList({ scores, comparison }) {
         area.feature
       )) })
     ] }),
-    regressed.length > 0 && /* @__PURE__ */ jsxs32(Box22, { style: neutralCardStyle, children: [
+    regressed.length > 0 && /* @__PURE__ */ jsxs32(Box21, { style: neutralCardStyle, children: [
       /* @__PURE__ */ jsx44(
-        Box22,
+        Box21,
         {
           padding: 4,
           style: { borderBottom: "1px solid var(--card-border-color)" },
@@ -8798,7 +8844,7 @@ function WeaknessesList({ scores, comparison }) {
           ] })
         }
       ),
-      /* @__PURE__ */ jsx44(Stack28, { children: regressed.map((featureName, i) => {
+      /* @__PURE__ */ jsx44(Stack27, { children: regressed.map((featureName, i) => {
         const area = scores.find((s) => s.feature === featureName);
         const areaDelta = perArea?.[featureName];
         return /* @__PURE__ */ jsxs32(
@@ -8834,13 +8880,13 @@ function WeaknessesList({ scores, comparison }) {
         );
       }) })
     ] }),
-    efficiencyAnomalies.length > 0 && /* @__PURE__ */ jsxs32(Box22, { style: neutralCardStyle, children: [
+    efficiencyAnomalies.length > 0 && /* @__PURE__ */ jsxs32(Box21, { style: neutralCardStyle, children: [
       /* @__PURE__ */ jsx44(
-        Box22,
+        Box21,
         {
           padding: 4,
           style: { borderBottom: "1px solid var(--card-border-color)" },
-          children: /* @__PURE__ */ jsxs32(Stack28, { space: 2, children: [
+          children: /* @__PURE__ */ jsxs32(Stack27, { space: 2, children: [
             /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
               /* @__PURE__ */ jsx44(BoltIcon2, { style: { color: "#fbbf24" } }),
               /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Efficiency Anomalies (>100%)" }),
@@ -8850,7 +8896,7 @@ function WeaknessesList({ scores, comparison }) {
           ] })
         }
       ),
-      /* @__PURE__ */ jsx44(Stack28, { children: efficiencyAnomalies.map((area, i) => /* @__PURE__ */ jsxs32(
+      /* @__PURE__ */ jsx44(Stack27, { children: efficiencyAnomalies.map((area, i) => /* @__PURE__ */ jsxs32(
         Flex25,
         {
           align: "center",
@@ -8878,7 +8924,7 @@ function WeaknessesList({ scores, comparison }) {
     ] })
   ] });
 }
-var tipValue2 = {
+var tipValue = {
   color: "#f87171",
   fontFamily: "var(--font-code-size, monospace)",
   fontWeight: 600
@@ -8891,7 +8937,7 @@ function dimTip(area, dim, score, description) {
     /* @__PURE__ */ jsx44("span", { style: tipArea, children: area }),
     " scores",
     " ",
-    /* @__PURE__ */ jsx44("span", { style: tipValue2, children: score }),
+    /* @__PURE__ */ jsx44("span", { style: tipValue, children: score }),
     /* @__PURE__ */ jsx44("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
     " on",
     " ",
@@ -8956,8 +9002,8 @@ function ReportDetail({
   subTab
 }) {
   const client = useClient10({ apiVersion: API_VERSION });
-  const [loading, setLoading] = useState18(true);
-  const [report, setReport] = useState18(null);
+  const [loading, setLoading] = useState19(true);
+  const [report, setReport] = useState19(null);
   useEffect9(() => {
     let cancelled = false;
     setLoading(true);
@@ -8983,22 +9029,22 @@ function ReportDetail({
   const hasAgentActivity = Boolean(
     summary?.agentBehavior && summary.agentBehavior.length > 0
   );
-  const tabs = useMemo7(
+  const tabs = useMemo9(
     () => [OVERVIEW_TAB, DIAGNOSTICS_TAB, ACTIVITY_TAB],
     []
   );
-  const disabledTabs = useMemo7(() => {
+  const disabledTabs = useMemo9(() => {
     const set2 = /* @__PURE__ */ new Set();
     if (!hasDiagnostics) set2.add("diagnostics");
     if (!hasAgentActivity) set2.add("activity");
     return set2;
   }, [hasDiagnostics, hasAgentActivity]);
-  const currentTab = useMemo7(() => {
+  const currentTab = useMemo9(() => {
     const parsed = parseTab(activeTab);
     if (disabledTabs.has(parsed)) return "overview";
     return tabs.some((t) => t.id === parsed) ? parsed : "overview";
   }, [activeTab, disabledTabs, tabs]);
-  const handleTabClick = useCallback23(
+  const handleTabClick = useCallback25(
     (tabId) => {
       onTabChange(tabId === "overview" ? null : tabId, null, null);
     },
@@ -9008,7 +9054,7 @@ function ReportDetail({
     return /* @__PURE__ */ jsx45(LoadingState, { message: "Loading report\u2026" });
   }
   if (!report || !summary) {
-    return /* @__PURE__ */ jsx45(Box23, { padding: 5, children: /* @__PURE__ */ jsxs33(Stack29, { space: 4, children: [
+    return /* @__PURE__ */ jsx45(Box22, { padding: 5, children: /* @__PURE__ */ jsxs33(Stack28, { space: 4, children: [
       /* @__PURE__ */ jsx45(
         Button8,
         {
@@ -9023,7 +9069,7 @@ function ReportDetail({
   }
   const { comparison, provenance } = report;
   const totalTests = summary.scores.reduce((n, s) => n + s.testCount, 0);
-  return /* @__PURE__ */ jsx45(Box23, { padding: 4, children: /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
+  return /* @__PURE__ */ jsx45(Box22, { padding: 4, children: /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
     /* @__PURE__ */ jsx45(
       ReportHeader,
       {
@@ -9051,7 +9097,7 @@ function ReportDetail({
         return isDisabled && tooltip ? /* @__PURE__ */ jsx45(
           Tooltip8,
           {
-            content: /* @__PURE__ */ jsx45(Box23, { padding: 2, style: { maxWidth: 280 }, children: tooltip }),
+            content: /* @__PURE__ */ jsx45(Box22, { padding: 2, style: { maxWidth: 280 }, children: tooltip }),
             placement: "bottom",
             portal: true,
             children: /* @__PURE__ */ jsx45("span", { style: { display: "inline-block" }, children: tabElement })
@@ -9085,7 +9131,7 @@ function ReportDetail({
         "aria-labelledby": "tab-overview",
         hidden: currentTab !== "overview",
         id: "panel-overview",
-        children: /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
+        children: /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
           /* @__PURE__ */ jsx45(
             DiagnosticsOverview,
             {
@@ -9108,6 +9154,7 @@ function ReportDetail({
         focus,
         judgments: summary.lowScoringJudgments,
         onNavigate: (newSubTab, newFocus) => onTabChange("diagnostics", newSubTab, newFocus),
+        perModel: summary.perModel,
         recommendations: summary.recommendations,
         scores: summary.scores,
         subTab
@@ -9143,6 +9190,7 @@ function DiagnosticsPanel({
   focus,
   judgments,
   onNavigate,
+  perModel,
   recommendations,
   scores,
   subTab: subTabParam
@@ -9151,7 +9199,7 @@ function DiagnosticsPanel({
   const issueCount = scores.filter((s) => s.totalScore < SCORE_CAUTION).length + scores.filter((s) => s.negativeDocLift).length + scores.filter(
     (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency < EFFICIENCY_CAUTION && !s.invertedRetrievalGap
   ).length;
-  return /* @__PURE__ */ jsx45(TabPanel, { "aria-labelledby": "tab-diagnostics", id: "panel-diagnostics", children: /* @__PURE__ */ jsxs33(Stack29, { space: 4, children: [
+  return /* @__PURE__ */ jsx45(TabPanel, { "aria-labelledby": "tab-diagnostics", id: "panel-diagnostics", children: /* @__PURE__ */ jsxs33(Stack28, { space: 4, children: [
     /* @__PURE__ */ jsx45(
       Flex26,
       {
@@ -9202,9 +9250,23 @@ function DiagnosticsPanel({
         ))
       }
     ),
-    subTab === "strengths" && /* @__PURE__ */ jsx45(StrengthsList, { comparison, scores }),
-    subTab === "issues" && /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
-      /* @__PURE__ */ jsx45(WeaknessesList, { comparison, scores }),
+    subTab === "strengths" && /* @__PURE__ */ jsx45(
+      StrengthsList,
+      {
+        comparison,
+        perModel,
+        scores
+      }
+    ),
+    subTab === "issues" && /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
+      /* @__PURE__ */ jsx45(
+        WeaknessesList,
+        {
+          comparison,
+          perModel,
+          scores
+        }
+      ),
       recommendations && recommendations.gaps.length > 0 && /* @__PURE__ */ jsx45(RecommendationsSection, { recommendations }),
       judgments && judgments.length > 0 && /* @__PURE__ */ jsx45(
         JudgmentList,
@@ -9256,9 +9318,14 @@ function getDisabledTabTooltip(tabId, summary) {
   }
 }
+// src/components/report-detail/AreaScoreRow.tsx
+import { WarningOutlineIcon as WarningOutlineIcon4 } from "@sanity/icons";
+import { Box as Box23, Flex as Flex27, Stack as Stack29, Text as Text35 } from "@sanity/ui";
+import { jsx as jsx46, jsxs as jsxs34 } from "react/jsx-runtime";
 // src/components/report-detail/AreaScoreTable.tsx
 import React4 from "react";
-import { Card as Card17, Stack as Stack30, Text as Text36 } from "@sanity/ui";
+import { Card as Card17, Stack as Stack30, Text as Text37 } from "@sanity/ui";
 // src/lib/scoring.ts
 var HEX_MAP = {
@@ -9275,30 +9342,30 @@ function scoreHex(score) {
 }
 // src/components/primitives/ScoreCell.tsx
-import { Card as Card16, Text as Text35 } from "@sanity/ui";
-import { jsx as jsx46 } from "react/jsx-runtime";
+import { Card as Card16, Text as Text36 } from "@sanity/ui";
+import { jsx as jsx47 } from "react/jsx-runtime";
 // src/components/report-detail/AreaScoreTable.tsx
-import { jsx as jsx47, jsxs as jsxs34 } from "react/jsx-runtime";
+import { jsx as jsx48, jsxs as jsxs35 } from "react/jsx-runtime";
 // src/components/report-detail/ComparisonSummary.tsx
-import { Badge as Badge8, Box as Box24, Card as Card18, Flex as Flex27, Grid as Grid4, Stack as Stack31, Text as Text37, Tooltip as Tooltip9 } from "@sanity/ui";
-import { jsx as jsx48, jsxs as jsxs35 } from "react/jsx-runtime";
+import { Badge as Badge8, Box as Box24, Card as Card18, Flex as Flex28, Grid as Grid4, Stack as Stack31, Text as Text38, Tooltip as Tooltip9 } from "@sanity/ui";
+import { jsx as jsx49, jsxs as jsxs36 } from "react/jsx-runtime";
 // src/components/report-detail/OverviewStats.tsx
 import { Grid as Grid5 } from "@sanity/ui";
-import { jsx as jsx49, jsxs as jsxs36 } from "react/jsx-runtime";
+import { jsx as jsx50, jsxs as jsxs37 } from "react/jsx-runtime";
 // src/components/report-detail/ThreeLayerTable.tsx
 import React5 from "react";
-import { Badge as Badge9, Card as Card19, Flex as Flex28, Stack as Stack32, Text as Text38 } from "@sanity/ui";
-import { jsx as jsx50, jsxs as jsxs37 } from "react/jsx-runtime";
+import { Badge as Badge9, Card as Card19, Flex as Flex29, Stack as Stack32, Text as Text39 } from "@sanity/ui";
+import { jsx as jsx51, jsxs as jsxs38 } from "react/jsx-runtime";
 // src/components/ScoreTimeline.tsx
-import { Card as Card20, Flex as Flex29, Select as Select2, Stack as Stack33, Text as Text39 } from "@sanity/ui";
-import { useCallback as useCallback24, useEffect as useEffect10, useMemo as useMemo8, useState as useState19 } from "react";
+import { Card as Card20, Flex as Flex30, Select as Select2, Stack as Stack33, Text as Text40 } from "@sanity/ui";
+import { useCallback as useCallback26, useEffect as useEffect10, useMemo as useMemo10, useState as useState20 } from "react";
 import { useClient as useClient11 } from "sanity";
-import { jsx as jsx51, jsxs as jsxs38 } from "react/jsx-runtime";
+import { jsx as jsx52, jsxs as jsxs39 } from "react/jsx-runtime";
 var CHART_HEIGHT = 220;
 var CHART_WIDTH = 800;
 var PAD_BOTTOM = 30;
@@ -9333,11 +9400,11 @@ function scoreForPoint(point, area) {
 }
 function ScoreTimeline({ mode = null, source = null }) {
   const client = useClient11({ apiVersion: API_VERSION });
-  const [dataPoints, setDataPoints] = useState19([]);
-  const [loading, setLoading] = useState19(true);
-  const [rangeDays, setRangeDays] = useState19(30);
-  const [selectedArea, setSelectedArea] = useState19(null);
-  const areaNames = useMemo8(() => {
+  const [dataPoints, setDataPoints] = useState20([]);
+  const [loading, setLoading] = useState20(true);
+  const [rangeDays, setRangeDays] = useState20(30);
+  const [selectedArea, setSelectedArea] = useState20(null);
+  const areaNames = useMemo10(() => {
     const names = /* @__PURE__ */ new Set();
     for (const dp of dataPoints) {
       for (const s of dp.scores) {
@@ -9346,7 +9413,7 @@ function ScoreTimeline({ mode = null, source = null }) {
     }
     return Array.from(names).sort();
   }, [dataPoints]);
-  const fetchData = useCallback24(async () => {
+  const fetchData = useCallback26(async () => {
     setLoading(true);
     try {
       const startDate = rangeDays ? daysAgo(rangeDays) : "1970-01-01T00:00:00Z";
@@ -9364,7 +9431,7 @@ function ScoreTimeline({ mode = null, source = null }) {
   useEffect10(() => {
     void fetchData();
   }, [fetchData]);
-  const chartPoints = useMemo8(() => {
+  const chartPoints = useMemo10(() => {
     const pts = [];
     const scored = dataPoints.map((dp) => ({
       date: dp.completedAt,
@@ -9378,18 +9445,18 @@ function ScoreTimeline({ mode = null, source = null }) {
     });
     return pts;
   }, [dataPoints, selectedArea]);
-  const avgScore = useMemo8(() => {
+  const avgScore = useMemo10(() => {
     if (chartPoints.length === 0) return 0;
     return chartPoints.reduce((sum, p) => sum + p.score, 0) / chartPoints.length;
   }, [chartPoints]);
-  const handleRangeChange = useCallback24(
+  const handleRangeChange = useCallback26(
     (e) => {
       const val = e.currentTarget.value;
       setRangeDays(val === "all" ? null : Number(val));
     },
     []
   );
-  const handleAreaChange = useCallback24(
+  const handleAreaChange = useCallback26(
     (e) => {
       const val = e.currentTarget.value;
       setSelectedArea(val || null);
@@ -9397,22 +9464,22 @@ function ScoreTimeline({ mode = null, source = null }) {
     []
   );
   const polylinePoints = chartPoints.map((p) => `${p.x},${p.y}`).join(" ");
-  return /* @__PURE__ */ jsxs38(Stack33, { space: 4, children: [
-    /* @__PURE__ */ jsxs38(Flex29, { gap: 3, children: [
-      /* @__PURE__ */ jsx51(
+  return /* @__PURE__ */ jsxs39(Stack33, { space: 4, children: [
+    /* @__PURE__ */ jsxs39(Flex30, { gap: 3, children: [
+      /* @__PURE__ */ jsx52(
         Select2,
         {
           onChange: handleRangeChange,
           value: rangeDays?.toString() ?? "all",
-          children: TIME_RANGES.map((r) => /* @__PURE__ */ jsx51("option", { value: r.days?.toString() ?? "all", children: r.label }, r.label))
+          children: TIME_RANGES.map((r) => /* @__PURE__ */ jsx52("option", { value: r.days?.toString() ?? "all", children: r.label }, r.label))
         }
       ),
-      /* @__PURE__ */ jsxs38(Select2, { onChange: handleAreaChange, value: selectedArea ?? "", children: [
-        /* @__PURE__ */ jsx51("option", { value: "", children: "Overall" }),
-        areaNames.map((name) => /* @__PURE__ */ jsx51("option", { value: name, children: name }, name))
+      /* @__PURE__ */ jsxs39(Select2, { onChange: handleAreaChange, value: selectedArea ?? "", children: [
+        /* @__PURE__ */ jsx52("option", { value: "", children: "Overall" }),
+        areaNames.map((name) => /* @__PURE__ */ jsx52("option", { value: name, children: name }, name))
       ] })
     ] }),
-    /* @__PURE__ */ jsx51(Card20, { padding: 3, radius: 2, shadow: 1, children: loading ? /* @__PURE__ */ jsx51(Flex29, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx51(Text39, { muted: true, size: 2, children: "Loading\u2026" }) }) : chartPoints.length === 0 ? /* @__PURE__ */ jsx51(Flex29, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx51(Text39, { muted: true, size: 2, children: "No reports found for this time range" }) }) : /* @__PURE__ */ jsxs38(
+    /* @__PURE__ */ jsx52(Card20, { padding: 3, radius: 2, shadow: 1, children: loading ? /* @__PURE__ */ jsx52(Flex30, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "Loading\u2026" }) }) : chartPoints.length === 0 ? /* @__PURE__ */ jsx52(Flex30, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "No reports found for this time range" }) }) : /* @__PURE__ */ jsxs39(
       "svg",
       {
         style: { display: "block", width: "100%" },
@@ -9420,8 +9487,8 @@ function ScoreTimeline({ mode = null, source = null }) {
         children: [
           Y_TICKS.map((tick) => {
             const y = PAD_TOP + PLOT_HEIGHT - tick / Y_MAX * PLOT_HEIGHT;
-            return /* @__PURE__ */ jsxs38("g", { children: [
-              /* @__PURE__ */ jsx51(
+            return /* @__PURE__ */ jsxs39("g", { children: [
+              /* @__PURE__ */ jsx52(
                 "line",
                 {
                   stroke: "#ccc",
@@ -9432,7 +9499,7 @@ function ScoreTimeline({ mode = null, source = null }) {
                   y2: y
                 }
               ),
-              /* @__PURE__ */ jsx51(
+              /* @__PURE__ */ jsx52(
                 "text",
                 {
                   dominantBaseline: "middle",
@@ -9452,7 +9519,7 @@ function ScoreTimeline({ mode = null, source = null }) {
             chartPoints.length - 1
           ].map((idx) => {
             const p = chartPoints[idx];
-            return /* @__PURE__ */ jsx51(
+            return /* @__PURE__ */ jsx52(
               "text",
               {
                 fill: "#999",
@@ -9464,7 +9531,7 @@ function ScoreTimeline({ mode = null, source = null }) {
               },
               idx
             );
-          }) : chartPoints.map((p, idx) => /* @__PURE__ */ jsx51(
+          }) : chartPoints.map((p, idx) => /* @__PURE__ */ jsx52(
             "text",
             {
               fill: "#999",
@@ -9476,7 +9543,7 @@ function ScoreTimeline({ mode = null, source = null }) {
             },
             idx
           )),
-          /* @__PURE__ */ jsx51(
+          /* @__PURE__ */ jsx52(
             "polyline",
             {
               fill: "none",
@@ -9486,7 +9553,7 @@ function ScoreTimeline({ mode = null, source = null }) {
               strokeWidth: 2.5
             }
           ),
-          chartPoints.map((p, idx) => /* @__PURE__ */ jsx51(
+          chartPoints.map((p, idx) => /* @__PURE__ */ jsx52(
             "circle",
             {
               cx: p.x,
@@ -9495,7 +9562,7 @@ function ScoreTimeline({ mode = null, source = null }) {
               r: 4,
               stroke: "#fff",
               strokeWidth: 1.5,
-              children: /* @__PURE__ */ jsxs38("title", { children: [
+              children: /* @__PURE__ */ jsxs39("title", { children: [
                 formatDate(p.date),
                 ": ",
                 Math.round(p.score)
@@ -9506,7 +9573,7 @@ function ScoreTimeline({ mode = null, source = null }) {
         ]
       }
     ) }),
-    /* @__PURE__ */ jsxs38(Text39, { muted: true, size: 2, children: [
+    /* @__PURE__ */ jsxs39(Text40, { muted: true, size: 2, children: [
       chartPoints.length,
       " data point",
       chartPoints.length !== 1 ? "s" : ""
@@ -9516,15 +9583,15 @@ function ScoreTimeline({ mode = null, source = null }) {
 var ScoreTimeline_default = ScoreTimeline;
 // src/components/Dashboard.tsx
-import { jsx as jsx52, jsxs as jsxs39 } from "react/jsx-runtime";
+import { jsx as jsx53, jsxs as jsxs40 } from "react/jsx-runtime";
 var VIEW_PARAM_MAP = {
   compare: "compare",
   timeline: "timeline"
 };
 function Dashboard() {
-  return /* @__PURE__ */ jsx52(HelpProvider, { children: /* @__PURE__ */ jsxs39(Flex30, { style: { height: "100%" }, children: [
-    /* @__PURE__ */ jsx52(Box25, { flex: 1, overflow: "auto", children: /* @__PURE__ */ jsx52(DashboardContent, {}) }),
-    /* @__PURE__ */ jsx52(HelpDrawer, {})
+  return /* @__PURE__ */ jsx53(HelpProvider, { children: /* @__PURE__ */ jsxs40(Flex31, { style: { height: "100%" }, children: [
+    /* @__PURE__ */ jsx53(Box25, { flex: 1, overflow: "auto", children: /* @__PURE__ */ jsx53(DashboardContent, {}) }),
+    /* @__PURE__ */ jsx53(HelpDrawer, {})
   ] }) });
 }
 function DashboardContent() {
@@ -9535,7 +9602,7 @@ function DashboardContent() {
   const isDetail = reportId !== null;
   const activeTab = isDetail ? "latest" : VIEW_PARAM_MAP[routerState.view ?? ""] ?? "latest";
   const defaultTopic = deriveHelpTopic(routerState);
-  const navigateToTab = useCallback25(
+  const navigateToTab = useCallback27(
     (tab) => {
       if (tab === "latest") {
         router.navigate({});
@@ -9545,13 +9612,13 @@ function DashboardContent() {
     },
     [router]
   );
-  const handleSelectReport = useCallback25(
+  const handleSelectReport = useCallback27(
     (id) => {
       router.navigate({ reportId: id });
     },
     [router]
   );
-  const handleTabChange = useCallback25(
+  const handleTabChange = useCallback27(
     (tab, subTab, focus) => {
       if (!routerState.reportId) return;
       const state = {
@@ -9564,19 +9631,19 @@ function DashboardContent() {
     },
     [router, routerState.reportId]
   );
-  const handleBack = useCallback25(() => {
+  const handleBack = useCallback27(() => {
     router.navigate({});
   }, [router]);
-  const handleOpenHelp = useCallback25(() => {
+  const handleOpenHelp = useCallback27(() => {
     openHelp(defaultTopic);
   }, [openHelp, defaultTopic]);
-  return /* @__PURE__ */ jsx52(Container, { width: 4, children: /* @__PURE__ */ jsxs39(Stack34, { padding: 4, space: 4, children: [
-    /* @__PURE__ */ jsxs39(Flex30, { align: "center", gap: 3, children: [
-      /* @__PURE__ */ jsxs39(Stack34, { flex: 1, space: 1, children: [
-        /* @__PURE__ */ jsx52(Text40, { size: 4, weight: "bold", children: "AI Literacy Framework" }),
-        /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "Evaluation reports and score trends" })
+  return /* @__PURE__ */ jsx53(Container, { width: 4, children: /* @__PURE__ */ jsxs40(Stack34, { padding: 4, space: 4, children: [
+    /* @__PURE__ */ jsxs40(Flex31, { align: "center", gap: 3, children: [
+      /* @__PURE__ */ jsxs40(Stack34, { flex: 1, space: 1, children: [
+        /* @__PURE__ */ jsx53(Text41, { size: 4, weight: "bold", children: "AI Literacy Framework" }),
+        /* @__PURE__ */ jsx53(Text41, { muted: true, size: 2, children: "Evaluation reports and score trends" })
       ] }),
-      /* @__PURE__ */ jsx52(
+      /* @__PURE__ */ jsx53(
         Button9,
         {
           icon: HelpCircleIcon8,
@@ -9587,8 +9654,8 @@ function DashboardContent() {
         }
       )
     ] }),
-    !isDetail && /* @__PURE__ */ jsxs39(TabList2, { space: 1, children: [
-      /* @__PURE__ */ jsx52(
+    !isDetail && /* @__PURE__ */ jsxs40(TabList2, { space: 1, children: [
+      /* @__PURE__ */ jsx53(
         Tab2,
         {
           "aria-controls": "latest-panel",
@@ -9598,7 +9665,7 @@ function DashboardContent() {
           selected: activeTab === "latest"
         }
       ),
-      /* @__PURE__ */ jsx52(
+      /* @__PURE__ */ jsx53(
         Tab2,
         {
           "aria-controls": "timeline-panel",
@@ -9608,7 +9675,7 @@ function DashboardContent() {
           selected: activeTab === "timeline"
         }
       ),
-      /* @__PURE__ */ jsx52(
+      /* @__PURE__ */ jsx53(
         Tab2,
         {
           "aria-controls": "compare-panel",
@@ -9619,10 +9686,10 @@ function DashboardContent() {
         }
       )
     ] }),
-    !isDetail && activeTab === "latest" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "latest-tab", id: "latest-panel", children: /* @__PURE__ */ jsx52(LatestReports, { onSelectReport: handleSelectReport }) }),
-    !isDetail && activeTab === "timeline" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "timeline-tab", id: "timeline-panel", children: /* @__PURE__ */ jsx52(ScoreTimeline_default, {}) }),
-    !isDetail && activeTab === "compare" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "compare-tab", id: "compare-panel", children: /* @__PURE__ */ jsx52(ComparisonView, {}) }),
-    isDetail && reportId && /* @__PURE__ */ jsx52(
+    !isDetail && activeTab === "latest" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "latest-tab", id: "latest-panel", children: /* @__PURE__ */ jsx53(LatestReports, { onSelectReport: handleSelectReport }) }),
+    !isDetail && activeTab === "timeline" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "timeline-tab", id: "timeline-panel", children: /* @__PURE__ */ jsx53(ScoreTimeline_default, {}) }),
+    !isDetail && activeTab === "compare" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "compare-tab", id: "compare-panel", children: /* @__PURE__ */ jsx53(ComparisonView, {}) }),
+    isDetail && reportId && /* @__PURE__ */ jsx53(
       ReportDetail,
       {
         activeTab: routerState.tab ?? null,
@@ -9658,7 +9725,7 @@ function ailfTool(options = {}) {
 // src/actions/RunEvaluationAction.tsx
 import { BarChartIcon as BarChartIcon2 } from "@sanity/icons";
 import { useToast as useToast10 } from "@sanity/ui";
-import { useCallback as useCallback26, useEffect as useEffect11, useRef as useRef6, useState as useState20 } from "react";
+import { useCallback as useCallback28, useEffect as useEffect11, useRef as useRef6, useState as useState21 } from "react";
 import {
   getReleaseIdFromReleaseDocumentId as getReleaseIdFromReleaseDocumentId3,
   useClient as useClient12,
@@ -9689,7 +9756,7 @@ function createRunEvaluationAction(options = {}) {
     const projectId = useProjectId2();
     const currentUser = useCurrentUser4();
     const toast = useToast10();
-    const [state, setState] = useState20({ status: "loading" });
+    const [state, setState] = useState21({ status: "loading" });
     const requestedAtRef = useRef6(null);
     const perspectiveId = getReleaseIdFromReleaseDocumentId3(release._id);
     useEffect11(() => {
@@ -9785,7 +9852,7 @@ function createRunEvaluationAction(options = {}) {
       }, 15e3);
       return () => clearTimeout(timer);
     }, [client, perspectiveId, state]);
-    const handleRequest = useCallback26(async () => {
+    const handleRequest = useCallback28(async () => {
       const releaseTitle = release.metadata?.title ?? perspectiveId ?? "release";
       const tag = `release-${slugify3(releaseTitle)}-${dateStamp3()}`;
       const now = Date.now();