@sanity/ailf-studio 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2585,8 +2585,8 @@ var taskSchema = defineType5({
2585
2585
  type: "boolean"
2586
2586
  }),
2587
2587
  defineField5({
2588
- description: 'Rubric mode for baseline. "abbreviated" uses a shorter rubric, "full" uses the same rubric as gold, "none" skips rubric grading.',
2589
- initialValue: "abbreviated",
2588
+ description: 'Rubric mode for baseline. "full" uses the same rubric as gold, "abbreviated" uses a shorter rubric, "none" skips rubric grading.',
2589
+ initialValue: "full",
2590
2590
  name: "rubric",
2591
2591
  options: {
2592
2592
  list: [
@@ -3064,14 +3064,14 @@ import {
3064
3064
  Box as Box25,
3065
3065
  Button as Button9,
3066
3066
  Container,
3067
- Flex as Flex30,
3067
+ Flex as Flex31,
3068
3068
  Stack as Stack34,
3069
3069
  Tab as Tab2,
3070
3070
  TabList as TabList2,
3071
3071
  TabPanel as TabPanel2,
3072
- Text as Text40
3072
+ Text as Text41
3073
3073
  } from "@sanity/ui";
3074
- import { useCallback as useCallback25 } from "react";
3074
+ import { useCallback as useCallback27 } from "react";
3075
3075
  import { useRouter as useRouter3 } from "sanity/router";
3076
3076
 
3077
3077
  // src/lib/help-context.ts
@@ -3305,7 +3305,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
3305
3305
  {
3306
3306
  "id": "scoring-model",
3307
3307
  "title": "Understanding Scores",
3308
- "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task:\n\n```\nTotal = Task Completion \xD7 0.50 + Code Correctness \xD7 0.25 + Doc Coverage \xD7 0.25\n```\n\nThis weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100** | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379** | Needs attention \u2014 there may be gaps in specific dimensions |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
3308
+ "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs): Total = Task \xD7 0.50 + Code \xD7 0.25 + Docs \xD7 0.25\nBaseline (no docs): Total = Task \xD7 0.60 + Code \xD7 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling \u2212 floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100** | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379** | Needs attention \u2014 there may be gaps in specific dimensions |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
3309
3309
  "source": "docs/help/scoring-model.md",
3310
3310
  "related": [
3311
3311
  "three-layer",
@@ -3337,7 +3337,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
3337
3337
  {
3338
3338
  "id": "glossary",
3339
3339
  "title": "Glossary",
3340
- "body": "**Overall Score**\n: A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Score without any documentation. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Weighted score for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
3340
+ "body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%. The floor uses a different profile (Task \xD7 60% + Code \xD7 40%, no Doc Coverage).\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
3341
3341
  "source": "packages/studio/src/glossary.ts",
3342
3342
  "tags": [
3343
3343
  "reference",
@@ -3386,23 +3386,23 @@ import { useClient as useClient3 } from "sanity";
3386
3386
  // src/glossary.ts
3387
3387
  var GLOSSARY = {
3388
3388
  // -- Overview stats -------------------------------------------------------
3389
- overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
3390
- docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.",
3389
+ overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
3390
+ docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.",
3391
3391
  actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.",
3392
3392
  retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.",
3393
3393
  infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.",
3394
3394
  // -- Three-layer decomposition columns ------------------------------------
3395
- floor: "Score without any documentation. This tells you what the model already knows from its training data.",
3395
+ floor: "Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.",
3396
3396
  ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.",
3397
3397
  actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.",
3398
3398
  retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.",
3399
3399
  efficiency: "What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).",
3400
3400
  invertedRetGap: "\u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.",
3401
3401
  // -- Per-area score columns -----------------------------------------------
3402
- score: "Weighted score for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.",
3402
+ score: "Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%. The floor uses a different profile (Task \xD7 60% + Code \xD7 40%, no Doc Coverage).",
3403
3403
  taskCompletion: "Can the LLM implement the requested feature? Graded 0\u2013100.",
3404
3404
  codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.",
3405
- docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100.",
3405
+ docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.",
3406
3406
  tests: "Number of test cases in this feature area.",
3407
3407
  // -- Comparison deltas ----------------------------------------------------
3408
3408
  overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.",
@@ -3429,6 +3429,8 @@ var GLOSSARY = {
3429
3429
  efficiencyAnomalies: "Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.",
3430
3430
  docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.",
3431
3431
  retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.",
3432
+ // -- Model breakdown --------------------------------------------------------
3433
+ modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.",
3432
3434
  // -- Strengths (positive diagnostics) ---------------------------------------
3433
3435
  strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.",
3434
3436
  // -- Recommendations / gap analysis ----------------------------------------
@@ -3458,7 +3460,7 @@ var GLOSSARY = {
3458
3460
  sourceBranch: "Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.",
3459
3461
  sourceLocal: "Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.",
3460
3462
  // -- Report list columns ----------------------------------------------------
3461
- reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
3463
+ reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
3462
3464
  reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.",
3463
3465
  reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.",
3464
3466
  // -- Mode values -----------------------------------------------------------
@@ -5591,10 +5593,10 @@ function LatestReports({
5591
5593
  import { ArrowLeftIcon as ArrowLeftIcon3 } from "@sanity/icons";
5592
5594
  import {
5593
5595
  Badge as Badge7,
5594
- Box as Box23,
5596
+ Box as Box22,
5595
5597
  Button as Button8,
5596
5598
  Flex as Flex26,
5597
- Stack as Stack29,
5599
+ Stack as Stack28,
5598
5600
  Tab,
5599
5601
  TabList,
5600
5602
  TabPanel,
@@ -5602,10 +5604,10 @@ import {
5602
5604
  Tooltip as Tooltip8
5603
5605
  } from "@sanity/ui";
5604
5606
  import {
5605
- useCallback as useCallback23,
5607
+ useCallback as useCallback25,
5606
5608
  useEffect as useEffect9,
5607
- useMemo as useMemo7,
5608
- useState as useState18
5609
+ useMemo as useMemo9,
5610
+ useState as useState19
5609
5611
  } from "react";
5610
5612
  import { useClient as useClient10 } from "sanity";
5611
5613
 
@@ -5971,21 +5973,6 @@ function scoreBg(score) {
5971
5973
  function scoreBorder(score) {
5972
5974
  return COLORS[colorForScore(score)].border;
5973
5975
  }
5974
- function scoreBoxStyle(score) {
5975
- const key = colorForScore(score);
5976
- return {
5977
- alignItems: "center",
5978
- backgroundColor: COLORS[key].bg,
5979
- borderRadius: 6,
5980
- color: COLORS[key].text,
5981
- display: "flex",
5982
- fontFamily: "var(--font-code-size)",
5983
- fontWeight: 700,
5984
- height: 48,
5985
- justifyContent: "center",
5986
- width: 48
5987
- };
5988
- }
5989
5976
  function barFillColor(score) {
5990
5977
  const key = colorForScore(score);
5991
5978
  switch (key) {
@@ -6055,160 +6042,180 @@ function DiagnosticsOverview({
6055
6042
  );
6056
6043
  const weak = scores.filter((s) => s.totalScore < SCORE_CAUTION);
6057
6044
  const negativeDocLiftCount = scores.filter((s) => s.docLift < 0).length;
6045
+ const hasAgenticData = overall.avgActualScore != null;
6058
6046
  const improved = comparison?.improved ?? [];
6059
6047
  const regressed = comparison?.regressed ?? [];
6060
6048
  const unchanged = comparison?.unchanged ?? [];
6061
6049
  const hasComparison = improved.length > 0 || regressed.length > 0 || unchanged.length > 0;
6062
6050
  return /* @__PURE__ */ jsxs21(Stack18, { space: 4, children: [
6063
- /* @__PURE__ */ jsxs21(
6064
- "div",
6065
- {
6066
- style: {
6067
- display: "grid",
6068
- gap: 12,
6069
- gridTemplateColumns: "repeat(4, 1fr)"
6070
- },
6071
- children: [
6072
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx25(
6073
- ScoreCard,
6074
- {
6075
- delta: comparison?.deltas.overall,
6076
- label: "AVG SCORE",
6077
- sentiment: scoreSentiment(overall.avgScore),
6078
- subtitle: "Overall quality score",
6079
- value: Math.round(overall.avgScore)
6080
- }
6081
- ) }),
6082
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
6083
- ScoreCard,
6084
- {
6085
- delta: comparison?.deltas.docLift,
6086
- label: "AVG DOC LIFT",
6087
- sentiment: docLiftSentiment(overall.avgDocLift),
6088
- subtitle: "Improvement with docs",
6089
- value: Math.round(overall.avgDocLift)
6090
- }
6091
- ) }),
6092
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.ceiling, children: /* @__PURE__ */ jsx25(
6093
- ScoreCard,
6094
- {
6095
- label: "AVG CEILING",
6096
- sentiment: scoreSentiment(overall.avgCeilingScore ?? 0),
6097
- subtitle: "Best case performance",
6098
- value: Math.round(overall.avgCeilingScore ?? 0)
6099
- }
6100
- ) }),
6101
- overall.avgInfrastructureEfficiency != null ? /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency, children: /* @__PURE__ */ jsx25(
6102
- ScoreCard,
6103
- {
6104
- label: "EFFICIENCY",
6105
- sentiment: efficiencySentiment(
6106
- overall.avgInfrastructureEfficiency
6107
- ),
6108
- subtitle: "Infra utilization",
6109
- suffix: "%",
6110
- value: Math.round(overall.avgInfrastructureEfficiency * 100)
6111
- }
6112
- ) }) : /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
6113
- ScoreCard,
6114
- {
6115
- label: "AVG FLOOR",
6116
- sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
6117
- subtitle: "Model-only baseline",
6118
- value: Math.round(overall.avgFloorScore ?? 0)
6119
- }
6120
- ) })
6121
- ]
6122
- }
6123
- ),
6124
- /* @__PURE__ */ jsxs21(
6125
- "div",
6126
- {
6127
- style: { display: "grid", gap: 12, gridTemplateColumns: "1fr 1fr 1fr" },
6128
- children: [
6129
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthStrong, children: /* @__PURE__ */ jsx25(
6130
- HealthCard,
6131
- {
6132
- color: strong.length > 0 ? "emerald" : "muted",
6133
- count: strong.length,
6134
- icon: /* @__PURE__ */ jsx25(CheckmarkCircleIcon, {}),
6135
- label: "Strong (80+)"
6136
- }
6137
- ) }),
6138
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthAttention, children: /* @__PURE__ */ jsx25(
6139
- HealthCard,
6140
- {
6141
- color: attention.length === 0 ? "muted" : "amber",
6142
- count: attention.length,
6143
- icon: /* @__PURE__ */ jsx25(WarningOutlineIcon, {}),
6144
- label: "Attention (70-79)"
6145
- }
6146
- ) }),
6147
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthWeak, children: /* @__PURE__ */ jsx25(
6148
- HealthCard,
6149
- {
6150
- color: weak.length === 0 ? "muted" : "red",
6151
- count: weak.length,
6152
- icon: /* @__PURE__ */ jsx25(ErrorOutlineIcon, {}),
6153
- label: "Weak (<70)"
6154
- }
6155
- ) })
6156
- ]
6157
- }
6158
- ),
6159
- /* @__PURE__ */ jsxs21(
6160
- "div",
6161
- {
6162
- style: {
6163
- display: "grid",
6164
- gap: 12,
6165
- gridTemplateColumns: "repeat(auto-fit, minmax(140px, 1fr))"
6166
- },
6167
- children: [
6168
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
6169
- MetricCard,
6170
- {
6171
- label: "Avg Floor",
6172
- sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
6173
- value: String(Math.round(overall.avgFloorScore ?? 0))
6174
- }
6175
- ) }),
6176
- overall.avgActualScore != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.actualScore, children: /* @__PURE__ */ jsx25(
6177
- MetricCard,
6178
- {
6179
- label: "Avg Actual",
6180
- sentiment: scoreSentiment(overall.avgActualScore),
6181
- value: String(Math.round(overall.avgActualScore))
6182
- }
6183
- ) }),
6184
- overall.avgRetrievalGap != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.retrievalGap, children: /* @__PURE__ */ jsx25(
6185
- MetricCard,
6186
- {
6187
- label: "Avg Retrieval Gap",
6188
- sentiment: retrievalGapSentiment(overall.avgRetrievalGap),
6189
- value: overall.avgRetrievalGap.toFixed(1)
6190
- }
6191
- ) }),
6192
- /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
6193
- MetricCard,
6194
- {
6195
- label: "Negative Doc Lift",
6196
- sentiment: negativeDocLiftSentiment(negativeDocLiftCount),
6197
- value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
6198
- }
6199
- ) }),
6200
- totalTests != null && /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests) }) }),
6201
- durationMs != null && durationMs > 0 && /* @__PURE__ */ jsx25(
6202
- HoverTip,
6203
- {
6204
- display: "block",
6205
- text: "Total wall-clock time for the evaluation pipeline run.",
6206
- children: /* @__PURE__ */ jsx25(MetricCard, { label: "Duration", value: formatDuration(durationMs) })
6207
- }
6208
- )
6209
- ]
6210
- }
6211
- ),
6051
+ /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
6052
+ /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Baseline" }) }),
6053
+ /* @__PURE__ */ jsxs21(Stack18, { space: 3, padding: 3, children: [
6054
+ /* @__PURE__ */ jsxs21(
6055
+ "div",
6056
+ {
6057
+ style: {
6058
+ display: "grid",
6059
+ gap: 12,
6060
+ gridTemplateColumns: "repeat(3, 1fr)"
6061
+ },
6062
+ children: [
6063
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx25(
6064
+ ScoreCard,
6065
+ {
6066
+ delta: comparison?.deltas.overall,
6067
+ label: "AVG SCORE",
6068
+ sentiment: scoreSentiment(overall.avgScore),
6069
+ subtitle: "With-docs ceiling",
6070
+ value: Math.round(overall.avgScore)
6071
+ }
6072
+ ) }),
6073
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx25(
6074
+ ScoreCard,
6075
+ {
6076
+ delta: comparison?.deltas.docLift,
6077
+ label: "DOC LIFT",
6078
+ sentiment: docLiftSentiment(overall.avgDocLift),
6079
+ subtitle: "Improvement from docs",
6080
+ value: Math.round(overall.avgDocLift)
6081
+ }
6082
+ ) }),
6083
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.floor, children: /* @__PURE__ */ jsx25(
6084
+ ScoreCard,
6085
+ {
6086
+ label: "FLOOR",
6087
+ sentiment: scoreSentiment(overall.avgFloorScore ?? 0),
6088
+ subtitle: "Without docs baseline",
6089
+ value: Math.round(overall.avgFloorScore ?? 0)
6090
+ }
6091
+ ) })
6092
+ ]
6093
+ }
6094
+ ),
6095
+ /* @__PURE__ */ jsxs21(
6096
+ "div",
6097
+ {
6098
+ style: {
6099
+ display: "grid",
6100
+ gap: 12,
6101
+ gridTemplateColumns: "repeat(3, 1fr)"
6102
+ },
6103
+ children: [
6104
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.negativeDocLiftMetric, children: /* @__PURE__ */ jsx25(
6105
+ MetricCard,
6106
+ {
6107
+ label: "Negative Doc Lift",
6108
+ sentiment: negativeDocLiftSentiment(negativeDocLiftCount),
6109
+ value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
6110
+ }
6111
+ ) }),
6112
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx25(MetricCard, { label: "Tests", value: String(totalTests ?? 0) }) }),
6113
+ durationMs != null && durationMs > 0 ? /* @__PURE__ */ jsx25(
6114
+ HoverTip,
6115
+ {
6116
+ display: "block",
6117
+ text: "Total wall-clock time for the evaluation pipeline run.",
6118
+ children: /* @__PURE__ */ jsx25(
6119
+ MetricCard,
6120
+ {
6121
+ label: "Duration",
6122
+ value: formatDuration(durationMs)
6123
+ }
6124
+ )
6125
+ }
6126
+ ) : /* @__PURE__ */ jsx25("div", {})
6127
+ ]
6128
+ }
6129
+ )
6130
+ ] })
6131
+ ] }),
6132
+ hasAgenticData && /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
6133
+ /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Agent Performance" }) }),
6134
+ /* @__PURE__ */ jsx25(Stack18, { space: 3, padding: 3, children: /* @__PURE__ */ jsxs21(
6135
+ "div",
6136
+ {
6137
+ style: {
6138
+ display: "grid",
6139
+ gap: 12,
6140
+ gridTemplateColumns: "repeat(3, 1fr)"
6141
+ },
6142
+ children: [
6143
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.actualScore, children: /* @__PURE__ */ jsx25(
6144
+ ScoreCard,
6145
+ {
6146
+ delta: comparison?.deltas.actualDelta,
6147
+ label: "ACTUAL SCORE",
6148
+ sentiment: scoreSentiment(overall.avgActualScore),
6149
+ subtitle: "Agent-retrieved docs",
6150
+ value: Math.round(overall.avgActualScore)
6151
+ }
6152
+ ) }),
6153
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.retrievalGap, children: /* @__PURE__ */ jsx25(
6154
+ ScoreCard,
6155
+ {
6156
+ label: "RETRIEVAL GAP",
6157
+ sentiment: overall.avgRetrievalGap != null ? retrievalGapSentiment(overall.avgRetrievalGap) : void 0,
6158
+ subtitle: "Lost to findability",
6159
+ suffix: "pts",
6160
+ value: overall.avgRetrievalGap != null ? Math.round(overall.avgRetrievalGap) : 0
6161
+ }
6162
+ ) }),
6163
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency, children: /* @__PURE__ */ jsx25(
6164
+ ScoreCard,
6165
+ {
6166
+ label: "EFFICIENCY",
6167
+ sentiment: overall.avgInfrastructureEfficiency != null ? efficiencySentiment(overall.avgInfrastructureEfficiency) : void 0,
6168
+ subtitle: "Doc quality reaching agents",
6169
+ suffix: "%",
6170
+ value: overall.avgInfrastructureEfficiency != null ? Math.round(overall.avgInfrastructureEfficiency * 100) : 0
6171
+ }
6172
+ ) })
6173
+ ]
6174
+ }
6175
+ ) })
6176
+ ] }),
6177
+ /* @__PURE__ */ jsxs21(Box15, { style: sectionWrapperStyle, children: [
6178
+ /* @__PURE__ */ jsx25(Box15, { padding: 3, style: sectionHeaderStyle, children: /* @__PURE__ */ jsx25(SectionLabel, { label: "Area Health" }) }),
6179
+ /* @__PURE__ */ jsx25(Box15, { padding: 3, children: /* @__PURE__ */ jsxs21(
6180
+ "div",
6181
+ {
6182
+ style: {
6183
+ display: "grid",
6184
+ gap: 12,
6185
+ gridTemplateColumns: "1fr 1fr 1fr"
6186
+ },
6187
+ children: [
6188
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthStrong, children: /* @__PURE__ */ jsx25(
6189
+ HealthCard,
6190
+ {
6191
+ color: strong.length > 0 ? "emerald" : "muted",
6192
+ count: strong.length,
6193
+ icon: /* @__PURE__ */ jsx25(CheckmarkCircleIcon, {}),
6194
+ label: "Strong (80+)"
6195
+ }
6196
+ ) }),
6197
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthAttention, children: /* @__PURE__ */ jsx25(
6198
+ HealthCard,
6199
+ {
6200
+ color: attention.length === 0 ? "muted" : "amber",
6201
+ count: attention.length,
6202
+ icon: /* @__PURE__ */ jsx25(WarningOutlineIcon, {}),
6203
+ label: "Attention (70-79)"
6204
+ }
6205
+ ) }),
6206
+ /* @__PURE__ */ jsx25(HoverTip, { display: "block", text: GLOSSARY.healthWeak, children: /* @__PURE__ */ jsx25(
6207
+ HealthCard,
6208
+ {
6209
+ color: weak.length === 0 ? "muted" : "red",
6210
+ count: weak.length,
6211
+ icon: /* @__PURE__ */ jsx25(ErrorOutlineIcon, {}),
6212
+ label: "Weak (<70)"
6213
+ }
6214
+ ) })
6215
+ ]
6216
+ }
6217
+ ) })
6218
+ ] }),
6212
6219
  hasComparison && /* @__PURE__ */ jsxs21(Box15, { style: neutralCardStyle, children: [
6213
6220
  /* @__PURE__ */ jsx25(
6214
6221
  Box15,
@@ -6289,6 +6296,26 @@ function DiagnosticsOverview({
6289
6296
  ] })
6290
6297
  ] });
6291
6298
  }
6299
+ var sectionWrapperStyle = {
6300
+ border: "1px solid var(--card-border-color)",
6301
+ borderRadius: 6,
6302
+ overflow: "hidden"
6303
+ };
6304
+ var sectionHeaderStyle = {
6305
+ borderBottom: "1px solid var(--card-border-color)"
6306
+ };
6307
+ function SectionLabel({ label }) {
6308
+ return /* @__PURE__ */ jsx25(
6309
+ Text23,
6310
+ {
6311
+ muted: true,
6312
+ size: 1,
6313
+ style: { letterSpacing: "0.08em", textTransform: "uppercase" },
6314
+ weight: "semibold",
6315
+ children: label
6316
+ }
6317
+ );
6318
+ }
6292
6319
  function ScoreCard({
6293
6320
  delta,
6294
6321
  label,
@@ -7722,11 +7749,12 @@ function ReportHeader({
7722
7749
  }
7723
7750
 
7724
7751
  // src/components/report-detail/StrengthsList.tsx
7752
+ import { useMemo as useMemo8 } from "react";
7725
7753
  import { CheckmarkCircleIcon as CheckmarkCircleIcon2, SearchIcon as SearchIcon6 } from "@sanity/icons";
7726
- import { Box as Box20, Flex as Flex23, Stack as Stack26, Text as Text31 } from "@sanity/ui";
7754
+ import { Box as Box20, Flex as Flex24, Stack as Stack26, Text as Text32 } from "@sanity/ui";
7727
7755
 
7728
- // src/components/report-detail/StrengthsTable.tsx
7729
- import {
7756
+ // src/components/report-detail/AreaScoresGrid.tsx
7757
+ import React3, {
7730
7758
  useCallback as useCallback22,
7731
7759
  useMemo as useMemo6,
7732
7760
  useState as useState17
@@ -7739,14 +7767,27 @@ function tableTier2(width) {
7739
7767
  if (width >= 600) return "compact";
7740
7768
  return "narrow";
7741
7769
  }
7742
- var GRID3 = {
7743
- full: "120px 1fr 1fr 1fr 1fr 80px 72px 72px",
7744
- compact: "96px 1fr 1fr 1fr 1fr 80px",
7745
- narrow: "56px 1fr 1fr 1fr 1fr"
7746
- };
7747
- function StrengthsTable({ scores, perArea }) {
7770
+ function gridColumns(tier, hasActual) {
7771
+ switch (tier) {
7772
+ case "full":
7773
+ return hasActual ? "120px 1fr 1fr 1fr 1fr 80px 72px 72px" : "120px 1fr 1fr 1fr 1fr 80px 72px";
7774
+ case "compact":
7775
+ return "96px 1fr 1fr 1fr 1fr 80px";
7776
+ case "narrow":
7777
+ return "56px 1fr 1fr 1fr 1fr";
7778
+ }
7779
+ }
7780
+ function AreaScoresGrid({
7781
+ scores,
7782
+ perArea,
7783
+ perModel
7784
+ }) {
7748
7785
  const { ref: containerRef, width } = useContainerWidth();
7749
7786
  const tier = tableTier2(width);
7787
+ const hasActual = useMemo6(
7788
+ () => scores.some((s) => s.actualScore != null),
7789
+ [scores]
7790
+ );
7750
7791
  const [sortField, setSortField] = useState17("score");
7751
7792
  const [sortDir, setSortDir] = useState17("desc");
7752
7793
  const handleSort = useCallback22(
@@ -7781,6 +7822,24 @@ function StrengthsTable({ scores, perArea }) {
7781
7822
  }
7782
7823
  });
7783
7824
  }, [scores, sortField, sortDir]);
7825
+ const modelScoresByFeature = useMemo6(() => {
7826
+ if (!perModel) return null;
7827
+ const map = /* @__PURE__ */ new Map();
7828
+ for (const model of perModel) {
7829
+ for (const score of model.scores) {
7830
+ let list = map.get(score.feature);
7831
+ if (!list) {
7832
+ list = [];
7833
+ map.set(score.feature, list);
7834
+ }
7835
+ list.push({ label: model.label, scores: score });
7836
+ }
7837
+ }
7838
+ for (const list of map.values()) {
7839
+ list.sort((a, b) => a.label.localeCompare(b.label));
7840
+ }
7841
+ return map;
7842
+ }, [perModel]);
7784
7843
  return /* @__PURE__ */ jsxs29(Box19, { ref: containerRef, style: { ...neutralCardStyle, overflow: "auto" }, children: [
7785
7844
  /* @__PURE__ */ jsxs29(
7786
7845
  "div",
@@ -7789,7 +7848,7 @@ function StrengthsTable({ scores, perArea }) {
7789
7848
  borderBottom: "1px solid var(--card-border-color)",
7790
7849
  display: "grid",
7791
7850
  gap: "0 12px",
7792
- gridTemplateColumns: GRID3[tier],
7851
+ gridTemplateColumns: gridColumns(tier, hasActual),
7793
7852
  padding: "12px 16px 8px"
7794
7853
  },
7795
7854
  children: [
@@ -7800,7 +7859,7 @@ function StrengthsTable({ scores, perArea }) {
7800
7859
  direction: sortDir,
7801
7860
  label: "Score",
7802
7861
  onClick: () => handleSort("score"),
7803
- tooltip: GLOSSARY.score
7862
+ tooltip: `${GLOSSARY.score} This is the ceiling score \u2014 with gold-standard docs injected.`
7804
7863
  }
7805
7864
  ),
7806
7865
  /* @__PURE__ */ jsx41(
@@ -7852,27 +7911,153 @@ function StrengthsTable({ scores, perArea }) {
7852
7911
  tooltip: GLOSSARY.docLift
7853
7912
  }
7854
7913
  ),
7855
- tier === "full" && /* @__PURE__ */ jsxs29(Fragment11, { children: [
7856
- /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
7857
- /* @__PURE__ */ jsx41(ColHeader3, { label: "Ceil", tooltip: GLOSSARY.ceiling })
7858
- ] })
7914
+ tier === "full" && /* @__PURE__ */ jsx41(ColHeader3, { label: "Floor", tooltip: GLOSSARY.floor }),
7915
+ tier === "full" && hasActual && /* @__PURE__ */ jsx41(ColHeader3, { label: "Actual", tooltip: GLOSSARY.actualScore })
7859
7916
  ]
7860
7917
  }
7861
7918
  ),
7862
- sorted.map((area) => /* @__PURE__ */ jsx41(
7863
- AreaRow,
7864
- {
7865
- area,
7866
- delta: perArea?.[area.feature],
7867
- tier
7868
- },
7869
- area.feature
7870
- ))
7919
+ sorted.map((area) => /* @__PURE__ */ jsxs29(React3.Fragment, { children: [
7920
+ /* @__PURE__ */ jsx41(
7921
+ AreaRow,
7922
+ {
7923
+ area,
7924
+ delta: perArea?.[area.feature],
7925
+ hasActual,
7926
+ tier
7927
+ }
7928
+ ),
7929
+ modelScoresByFeature && /* @__PURE__ */ jsx41(
7930
+ ModelSubRows,
7931
+ {
7932
+ hasActual,
7933
+ models: modelScoresByFeature.get(area.feature),
7934
+ tier
7935
+ }
7936
+ )
7937
+ ] }, area.feature))
7871
7938
  ] });
7872
7939
  }
7940
+ function ModelSubRows({
7941
+ hasActual,
7942
+ models,
7943
+ tier
7944
+ }) {
7945
+ if (!models || models.length === 0) return null;
7946
+ return /* @__PURE__ */ jsx41(Fragment11, { children: models.map((entry) => /* @__PURE__ */ jsx41(
7947
+ ModelRow,
7948
+ {
7949
+ hasActual,
7950
+ label: entry.label,
7951
+ scores: entry.scores,
7952
+ tier
7953
+ },
7954
+ entry.label
7955
+ )) });
7956
+ }
7957
+ function ModelRow({
7958
+ hasActual,
7959
+ label,
7960
+ scores,
7961
+ tier
7962
+ }) {
7963
+ const isNarrow = tier === "narrow";
7964
+ return /* @__PURE__ */ jsxs29(
7965
+ "div",
7966
+ {
7967
+ style: {
7968
+ alignItems: "center",
7969
+ backgroundColor: "var(--card-bg2-color, rgba(255,255,255,0.02))",
7970
+ borderBottom: "1px solid var(--card-border-color)",
7971
+ display: "grid",
7972
+ gap: "0 12px",
7973
+ gridTemplateColumns: gridColumns(tier, hasActual),
7974
+ padding: isNarrow ? "6px 12px 6px 20px" : "6px 16px 6px 28px"
7975
+ },
7976
+ children: [
7977
+ /* @__PURE__ */ jsx41(Flex22, { align: "center", children: /* @__PURE__ */ jsx41(
7978
+ Text30,
7979
+ {
7980
+ size: 1,
7981
+ style: {
7982
+ color: scoreColor(scores.totalScore),
7983
+ fontFamily: "var(--font-code-size, monospace)",
7984
+ fontWeight: 600
7985
+ },
7986
+ children: Math.round(scores.totalScore)
7987
+ }
7988
+ ) }),
7989
+ /* @__PURE__ */ jsx41(Flex22, { align: "center", gap: 2, children: /* @__PURE__ */ jsx41(Text30, { muted: true, size: 1, children: label }) }),
7990
+ /* @__PURE__ */ jsx41(
7991
+ DimCell,
7992
+ {
7993
+ area: label,
7994
+ dim: "Task Completion",
7995
+ size: "small",
7996
+ value: scores.taskCompletion
7997
+ }
7998
+ ),
7999
+ /* @__PURE__ */ jsx41(
8000
+ DimCell,
8001
+ {
8002
+ area: label,
8003
+ dim: "Code Correctness",
8004
+ size: "small",
8005
+ value: scores.codeCorrectness
8006
+ }
8007
+ ),
8008
+ /* @__PURE__ */ jsx41(
8009
+ DimCell,
8010
+ {
8011
+ area: label,
8012
+ dim: "Doc Coverage",
8013
+ size: "small",
8014
+ value: scores.docCoverage
8015
+ }
8016
+ ),
8017
+ !isNarrow && /* @__PURE__ */ jsxs29(
8018
+ Text30,
8019
+ {
8020
+ size: 1,
8021
+ style: {
8022
+ color: scores.docLift >= 5 ? "#34d399" : scores.docLift < 0 ? "#f87171" : "var(--card-muted-fg-color)",
8023
+ fontFamily: "var(--font-code-size, monospace)",
8024
+ fontWeight: 500
8025
+ },
8026
+ children: [
8027
+ scores.docLift > 0 ? "+" : "",
8028
+ scores.docLift
8029
+ ]
8030
+ }
8031
+ ),
8032
+ tier === "full" && /* @__PURE__ */ jsx41(
8033
+ Text30,
8034
+ {
8035
+ muted: true,
8036
+ size: 1,
8037
+ style: { fontFamily: "var(--font-code-size, monospace)" },
8038
+ children: Math.round(scores.floorScore ?? 0)
8039
+ }
8040
+ ),
8041
+ tier === "full" && hasActual && /* @__PURE__ */ jsx41(
8042
+ Text30,
8043
+ {
8044
+ size: 1,
8045
+ style: {
8046
+ color: scores.actualScore != null ? scoreColor(scores.actualScore) : "var(--card-muted-fg-color)",
8047
+ fontFamily: "var(--font-code-size, monospace)",
8048
+ fontWeight: 500
8049
+ },
8050
+ children: scores.actualScore != null ? Math.round(scores.actualScore) : "\u2014"
8051
+ }
8052
+ )
8053
+ ]
8054
+ }
8055
+ );
8056
+ }
7873
8057
  function AreaRow({
7874
8058
  area,
7875
8059
  delta,
8060
+ hasActual,
7876
8061
  tier
7877
8062
  }) {
7878
8063
  const isNarrow = tier === "narrow";
@@ -7884,7 +8069,7 @@ function AreaRow({
7884
8069
  borderBottom: "1px solid var(--card-border-color)",
7885
8070
  display: "grid",
7886
8071
  gap: "0 12px",
7887
- gridTemplateColumns: GRID3[tier],
8072
+ gridTemplateColumns: gridColumns(tier, hasActual),
7888
8073
  padding: isNarrow ? "8px 12px" : "10px 16px"
7889
8074
  },
7890
8075
  children: [
@@ -7894,7 +8079,7 @@ function AreaRow({
7894
8079
  {
7895
8080
  text: /* @__PURE__ */ jsxs29(Text30, { size: 2, style: { lineHeight: 1.5 }, children: [
7896
8081
  /* @__PURE__ */ jsx41("span", { style: { fontWeight: 600 }, children: area.feature }),
7897
- " composite:",
8082
+ " ceiling score:",
7898
8083
  " ",
7899
8084
  /* @__PURE__ */ jsx41(
7900
8085
  "span",
@@ -7910,7 +8095,8 @@ function AreaRow({
7910
8095
  /* @__PURE__ */ jsx41("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
7911
8096
  ".",
7912
8097
  " ",
7913
- GLOSSARY.score
8098
+ GLOSSARY.score,
8099
+ " This is the ceiling \u2014 with gold-standard docs injected."
7914
8100
  ] }),
7915
8101
  children: /* @__PURE__ */ jsx41(
7916
8102
  "div",
@@ -8029,13 +8215,22 @@ function AreaRow({
8029
8215
  children: Math.round(area.floorScore ?? 0)
8030
8216
  }
8031
8217
  ),
8032
- tier === "full" && /* @__PURE__ */ jsx41(
8033
- Text30,
8218
+ tier === "full" && hasActual && /* @__PURE__ */ jsx41(
8219
+ HoverTip,
8034
8220
  {
8035
- muted: true,
8036
- size: 2,
8037
- style: { fontFamily: "var(--font-code-size, monospace)" },
8038
- children: Math.round(area.ceilingScore ?? 0)
8221
+ text: area.actualScore != null ? `${area.feature} actual score: ${Math.round(area.actualScore)}/100. ${GLOSSARY.actualScore}` : `No agentic data for ${area.feature}.`,
8222
+ children: /* @__PURE__ */ jsx41(
8223
+ Text30,
8224
+ {
8225
+ size: 2,
8226
+ style: {
8227
+ color: area.actualScore != null ? scoreColor(area.actualScore) : "var(--card-muted-fg-color)",
8228
+ fontFamily: "var(--font-code-size, monospace)",
8229
+ fontWeight: 600
8230
+ },
8231
+ children: area.actualScore != null ? Math.round(area.actualScore) : "\u2014"
8232
+ }
8233
+ )
8039
8234
  }
8040
8235
  )
8041
8236
  ]
@@ -8045,6 +8240,7 @@ function AreaRow({
8045
8240
  function DimCell({
8046
8241
  area,
8047
8242
  dim,
8243
+ size = "normal",
8048
8244
  value
8049
8245
  }) {
8050
8246
  const glossary = {
@@ -8052,6 +8248,8 @@ function DimCell({
8052
8248
  "Code Correctness": GLOSSARY.codeCorrectness,
8053
8249
  "Doc Coverage": GLOSSARY.docCoverage
8054
8250
  };
8251
+ const textSize = size === "small" ? 0 : 1;
8252
+ const barHeight = size === "small" ? 3 : 4;
8055
8253
  return /* @__PURE__ */ jsx41(
8056
8254
  HoverTip,
8057
8255
  {
@@ -8082,7 +8280,7 @@ function DimCell({
8082
8280
  /* @__PURE__ */ jsx41(
8083
8281
  Text30,
8084
8282
  {
8085
- size: 1,
8283
+ size: textSize,
8086
8284
  style: {
8087
8285
  color: scoreColor(value),
8088
8286
  fontFamily: "var(--font-code-size, monospace)",
@@ -8097,7 +8295,7 @@ function DimCell({
8097
8295
  style: {
8098
8296
  backgroundColor: "var(--card-border-color)",
8099
8297
  borderRadius: 999,
8100
- height: 4,
8298
+ height: barHeight,
8101
8299
  overflow: "hidden",
8102
8300
  width: "100%"
8103
8301
  },
@@ -8170,51 +8368,219 @@ function ColHeader3({
8170
8368
  ] });
8171
8369
  }
8172
8370
 
8173
- // src/components/report-detail/StrengthsList.tsx
8371
+ // src/components/report-detail/ModelSelector.tsx
8372
+ import { useCallback as useCallback23 } from "react";
8373
+ import { Flex as Flex23, Text as Text31 } from "@sanity/ui";
8174
8374
  import { jsx as jsx42, jsxs as jsxs30 } from "react/jsx-runtime";
8175
- function StrengthsList({ scores, comparison }) {
8176
- const retrievalSuccesses = scores.filter(
8375
+ var pillBase = {
8376
+ borderColor: "var(--card-border-color)",
8377
+ borderRadius: 999,
8378
+ borderStyle: "solid",
8379
+ borderWidth: 1,
8380
+ cursor: "pointer",
8381
+ fontSize: 13,
8382
+ fontWeight: 500,
8383
+ lineHeight: 1,
8384
+ padding: "5px 12px",
8385
+ transition: "all 150ms ease",
8386
+ userSelect: "none",
8387
+ whiteSpace: "nowrap"
8388
+ };
8389
+ var pillDefault = {
8390
+ ...pillBase,
8391
+ backgroundColor: "transparent",
8392
+ color: "var(--card-muted-fg-color)"
8393
+ };
8394
+ var pillSelected = {
8395
+ ...pillBase,
8396
+ backgroundColor: "rgba(16,185,129,0.15)",
8397
+ borderColor: "rgba(16,185,129,0.40)",
8398
+ color: "#34d399"
8399
+ };
8400
+ function ModelSelector({
8401
+ models,
8402
+ selection,
8403
+ onChange
8404
+ }) {
8405
+ return /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 1, wrap: "wrap", children: [
8406
+ /* @__PURE__ */ jsx42(
8407
+ Pill2,
8408
+ {
8409
+ isSelected: selection === null,
8410
+ label: "All Models",
8411
+ onClick: () => onChange(null)
8412
+ }
8413
+ ),
8414
+ models.map((model) => /* @__PURE__ */ jsx42(
8415
+ Pill2,
8416
+ {
8417
+ isSelected: selection === model.modelId,
8418
+ label: model.label,
8419
+ onClick: () => onChange(model.modelId)
8420
+ },
8421
+ model.modelId
8422
+ )),
8423
+ /* @__PURE__ */ jsx42(
8424
+ "div",
8425
+ {
8426
+ style: {
8427
+ backgroundColor: "var(--card-border-color)",
8428
+ height: 16,
8429
+ marginInline: 4,
8430
+ width: 1
8431
+ }
8432
+ }
8433
+ ),
8434
+ /* @__PURE__ */ jsx42(
8435
+ Pill2,
8436
+ {
8437
+ isSelected: selection === "expanded",
8438
+ label: "By Model",
8439
+ onClick: () => onChange("expanded")
8440
+ }
8441
+ )
8442
+ ] });
8443
+ }
8444
+ function Pill2({
8445
+ isSelected,
8446
+ label,
8447
+ onClick
8448
+ }) {
8449
+ const handleKeyDown = useCallback23(
8450
+ (e) => {
8451
+ if (e.key === "Enter" || e.key === " ") {
8452
+ e.preventDefault();
8453
+ onClick();
8454
+ }
8455
+ },
8456
+ [onClick]
8457
+ );
8458
+ return /* @__PURE__ */ jsx42(
8459
+ "span",
8460
+ {
8461
+ onClick,
8462
+ onKeyDown: handleKeyDown,
8463
+ role: "button",
8464
+ style: isSelected ? pillSelected : pillDefault,
8465
+ tabIndex: 0,
8466
+ children: /* @__PURE__ */ jsx42(
8467
+ Text31,
8468
+ {
8469
+ size: 1,
8470
+ style: {
8471
+ color: "inherit",
8472
+ fontWeight: "inherit"
8473
+ },
8474
+ children: label
8475
+ }
8476
+ )
8477
+ }
8478
+ );
8479
+ }
8480
+
8481
+ // src/components/report-detail/useModelSelection.ts
8482
+ import { useCallback as useCallback24, useMemo as useMemo7, useState as useState18 } from "react";
8483
+ function useModelSelection({
8484
+ scores,
8485
+ perModel
8486
+ }) {
8487
+ const [selection, setSelection] = useState18(null);
8488
+ const onSelectionChange = useCallback24((next) => {
8489
+ setSelection(next);
8490
+ }, []);
8491
+ const isExpanded = selection === "expanded";
8492
+ const hasModels = perModel != null && perModel.length > 1;
8493
+ const resolvedScores = useMemo7(() => {
8494
+ if (isExpanded || selection === null || !perModel) return scores;
8495
+ const model = perModel.find((m) => m.modelId === selection);
8496
+ return model?.scores ?? scores;
8497
+ }, [isExpanded, selection, perModel, scores]);
8498
+ const expandedPerModel = isExpanded ? perModel ?? void 0 : void 0;
8499
+ return {
8500
+ selection,
8501
+ onSelectionChange,
8502
+ resolvedScores,
8503
+ isExpanded,
8504
+ hasModels,
8505
+ expandedPerModel
8506
+ };
8507
+ }
8508
+
8509
+ // src/components/report-detail/StrengthsList.tsx
8510
+ import { jsx as jsx43, jsxs as jsxs31 } from "react/jsx-runtime";
8511
+ function StrengthsList({
8512
+ scores,
8513
+ comparison,
8514
+ perModel
8515
+ }) {
8516
+ const {
8517
+ selection,
8518
+ onSelectionChange,
8519
+ resolvedScores,
8520
+ hasModels,
8521
+ expandedPerModel
8522
+ } = useModelSelection({ scores, perModel });
8523
+ const displayedScores = useMemo8(
8524
+ () => resolvedScores.filter((s) => s.totalScore >= SCORE_CAUTION),
8525
+ [resolvedScores]
8526
+ );
8527
+ const retrievalSuccesses = displayedScores.filter(
8177
8528
  (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency >= EFFICIENCY_POSITIVE && !s.invertedRetrievalGap
8178
8529
  ).sort(
8179
8530
  (a, b) => (b.infrastructureEfficiency ?? 0) - (a.infrastructureEfficiency ?? 0)
8180
8531
  );
8181
- if (scores.length === 0) return null;
8182
- return /* @__PURE__ */ jsxs30(Stack26, { space: 5, children: [
8183
- /* @__PURE__ */ jsxs30(Stack26, { space: 3, children: [
8184
- /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 2, children: [
8185
- /* @__PURE__ */ jsx42(CheckmarkCircleIcon2, { style: { color: "#34d399" } }),
8186
- /* @__PURE__ */ jsx42(Text31, { size: 2, weight: "medium", children: "All Areas \u2014 Scores & Doc Lift" }),
8187
- /* @__PURE__ */ jsx42(InfoTip, { text: GLOSSARY.strengths })
8532
+ if (displayedScores.length === 0) return null;
8533
+ return /* @__PURE__ */ jsxs31(Stack26, { space: 5, children: [
8534
+ /* @__PURE__ */ jsxs31(Stack26, { space: 3, children: [
8535
+ /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, wrap: "wrap", children: [
8536
+ /* @__PURE__ */ jsx43(CheckmarkCircleIcon2, { style: { color: "#34d399" } }),
8537
+ /* @__PURE__ */ jsx43(Text32, { size: 2, weight: "medium", children: "Strong Areas (70+)" }),
8538
+ /* @__PURE__ */ jsx43(InfoTip, { text: GLOSSARY.strengths }),
8539
+ hasModels && /* @__PURE__ */ jsx43(Box20, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx43(
8540
+ ModelSelector,
8541
+ {
8542
+ models: perModel,
8543
+ onChange: onSelectionChange,
8544
+ selection
8545
+ }
8546
+ ) })
8188
8547
  ] }),
8189
- /* @__PURE__ */ jsx42(StrengthsTable, { perArea: comparison?.deltas?.perArea, scores })
8548
+ /* @__PURE__ */ jsx43(
8549
+ AreaScoresGrid,
8550
+ {
8551
+ perArea: comparison?.deltas?.perArea,
8552
+ perModel: expandedPerModel,
8553
+ scores: displayedScores
8554
+ }
8555
+ )
8190
8556
  ] }),
8191
- retrievalSuccesses.length > 0 && /* @__PURE__ */ jsxs30(Box20, { style: neutralCardStyle, children: [
8192
- /* @__PURE__ */ jsx42(
8557
+ retrievalSuccesses.length > 0 && /* @__PURE__ */ jsxs31(Box20, { style: neutralCardStyle, children: [
8558
+ /* @__PURE__ */ jsx43(
8193
8559
  Box20,
8194
8560
  {
8195
8561
  padding: 4,
8196
8562
  style: { borderBottom: "1px solid var(--card-border-color)" },
8197
- children: /* @__PURE__ */ jsxs30(Flex23, { align: "center", gap: 2, children: [
8198
- /* @__PURE__ */ jsx42(SearchIcon6, { style: { color: "#34d399" } }),
8199
- /* @__PURE__ */ jsxs30(Text31, { size: 2, weight: "medium", children: [
8563
+ children: /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, children: [
8564
+ /* @__PURE__ */ jsx43(SearchIcon6, { style: { color: "#34d399" } }),
8565
+ /* @__PURE__ */ jsxs31(Text32, { size: 2, weight: "medium", children: [
8200
8566
  "Retrieval Successes (",
8201
8567
  Math.round(EFFICIENCY_POSITIVE * 100),
8202
8568
  "%+ efficiency)"
8203
8569
  ] }),
8204
- /* @__PURE__ */ jsx42(InfoTip, { text: GLOSSARY.retrievalExcellence })
8570
+ /* @__PURE__ */ jsx43(InfoTip, { text: GLOSSARY.retrievalExcellence })
8205
8571
  ] })
8206
8572
  }
8207
8573
  ),
8208
- /* @__PURE__ */ jsx42(Stack26, { children: retrievalSuccesses.map((area, i) => /* @__PURE__ */ jsxs30(
8209
- Flex23,
8574
+ /* @__PURE__ */ jsx43(Stack26, { children: retrievalSuccesses.map((area, i) => /* @__PURE__ */ jsxs31(
8575
+ Flex24,
8210
8576
  {
8211
8577
  align: "center",
8212
8578
  justify: "space-between",
8213
8579
  padding: 4,
8214
8580
  style: i > 0 ? dividerStyle : void 0,
8215
8581
  children: [
8216
- /* @__PURE__ */ jsx42(Text31, { size: 2, children: area.feature }),
8217
- /* @__PURE__ */ jsx42(
8582
+ /* @__PURE__ */ jsx43(Text32, { size: 2, children: area.feature }),
8583
+ /* @__PURE__ */ jsx43(
8218
8584
  "span",
8219
8585
  {
8220
8586
  style: {
@@ -8240,392 +8606,72 @@ function StrengthsList({ scores, comparison }) {
8240
8606
  import {
8241
8607
  ErrorOutlineIcon as ErrorOutlineIcon3,
8242
8608
  SearchIcon as SearchIcon7,
8243
- WarningOutlineIcon as WarningOutlineIcon4,
8609
+ WarningOutlineIcon as WarningOutlineIcon3,
8244
8610
  BoltIcon as BoltIcon2,
8245
8611
  ArrowDownIcon as ArrowDownIcon2
8246
8612
  } from "@sanity/icons";
8247
- import { Box as Box22, Flex as Flex25, Stack as Stack28, Text as Text33 } from "@sanity/ui";
8248
-
8249
- // src/components/report-detail/AreaScoreRow.tsx
8250
- import { WarningOutlineIcon as WarningOutlineIcon3 } from "@sanity/icons";
8251
- import { Box as Box21, Flex as Flex24, Stack as Stack27, Text as Text32 } from "@sanity/ui";
8252
- import { jsx as jsx43, jsxs as jsxs31 } from "react/jsx-runtime";
8253
- function AreaScoreRow({ area, showTrend }) {
8254
- return /* @__PURE__ */ jsx43(Box21, { style: { ...neutralCardStyle, padding: 20 }, children: /* @__PURE__ */ jsxs31(Stack27, { space: 4, children: [
8255
- /* @__PURE__ */ jsxs31(Flex24, { align: "flex-start", gap: 3, justify: "space-between", wrap: "wrap", children: [
8256
- /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 3, children: [
8257
- /* @__PURE__ */ jsx43(
8258
- HoverTip,
8259
- {
8260
- text: /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
8261
- /* @__PURE__ */ jsx43("span", { style: tipBold, children: area.feature }),
8262
- " composite score:",
8263
- " ",
8264
- /* @__PURE__ */ jsx43(
8265
- "span",
8266
- {
8267
- style: { ...tipValue, color: scoreColor(area.totalScore) },
8268
- children: Math.round(area.totalScore)
8269
- }
8270
- ),
8271
- /* @__PURE__ */ jsx43("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
8272
- ". ",
8273
- GLOSSARY.score
8274
- ] }),
8275
- children: /* @__PURE__ */ jsx43(Box21, { style: scoreBoxStyle(area.totalScore), children: /* @__PURE__ */ jsx43("span", { style: { fontSize: 20 }, children: Math.round(area.totalScore) }) })
8276
- }
8277
- ),
8278
- /* @__PURE__ */ jsxs31(Stack27, { space: 2, children: [
8279
- /* @__PURE__ */ jsxs31(Flex24, { align: "center", gap: 2, wrap: "wrap", children: [
8280
- /* @__PURE__ */ jsx43(Text32, { size: 3, weight: "semibold", children: area.feature }),
8281
- area.negativeDocLift && /* @__PURE__ */ jsx43(HoverTip, { text: GLOSSARY.docsHurt, children: /* @__PURE__ */ jsxs31(
8282
- "span",
8283
- {
8284
- style: {
8285
- alignItems: "center",
8286
- backgroundColor: "rgba(239,68,68,0.2)",
8287
- borderRadius: 4,
8288
- color: "#f87171",
8289
- display: "inline-flex",
8290
- fontSize: 13,
8291
- gap: 4,
8292
- padding: "3px 8px"
8293
- },
8294
- children: [
8295
- /* @__PURE__ */ jsx43(WarningOutlineIcon3, {}),
8296
- "Docs Hurt"
8297
- ]
8298
- }
8299
- ) }),
8300
- area.invertedRetrievalGap && /* @__PURE__ */ jsx43(HoverTip, { text: GLOSSARY.invertedRetGap, children: /* @__PURE__ */ jsx43(
8301
- "span",
8302
- {
8303
- style: {
8304
- backgroundColor: "rgba(245,158,11,0.2)",
8305
- borderRadius: 4,
8306
- color: "#fbbf24",
8307
- fontSize: 13,
8308
- padding: "3px 8px"
8309
- },
8310
- children: "Inverted Retrieval"
8311
- }
8312
- ) })
8313
- ] }),
8314
- /* @__PURE__ */ jsxs31(Text32, { muted: true, size: 2, children: [
8315
- area.testCount,
8316
- " test",
8317
- area.testCount === 1 ? "" : "s"
8318
- ] })
8319
- ] })
8320
- ] }),
8321
- showTrend && /* @__PURE__ */ jsx43(
8322
- "span",
8323
- {
8324
- style: {
8325
- backgroundColor: showTrend === "improved" ? "rgba(16,185,129,0.2)" : showTrend === "regressed" ? "rgba(239,68,68,0.2)" : "var(--card-muted-bg-color)",
8326
- borderRadius: 4,
8327
- color: showTrend === "improved" ? "#34d399" : showTrend === "regressed" ? "#f87171" : "var(--card-muted-fg-color)",
8328
- fontSize: 13,
8329
- fontWeight: 500,
8330
- padding: "4px 10px"
8331
- },
8332
- children: showTrend
8333
- }
8334
- )
8335
- ] }),
8336
- /* @__PURE__ */ jsxs31(
8337
- "div",
8338
- {
8339
- style: {
8340
- display: "grid",
8341
- gap: 16,
8342
- gridTemplateColumns: "1fr 1fr 1fr"
8343
- },
8344
- children: [
8345
- /* @__PURE__ */ jsx43(
8346
- DimBar,
8347
- {
8348
- label: "Task Completion",
8349
- tip: dimBarTip(
8350
- area.feature,
8351
- "Task Completion",
8352
- area.taskCompletion,
8353
- GLOSSARY.taskCompletion
8354
- ),
8355
- value: area.taskCompletion
8356
- }
8357
- ),
8358
- /* @__PURE__ */ jsx43(
8359
- DimBar,
8360
- {
8361
- label: "Code Correctness",
8362
- tip: dimBarTip(
8363
- area.feature,
8364
- "Code Correctness",
8365
- area.codeCorrectness,
8366
- GLOSSARY.codeCorrectness
8367
- ),
8368
- value: area.codeCorrectness
8369
- }
8370
- ),
8371
- /* @__PURE__ */ jsx43(
8372
- DimBar,
8373
- {
8374
- label: "Doc Coverage",
8375
- tip: dimBarTip(
8376
- area.feature,
8377
- "Doc Coverage",
8378
- area.docCoverage,
8379
- GLOSSARY.docCoverage
8380
- ),
8381
- value: area.docCoverage
8382
- }
8383
- )
8384
- ]
8385
- }
8386
- ),
8387
- /* @__PURE__ */ jsxs31(Flex24, { gap: 5, style: { ...dividerStyle, paddingTop: 12 }, wrap: "wrap", children: [
8388
- /* @__PURE__ */ jsx43(
8389
- MetricPair,
8390
- {
8391
- color: area.negativeDocLift ? "#f87171" : "#34d399",
8392
- label: "Doc Lift",
8393
- tip: metricTip(
8394
- area.feature,
8395
- "Doc Lift",
8396
- `${area.docLift > 0 ? "+" : ""}${area.docLift}`,
8397
- GLOSSARY.docLift
8398
- ),
8399
- value: `${area.docLift > 0 ? "+" : ""}${area.docLift}`
8400
- }
8401
- ),
8402
- /* @__PURE__ */ jsx43(
8403
- MetricPair,
8404
- {
8405
- label: "Ceiling",
8406
- tip: metricTip(
8407
- area.feature,
8408
- "Ceiling",
8409
- String(Math.round(area.ceilingScore ?? 0)),
8410
- GLOSSARY.ceiling
8411
- ),
8412
- value: String(Math.round(area.ceilingScore ?? 0))
8413
- }
8414
- ),
8415
- /* @__PURE__ */ jsx43(
8416
- MetricPair,
8417
- {
8418
- label: "Floor",
8419
- tip: metricTip(
8420
- area.feature,
8421
- "Floor",
8422
- String(Math.round(area.floorScore ?? 0)),
8423
- GLOSSARY.floor
8424
- ),
8425
- value: String(Math.round(area.floorScore ?? 0))
8426
- }
8427
- ),
8428
- area.actualScore != null && /* @__PURE__ */ jsx43(
8429
- MetricPair,
8430
- {
8431
- label: "Actual",
8432
- tip: metricTip(
8433
- area.feature,
8434
- "Actual",
8435
- String(Math.round(area.actualScore)),
8436
- GLOSSARY.actualScore
8437
- ),
8438
- value: String(Math.round(area.actualScore))
8439
- }
8440
- ),
8441
- area.infrastructureEfficiency != null && /* @__PURE__ */ jsx43(
8442
- MetricPair,
8443
- {
8444
- color: efficiencyColor(area.infrastructureEfficiency),
8445
- label: "Efficiency",
8446
- tip: metricTip(
8447
- area.feature,
8448
- "Efficiency",
8449
- formatPercent(area.infrastructureEfficiency),
8450
- GLOSSARY.infraEfficiency
8451
- ),
8452
- value: formatPercent(area.infrastructureEfficiency)
8453
- }
8454
- ),
8455
- area.retrievalGap != null && /* @__PURE__ */ jsx43(
8456
- MetricPair,
8457
- {
8458
- label: "Ret Gap",
8459
- tip: metricTip(
8460
- area.feature,
8461
- "Retrieval Gap",
8462
- String(area.retrievalGap),
8463
- GLOSSARY.retrievalGap
8464
- ),
8465
- value: String(area.retrievalGap)
8466
- }
8467
- )
8468
- ] })
8469
- ] }) });
8470
- }
8471
- var tipValue = {
8472
- fontFamily: "var(--font-code-size, monospace)",
8473
- fontWeight: 600
8474
- };
8475
- var tipBold = { fontWeight: 600 };
8476
- function dimBarTip(area, dim, score, description) {
8477
- return /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
8478
- /* @__PURE__ */ jsx43("span", { style: tipBold, children: area }),
8479
- " \u2192 ",
8480
- /* @__PURE__ */ jsx43("span", { style: tipBold, children: dim }),
8481
- ":",
8482
- " ",
8483
- /* @__PURE__ */ jsx43("span", { style: { ...tipValue, color: scoreColor(score) }, children: Math.round(score) }),
8484
- /* @__PURE__ */ jsx43("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
8485
- ".",
8486
- " ",
8487
- description
8488
- ] });
8489
- }
8490
- function DimBar({
8491
- label,
8492
- value,
8493
- tip
8494
- }) {
8495
- const bar = /* @__PURE__ */ jsxs31(Stack27, { space: 2, style: { flex: 1 }, children: [
8496
- /* @__PURE__ */ jsxs31(Flex24, { align: "center", justify: "space-between", children: [
8497
- /* @__PURE__ */ jsx43(Text32, { muted: true, size: 1, children: label }),
8498
- /* @__PURE__ */ jsx43(
8499
- "span",
8500
- {
8501
- style: {
8502
- color: scoreColor(value),
8503
- fontFamily: "var(--font-code-size, monospace)",
8504
- fontSize: 14,
8505
- fontWeight: 600
8506
- },
8507
- children: Math.round(value)
8508
- }
8509
- )
8510
- ] }),
8511
- /* @__PURE__ */ jsx43(
8512
- Box21,
8513
- {
8514
- style: {
8515
- backgroundColor: "var(--card-border-color)",
8516
- borderRadius: 999,
8517
- height: 6,
8518
- overflow: "hidden"
8519
- },
8520
- children: /* @__PURE__ */ jsx43(
8521
- Box21,
8522
- {
8523
- style: {
8524
- backgroundColor: barFillColor(value),
8525
- borderRadius: 999,
8526
- height: "100%",
8527
- transition: "width 0.3s",
8528
- width: `${Math.min(value, 100)}%`
8529
- }
8530
- }
8531
- )
8532
- }
8533
- )
8534
- ] });
8535
- if (tip) {
8536
- return /* @__PURE__ */ jsx43(HoverTip, { text: tip, children: bar });
8537
- }
8538
- return bar;
8539
- }
8540
- function metricTip(area, metric, displayValue, description) {
8541
- return /* @__PURE__ */ jsxs31(Text32, { size: 2, style: { lineHeight: 1.5 }, children: [
8542
- /* @__PURE__ */ jsx43("span", { style: tipBold, children: area }),
8543
- " \u2192",
8544
- " ",
8545
- /* @__PURE__ */ jsx43("span", { style: tipBold, children: metric }),
8546
- ":",
8547
- " ",
8548
- /* @__PURE__ */ jsx43("span", { style: tipValue, children: displayValue }),
8549
- ". ",
8550
- description
8551
- ] });
8552
- }
8553
- function MetricPair({
8554
- color,
8555
- label,
8556
- value,
8557
- tip
8558
- }) {
8559
- const content = /* @__PURE__ */ jsxs31(Text32, { muted: true, size: 1, children: [
8560
- label,
8561
- ":",
8562
- " ",
8563
- /* @__PURE__ */ jsx43(
8564
- "span",
8565
- {
8566
- style: {
8567
- color: color ?? "var(--card-fg-color)",
8568
- fontFamily: "var(--font-code-size, monospace)",
8569
- fontWeight: 500
8570
- },
8571
- children: value
8572
- }
8573
- )
8574
- ] });
8575
- if (tip) {
8576
- return /* @__PURE__ */ jsx43(HoverTip, { text: tip, children: content });
8577
- }
8578
- return content;
8579
- }
8580
-
8581
- // src/components/report-detail/WeaknessesList.tsx
8613
+ import { Box as Box21, Flex as Flex25, Stack as Stack27, Text as Text33 } from "@sanity/ui";
8582
8614
  import { jsx as jsx44, jsxs as jsxs32 } from "react/jsx-runtime";
8583
- function WeaknessesList({ scores, comparison }) {
8584
- const weakAreas = scores.filter((s) => s.totalScore < SCORE_CAUTION).sort((a, b) => a.totalScore - b.totalScore);
8615
+ function WeaknessesList({
8616
+ scores,
8617
+ comparison,
8618
+ perModel
8619
+ }) {
8620
+ const {
8621
+ selection,
8622
+ onSelectionChange,
8623
+ resolvedScores,
8624
+ hasModels,
8625
+ expandedPerModel
8626
+ } = useModelSelection({ scores, perModel });
8627
+ const weakFeatures = new Set(
8628
+ scores.filter((s) => s.totalScore < SCORE_CAUTION).map((s) => s.feature)
8629
+ );
8630
+ const weakAreas = resolvedScores.filter((s) => weakFeatures.has(s.feature)).sort((a, b) => a.totalScore - b.totalScore);
8585
8631
  const docsHurt = scores.filter((s) => s.negativeDocLift);
8586
8632
  const retrievalIssues = scores.filter(
8587
8633
  (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency < EFFICIENCY_CAUTION && !s.invertedRetrievalGap
8588
8634
  );
8589
8635
  const dimWeaknesses = scores.map((s) => ({ area: s, dims: getDimensionWeaknesses(s) })).filter(({ dims }) => dims.length > 0);
8590
8636
  const regressed = comparison?.regressed ?? [];
8591
- const improved = comparison?.improved ?? [];
8592
- const unchanged = comparison?.unchanged ?? [];
8593
8637
  const perArea = comparison?.deltas?.perArea;
8594
8638
  const efficiencyAnomalies = scores.filter(
8595
8639
  (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency > EFFICIENCY_ANOMALY
8596
8640
  );
8597
8641
  const hasContent = weakAreas.length > 0 || docsHurt.length > 0 || retrievalIssues.length > 0 || dimWeaknesses.length > 0 || regressed.length > 0 || efficiencyAnomalies.length > 0;
8598
8642
  if (!hasContent) return null;
8599
- const trendFor = (feature) => {
8600
- if (improved.includes(feature)) return "improved";
8601
- if (regressed.includes(feature)) return "regressed";
8602
- if (unchanged.includes(feature)) return "unchanged";
8603
- return null;
8604
- };
8605
- return /* @__PURE__ */ jsxs32(Stack28, { space: 5, children: [
8606
- weakAreas.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
8607
- /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
8643
+ return /* @__PURE__ */ jsxs32(Stack27, { space: 5, children: [
8644
+ weakAreas.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
8645
+ /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, wrap: "wrap", children: [
8608
8646
  /* @__PURE__ */ jsx44(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
8609
8647
  /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Weak Areas (<70)" }),
8610
- /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.weakAreas })
8648
+ /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.weakAreas }),
8649
+ hasModels && /* @__PURE__ */ jsx44(Box21, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx44(
8650
+ ModelSelector,
8651
+ {
8652
+ models: perModel,
8653
+ onChange: onSelectionChange,
8654
+ selection
8655
+ }
8656
+ ) })
8611
8657
  ] }),
8612
- /* @__PURE__ */ jsx44(Stack28, { space: 3, children: weakAreas.map((area) => /* @__PURE__ */ jsx44(
8613
- AreaScoreRow,
8658
+ /* @__PURE__ */ jsx44(
8659
+ AreaScoresGrid,
8614
8660
  {
8615
- area,
8616
- showTrend: trendFor(area.feature)
8617
- },
8618
- area.feature
8619
- )) })
8661
+ perArea,
8662
+ perModel: expandedPerModel,
8663
+ scores: weakAreas
8664
+ }
8665
+ )
8620
8666
  ] }),
8621
- docsHurt.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
8667
+ docsHurt.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
8622
8668
  /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
8623
8669
  /* @__PURE__ */ jsx44(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
8624
8670
  /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Docs Hurt Performance (Negative Doc Lift)" }),
8625
8671
  /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.docsHurt })
8626
8672
  ] }),
8627
- /* @__PURE__ */ jsx44(Box22, { style: sectionStyle("red"), children: docsHurt.map((area, i) => /* @__PURE__ */ jsxs32(
8628
- Box22,
8673
+ /* @__PURE__ */ jsx44(Box21, { style: sectionStyle("red"), children: docsHurt.map((area, i) => /* @__PURE__ */ jsxs32(
8674
+ Box21,
8629
8675
  {
8630
8676
  padding: 4,
8631
8677
  style: i > 0 ? { borderTop: "1px solid rgba(239,68,68,0.2)" } : void 0,
@@ -8661,7 +8707,7 @@ function WeaknessesList({ scores, comparison }) {
8661
8707
  }
8662
8708
  )
8663
8709
  ] }),
8664
- /* @__PURE__ */ jsx44(Box22, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
8710
+ /* @__PURE__ */ jsx44(Box21, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
8665
8711
  area.invertedRetrievalGap && /* @__PURE__ */ jsxs32("span", { style: { color: "#fbbf24" }, children: [
8666
8712
  "Agent does better by NOT finding these docs.",
8667
8713
  " "
@@ -8677,14 +8723,14 @@ function WeaknessesList({ scores, comparison }) {
8677
8723
  area.feature
8678
8724
  )) })
8679
8725
  ] }),
8680
- retrievalIssues.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
8726
+ retrievalIssues.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
8681
8727
  /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
8682
8728
  /* @__PURE__ */ jsx44(SearchIcon7, { style: { color: "#fbbf24" } }),
8683
8729
  /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Retrieval Issues (<70% efficiency)" }),
8684
8730
  /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.retrievalIssues })
8685
8731
  ] }),
8686
- /* @__PURE__ */ jsx44(Box22, { style: sectionStyle("amber"), children: retrievalIssues.map((area, i) => /* @__PURE__ */ jsxs32(
8687
- Box22,
8732
+ /* @__PURE__ */ jsx44(Box21, { style: sectionStyle("amber"), children: retrievalIssues.map((area, i) => /* @__PURE__ */ jsxs32(
8733
+ Box21,
8688
8734
  {
8689
8735
  padding: 4,
8690
8736
  style: i > 0 ? { borderTop: "1px solid rgba(245,158,11,0.2)" } : void 0,
@@ -8720,7 +8766,7 @@ function WeaknessesList({ scores, comparison }) {
8720
8766
  }
8721
8767
  )
8722
8768
  ] }),
8723
- /* @__PURE__ */ jsx44(Box22, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
8769
+ /* @__PURE__ */ jsx44(Box21, { paddingTop: 2, children: /* @__PURE__ */ jsxs32(Text33, { muted: true, size: 2, children: [
8724
8770
  "Actual score (",
8725
8771
  Math.round(area.actualScore ?? 0),
8726
8772
  ") is much lower than ceiling (",
@@ -8735,14 +8781,14 @@ function WeaknessesList({ scores, comparison }) {
8735
8781
  area.feature
8736
8782
  )) })
8737
8783
  ] }),
8738
- dimWeaknesses.length > 0 && /* @__PURE__ */ jsxs32(Stack28, { space: 3, children: [
8784
+ dimWeaknesses.length > 0 && /* @__PURE__ */ jsxs32(Stack27, { space: 3, children: [
8739
8785
  /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
8740
- /* @__PURE__ */ jsx44(WarningOutlineIcon4, { style: { color: "#fbbf24" } }),
8786
+ /* @__PURE__ */ jsx44(WarningOutlineIcon3, { style: { color: "#fbbf24" } }),
8741
8787
  /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Dimension Weaknesses (<50)" }),
8742
8788
  /* @__PURE__ */ jsx44(InfoTip, { text: GLOSSARY.dimWeaknesses })
8743
8789
  ] }),
8744
- /* @__PURE__ */ jsx44(Box22, { style: neutralCardStyle, children: dimWeaknesses.map(({ area, dims }, i) => /* @__PURE__ */ jsxs32(
8745
- Box22,
8790
+ /* @__PURE__ */ jsx44(Box21, { style: neutralCardStyle, children: dimWeaknesses.map(({ area, dims }, i) => /* @__PURE__ */ jsxs32(
8791
+ Box21,
8746
8792
  {
8747
8793
  padding: 4,
8748
8794
  style: i > 0 ? dividerStyle : void 0,
@@ -8786,9 +8832,9 @@ function WeaknessesList({ scores, comparison }) {
8786
8832
  area.feature
8787
8833
  )) })
8788
8834
  ] }),
8789
- regressed.length > 0 && /* @__PURE__ */ jsxs32(Box22, { style: neutralCardStyle, children: [
8835
+ regressed.length > 0 && /* @__PURE__ */ jsxs32(Box21, { style: neutralCardStyle, children: [
8790
8836
  /* @__PURE__ */ jsx44(
8791
- Box22,
8837
+ Box21,
8792
8838
  {
8793
8839
  padding: 4,
8794
8840
  style: { borderBottom: "1px solid var(--card-border-color)" },
@@ -8798,7 +8844,7 @@ function WeaknessesList({ scores, comparison }) {
8798
8844
  ] })
8799
8845
  }
8800
8846
  ),
8801
- /* @__PURE__ */ jsx44(Stack28, { children: regressed.map((featureName, i) => {
8847
+ /* @__PURE__ */ jsx44(Stack27, { children: regressed.map((featureName, i) => {
8802
8848
  const area = scores.find((s) => s.feature === featureName);
8803
8849
  const areaDelta = perArea?.[featureName];
8804
8850
  return /* @__PURE__ */ jsxs32(
@@ -8834,13 +8880,13 @@ function WeaknessesList({ scores, comparison }) {
8834
8880
  );
8835
8881
  }) })
8836
8882
  ] }),
8837
- efficiencyAnomalies.length > 0 && /* @__PURE__ */ jsxs32(Box22, { style: neutralCardStyle, children: [
8883
+ efficiencyAnomalies.length > 0 && /* @__PURE__ */ jsxs32(Box21, { style: neutralCardStyle, children: [
8838
8884
  /* @__PURE__ */ jsx44(
8839
- Box22,
8885
+ Box21,
8840
8886
  {
8841
8887
  padding: 4,
8842
8888
  style: { borderBottom: "1px solid var(--card-border-color)" },
8843
- children: /* @__PURE__ */ jsxs32(Stack28, { space: 2, children: [
8889
+ children: /* @__PURE__ */ jsxs32(Stack27, { space: 2, children: [
8844
8890
  /* @__PURE__ */ jsxs32(Flex25, { align: "center", gap: 2, children: [
8845
8891
  /* @__PURE__ */ jsx44(BoltIcon2, { style: { color: "#fbbf24" } }),
8846
8892
  /* @__PURE__ */ jsx44(Text33, { size: 2, weight: "medium", children: "Efficiency Anomalies (>100%)" }),
@@ -8850,7 +8896,7 @@ function WeaknessesList({ scores, comparison }) {
8850
8896
  ] })
8851
8897
  }
8852
8898
  ),
8853
- /* @__PURE__ */ jsx44(Stack28, { children: efficiencyAnomalies.map((area, i) => /* @__PURE__ */ jsxs32(
8899
+ /* @__PURE__ */ jsx44(Stack27, { children: efficiencyAnomalies.map((area, i) => /* @__PURE__ */ jsxs32(
8854
8900
  Flex25,
8855
8901
  {
8856
8902
  align: "center",
@@ -8878,7 +8924,7 @@ function WeaknessesList({ scores, comparison }) {
8878
8924
  ] })
8879
8925
  ] });
8880
8926
  }
8881
- var tipValue2 = {
8927
+ var tipValue = {
8882
8928
  color: "#f87171",
8883
8929
  fontFamily: "var(--font-code-size, monospace)",
8884
8930
  fontWeight: 600
@@ -8891,7 +8937,7 @@ function dimTip(area, dim, score, description) {
8891
8937
  /* @__PURE__ */ jsx44("span", { style: tipArea, children: area }),
8892
8938
  " scores",
8893
8939
  " ",
8894
- /* @__PURE__ */ jsx44("span", { style: tipValue2, children: score }),
8940
+ /* @__PURE__ */ jsx44("span", { style: tipValue, children: score }),
8895
8941
  /* @__PURE__ */ jsx44("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
8896
8942
  " on",
8897
8943
  " ",
@@ -8956,8 +9002,8 @@ function ReportDetail({
8956
9002
  subTab
8957
9003
  }) {
8958
9004
  const client = useClient10({ apiVersion: API_VERSION });
8959
- const [loading, setLoading] = useState18(true);
8960
- const [report, setReport] = useState18(null);
9005
+ const [loading, setLoading] = useState19(true);
9006
+ const [report, setReport] = useState19(null);
8961
9007
  useEffect9(() => {
8962
9008
  let cancelled = false;
8963
9009
  setLoading(true);
@@ -8983,22 +9029,22 @@ function ReportDetail({
8983
9029
  const hasAgentActivity = Boolean(
8984
9030
  summary?.agentBehavior && summary.agentBehavior.length > 0
8985
9031
  );
8986
- const tabs = useMemo7(
9032
+ const tabs = useMemo9(
8987
9033
  () => [OVERVIEW_TAB, DIAGNOSTICS_TAB, ACTIVITY_TAB],
8988
9034
  []
8989
9035
  );
8990
- const disabledTabs = useMemo7(() => {
9036
+ const disabledTabs = useMemo9(() => {
8991
9037
  const set2 = /* @__PURE__ */ new Set();
8992
9038
  if (!hasDiagnostics) set2.add("diagnostics");
8993
9039
  if (!hasAgentActivity) set2.add("activity");
8994
9040
  return set2;
8995
9041
  }, [hasDiagnostics, hasAgentActivity]);
8996
- const currentTab = useMemo7(() => {
9042
+ const currentTab = useMemo9(() => {
8997
9043
  const parsed = parseTab(activeTab);
8998
9044
  if (disabledTabs.has(parsed)) return "overview";
8999
9045
  return tabs.some((t) => t.id === parsed) ? parsed : "overview";
9000
9046
  }, [activeTab, disabledTabs, tabs]);
9001
- const handleTabClick = useCallback23(
9047
+ const handleTabClick = useCallback25(
9002
9048
  (tabId) => {
9003
9049
  onTabChange(tabId === "overview" ? null : tabId, null, null);
9004
9050
  },
@@ -9008,7 +9054,7 @@ function ReportDetail({
9008
9054
  return /* @__PURE__ */ jsx45(LoadingState, { message: "Loading report\u2026" });
9009
9055
  }
9010
9056
  if (!report || !summary) {
9011
- return /* @__PURE__ */ jsx45(Box23, { padding: 5, children: /* @__PURE__ */ jsxs33(Stack29, { space: 4, children: [
9057
+ return /* @__PURE__ */ jsx45(Box22, { padding: 5, children: /* @__PURE__ */ jsxs33(Stack28, { space: 4, children: [
9012
9058
  /* @__PURE__ */ jsx45(
9013
9059
  Button8,
9014
9060
  {
@@ -9023,7 +9069,7 @@ function ReportDetail({
9023
9069
  }
9024
9070
  const { comparison, provenance } = report;
9025
9071
  const totalTests = summary.scores.reduce((n, s) => n + s.testCount, 0);
9026
- return /* @__PURE__ */ jsx45(Box23, { padding: 4, children: /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
9072
+ return /* @__PURE__ */ jsx45(Box22, { padding: 4, children: /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
9027
9073
  /* @__PURE__ */ jsx45(
9028
9074
  ReportHeader,
9029
9075
  {
@@ -9051,7 +9097,7 @@ function ReportDetail({
9051
9097
  return isDisabled && tooltip ? /* @__PURE__ */ jsx45(
9052
9098
  Tooltip8,
9053
9099
  {
9054
- content: /* @__PURE__ */ jsx45(Box23, { padding: 2, style: { maxWidth: 280 }, children: tooltip }),
9100
+ content: /* @__PURE__ */ jsx45(Box22, { padding: 2, style: { maxWidth: 280 }, children: tooltip }),
9055
9101
  placement: "bottom",
9056
9102
  portal: true,
9057
9103
  children: /* @__PURE__ */ jsx45("span", { style: { display: "inline-block" }, children: tabElement })
@@ -9085,7 +9131,7 @@ function ReportDetail({
9085
9131
  "aria-labelledby": "tab-overview",
9086
9132
  hidden: currentTab !== "overview",
9087
9133
  id: "panel-overview",
9088
- children: /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
9134
+ children: /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
9089
9135
  /* @__PURE__ */ jsx45(
9090
9136
  DiagnosticsOverview,
9091
9137
  {
@@ -9108,6 +9154,7 @@ function ReportDetail({
9108
9154
  focus,
9109
9155
  judgments: summary.lowScoringJudgments,
9110
9156
  onNavigate: (newSubTab, newFocus) => onTabChange("diagnostics", newSubTab, newFocus),
9157
+ perModel: summary.perModel,
9111
9158
  recommendations: summary.recommendations,
9112
9159
  scores: summary.scores,
9113
9160
  subTab
@@ -9143,6 +9190,7 @@ function DiagnosticsPanel({
9143
9190
  focus,
9144
9191
  judgments,
9145
9192
  onNavigate,
9193
+ perModel,
9146
9194
  recommendations,
9147
9195
  scores,
9148
9196
  subTab: subTabParam
@@ -9151,7 +9199,7 @@ function DiagnosticsPanel({
9151
9199
  const issueCount = scores.filter((s) => s.totalScore < SCORE_CAUTION).length + scores.filter((s) => s.negativeDocLift).length + scores.filter(
9152
9200
  (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency < EFFICIENCY_CAUTION && !s.invertedRetrievalGap
9153
9201
  ).length;
9154
- return /* @__PURE__ */ jsx45(TabPanel, { "aria-labelledby": "tab-diagnostics", id: "panel-diagnostics", children: /* @__PURE__ */ jsxs33(Stack29, { space: 4, children: [
9202
+ return /* @__PURE__ */ jsx45(TabPanel, { "aria-labelledby": "tab-diagnostics", id: "panel-diagnostics", children: /* @__PURE__ */ jsxs33(Stack28, { space: 4, children: [
9155
9203
  /* @__PURE__ */ jsx45(
9156
9204
  Flex26,
9157
9205
  {
@@ -9202,9 +9250,23 @@ function DiagnosticsPanel({
9202
9250
  ))
9203
9251
  }
9204
9252
  ),
9205
- subTab === "strengths" && /* @__PURE__ */ jsx45(StrengthsList, { comparison, scores }),
9206
- subTab === "issues" && /* @__PURE__ */ jsxs33(Stack29, { space: 5, children: [
9207
- /* @__PURE__ */ jsx45(WeaknessesList, { comparison, scores }),
9253
+ subTab === "strengths" && /* @__PURE__ */ jsx45(
9254
+ StrengthsList,
9255
+ {
9256
+ comparison,
9257
+ perModel,
9258
+ scores
9259
+ }
9260
+ ),
9261
+ subTab === "issues" && /* @__PURE__ */ jsxs33(Stack28, { space: 5, children: [
9262
+ /* @__PURE__ */ jsx45(
9263
+ WeaknessesList,
9264
+ {
9265
+ comparison,
9266
+ perModel,
9267
+ scores
9268
+ }
9269
+ ),
9208
9270
  recommendations && recommendations.gaps.length > 0 && /* @__PURE__ */ jsx45(RecommendationsSection, { recommendations }),
9209
9271
  judgments && judgments.length > 0 && /* @__PURE__ */ jsx45(
9210
9272
  JudgmentList,
@@ -9256,9 +9318,14 @@ function getDisabledTabTooltip(tabId, summary) {
9256
9318
  }
9257
9319
  }
9258
9320
 
9321
+ // src/components/report-detail/AreaScoreRow.tsx
9322
+ import { WarningOutlineIcon as WarningOutlineIcon4 } from "@sanity/icons";
9323
+ import { Box as Box23, Flex as Flex27, Stack as Stack29, Text as Text35 } from "@sanity/ui";
9324
+ import { jsx as jsx46, jsxs as jsxs34 } from "react/jsx-runtime";
9325
+
9259
9326
  // src/components/report-detail/AreaScoreTable.tsx
9260
9327
  import React4 from "react";
9261
- import { Card as Card17, Stack as Stack30, Text as Text36 } from "@sanity/ui";
9328
+ import { Card as Card17, Stack as Stack30, Text as Text37 } from "@sanity/ui";
9262
9329
 
9263
9330
  // src/lib/scoring.ts
9264
9331
  var HEX_MAP = {
@@ -9275,30 +9342,30 @@ function scoreHex(score) {
9275
9342
  }
9276
9343
 
9277
9344
  // src/components/primitives/ScoreCell.tsx
9278
- import { Card as Card16, Text as Text35 } from "@sanity/ui";
9279
- import { jsx as jsx46 } from "react/jsx-runtime";
9345
+ import { Card as Card16, Text as Text36 } from "@sanity/ui";
9346
+ import { jsx as jsx47 } from "react/jsx-runtime";
9280
9347
 
9281
9348
  // src/components/report-detail/AreaScoreTable.tsx
9282
- import { jsx as jsx47, jsxs as jsxs34 } from "react/jsx-runtime";
9349
+ import { jsx as jsx48, jsxs as jsxs35 } from "react/jsx-runtime";
9283
9350
 
9284
9351
  // src/components/report-detail/ComparisonSummary.tsx
9285
- import { Badge as Badge8, Box as Box24, Card as Card18, Flex as Flex27, Grid as Grid4, Stack as Stack31, Text as Text37, Tooltip as Tooltip9 } from "@sanity/ui";
9286
- import { jsx as jsx48, jsxs as jsxs35 } from "react/jsx-runtime";
9352
+ import { Badge as Badge8, Box as Box24, Card as Card18, Flex as Flex28, Grid as Grid4, Stack as Stack31, Text as Text38, Tooltip as Tooltip9 } from "@sanity/ui";
9353
+ import { jsx as jsx49, jsxs as jsxs36 } from "react/jsx-runtime";
9287
9354
 
9288
9355
  // src/components/report-detail/OverviewStats.tsx
9289
9356
  import { Grid as Grid5 } from "@sanity/ui";
9290
- import { jsx as jsx49, jsxs as jsxs36 } from "react/jsx-runtime";
9357
+ import { jsx as jsx50, jsxs as jsxs37 } from "react/jsx-runtime";
9291
9358
 
9292
9359
  // src/components/report-detail/ThreeLayerTable.tsx
9293
9360
  import React5 from "react";
9294
- import { Badge as Badge9, Card as Card19, Flex as Flex28, Stack as Stack32, Text as Text38 } from "@sanity/ui";
9295
- import { jsx as jsx50, jsxs as jsxs37 } from "react/jsx-runtime";
9361
+ import { Badge as Badge9, Card as Card19, Flex as Flex29, Stack as Stack32, Text as Text39 } from "@sanity/ui";
9362
+ import { jsx as jsx51, jsxs as jsxs38 } from "react/jsx-runtime";
9296
9363
 
9297
9364
  // src/components/ScoreTimeline.tsx
9298
- import { Card as Card20, Flex as Flex29, Select as Select2, Stack as Stack33, Text as Text39 } from "@sanity/ui";
9299
- import { useCallback as useCallback24, useEffect as useEffect10, useMemo as useMemo8, useState as useState19 } from "react";
9365
+ import { Card as Card20, Flex as Flex30, Select as Select2, Stack as Stack33, Text as Text40 } from "@sanity/ui";
9366
+ import { useCallback as useCallback26, useEffect as useEffect10, useMemo as useMemo10, useState as useState20 } from "react";
9300
9367
  import { useClient as useClient11 } from "sanity";
9301
- import { jsx as jsx51, jsxs as jsxs38 } from "react/jsx-runtime";
9368
+ import { jsx as jsx52, jsxs as jsxs39 } from "react/jsx-runtime";
9302
9369
  var CHART_HEIGHT = 220;
9303
9370
  var CHART_WIDTH = 800;
9304
9371
  var PAD_BOTTOM = 30;
@@ -9333,11 +9400,11 @@ function scoreForPoint(point, area) {
9333
9400
  }
9334
9401
  function ScoreTimeline({ mode = null, source = null }) {
9335
9402
  const client = useClient11({ apiVersion: API_VERSION });
9336
- const [dataPoints, setDataPoints] = useState19([]);
9337
- const [loading, setLoading] = useState19(true);
9338
- const [rangeDays, setRangeDays] = useState19(30);
9339
- const [selectedArea, setSelectedArea] = useState19(null);
9340
- const areaNames = useMemo8(() => {
9403
+ const [dataPoints, setDataPoints] = useState20([]);
9404
+ const [loading, setLoading] = useState20(true);
9405
+ const [rangeDays, setRangeDays] = useState20(30);
9406
+ const [selectedArea, setSelectedArea] = useState20(null);
9407
+ const areaNames = useMemo10(() => {
9341
9408
  const names = /* @__PURE__ */ new Set();
9342
9409
  for (const dp of dataPoints) {
9343
9410
  for (const s of dp.scores) {
@@ -9346,7 +9413,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9346
9413
  }
9347
9414
  return Array.from(names).sort();
9348
9415
  }, [dataPoints]);
9349
- const fetchData = useCallback24(async () => {
9416
+ const fetchData = useCallback26(async () => {
9350
9417
  setLoading(true);
9351
9418
  try {
9352
9419
  const startDate = rangeDays ? daysAgo(rangeDays) : "1970-01-01T00:00:00Z";
@@ -9364,7 +9431,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9364
9431
  useEffect10(() => {
9365
9432
  void fetchData();
9366
9433
  }, [fetchData]);
9367
- const chartPoints = useMemo8(() => {
9434
+ const chartPoints = useMemo10(() => {
9368
9435
  const pts = [];
9369
9436
  const scored = dataPoints.map((dp) => ({
9370
9437
  date: dp.completedAt,
@@ -9378,18 +9445,18 @@ function ScoreTimeline({ mode = null, source = null }) {
9378
9445
  });
9379
9446
  return pts;
9380
9447
  }, [dataPoints, selectedArea]);
9381
- const avgScore = useMemo8(() => {
9448
+ const avgScore = useMemo10(() => {
9382
9449
  if (chartPoints.length === 0) return 0;
9383
9450
  return chartPoints.reduce((sum, p) => sum + p.score, 0) / chartPoints.length;
9384
9451
  }, [chartPoints]);
9385
- const handleRangeChange = useCallback24(
9452
+ const handleRangeChange = useCallback26(
9386
9453
  (e) => {
9387
9454
  const val = e.currentTarget.value;
9388
9455
  setRangeDays(val === "all" ? null : Number(val));
9389
9456
  },
9390
9457
  []
9391
9458
  );
9392
- const handleAreaChange = useCallback24(
9459
+ const handleAreaChange = useCallback26(
9393
9460
  (e) => {
9394
9461
  const val = e.currentTarget.value;
9395
9462
  setSelectedArea(val || null);
@@ -9397,22 +9464,22 @@ function ScoreTimeline({ mode = null, source = null }) {
9397
9464
  []
9398
9465
  );
9399
9466
  const polylinePoints = chartPoints.map((p) => `${p.x},${p.y}`).join(" ");
9400
- return /* @__PURE__ */ jsxs38(Stack33, { space: 4, children: [
9401
- /* @__PURE__ */ jsxs38(Flex29, { gap: 3, children: [
9402
- /* @__PURE__ */ jsx51(
9467
+ return /* @__PURE__ */ jsxs39(Stack33, { space: 4, children: [
9468
+ /* @__PURE__ */ jsxs39(Flex30, { gap: 3, children: [
9469
+ /* @__PURE__ */ jsx52(
9403
9470
  Select2,
9404
9471
  {
9405
9472
  onChange: handleRangeChange,
9406
9473
  value: rangeDays?.toString() ?? "all",
9407
- children: TIME_RANGES.map((r) => /* @__PURE__ */ jsx51("option", { value: r.days?.toString() ?? "all", children: r.label }, r.label))
9474
+ children: TIME_RANGES.map((r) => /* @__PURE__ */ jsx52("option", { value: r.days?.toString() ?? "all", children: r.label }, r.label))
9408
9475
  }
9409
9476
  ),
9410
- /* @__PURE__ */ jsxs38(Select2, { onChange: handleAreaChange, value: selectedArea ?? "", children: [
9411
- /* @__PURE__ */ jsx51("option", { value: "", children: "Overall" }),
9412
- areaNames.map((name) => /* @__PURE__ */ jsx51("option", { value: name, children: name }, name))
9477
+ /* @__PURE__ */ jsxs39(Select2, { onChange: handleAreaChange, value: selectedArea ?? "", children: [
9478
+ /* @__PURE__ */ jsx52("option", { value: "", children: "Overall" }),
9479
+ areaNames.map((name) => /* @__PURE__ */ jsx52("option", { value: name, children: name }, name))
9413
9480
  ] })
9414
9481
  ] }),
9415
- /* @__PURE__ */ jsx51(Card20, { padding: 3, radius: 2, shadow: 1, children: loading ? /* @__PURE__ */ jsx51(Flex29, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx51(Text39, { muted: true, size: 2, children: "Loading\u2026" }) }) : chartPoints.length === 0 ? /* @__PURE__ */ jsx51(Flex29, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx51(Text39, { muted: true, size: 2, children: "No reports found for this time range" }) }) : /* @__PURE__ */ jsxs38(
9482
+ /* @__PURE__ */ jsx52(Card20, { padding: 3, radius: 2, shadow: 1, children: loading ? /* @__PURE__ */ jsx52(Flex30, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "Loading\u2026" }) }) : chartPoints.length === 0 ? /* @__PURE__ */ jsx52(Flex30, { align: "center", justify: "center", style: { height: 200 }, children: /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "No reports found for this time range" }) }) : /* @__PURE__ */ jsxs39(
9416
9483
  "svg",
9417
9484
  {
9418
9485
  style: { display: "block", width: "100%" },
@@ -9420,8 +9487,8 @@ function ScoreTimeline({ mode = null, source = null }) {
9420
9487
  children: [
9421
9488
  Y_TICKS.map((tick) => {
9422
9489
  const y = PAD_TOP + PLOT_HEIGHT - tick / Y_MAX * PLOT_HEIGHT;
9423
- return /* @__PURE__ */ jsxs38("g", { children: [
9424
- /* @__PURE__ */ jsx51(
9490
+ return /* @__PURE__ */ jsxs39("g", { children: [
9491
+ /* @__PURE__ */ jsx52(
9425
9492
  "line",
9426
9493
  {
9427
9494
  stroke: "#ccc",
@@ -9432,7 +9499,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9432
9499
  y2: y
9433
9500
  }
9434
9501
  ),
9435
- /* @__PURE__ */ jsx51(
9502
+ /* @__PURE__ */ jsx52(
9436
9503
  "text",
9437
9504
  {
9438
9505
  dominantBaseline: "middle",
@@ -9452,7 +9519,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9452
9519
  chartPoints.length - 1
9453
9520
  ].map((idx) => {
9454
9521
  const p = chartPoints[idx];
9455
- return /* @__PURE__ */ jsx51(
9522
+ return /* @__PURE__ */ jsx52(
9456
9523
  "text",
9457
9524
  {
9458
9525
  fill: "#999",
@@ -9464,7 +9531,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9464
9531
  },
9465
9532
  idx
9466
9533
  );
9467
- }) : chartPoints.map((p, idx) => /* @__PURE__ */ jsx51(
9534
+ }) : chartPoints.map((p, idx) => /* @__PURE__ */ jsx52(
9468
9535
  "text",
9469
9536
  {
9470
9537
  fill: "#999",
@@ -9476,7 +9543,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9476
9543
  },
9477
9544
  idx
9478
9545
  )),
9479
- /* @__PURE__ */ jsx51(
9546
+ /* @__PURE__ */ jsx52(
9480
9547
  "polyline",
9481
9548
  {
9482
9549
  fill: "none",
@@ -9486,7 +9553,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9486
9553
  strokeWidth: 2.5
9487
9554
  }
9488
9555
  ),
9489
- chartPoints.map((p, idx) => /* @__PURE__ */ jsx51(
9556
+ chartPoints.map((p, idx) => /* @__PURE__ */ jsx52(
9490
9557
  "circle",
9491
9558
  {
9492
9559
  cx: p.x,
@@ -9495,7 +9562,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9495
9562
  r: 4,
9496
9563
  stroke: "#fff",
9497
9564
  strokeWidth: 1.5,
9498
- children: /* @__PURE__ */ jsxs38("title", { children: [
9565
+ children: /* @__PURE__ */ jsxs39("title", { children: [
9499
9566
  formatDate(p.date),
9500
9567
  ": ",
9501
9568
  Math.round(p.score)
@@ -9506,7 +9573,7 @@ function ScoreTimeline({ mode = null, source = null }) {
9506
9573
  ]
9507
9574
  }
9508
9575
  ) }),
9509
- /* @__PURE__ */ jsxs38(Text39, { muted: true, size: 2, children: [
9576
+ /* @__PURE__ */ jsxs39(Text40, { muted: true, size: 2, children: [
9510
9577
  chartPoints.length,
9511
9578
  " data point",
9512
9579
  chartPoints.length !== 1 ? "s" : ""
@@ -9516,15 +9583,15 @@ function ScoreTimeline({ mode = null, source = null }) {
9516
9583
  var ScoreTimeline_default = ScoreTimeline;
9517
9584
 
9518
9585
  // src/components/Dashboard.tsx
9519
- import { jsx as jsx52, jsxs as jsxs39 } from "react/jsx-runtime";
9586
+ import { jsx as jsx53, jsxs as jsxs40 } from "react/jsx-runtime";
9520
9587
  var VIEW_PARAM_MAP = {
9521
9588
  compare: "compare",
9522
9589
  timeline: "timeline"
9523
9590
  };
9524
9591
  function Dashboard() {
9525
- return /* @__PURE__ */ jsx52(HelpProvider, { children: /* @__PURE__ */ jsxs39(Flex30, { style: { height: "100%" }, children: [
9526
- /* @__PURE__ */ jsx52(Box25, { flex: 1, overflow: "auto", children: /* @__PURE__ */ jsx52(DashboardContent, {}) }),
9527
- /* @__PURE__ */ jsx52(HelpDrawer, {})
9592
+ return /* @__PURE__ */ jsx53(HelpProvider, { children: /* @__PURE__ */ jsxs40(Flex31, { style: { height: "100%" }, children: [
9593
+ /* @__PURE__ */ jsx53(Box25, { flex: 1, overflow: "auto", children: /* @__PURE__ */ jsx53(DashboardContent, {}) }),
9594
+ /* @__PURE__ */ jsx53(HelpDrawer, {})
9528
9595
  ] }) });
9529
9596
  }
9530
9597
  function DashboardContent() {
@@ -9535,7 +9602,7 @@ function DashboardContent() {
9535
9602
  const isDetail = reportId !== null;
9536
9603
  const activeTab = isDetail ? "latest" : VIEW_PARAM_MAP[routerState.view ?? ""] ?? "latest";
9537
9604
  const defaultTopic = deriveHelpTopic(routerState);
9538
- const navigateToTab = useCallback25(
9605
+ const navigateToTab = useCallback27(
9539
9606
  (tab) => {
9540
9607
  if (tab === "latest") {
9541
9608
  router.navigate({});
@@ -9545,13 +9612,13 @@ function DashboardContent() {
9545
9612
  },
9546
9613
  [router]
9547
9614
  );
9548
- const handleSelectReport = useCallback25(
9615
+ const handleSelectReport = useCallback27(
9549
9616
  (id) => {
9550
9617
  router.navigate({ reportId: id });
9551
9618
  },
9552
9619
  [router]
9553
9620
  );
9554
- const handleTabChange = useCallback25(
9621
+ const handleTabChange = useCallback27(
9555
9622
  (tab, subTab, focus) => {
9556
9623
  if (!routerState.reportId) return;
9557
9624
  const state = {
@@ -9564,19 +9631,19 @@ function DashboardContent() {
9564
9631
  },
9565
9632
  [router, routerState.reportId]
9566
9633
  );
9567
- const handleBack = useCallback25(() => {
9634
+ const handleBack = useCallback27(() => {
9568
9635
  router.navigate({});
9569
9636
  }, [router]);
9570
- const handleOpenHelp = useCallback25(() => {
9637
+ const handleOpenHelp = useCallback27(() => {
9571
9638
  openHelp(defaultTopic);
9572
9639
  }, [openHelp, defaultTopic]);
9573
- return /* @__PURE__ */ jsx52(Container, { width: 4, children: /* @__PURE__ */ jsxs39(Stack34, { padding: 4, space: 4, children: [
9574
- /* @__PURE__ */ jsxs39(Flex30, { align: "center", gap: 3, children: [
9575
- /* @__PURE__ */ jsxs39(Stack34, { flex: 1, space: 1, children: [
9576
- /* @__PURE__ */ jsx52(Text40, { size: 4, weight: "bold", children: "AI Literacy Framework" }),
9577
- /* @__PURE__ */ jsx52(Text40, { muted: true, size: 2, children: "Evaluation reports and score trends" })
9640
+ return /* @__PURE__ */ jsx53(Container, { width: 4, children: /* @__PURE__ */ jsxs40(Stack34, { padding: 4, space: 4, children: [
9641
+ /* @__PURE__ */ jsxs40(Flex31, { align: "center", gap: 3, children: [
9642
+ /* @__PURE__ */ jsxs40(Stack34, { flex: 1, space: 1, children: [
9643
+ /* @__PURE__ */ jsx53(Text41, { size: 4, weight: "bold", children: "AI Literacy Framework" }),
9644
+ /* @__PURE__ */ jsx53(Text41, { muted: true, size: 2, children: "Evaluation reports and score trends" })
9578
9645
  ] }),
9579
- /* @__PURE__ */ jsx52(
9646
+ /* @__PURE__ */ jsx53(
9580
9647
  Button9,
9581
9648
  {
9582
9649
  icon: HelpCircleIcon8,
@@ -9587,8 +9654,8 @@ function DashboardContent() {
9587
9654
  }
9588
9655
  )
9589
9656
  ] }),
9590
- !isDetail && /* @__PURE__ */ jsxs39(TabList2, { space: 1, children: [
9591
- /* @__PURE__ */ jsx52(
9657
+ !isDetail && /* @__PURE__ */ jsxs40(TabList2, { space: 1, children: [
9658
+ /* @__PURE__ */ jsx53(
9592
9659
  Tab2,
9593
9660
  {
9594
9661
  "aria-controls": "latest-panel",
@@ -9598,7 +9665,7 @@ function DashboardContent() {
9598
9665
  selected: activeTab === "latest"
9599
9666
  }
9600
9667
  ),
9601
- /* @__PURE__ */ jsx52(
9668
+ /* @__PURE__ */ jsx53(
9602
9669
  Tab2,
9603
9670
  {
9604
9671
  "aria-controls": "timeline-panel",
@@ -9608,7 +9675,7 @@ function DashboardContent() {
9608
9675
  selected: activeTab === "timeline"
9609
9676
  }
9610
9677
  ),
9611
- /* @__PURE__ */ jsx52(
9678
+ /* @__PURE__ */ jsx53(
9612
9679
  Tab2,
9613
9680
  {
9614
9681
  "aria-controls": "compare-panel",
@@ -9619,10 +9686,10 @@ function DashboardContent() {
9619
9686
  }
9620
9687
  )
9621
9688
  ] }),
9622
- !isDetail && activeTab === "latest" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "latest-tab", id: "latest-panel", children: /* @__PURE__ */ jsx52(LatestReports, { onSelectReport: handleSelectReport }) }),
9623
- !isDetail && activeTab === "timeline" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "timeline-tab", id: "timeline-panel", children: /* @__PURE__ */ jsx52(ScoreTimeline_default, {}) }),
9624
- !isDetail && activeTab === "compare" && /* @__PURE__ */ jsx52(TabPanel2, { "aria-labelledby": "compare-tab", id: "compare-panel", children: /* @__PURE__ */ jsx52(ComparisonView, {}) }),
9625
- isDetail && reportId && /* @__PURE__ */ jsx52(
9689
+ !isDetail && activeTab === "latest" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "latest-tab", id: "latest-panel", children: /* @__PURE__ */ jsx53(LatestReports, { onSelectReport: handleSelectReport }) }),
9690
+ !isDetail && activeTab === "timeline" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "timeline-tab", id: "timeline-panel", children: /* @__PURE__ */ jsx53(ScoreTimeline_default, {}) }),
9691
+ !isDetail && activeTab === "compare" && /* @__PURE__ */ jsx53(TabPanel2, { "aria-labelledby": "compare-tab", id: "compare-panel", children: /* @__PURE__ */ jsx53(ComparisonView, {}) }),
9692
+ isDetail && reportId && /* @__PURE__ */ jsx53(
9626
9693
  ReportDetail,
9627
9694
  {
9628
9695
  activeTab: routerState.tab ?? null,
@@ -9658,7 +9725,7 @@ function ailfTool(options = {}) {
9658
9725
  // src/actions/RunEvaluationAction.tsx
9659
9726
  import { BarChartIcon as BarChartIcon2 } from "@sanity/icons";
9660
9727
  import { useToast as useToast10 } from "@sanity/ui";
9661
- import { useCallback as useCallback26, useEffect as useEffect11, useRef as useRef6, useState as useState20 } from "react";
9728
+ import { useCallback as useCallback28, useEffect as useEffect11, useRef as useRef6, useState as useState21 } from "react";
9662
9729
  import {
9663
9730
  getReleaseIdFromReleaseDocumentId as getReleaseIdFromReleaseDocumentId3,
9664
9731
  useClient as useClient12,
@@ -9689,7 +9756,7 @@ function createRunEvaluationAction(options = {}) {
9689
9756
  const projectId = useProjectId2();
9690
9757
  const currentUser = useCurrentUser4();
9691
9758
  const toast = useToast10();
9692
- const [state, setState] = useState20({ status: "loading" });
9759
+ const [state, setState] = useState21({ status: "loading" });
9693
9760
  const requestedAtRef = useRef6(null);
9694
9761
  const perspectiveId = getReleaseIdFromReleaseDocumentId3(release._id);
9695
9762
  useEffect11(() => {
@@ -9785,7 +9852,7 @@ function createRunEvaluationAction(options = {}) {
9785
9852
  }, 15e3);
9786
9853
  return () => clearTimeout(timer);
9787
9854
  }, [client, perspectiveId, state]);
9788
- const handleRequest = useCallback26(async () => {
9855
+ const handleRequest = useCallback28(async () => {
9789
9856
  const releaseTitle = release.metadata?.title ?? perspectiveId ?? "release";
9790
9857
  const tag = `release-${slugify3(releaseTitle)}-${dateStamp3()}`;
9791
9858
  const now = Date.now();