@sanity/ailf-studio 1.15.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -622,6 +622,13 @@ interface RecommendationsData {
622
622
  */
623
623
  interface StoredTestResultData {
624
624
  area: string;
625
+ /**
626
+ * Documentation context the task expected the model to use.
627
+ * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
628
+ * Studio reader components fall back via `??`.
629
+ */
630
+ contextDocs?: DocumentRef[];
631
+ /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
625
632
  canonicalDocs?: DocumentRef[];
626
633
  compositeScore?: number;
627
634
  cost?: number;
@@ -646,7 +653,13 @@ interface StoredTestResultData {
646
653
  }
647
654
  /** A single low-scoring grader judgment stored in reports */
648
655
  interface JudgmentData {
649
- /** Docs the task expected the model to use */
656
+ /**
657
+ * Documentation context the task expected the model to use.
658
+ * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
659
+ * Studio reader components fall back via `??`.
660
+ */
661
+ contextDocs?: DocumentRef[];
662
+ /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
650
663
  canonicalDocs?: DocumentRef[];
651
664
  dimension: string;
652
665
  /**
@@ -926,9 +939,9 @@ declare const GLOSSARY: {
926
939
  readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
927
940
  readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
928
941
  readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
929
- readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
942
+ readonly failureMode: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).";
930
943
  readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
931
- readonly confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.";
944
+ readonly confidence: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.";
932
945
  readonly agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
933
946
  readonly searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
934
947
  readonly docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
package/dist/index.js CHANGED
@@ -1525,16 +1525,8 @@ var reportSchema = defineType4({
1525
1525
  }),
1526
1526
  defineField4({
1527
1527
  name: "failureMode",
1528
- options: {
1529
- list: [
1530
- "missing-docs",
1531
- "incorrect-docs",
1532
- "outdated-docs",
1533
- "poor-structure",
1534
- "model-limitation",
1535
- "unclassified"
1536
- ]
1537
- },
1528
+ description: "Per-dimension failure-mode tag \u2014 taxonomy lives in @sanity/ailf packages/eval/src/grader/ and is enumerated by the runtime grader prompt. The Studio dropdown is intentionally unconstrained so taxonomy edits don't require Studio schema PRs (Plan 03-02).",
1529
+ readOnly: true,
1538
1530
  title: "Failure Mode",
1539
1531
  type: "string"
1540
1532
  }),
@@ -1600,14 +1592,8 @@ var reportSchema = defineType4({
1600
1592
  type: "string"
1601
1593
  }),
1602
1594
  defineField4({
1595
+ description: "Free-form dimension slug \u2014 per-dimension taxonomies (Plan 03-02) ship kebab-case values beyond the closed literacy triple (e.g. factual-correctness, assertion-pass-rate, input-validation).",
1603
1596
  name: "dimension",
1604
- options: {
1605
- list: [
1606
- "task-completion",
1607
- "code-correctness",
1608
- "doc-coverage"
1609
- ]
1610
- },
1611
1597
  title: "Dimension",
1612
1598
  type: "string"
1613
1599
  }),
@@ -2326,8 +2312,17 @@ import {
2326
2312
  } from "sanity";
2327
2313
  import { jsx as jsx5, jsxs as jsxs5 } from "react/jsx-runtime";
2328
2314
  function CanonicalDocPreview(props) {
2329
- const { perspective, perspectiveTitle, reason, refType, slug, path } = props;
2315
+ const {
2316
+ docType,
2317
+ perspective,
2318
+ perspectiveTitle,
2319
+ reason,
2320
+ refType,
2321
+ slug,
2322
+ path
2323
+ } = props;
2330
2324
  const isPerspective = refType === "perspective";
2325
+ const isNonArticleDoc = !isPerspective && typeof docType === "string" && docType !== "article";
2331
2326
  const { data: activeReleases } = useActiveReleases();
2332
2327
  const { data: archivedReleases } = useArchivedReleases();
2333
2328
  const resolvedTitle = useMemo(() => {
@@ -2374,7 +2369,14 @@ function CanonicalDocPreview(props) {
2374
2369
  return /* @__PURE__ */ jsxs5(Flex3, { align: "center", gap: 2, padding: 2, children: [
2375
2370
  /* @__PURE__ */ jsx5(Text5, { size: 2, children: icon }),
2376
2371
  /* @__PURE__ */ jsxs5(Flex3, { direction: "column", gap: 1, flex: 1, children: [
2377
- /* @__PURE__ */ jsx5(Text5, { size: 2, weight: "semibold", children: resolvedTitle }),
2372
+ /* @__PURE__ */ jsxs5(Flex3, { align: "center", gap: 2, children: [
2373
+ /* @__PURE__ */ jsx5(Text5, { size: 2, weight: "semibold", children: resolvedTitle }),
2374
+ isNonArticleDoc && /* @__PURE__ */ jsxs5(Text5, { muted: true, size: 0, children: [
2375
+ "(",
2376
+ docType,
2377
+ ")"
2378
+ ] })
2379
+ ] }),
2378
2380
  subtitle && /* @__PURE__ */ jsx5(Text5, { muted: true, size: 1, children: subtitle })
2379
2381
  ] })
2380
2382
  ] });
@@ -2839,6 +2841,17 @@ var taskSchema = defineType5({
2839
2841
  select: { subtitle: "slug.current", title: "title" }
2840
2842
  },
2841
2843
  type: "article"
2844
+ },
2845
+ // typesReference — SDK / API type-definition documents.
2846
+ // The pipeline resolves these via the renderer registry
2847
+ // (see packages/eval/src/sanity/document-renderers.ts);
2848
+ // adding more curated types here is a one-line schema
2849
+ // change with no other code path required (W0195).
2850
+ {
2851
+ preview: {
2852
+ select: { subtitle: "slug.current", title: "title" }
2853
+ },
2854
+ type: "typesReference"
2842
2855
  }
2843
2856
  ],
2844
2857
  type: "crossDatasetReference",
@@ -2897,6 +2910,7 @@ var taskSchema = defineType5({
2897
2910
  name: "canonicalDocRef",
2898
2911
  preview: {
2899
2912
  select: {
2913
+ docType: "doc._type",
2900
2914
  path: "path",
2901
2915
  perspective: "perspective",
2902
2916
  perspectiveTitle: "perspectiveTitle",
@@ -2961,10 +2975,37 @@ var taskSchema = defineType5({
2961
2975
  type: "string"
2962
2976
  }),
2963
2977
  defineField5({
2964
- description: "Task-specific criteria bullets (only for llm-rubric type). Each item describes a specific aspect the grader should check.",
2978
+ description: "Task-specific criteria bullets (only for llm-rubric type). Each criterion has a stable id + text. The id auto-derives from text on first save.",
2965
2979
  hidden: ({ parent }) => parent?.type !== "llm-rubric",
2966
2980
  name: "criteria",
2967
- of: [{ type: "string" }],
2981
+ of: [
2982
+ {
2983
+ name: "criterion",
2984
+ type: "object",
2985
+ fields: [
2986
+ defineField5({
2987
+ name: "id",
2988
+ title: "Criterion ID",
2989
+ type: "slug",
2990
+ options: { source: "text", maxLength: 60 },
2991
+ validation: (rule) => rule.required().custom((value) => {
2992
+ if (!value?.current) return "ID is required";
2993
+ return /^[a-z0-9][a-z0-9-]*$/.test(value.current) || "Must be lowercase alphanumeric with hyphens";
2994
+ })
2995
+ }),
2996
+ defineField5({
2997
+ name: "text",
2998
+ title: "Criterion Text",
2999
+ type: "text",
3000
+ rows: 2,
3001
+ validation: (rule) => rule.required()
3002
+ })
3003
+ ],
3004
+ preview: {
3005
+ select: { title: "text", subtitle: "id.current" }
3006
+ }
3007
+ }
3008
+ ],
2968
3009
  title: "Criteria",
2969
3010
  type: "array"
2970
3011
  }),
@@ -3871,7 +3912,7 @@ Click into any report for the full breakdown: per-area scores, diagnostics, and
3871
3912
  {
3872
3913
  "id": "glossary",
3873
3914
  "title": "Glossary",
3874
- "body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
3915
+ "body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
3875
3916
  "source": "packages/studio/src/glossary.ts",
3876
3917
  "tags": [
3877
3918
  "reference",
@@ -3970,9 +4011,9 @@ var GLOSSARY = {
3970
4011
  // -- Recommendations / gap analysis ----------------------------------------
3971
4012
  recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.",
3972
4013
  totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.",
3973
- failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).",
4014
+ failureMode: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).",
3974
4015
  estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.",
3975
- confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.",
4016
+ confidence: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.",
3976
4017
  // -- Agent behavior --------------------------------------------------------
3977
4018
  agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.",
3978
4019
  searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.",
@@ -4088,6 +4129,20 @@ function getDimensionValue(score, key) {
4088
4129
  }
4089
4130
 
4090
4131
  // src/lib/comparison.ts
4132
+ function normalizePerArea(perArea) {
4133
+ if (perArea == null) return [];
4134
+ if (Array.isArray(perArea)) {
4135
+ return perArea.filter(
4136
+ (e) => typeof e === "object" && e !== null && typeof e.area === "string" && typeof e.delta === "number"
4137
+ );
4138
+ }
4139
+ if (typeof perArea === "object") {
4140
+ return Object.entries(perArea).filter(
4141
+ (entry) => typeof entry[1] === "number"
4142
+ ).map(([area, delta]) => ({ area, delta }));
4143
+ }
4144
+ return [];
4145
+ }
4091
4146
  function scoreMap(summary) {
4092
4147
  return new Map(summary.scores.map((s) => [s.feature, s]));
4093
4148
  }
@@ -9431,7 +9486,8 @@ function matchesQuery(j, query) {
9431
9486
  if (j.taskId.toLowerCase().includes(q)) return true;
9432
9487
  if (j.reason.toLowerCase().includes(q)) return true;
9433
9488
  if (j.modelId.toLowerCase().includes(q)) return true;
9434
- return j.canonicalDocs?.some(
9489
+ const docs = j.contextDocs ?? j.canonicalDocs;
9490
+ return docs?.some(
9435
9491
  (d) => (d.title ?? d.slug).toLowerCase().includes(q) || d.slug.toLowerCase().includes(q)
9436
9492
  ) ?? false;
9437
9493
  }
@@ -12058,7 +12114,7 @@ function StrengthsList({
12058
12114
  AreaScoresGrid,
12059
12115
  {
12060
12116
  mode,
12061
- perArea: comparison?.deltas?.perArea,
12117
+ perArea: normalizePerArea(comparison?.deltas?.perArea),
12062
12118
  perModel: expandedPerModel,
12063
12119
  scores: displayedScores
12064
12120
  }
@@ -12145,7 +12201,7 @@ function WeaknessesList({
12145
12201
  );
12146
12202
  const dimWeaknesses = scores.map((s) => ({ area: s, dims: getDimensionWeaknesses(s) })).filter(({ dims }) => dims.length > 0);
12147
12203
  const regressed = comparison?.regressed ?? [];
12148
- const perArea = comparison?.deltas?.perArea;
12204
+ const perArea = normalizePerArea(comparison?.deltas?.perArea);
12149
12205
  const efficiencyAnomalies = scores.filter(
12150
12206
  (s) => s.infrastructureEfficiency != null && s.infrastructureEfficiency > EFFICIENCY_ANOMALY
12151
12207
  );
@@ -12358,7 +12414,7 @@ function WeaknessesList({
12358
12414
  ),
12359
12415
  /* @__PURE__ */ jsx56(Stack31, { children: regressed.map((featureName, i) => {
12360
12416
  const area = scores.find((s) => s.feature === featureName);
12361
- const areaDelta = perArea?.find(
12417
+ const areaDelta = perArea.find(
12362
12418
  (p) => p.area === featureName
12363
12419
  )?.delta;
12364
12420
  return /* @__PURE__ */ jsxs40(
@@ -13300,10 +13356,13 @@ function JudgmentDetailDrawer({
13300
13356
  }
13301
13357
  )
13302
13358
  ] }),
13303
- judgment.canonicalDocs && judgment.canonicalDocs.length > 0 && /* @__PURE__ */ jsx65(Box33, { padding: 4, style: RELATED_DOCS_STYLE, children: /* @__PURE__ */ jsxs48(Flex35, { align: "center", gap: 2, wrap: "wrap", children: [
13304
- /* @__PURE__ */ jsx65("span", { style: SECTION_LABEL_STYLE, children: "Related docs" }),
13305
- judgment.canonicalDocs.map((doc) => /* @__PURE__ */ jsx65(DocBadge, { doc }, doc.slug))
13306
- ] }) })
13359
+ (() => {
13360
+ const docs = judgment.contextDocs ?? judgment.canonicalDocs;
13361
+ return docs && docs.length > 0 ? /* @__PURE__ */ jsx65(Box33, { padding: 4, style: RELATED_DOCS_STYLE, children: /* @__PURE__ */ jsxs48(Flex35, { align: "center", gap: 2, wrap: "wrap", children: [
13362
+ /* @__PURE__ */ jsx65("span", { style: SECTION_LABEL_STYLE, children: "Related docs" }),
13363
+ docs.map((doc) => /* @__PURE__ */ jsx65(DocBadge, { doc }, doc.slug))
13364
+ ] }) }) : null;
13365
+ })()
13307
13366
  ]
13308
13367
  }
13309
13368
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf-studio",
3
- "version": "1.15.0",
3
+ "version": "1.16.0",
4
4
  "description": "AI Literacy Framework — Sanity Studio dashboard plugin",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -71,8 +71,7 @@
71
71
  "prebuild": "pnpm extract-help",
72
72
  "build": "tsup",
73
73
  "dev": "tsup --watch",
74
- "test": "tsx --test src/__tests__/**/*.test.ts && vitest run",
75
- "test:vitest": "vitest run",
76
- "test:vitest:watch": "vitest"
74
+ "test": "vitest run",
75
+ "test:watch": "vitest"
77
76
  }
78
77
  }