@sanity/ailf-studio 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -466,6 +466,18 @@ interface OverallAgentBehaviorData {
466
466
  totalUniqueDocSlugs: number;
467
467
  totalUniqueSearchQueries: number;
468
468
  }
469
+ /** Per-model score breakdown stored in summary.perModel */
470
+ interface PerModelData {
471
+ modelId: string;
472
+ label: string;
473
+ overall: {
474
+ avgScore: number;
475
+ avgDocLift: number;
476
+ cost?: null | number;
477
+ testCount: number;
478
+ };
479
+ scores: ScoreItem[];
480
+ }
469
481
  /** Summary data as stored in Sanity */
470
482
  interface SummaryData {
471
483
  /** Per-feature agent behavior data (only present when agentic mode ran) */
@@ -489,6 +501,8 @@ interface SummaryData {
489
501
  };
490
502
  /** Low-scoring grader judgments — the raw "red text" explaining failures */
491
503
  lowScoringJudgments: JudgmentData[] | null;
504
+ /** Per-model score breakdown (one entry per LLM model evaluated) */
505
+ perModel?: PerModelData[] | null;
492
506
  /** Gap analysis recommendations (when gap analysis was run) */
493
507
  recommendations: null | RecommendationsData;
494
508
  scores: ScoreItem[];
@@ -600,21 +614,21 @@ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
600
614
  * @see docs/ARCHITECTURE.md (scoring model)
601
615
  */
602
616
  declare const GLOSSARY: {
603
- readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
604
- readonly docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.";
617
+ readonly overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
618
+ readonly docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
605
619
  readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
606
620
  readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
607
621
  readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
608
- readonly floor: "Score without any documentation. This tells you what the model already knows from its training data.";
622
+ readonly floor: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
609
623
  readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
610
624
  readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
611
625
  readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
612
626
  readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
613
627
  readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
614
- readonly score: "Weighted score for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
628
+ readonly score: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%. The floor uses a different profile (Task × 60% + Code × 40%, no Doc Coverage).";
615
629
  readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
616
630
  readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
617
- readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
631
+ readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
618
632
  readonly tests: "Number of test cases in this feature area.";
619
633
  readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
620
634
  readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
@@ -637,6 +651,7 @@ declare const GLOSSARY: {
637
651
  readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
638
652
  readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
639
653
  readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
654
+ readonly modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.";
640
655
  readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
641
656
  readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
642
657
  readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
@@ -659,7 +674,7 @@ declare const GLOSSARY: {
659
674
  readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
660
675
  readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
661
676
  readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
662
- readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
677
+ readonly reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
663
678
  readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
664
679
  readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
665
680
  readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
@@ -967,4 +982,4 @@ declare function ailfTool(options?: AilfToolOptions): Tool;
967
982
  */
968
983
  declare const ailfPlugin: sanity.Plugin<void>;
969
984
 
970
- export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };
985
+ export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };