npm - @sanity/ailf-studio - Versions diffs - 1.0.0 → 1.1.1 - Mend

@sanity/ailf-studio 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -466,6 +466,18 @@ interface OverallAgentBehaviorData {
     totalUniqueDocSlugs: number;
     totalUniqueSearchQueries: number;
 }
+/** Per-model score breakdown stored in summary.perModel */
+interface PerModelData {
+    modelId: string;
+    label: string;
+    overall: {
+        avgScore: number;
+        avgDocLift: number;
+        cost?: null | number;
+        testCount: number;
+    };
+    scores: ScoreItem[];
+}
 /** Summary data as stored in Sanity */
 interface SummaryData {
     /** Per-feature agent behavior data (only present when agentic mode ran) */
@@ -489,6 +501,8 @@ interface SummaryData {
     };
     /** Low-scoring grader judgments — the raw "red text" explaining failures */
     lowScoringJudgments: JudgmentData[] | null;
+    /** Per-model score breakdown (one entry per LLM model evaluated) */
+    perModel?: PerModelData[] | null;
     /** Gap analysis recommendations (when gap analysis was run) */
     recommendations: null | RecommendationsData;
     scores: ScoreItem[];
@@ -600,21 +614,21 @@ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
  * @see docs/ARCHITECTURE.md (scoring model)
  */
 declare const GLOSSARY: {
-    readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
-    readonly docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.";
+    readonly overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
+    readonly docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
     readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
     readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
     readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
-    readonly floor: "Score without any documentation. This tells you what the model already knows from its training data.";
+    readonly floor: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
     readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
     readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
     readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
     readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
     readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
-    readonly score: "Weighted score for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
+    readonly score: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%. The floor uses a different profile (Task × 60% + Code × 40%, no Doc Coverage).";
     readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
     readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
-    readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
+    readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
     readonly tests: "Number of test cases in this feature area.";
     readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
     readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
@@ -637,6 +651,7 @@ declare const GLOSSARY: {
     readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
     readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
     readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
+    readonly modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.";
     readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
     readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
     readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
@@ -659,7 +674,7 @@ declare const GLOSSARY: {
     readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
     readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
     readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
-    readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
+    readonly reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
     readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
     readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
     readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
@@ -967,4 +982,4 @@ declare function ailfTool(options?: AilfToolOptions): Tool;
  */
 declare const ailfPlugin: sanity.Plugin<void>;
-export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };
+export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };