@sanity/ailf-studio 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +22 -7
- package/dist/index.js +762 -695
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -466,6 +466,18 @@ interface OverallAgentBehaviorData {
|
|
|
466
466
|
totalUniqueDocSlugs: number;
|
|
467
467
|
totalUniqueSearchQueries: number;
|
|
468
468
|
}
|
|
469
|
+
/** Per-model score breakdown stored in summary.perModel */
|
|
470
|
+
interface PerModelData {
|
|
471
|
+
modelId: string;
|
|
472
|
+
label: string;
|
|
473
|
+
overall: {
|
|
474
|
+
avgScore: number;
|
|
475
|
+
avgDocLift: number;
|
|
476
|
+
cost?: null | number;
|
|
477
|
+
testCount: number;
|
|
478
|
+
};
|
|
479
|
+
scores: ScoreItem[];
|
|
480
|
+
}
|
|
469
481
|
/** Summary data as stored in Sanity */
|
|
470
482
|
interface SummaryData {
|
|
471
483
|
/** Per-feature agent behavior data (only present when agentic mode ran) */
|
|
@@ -489,6 +501,8 @@ interface SummaryData {
|
|
|
489
501
|
};
|
|
490
502
|
/** Low-scoring grader judgments — the raw "red text" explaining failures */
|
|
491
503
|
lowScoringJudgments: JudgmentData[] | null;
|
|
504
|
+
/** Per-model score breakdown (one entry per LLM model evaluated) */
|
|
505
|
+
perModel?: PerModelData[] | null;
|
|
492
506
|
/** Gap analysis recommendations (when gap analysis was run) */
|
|
493
507
|
recommendations: null | RecommendationsData;
|
|
494
508
|
scores: ScoreItem[];
|
|
@@ -600,21 +614,21 @@ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
|
|
|
600
614
|
* @see docs/ARCHITECTURE.md (scoring model)
|
|
601
615
|
*/
|
|
602
616
|
declare const GLOSSARY: {
|
|
603
|
-
readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
604
|
-
readonly docLift: "How much the docs help, compared to the model's training data alone.
|
|
617
|
+
readonly overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
618
|
+
readonly docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
|
|
605
619
|
readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
|
|
606
620
|
readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
|
|
607
621
|
readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
|
|
608
|
-
readonly floor: "
|
|
622
|
+
readonly floor: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
|
|
609
623
|
readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
|
|
610
624
|
readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
|
|
611
625
|
readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
|
|
612
626
|
readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
|
|
613
627
|
readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
|
|
614
|
-
readonly score: "
|
|
628
|
+
readonly score: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%. The floor uses a different profile (Task × 60% + Code × 40%, no Doc Coverage).";
|
|
615
629
|
readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
|
|
616
630
|
readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
|
|
617
|
-
readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
|
|
631
|
+
readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
|
|
618
632
|
readonly tests: "Number of test cases in this feature area.";
|
|
619
633
|
readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
|
|
620
634
|
readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
|
|
@@ -637,6 +651,7 @@ declare const GLOSSARY: {
|
|
|
637
651
|
readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
|
|
638
652
|
readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
|
|
639
653
|
readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
|
|
654
|
+
readonly modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.";
|
|
640
655
|
readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
|
|
641
656
|
readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
|
|
642
657
|
readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
|
|
@@ -659,7 +674,7 @@ declare const GLOSSARY: {
|
|
|
659
674
|
readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
|
|
660
675
|
readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
|
|
661
676
|
readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
|
|
662
|
-
readonly reportScore: "The overall
|
|
677
|
+
readonly reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
|
|
663
678
|
readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
|
|
664
679
|
readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
|
|
665
680
|
readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
|
|
@@ -967,4 +982,4 @@ declare function ailfTool(options?: AilfToolOptions): Tool;
|
|
|
967
982
|
*/
|
|
968
983
|
declare const ailfPlugin: sanity.Plugin<void>;
|
|
969
984
|
|
|
970
|
-
export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };
|
|
985
|
+
export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };
|