@sanity/ailf-studio 0.1.27 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sanity.io
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/dist/index.d.ts CHANGED
@@ -249,6 +249,18 @@ declare const GLOSSARY: {
249
249
  readonly change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
250
250
  readonly lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.";
251
251
  readonly judgmentReason: "The grading model's natural language explanation of what went wrong.";
252
+ readonly healthStrong: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.";
253
+ readonly healthAttention: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.";
254
+ readonly healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
255
+ readonly negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
256
+ readonly weakAreas: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
257
+ readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.";
258
+ readonly retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
259
+ readonly dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
260
+ readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
261
+ readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
262
+ readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
263
+ readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
252
264
  readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
253
265
  readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
254
266
  readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
@@ -266,6 +278,23 @@ declare const GLOSSARY: {
266
278
  readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
267
279
  readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
268
280
  readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
281
+ readonly areaDelta: "Score change for this area compared to the previous evaluation run.";
282
+ readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
283
+ readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
284
+ readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
285
+ readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
286
+ readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
287
+ readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
288
+ readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
289
+ readonly modeFull: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
290
+ readonly modeAgentic: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
291
+ readonly modeObserved: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
292
+ readonly modeDebug: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
293
+ readonly triggerManual: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
294
+ readonly triggerCi: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
295
+ readonly triggerSchedule: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
296
+ readonly triggerWebhook: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
297
+ readonly triggerCrossRepo: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
269
298
  };
270
299
 
271
300
  /**
@@ -569,6 +598,8 @@ interface ComparisonData {
569
598
  docLift: number;
570
599
  overall: number;
571
600
  actualDelta?: number;
601
+ /** Per-area score deltas — e.g. `{ "GROQ": 5.2, "Functions": -2.1 }` */
602
+ perArea?: Record<string, number>;
572
603
  retrievalGapDelta?: number;
573
604
  infrastructureEfficiencyDelta?: number;
574
605
  };