@sanity/ailf-studio 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +19 -0
  2. package/dist/index.js +1414 -1242
  3. package/package.json +1 -1
package/dist/index.d.ts CHANGED
@@ -278,6 +278,23 @@ declare const GLOSSARY: {
278
278
  readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
279
279
  readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
280
280
  readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
281
+ readonly areaDelta: "Score change for this area compared to the previous evaluation run.";
282
+ readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
283
+ readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
284
+ readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
285
+ readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
286
+ readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
287
+ readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
288
+ readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
289
+ readonly modeFull: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
290
+ readonly modeAgentic: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
291
+ readonly modeObserved: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
292
+ readonly modeDebug: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
293
+ readonly triggerManual: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
294
+ readonly triggerCi: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
295
+ readonly triggerSchedule: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
296
+ readonly triggerWebhook: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
297
+ readonly triggerCrossRepo: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
281
298
  };
282
299
 
283
300
  /**
@@ -581,6 +598,8 @@ interface ComparisonData {
581
598
  docLift: number;
582
599
  overall: number;
583
600
  actualDelta?: number;
601
+ /** Per-area score deltas — e.g. `{ "GROQ": 5.2, "Functions": -2.1 }` */
602
+ perArea?: Record<string, number>;
584
603
  retrievalGapDelta?: number;
585
604
  infrastructureEfficiencyDelta?: number;
586
605
  };