@sanity/ailf-studio 1.19.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +580 -672
  2. package/dist/index.js +792 -562
  3. package/package.json +2 -3
package/dist/index.d.ts CHANGED
@@ -4,6 +4,8 @@ import { StructureResolver, StructureBuilder } from 'sanity/structure';
4
4
  import * as react_jsx_runtime from 'react/jsx-runtime';
5
5
  import * as react from 'react';
6
6
  import { ReactNode } from 'react';
7
+ import { HelpTopic } from './help-topics.js';
8
+ export { HelpTopic } from './help-topics.js';
7
9
  import { DocumentRef } from './document-ref.js';
8
10
 
9
11
  /**
@@ -307,656 +309,59 @@ declare function HelpProvider({ children, defaultTopicId }: HelpProviderProps):
307
309
  declare function useHelp(): HelpContextValue;
308
310
 
309
311
  /**
310
- * types.ts
311
- *
312
- * Shared types for the AILF Studio dashboard plugin.
312
+ * lib/help-topics.ts
313
313
  *
314
- * These mirror the shapes returned by the GROQ queries in queries.ts.
315
- * They're kept separate from the eval package types to avoid a build
316
- * dependency the Studio plugin reads from Sanity directly.
314
+ * Utility functions for help topic lookup and context-aware topic
315
+ * derivation. Used by the HelpDrawer and HelpContext to determine
316
+ * which topic to show based on the current router state.
317
317
  *
318
- * Cross-package contract types (DocumentRef, ScoreGrade, scoreGrade) are
319
- * imported from @sanity/ailf-shared — the single source of truth.
318
+ * @see docs/design-docs/contextual-help-sidebar.md
320
319
  */
321
320
 
322
- /** Comparison data as stored in Sanity */
323
- interface ComparisonData {
324
- deltas: {
325
- docLift: number;
326
- overall: number;
327
- actualDelta?: number;
328
- /** Per-area score deltas — e.g. `[{ area: "GROQ", delta: 5.2 }, …]`
329
- *
330
- * Reshaped from a keyed map to an array in W0137 / D0041 to stop minting
331
- * a new Sanity attribute path for every feature area. Use
332
- * `perArea.find((p) => p.area === name)?.delta` to look up a value. */
333
- perArea?: {
334
- area: string;
335
- delta: number;
336
- }[];
337
- retrievalGapDelta?: number;
338
- infrastructureEfficiencyDelta?: number;
339
- };
340
- generatedAt: string;
341
- improved: string[];
342
- noiseThreshold: number;
343
- notEvaluated: string[];
344
- regressed: string[];
345
- unchanged: string[];
346
- }
347
- /** Shape returned by contentImpactQuery and recentDocumentEvalsQuery */
348
- interface ContentImpactItem {
349
- _id: string;
350
- areas: null | string[];
351
- comparisonDelta: null | number;
352
- completedAt: string;
353
- durationMs: number;
354
- improved: null | string[];
355
- mode: string;
356
- models: null | string[];
357
- overall: number;
358
- perspective: null | string;
359
- regressed: null | string[];
360
- reportId: string;
361
- scores: null | {
362
- actualScore?: number;
363
- docLift: number;
364
- feature: string;
365
- totalScore: number;
366
- }[];
367
- source: string;
368
- tag: null | string;
369
- targetDocuments: null | string[];
370
- title: null | string;
371
- trigger: null | string;
372
- }
373
- /** Provenance data as stored in Sanity */
374
- interface ProvenanceData {
375
- areas: string[];
376
- /** How this run is treated for reporting and trend tracking (D0037). */
377
- classification?: string;
378
- contextHash?: string;
379
- /** Who/what actually invoked the run (D0037). */
380
- executor?: {
381
- email?: string;
382
- githubActor?: string;
383
- name?: string;
384
- runId?: string;
385
- surface?: string;
386
- type?: string;
387
- workflow?: string;
388
- };
389
- git?: {
390
- branch: string;
391
- prNumber?: number;
392
- repo: string;
393
- sha: string;
394
- };
395
- graderModel: string;
396
- /** Platform and CI-provider metadata (D0037). */
397
- host?: {
398
- arch?: string;
399
- ci?: string;
400
- platform?: string;
401
- };
402
- /** Free-form searchable tags (D0037). */
403
- labels?: string[];
404
- lineage?: {
405
- comparedAgainst?: string;
406
- parentJobId?: string;
407
- rerunOf?: string;
408
- };
409
- mode: string;
410
- models: {
411
- id: string;
412
- label: string;
413
- }[];
414
- /** Team and (optionally) individual this run is attributable to (D0037). */
415
- owner?: {
416
- individual?: string;
417
- team?: string;
418
- };
419
- /** Identity of the pipeline run that produced this report (D0032) */
420
- runId: string;
421
- /** @deprecated Use `promptfooUrls` when available */
422
- promptfooUrl?: string;
423
- /** Per-mode Promptfoo share URLs (one per sub-eval) */
424
- promptfooUrls?: {
425
- mode: string;
426
- url: string;
427
- }[];
428
- /** Human-authored "why I ran this" (D0037). */
429
- purpose?: string;
430
- source: {
431
- baseUrl: string;
432
- dataset?: string;
433
- name: string;
434
- perspective?: string;
435
- projectId?: string;
436
- };
437
- targetDocuments?: string[];
438
- taskIds?: string[];
439
- /** AILF/Node version metadata (D0037). */
440
- tool?: {
441
- ailfVersion?: string;
442
- nodeVersion?: string;
443
- };
444
- trigger: {
445
- callerRef?: string;
446
- callerRepo?: string;
447
- documentId?: string;
448
- runId?: string;
449
- schedule?: string;
450
- source?: string;
451
- type: string;
452
- workflow?: string;
453
- };
454
- }
455
- /** Shape returned by reportDetailQuery */
456
- interface ReportDetail {
457
- _id: string;
458
- comparison: ComparisonData | null;
459
- completedAt: string;
460
- durationMs: number;
461
- provenance: ProvenanceData;
462
- reportId: string;
463
- summary: SummaryData;
464
- tag: null | string;
465
- title: null | string;
466
- }
467
- /** Shape returned by latestReportsQuery */
468
- interface ReportListItem {
469
- _id: string;
470
- actualScore?: number | null;
471
- areas: string[];
472
- /** Run classification (D0037) — projected from provenance.classification. */
473
- classification?: null | string;
474
- comparisonDelta: null | number;
475
- completedAt: string;
476
- docLift: number;
477
- durationMs: number;
478
- evaluationMode?: string | null;
479
- /** Executor identity name (D0037). */
480
- executorName?: null | string;
481
- /** Origin surface of the executor (cli/studio/api) — D0037. */
482
- executorSurface?: null | string;
483
- /** Executor discriminator (user/system) — D0037. */
484
- executorType?: null | string;
485
- git: null | {
486
- branch: string;
487
- prNumber?: number;
488
- repo: string;
489
- sha: string;
490
- };
491
- improved: null | string[];
492
- /** Free-form labels (D0037). */
493
- labels?: null | string[];
494
- mode: string;
495
- models: string[];
496
- overall: number;
497
- /** Individual attributable for the run (D0037). */
498
- ownerIndividual?: null | string;
499
- /** Owner team slug (D0037). */
500
- ownerTeam?: null | string;
501
- /** Content release perspective ID (when evaluated with --sanity-perspective) */
502
- perspective?: null | string;
503
- promptfooUrl: null | string;
504
- promptfooUrls: null | {
505
- mode: string;
506
- url: string;
507
- }[];
508
- regressed: null | string[];
509
- reportId: string;
510
- retrievalGap?: number | null;
511
- scores: ScoreItem[];
512
- source: string;
513
- tag: null | string;
514
- title: null | string;
515
- /** Target document slugs (when evaluated with --changed-docs) */
516
- targetDocuments?: null | string[];
517
- trigger: string;
518
- }
519
- /** Per-area score (shared between list and detail views) */
520
- interface ScoreItem {
521
- codeCorrectness: number;
522
- docCoverage: number;
523
- /**
524
- * Generic dimension scores map — all dimensions by camelCase key (0–100).
525
- *
526
- * Non-literacy modes (agent-harness, mcp-server) store their actual
527
- * dimensions here (e.g., agentOutput, toolUsage). Literacy mode may
528
- * also populate this alongside the three legacy named fields above.
529
- *
530
- * UI components should read from this map via `resolveDimensions()` in
531
- * `lib/dimensions.ts` rather than hardcoding the three named fields.
532
- */
533
- dimensions?: Record<string, number>;
534
- docLift: number;
535
- /** Sanity documents used for this feature area's evaluation */
536
- documents?: DocumentRef[];
537
- feature: string;
538
- /** Grouping strategy — "task" for agent-harness, "feature" for literacy */
539
- groupType?: "aggregate" | "feature" | "task";
540
- /** True when floor > ceiling (docs hurt performance) */
541
- negativeDocLift?: boolean;
542
- taskCompletion: number;
543
- testCount: number;
544
- totalScore: number;
545
- /** Score from agent-retrieved docs (only in full-mode reports) */
546
- actualScore?: number;
547
- /** Ceiling − actual: quality lost to discoverability (only in full-mode reports) */
548
- retrievalGap?: number;
549
- /** Actual / ceiling (0–1): agent effectiveness (only in full-mode reports) */
550
- infrastructureEfficiency?: number | null;
551
- /** True when agents outperform by not finding bad docs */
552
- invertedRetrievalGap?: boolean;
553
- /** Floor score — model knowledge alone */
554
- floorScore?: number;
555
- /** Ceiling score — gold-standard docs injected */
556
- ceilingScore?: number;
321
+ /** Router state shape (matches Dashboard.tsx) */
322
+ interface RouterState {
323
+ focus?: string;
324
+ reportId?: string;
325
+ subTab?: string;
326
+ tab?: string;
327
+ view?: string;
557
328
  }
558
329
  /**
559
- * A single row in `StudioArtifactRef.entries[]`. W0051 adds optional `preview` +
560
- * `association` + `truncated` so list-view renderers can consume the
561
- * descriptor-extracted preview without fetching the external payload.
562
- * Older manifests (pre-W0051) carry only `{ key, bytes }`; readers treat
563
- * missing fields as absent data, not as errors.
330
+ * Derive the default help topic from the current router state.
331
+ * This is the "help finds you" mechanism the drawer shows a
332
+ * relevant topic based on what the user is currently viewing.
564
333
  */
565
- interface StudioArtifactRefEntry {
566
- key: string;
567
- bytes: number;
568
- association?: Record<string, string | number>;
569
- truncated?: boolean;
570
- preview?: unknown;
571
- }
572
- /** Reference to an artifact stored in an external object store. */
573
- interface StudioArtifactRef {
574
- store: "gcs" | "local";
575
- bucket: string;
576
- path: string;
577
- bytes?: number;
578
- entryCount?: number;
579
- /**
580
- * Added in W0047 / D0032. Missing on pre-W0047 legacy refs, in which case
581
- * dispatchers must treat it as `"bulk"` (the only layout that existed then).
582
- */
583
- layout?: "bulk" | "per-entry";
584
- /** Per-entry index (populated for `layout: "per-entry"` refs only). */
585
- entries?: StudioArtifactRefEntry[];
586
- truncated?: boolean;
587
- preview?: unknown;
588
- /**
589
- * D0040 / W0135 — when present, this ref's bytes physically live under a
590
- * different run's GCS prefix. `path` is already authoritative for
591
- * resolution; this field is a lineage marker only. Studio mirror uses
592
- * `string` rather than the branded `RunId` since Studio types deliberately
593
- * avoid pulling in core's branded-IDs module.
594
- */
595
- sourceRunId?: string;
596
- }
334
+ declare function deriveHelpTopic(routerState: RouterState): string;
597
335
  /**
598
- * Per-test result stored in reports for drill-down and audit.
599
- * Mirrors StoredTestResult from @sanity/ailf-core.
600
- *
601
- * Per D0030, new reports omit `responseOutput` / `responseOutputTruncated`
602
- * inline — the full output lives in the `testOutputs` GCS artifact and is
603
- * fetched via `useArtifactCache`. Both fields remain optional so the
604
- * reader path tolerates legacy reports that were published before W0045.
336
+ * Find a help topic by ID. Returns undefined if not found.
605
337
  */
606
- interface StoredTestResultData {
607
- area: string;
608
- /**
609
- * Documentation context the task expected the model to use.
610
- * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
611
- * Studio reader components fall back via `??`.
612
- */
613
- contextDocs?: DocumentRef[];
614
- /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
615
- canonicalDocs?: DocumentRef[];
616
- compositeScore?: number;
617
- cost?: number;
618
- dimensions: {
619
- dimension: string;
620
- reason: string;
621
- score: number;
622
- }[];
623
- latencyMs?: number;
624
- modelId: string;
625
- outputFailure?: boolean;
626
- responseOutput?: string;
627
- responseOutputTruncated?: boolean;
628
- taskId: string;
629
- tokenUsage?: {
630
- cached?: number;
631
- completion?: number;
632
- prompt?: number;
633
- total?: number;
634
- };
635
- variant: "baseline" | "gold";
636
- }
637
- /** A single low-scoring grader judgment stored in reports */
638
- interface JudgmentData {
639
- /**
640
- * Documentation context the task expected the model to use.
641
- * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
642
- * Studio reader components fall back via `??`.
643
- */
644
- contextDocs?: DocumentRef[];
645
- /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
646
- canonicalDocs?: DocumentRef[];
647
- dimension: string;
648
- /**
649
- * `graderJudgments` manifest entry key = `formatEntryKey({mode, task,
650
- * model, grader})` from the slim-report publisher. Present on reports
651
- * published under W0051+. Consumed by `useArtifactDetail` in
652
- * `JudgmentDetailDrawer` to hydrate the full reasoning from GCS, and by
653
- * `JudgmentList` to drive hover-prefetch and the list row's data hook.
654
- * Optional so legacy reports (pre-W0051) still type-check.
655
- */
656
- id?: string;
657
- modelId: string;
658
- /** True when the model failed to produce output (empty response, API error, refusal) */
659
- outputFailure?: boolean;
660
- reason: string;
661
- score: number;
662
- taskId: string;
663
- }
338
+ declare function findTopic(topicId: string): HelpTopic | undefined;
664
339
  /**
665
- * Per-feature agent behavior data how agents interacted with docs.
666
- *
667
- * W0051 slimmed the full `searchQueries` / `docSlugsVisited` arrays out
668
- * of the Report summary and replaced them with `*Sample` (bounded first-N)
669
- * + `*Count` (distinct total). Older reports still carry the full arrays
670
- * under the legacy names; both shapes are optional here so the
671
- * `AgentBehaviorCard` renders either fluidly.
340
+ * Simple client-side text search across help topics.
341
+ * Matches against title and body (case-insensitive substring).
342
+ * Returns all topics if query is empty.
672
343
  */
673
- interface FeatureAgentBehaviorData {
674
- avgDocPagesVisited: number;
675
- avgNetworkTimeMs: number;
676
- avgSearchesPerformed: number;
677
- externalDomains: string[];
678
- feature: string;
679
- tasksWithBehaviorData: number;
680
- /** W0051 slim: bounded sample of unique search queries (first ~5). */
681
- searchQueriesSample?: string[];
682
- /** W0051 slim: count of distinct queries in the full traces artifact. */
683
- searchQueriesCount?: number;
684
- /** W0051 slim: bounded sample of unique doc slugs visited. */
685
- docSlugsVisitedSample?: string[];
686
- /** W0051 slim: count of distinct slugs in the full traces artifact. */
687
- docSlugsVisitedCount?: number;
688
- /** Legacy (pre-W0051): the full arrays inlined on the Report. */
689
- searchQueries?: string[];
690
- docSlugsVisited?: string[];
691
- }
692
- /** Overall agent behavior stats (aggregated across all features) */
693
- interface OverallAgentBehaviorData {
694
- avgDocPagesVisited: number;
695
- avgNetworkTimeMs: number;
696
- avgSearchesPerformed: number;
697
- testsWithBehaviorData: number;
698
- totalUniqueDocSlugs: number;
699
- totalUniqueSearchQueries: number;
700
- }
701
- /** Per-model score breakdown stored in summary.perModel */
702
- interface PerModelData {
703
- modelId: string;
704
- label: string;
705
- overall: {
706
- avgScore: number;
707
- avgDocLift: number;
708
- cost?: null | number;
709
- testCount: number;
710
- };
711
- scores: ScoreItem[];
712
- }
713
- /** Summary data as stored in Sanity */
714
- interface SummaryData {
715
- /** Per-feature agent behavior data (only present when agentic mode ran) */
716
- agentBehavior?: FeatureAgentBehaviorData[] | null;
717
- /** External artifact references — present when pipeline uploads to GCS (D0032) */
718
- artifactManifest?: {
719
- testOutputs?: StudioArtifactRef;
720
- renderedPrompts?: StudioArtifactRef;
721
- rawResults?: StudioArtifactRef;
722
- graderPrompts?: StudioArtifactRef;
723
- traces?: StudioArtifactRef;
724
- pipelineContext?: StudioArtifactRef;
725
- diagnosis?: StudioArtifactRef;
726
- };
727
- belowCritical: string[];
728
- /** All Sanity documents used across the entire evaluation */
729
- documentManifest?: DocumentRef[];
730
- evaluationMode?: string;
731
- lowestArea: string;
732
- lowestScore: number;
733
- overall: {
734
- /** Aggregate agent behavior stats (only present when agentic mode ran) */
735
- agentBehavior?: OverallAgentBehaviorData;
736
- avgDocLift: number;
737
- avgScore: number;
738
- avgCeilingScore?: number;
739
- avgFloorScore?: number;
740
- avgActualScore?: number;
741
- avgRetrievalGap?: number;
742
- avgInfrastructureEfficiency?: number;
743
- };
744
- /** Low-scoring grader judgments — the raw "red text" explaining failures */
745
- lowScoringJudgments: JudgmentData[] | null;
746
- /** Per-model score breakdown (one entry per LLM model evaluated) */
747
- perModel?: PerModelData[] | null;
748
- /**
749
- * Slim failure-mode summary (W0051). `topTitles[*]` carry the
750
- * `graderJudgments`-era `id = formatEntryKey({mode, category})` so the
751
- * FailureModesPanel can resolve each row to its per-category manifest
752
- * entry via `useFailureModeArtifact`.
753
- */
754
- failureModes?: {
755
- counts: Record<string, number>;
756
- topTitles: {
757
- id: string;
758
- category: string;
759
- severity: "low" | "medium" | "high" | "critical";
760
- title: string;
761
- count: number;
762
- }[];
763
- totalJudgments: number;
764
- classificationRate: number;
765
- } | null;
766
- scores: ScoreItem[];
767
- /** Per-test results with model output and metadata (D0029) */
768
- testResults?: StoredTestResultData[] | null;
769
- timestamp: string;
770
- }
771
- /** Shape returned by scoreTimelineQuery */
772
- interface TimelineDataPoint {
773
- _id: string;
774
- actualScore?: number | null;
775
- areas?: null | string[];
776
- completedAt: string;
777
- durationMs?: null | number;
778
- mode: string;
779
- models?: null | string[];
780
- overall: number;
781
- ownerTeam?: null | string;
782
- reportId: null | string;
783
- scores: {
784
- feature: string;
785
- totalScore: number;
786
- actualScore?: number;
787
- }[];
788
- source: string;
789
- tag: null | string;
790
- title: null | string;
791
- trigger?: null | string;
792
- }
344
+ declare function searchTopics(query: string): HelpTopic[];
345
+
793
346
  /**
794
- * A single help topic extracted from markdown docs at build time.
347
+ * HelpDrawer.tsx
795
348
  *
796
- * Topics are authored using `:::help` remark container directives in markdown
797
- * files under `docs/`. The extraction script (`scripts/extract-help.ts`) reads
798
- * these directives and emits a generated TypeScript module that the HelpDrawer
799
- * component imports.
349
+ * Sliding help panel for the AILF Studio dashboard. Anchored to the
350
+ * right edge of the tool viewport, pushes main content left (non-modal).
351
+ *
352
+ * The drawer:
353
+ * - Reads current topic from HelpContext
354
+ * - Renders markdown body via the shared <Markdown> component
355
+ * - Shows "See also" links for related topics
356
+ * - Includes a search bar for topic discovery
357
+ * - Supports back navigation through topic history
800
358
  *
801
359
  * @see docs/design-docs/contextual-help-sidebar.md
802
360
  */
803
- interface HelpTopic {
804
- /** URL-safe identifier — matches the #id in the :::help directive */
805
- id: string;
806
- /** Display title shown in the drawer header */
807
- title: string;
808
- /** Markdown body content (rendered in the drawer) */
809
- body: string;
810
- /** Source file path (for debugging / "Edit this page" links) */
811
- source: string;
812
- /** Related topic IDs — rendered as "See also" links */
813
- related?: string[];
814
- /** Tags for search/filtering */
815
- tags?: string[];
816
- }
361
+ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
817
362
 
818
363
  /**
819
- * lib/help-topics.ts
820
- *
821
- * Utility functions for help topic lookup and context-aware topic
822
- * derivation. Used by the HelpDrawer and HelpContext to determine
823
- * which topic to show based on the current router state.
824
- *
825
- * @see docs/design-docs/contextual-help-sidebar.md
826
- */
827
-
828
- /** Router state shape (matches Dashboard.tsx) */
829
- interface RouterState {
830
- focus?: string;
831
- reportId?: string;
832
- subTab?: string;
833
- tab?: string;
834
- view?: string;
835
- }
836
- /**
837
- * Derive the default help topic from the current router state.
838
- * This is the "help finds you" mechanism — the drawer shows a
839
- * relevant topic based on what the user is currently viewing.
840
- */
841
- declare function deriveHelpTopic(routerState: RouterState): string;
842
- /**
843
- * Find a help topic by ID. Returns undefined if not found.
844
- */
845
- declare function findTopic(topicId: string): HelpTopic | undefined;
846
- /**
847
- * Simple client-side text search across help topics.
848
- * Matches against title and body (case-insensitive substring).
849
- * Returns all topics if query is empty.
850
- */
851
- declare function searchTopics(query: string): HelpTopic[];
852
-
853
- /**
854
- * HelpDrawer.tsx
855
- *
856
- * Sliding help panel for the AILF Studio dashboard. Anchored to the
857
- * right edge of the tool viewport, pushes main content left (non-modal).
858
- *
859
- * The drawer:
860
- * - Reads current topic from HelpContext
861
- * - Renders markdown body via the shared <Markdown> component
862
- * - Shows "See also" links for related topics
863
- * - Includes a search bar for topic discovery
864
- * - Supports back navigation through topic history
865
- *
866
- * @see docs/design-docs/contextual-help-sidebar.md
867
- */
868
- declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
869
-
870
- /**
871
- * glossary.ts
872
- *
873
- * Centralized tooltip descriptions for all evaluation metrics.
874
- *
875
- * Every user-facing metric label in the Studio dashboard should use
876
- * a description from this file. This ensures consistent wording across
877
- * stat cards, table headers, and comparison views.
878
- *
879
- * @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
880
- * @see docs/architecture.md (scoring model)
881
- */
882
- declare const GLOSSARY: {
883
- readonly overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
884
- readonly docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.";
885
- readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
886
- readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
887
- readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
888
- readonly floor: "Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.";
889
- readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
890
- readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
891
- readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
892
- readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
893
- readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
894
- readonly score: "Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
895
- readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
896
- readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
897
- readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.";
898
- readonly tests: "Number of test cases in this feature area.";
899
- readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
900
- readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
901
- readonly retGapDelta: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.";
902
- readonly efficiencyDelta: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.";
903
- readonly baseline: "The reference run you're comparing against.";
904
- readonly experiment: "The new run you're evaluating.";
905
- readonly delta: "Difference between experiment and baseline. Positive means improvement, negative means regression.";
906
- readonly change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
907
- readonly lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.";
908
- readonly judgmentReason: "The grading model's natural language explanation of what went wrong.";
909
- readonly healthStrong: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.";
910
- readonly healthAttention: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.";
911
- readonly healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
912
- readonly negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
913
- readonly weakAreas: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
914
- readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.";
915
- readonly retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
916
- readonly dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
917
- readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
918
- readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
919
- readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
920
- readonly modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.";
921
- readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
922
- readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
923
- readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
924
- readonly failureMode: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).";
925
- readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
926
- readonly confidence: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.";
927
- readonly agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
928
- readonly searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
929
- readonly docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
930
- readonly externalDomains: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.";
931
- readonly avgDocPagesVisited: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.";
932
- readonly avgSearchesPerformed: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.";
933
- readonly avgNetworkTimeMs: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.";
934
- readonly totalRequests: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.";
935
- readonly totalBytesDownloaded: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.";
936
- readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
937
- readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
938
- readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
939
- readonly areaDelta: "Score change for this area compared to the previous evaluation run.";
940
- readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
941
- readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
942
- readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
943
- readonly reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
944
- readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
945
- readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
946
- readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
947
- readonly modeFull: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
948
- readonly modeAgentic: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
949
- readonly modeObserved: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
950
- readonly modeDebug: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
951
- readonly triggerManual: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
952
- readonly triggerCi: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
953
- readonly triggerSchedule: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
954
- readonly triggerWebhook: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
955
- readonly triggerCrossRepo: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
956
- };
957
-
958
- /**
959
- * queries.ts
364
+ * queries.ts
960
365
  *
961
366
  * GROQ queries for the AILF Studio dashboard.
962
367
  *
@@ -1042,24 +447,42 @@ declare const distinctTriggersQuery = "\n array::unique(*[_type == \"ailf.repor
1042
447
  * Sanity document schema for `ailf.evalRequest` — an intent document that
1043
448
  * requests an evaluation pipeline run.
1044
449
  *
1045
- * The Studio creates this document programmatically (e.g. from the release
1046
- * action component). A Sanity webhook watches for new `ailf.evalRequest`
1047
- * documents with `status == "pending"` and dispatches a GitHub Actions
1048
- * workflow. The webhook handler updates `status` to "dispatched", and a
1049
- * callback from the pipeline sets it to "completed" or "failed".
450
+ * The Studio and dashboard create this document programmatically. A Sanity
451
+ * webhook watches for new `ailf.evalRequest` documents with
452
+ * `status == "pending"` and dispatches a GitHub Actions workflow. The
453
+ * webhook handler updates `status` to "dispatched", and a callback from
454
+ * the pipeline sets it to "completed" or "failed".
455
+ *
456
+ * ## Shape
457
+ *
458
+ * Two layers:
459
+ *
460
+ * 1. **Status + identity** — `dataset`, `projectId`, `requestedAt`,
461
+ * `requestedBy`, `status`, `dispatchedAt`, `completedAt`, `error`,
462
+ * `reportId`, `sourceReportId`. Required by the lifecycle the webhook
463
+ * handler manages.
464
+ * 2. **`pipelineRequest`** — a JSON-serialized canonical `PipelineRequest`
465
+ * (the universal pipeline invocation contract defined in
466
+ * `@sanity/ailf-core`). Source of truth for what the webhook handler
467
+ * dispatches. The schema does not enumerate individual request fields
468
+ * — adding a `PipelineRequest` field costs zero Studio schema work.
1050
469
  *
1051
470
  * Intent documents are immutable — all fields are `readOnly: true`. The
1052
471
  * document is created once and only updated server-side by the webhook
1053
472
  * handler or pipeline callback.
473
+ *
474
+ * Studio's eval-request preview is intentionally generic ("Evaluation
475
+ * Request · {status}") — Studio's reporting/eval-request UI is being
476
+ * deprecated in favor of the App SDK dashboard (parent: W0238). Anyone
477
+ * needing request-level detail opens the document and reads
478
+ * `pipelineRequest`.
1054
479
  */
1055
480
  declare const evalRequestSchema: {
1056
481
  type: "document";
1057
482
  name: "ailf.evalRequest";
1058
483
  } & Omit<sanity.DocumentDefinition, "preview"> & {
1059
484
  preview?: sanity.PreviewConfig<{
1060
- perspective: string;
1061
485
  status: string;
1062
- tasks: string;
1063
486
  }, Record<string, unknown>> | undefined;
1064
487
  };
1065
488
 
@@ -1204,44 +627,529 @@ declare const webhookConfigSchema: {
1204
627
  };
1205
628
 
1206
629
  /**
1207
- * tool.tsx
1208
- *
1209
- * Sanity Studio tool definition for the AILF dashboard.
1210
- *
1211
- * Registers as a top-level Studio tool accessible from the sidebar.
1212
- * Defines URL-based routing so each view is bookmarkable and
1213
- * supports browser back/forward navigation.
1214
- *
1215
- * Route structure (URLs shown with the default tool name `dashboard`;
1216
- * override via `ailfTool({ name })` to change the segment):
1217
- * /dashboard → Latest Reports (home)
1218
- * /dashboard/report/:reportId → Report Detail
1219
- * /dashboard/timeline → Score Timeline
630
+ * tool.tsx
631
+ *
632
+ * Sanity Studio tool definition for the AILF dashboard.
633
+ *
634
+ * Registers as a top-level Studio tool accessible from the sidebar.
635
+ * Defines URL-based routing so each view is bookmarkable and
636
+ * supports browser back/forward navigation.
637
+ *
638
+ * Route structure (URLs shown with the default tool name `dashboard`;
639
+ * override via `ailfTool({ name })` to change the segment):
640
+ * /dashboard → Latest Reports (home)
641
+ * /dashboard/report/:reportId → Report Detail
642
+ * /dashboard/timeline → Score Timeline
643
+ *
644
+ * The Compare view (`/dashboard/compare`) is temporarily hidden from
645
+ * the dashboard tab UI pending more implementation. The router still
646
+ * accepts `view: "compare"` so any existing bookmarks degrade
647
+ * gracefully to the Latest Reports tab.
648
+ */
649
+
650
+ /**
651
+ * AILF Dashboard tool configuration.
652
+ *
653
+ * Add to your sanity.config.ts:
654
+ * ```ts
655
+ * import { ailfTool } from "@sanity/ailf-studio"
656
+ *
657
+ * export default defineConfig({
658
+ * // ...
659
+ * tools: [ailfTool()],
660
+ * })
661
+ * ```
662
+ */
663
+ interface AilfToolOptions {
664
+ name?: string;
665
+ title?: string;
666
+ }
667
+ declare function ailfTool(options?: AilfToolOptions): Tool;
668
+
669
+ /**
670
+ * types.ts
671
+ *
672
+ * Shared types for the AILF Studio dashboard plugin.
673
+ *
674
+ * These mirror the shapes returned by the GROQ queries in queries.ts.
675
+ * They're kept separate from the eval package types to avoid a build
676
+ * dependency — the Studio plugin reads from Sanity directly.
677
+ *
678
+ * Cross-package contract types (DocumentRef, ScoreGrade, scoreGrade) are
679
+ * imported from @sanity/ailf-shared — the single source of truth.
680
+ */
681
+
682
+ /** Comparison data as stored in Sanity */
683
+ interface ComparisonData {
684
+ deltas: {
685
+ docLift: number;
686
+ overall: number;
687
+ actualDelta?: number;
688
+ /** Per-area score deltas — e.g. `[{ area: "GROQ", delta: 5.2 }, …]`
689
+ *
690
+ * Reshaped from a keyed map to an array in W0137 / D0041 to stop minting
691
+ * a new Sanity attribute path for every feature area. Use
692
+ * `perArea.find((p) => p.area === name)?.delta` to look up a value. */
693
+ perArea?: {
694
+ area: string;
695
+ delta: number;
696
+ }[];
697
+ retrievalGapDelta?: number;
698
+ infrastructureEfficiencyDelta?: number;
699
+ };
700
+ generatedAt: string;
701
+ improved: string[];
702
+ noiseThreshold: number;
703
+ notEvaluated: string[];
704
+ regressed: string[];
705
+ unchanged: string[];
706
+ }
707
+ /** Shape returned by contentImpactQuery and recentDocumentEvalsQuery */
708
+ interface ContentImpactItem {
709
+ _id: string;
710
+ areas: null | string[];
711
+ comparisonDelta: null | number;
712
+ completedAt: string;
713
+ durationMs: number;
714
+ improved: null | string[];
715
+ mode: string;
716
+ models: null | string[];
717
+ overall: number;
718
+ perspective: null | string;
719
+ regressed: null | string[];
720
+ reportId: string;
721
+ scores: null | {
722
+ actualScore?: number;
723
+ docLift: number;
724
+ feature: string;
725
+ totalScore: number;
726
+ }[];
727
+ source: string;
728
+ tag: null | string;
729
+ targetDocuments: null | string[];
730
+ title: null | string;
731
+ trigger: null | string;
732
+ }
733
+ /** Provenance data as stored in Sanity */
734
+ interface ProvenanceData {
735
+ areas: string[];
736
+ /** How this run is treated for reporting and trend tracking (D0037). */
737
+ classification?: string;
738
+ contextHash?: string;
739
+ /** Who/what actually invoked the run (D0037). */
740
+ executor?: {
741
+ email?: string;
742
+ githubActor?: string;
743
+ name?: string;
744
+ runId?: string;
745
+ surface?: string;
746
+ type?: string;
747
+ workflow?: string;
748
+ };
749
+ git?: {
750
+ branch: string;
751
+ prNumber?: number;
752
+ repo: string;
753
+ sha: string;
754
+ };
755
+ graderModel: string;
756
+ /** Platform and CI-provider metadata (D0037). */
757
+ host?: {
758
+ arch?: string;
759
+ ci?: string;
760
+ platform?: string;
761
+ };
762
+ /** Free-form searchable tags (D0037). */
763
+ labels?: string[];
764
+ lineage?: {
765
+ comparedAgainst?: string;
766
+ parentJobId?: string;
767
+ rerunOf?: string;
768
+ };
769
+ mode: string;
770
+ models: {
771
+ id: string;
772
+ label: string;
773
+ }[];
774
+ /** Team and (optionally) individual this run is attributable to (D0037). */
775
+ owner?: {
776
+ individual?: string;
777
+ team?: string;
778
+ };
779
+ /** Identity of the pipeline run that produced this report (D0032) */
780
+ runId: string;
781
+ /** @deprecated Use `promptfooUrls` when available */
782
+ promptfooUrl?: string;
783
+ /** Per-mode Promptfoo share URLs (one per sub-eval) */
784
+ promptfooUrls?: {
785
+ mode: string;
786
+ url: string;
787
+ }[];
788
+ /** Human-authored "why I ran this" (D0037). */
789
+ purpose?: string;
790
+ source: {
791
+ baseUrl: string;
792
+ dataset?: string;
793
+ name: string;
794
+ perspective?: string;
795
+ projectId?: string;
796
+ };
797
+ targetDocuments?: string[];
798
+ taskIds?: string[];
799
+ /** AILF/Node version metadata (D0037). */
800
+ tool?: {
801
+ ailfVersion?: string;
802
+ nodeVersion?: string;
803
+ };
804
+ trigger: {
805
+ callerRef?: string;
806
+ callerRepo?: string;
807
+ documentId?: string;
808
+ runId?: string;
809
+ schedule?: string;
810
+ source?: string;
811
+ type: string;
812
+ workflow?: string;
813
+ };
814
+ }
815
+ /** Shape returned by reportDetailQuery */
816
+ interface ReportDetail {
817
+ _id: string;
818
+ comparison: ComparisonData | null;
819
+ completedAt: string;
820
+ durationMs: number;
821
+ provenance: ProvenanceData;
822
+ reportId: string;
823
+ summary: SummaryData;
824
+ tag: null | string;
825
+ title: null | string;
826
+ }
827
+ /** Shape returned by latestReportsQuery */
828
+ interface ReportListItem {
829
+ _id: string;
830
+ actualScore?: number | null;
831
+ areas: string[];
832
+ /** Run classification (D0037) — projected from provenance.classification. */
833
+ classification?: null | string;
834
+ comparisonDelta: null | number;
835
+ completedAt: string;
836
+ docLift: number;
837
+ durationMs: number;
838
+ evaluationMode?: string | null;
839
+ /** Executor identity name (D0037). */
840
+ executorName?: null | string;
841
+ /** Origin surface of the executor (cli/studio/api) — D0037. */
842
+ executorSurface?: null | string;
843
+ /** Executor discriminator (user/system) — D0037. */
844
+ executorType?: null | string;
845
+ git: null | {
846
+ branch: string;
847
+ prNumber?: number;
848
+ repo: string;
849
+ sha: string;
850
+ };
851
+ improved: null | string[];
852
+ /** Free-form labels (D0037). */
853
+ labels?: null | string[];
854
+ mode: string;
855
+ models: string[];
856
+ overall: number;
857
+ /** Individual attributable for the run (D0037). */
858
+ ownerIndividual?: null | string;
859
+ /** Owner team slug (D0037). */
860
+ ownerTeam?: null | string;
861
+ /** Content release perspective ID (when evaluated with --sanity-perspective) */
862
+ perspective?: null | string;
863
+ promptfooUrl: null | string;
864
+ promptfooUrls: null | {
865
+ mode: string;
866
+ url: string;
867
+ }[];
868
+ regressed: null | string[];
869
+ reportId: string;
870
+ retrievalGap?: number | null;
871
+ scores: ScoreItem[];
872
+ source: string;
873
+ tag: null | string;
874
+ title: null | string;
875
+ /** Target document slugs (when evaluated with --changed-docs) */
876
+ targetDocuments?: null | string[];
877
+ trigger: string;
878
+ }
879
+ /** Per-area score (shared between list and detail views) */
880
+ interface ScoreItem {
881
+ codeCorrectness: number;
882
+ docCoverage: number;
883
+ /**
884
+ * Generic dimension scores map — all dimensions by camelCase key (0–100).
885
+ *
886
+ * Non-literacy modes (agent-harness, mcp-server) store their actual
887
+ * dimensions here (e.g., agentOutput, toolUsage). Literacy mode may
888
+ * also populate this alongside the three legacy named fields above.
889
+ *
890
+ * UI components should read from this map via `resolveDimensions()` in
891
+ * `lib/dimensions.ts` rather than hardcoding the three named fields.
892
+ */
893
+ dimensions?: Record<string, number>;
894
+ docLift: number;
895
+ /** Sanity documents used for this feature area's evaluation */
896
+ documents?: DocumentRef[];
897
+ feature: string;
898
+ /** Grouping strategy — "task" for agent-harness, "feature" for literacy */
899
+ groupType?: "aggregate" | "feature" | "task";
900
+ /** True when floor > ceiling (docs hurt performance) */
901
+ negativeDocLift?: boolean;
902
+ taskCompletion: number;
903
+ testCount: number;
904
+ totalScore: number;
905
+ /** Score from agent-retrieved docs (only in full-mode reports) */
906
+ actualScore?: number;
907
+ /** Ceiling − actual: quality lost to discoverability (only in full-mode reports) */
908
+ retrievalGap?: number;
909
+ /** Actual / ceiling (0–1): agent effectiveness (only in full-mode reports) */
910
+ infrastructureEfficiency?: number | null;
911
+ /** True when agents outperform by not finding bad docs */
912
+ invertedRetrievalGap?: boolean;
913
+ /** Floor score — model knowledge alone */
914
+ floorScore?: number;
915
+ /** Ceiling score — gold-standard docs injected */
916
+ ceilingScore?: number;
917
+ }
918
+ /**
919
+ * A single row in `StudioArtifactRef.entries[]`. W0051 adds optional `preview` +
920
+ * `association` + `truncated` so list-view renderers can consume the
921
+ * descriptor-extracted preview without fetching the external payload.
922
+ * Older manifests (pre-W0051) carry only `{ key, bytes }`; readers treat
923
+ * missing fields as absent data, not as errors.
924
+ */
925
+ interface StudioArtifactRefEntry {
926
+ key: string;
927
+ bytes: number;
928
+ association?: Record<string, string | number>;
929
+ truncated?: boolean;
930
+ preview?: unknown;
931
+ }
932
+ /** Reference to an artifact stored in an external object store. */
933
+ interface StudioArtifactRef {
934
+ store: "gcs" | "local";
935
+ bucket: string;
936
+ path: string;
937
+ bytes?: number;
938
+ entryCount?: number;
939
+ /**
940
+ * Added in W0047 / D0032. Missing on pre-W0047 legacy refs, in which case
941
+ * dispatchers must treat it as `"bulk"` (the only layout that existed then).
942
+ */
943
+ layout?: "bulk" | "per-entry";
944
+ /** Per-entry index (populated for `layout: "per-entry"` refs only). */
945
+ entries?: StudioArtifactRefEntry[];
946
+ truncated?: boolean;
947
+ preview?: unknown;
948
+ /**
949
+ * D0040 / W0135 — when present, this ref's bytes physically live under a
950
+ * different run's GCS prefix. `path` is already authoritative for
951
+ * resolution; this field is a lineage marker only. Studio mirror uses
952
+ * `string` rather than the branded `RunId` since Studio types deliberately
953
+ * avoid pulling in core's branded-IDs module.
954
+ */
955
+ sourceRunId?: string;
956
+ }
957
+ /**
958
+ * Per-test result stored in reports for drill-down and audit.
959
+ * Mirrors StoredTestResult from @sanity/ailf-core.
1220
960
  *
1221
- * The Compare view (`/dashboard/compare`) is temporarily hidden from
1222
- * the dashboard tab UI pending more implementation. The router still
1223
- * accepts `view: "compare"` so any existing bookmarks degrade
1224
- * gracefully to the Latest Reports tab.
961
+ * Per D0030, new reports omit `responseOutput` / `responseOutputTruncated`
962
+ * inline — the full output lives in the `testOutputs` GCS artifact and is
963
+ * fetched via `useArtifactCache`. Both fields remain optional so the
964
+ * reader path tolerates legacy reports that were published before W0045.
1225
965
  */
1226
-
966
+ interface StoredTestResultData {
967
+ area: string;
968
+ /**
969
+ * Documentation context the task expected the model to use.
970
+ * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
971
+ * Studio reader components fall back via `??`.
972
+ */
973
+ contextDocs?: DocumentRef[];
974
+ /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
975
+ canonicalDocs?: DocumentRef[];
976
+ compositeScore?: number;
977
+ cost?: number;
978
+ dimensions: {
979
+ dimension: string;
980
+ reason: string;
981
+ score: number;
982
+ }[];
983
+ latencyMs?: number;
984
+ modelId: string;
985
+ outputFailure?: boolean;
986
+ responseOutput?: string;
987
+ responseOutputTruncated?: boolean;
988
+ taskId: string;
989
+ tokenUsage?: {
990
+ cached?: number;
991
+ completion?: number;
992
+ prompt?: number;
993
+ total?: number;
994
+ };
995
+ variant: "baseline" | "gold";
996
+ }
997
+ /** A single low-scoring grader judgment stored in reports */
998
+ interface JudgmentData {
999
+ /**
1000
+ * Documentation context the task expected the model to use.
1001
+ * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports;
1002
+ * Studio reader components fall back via `??`.
1003
+ */
1004
+ contextDocs?: DocumentRef[];
1005
+ /** @deprecated legacy alias for contextDocs on pre-Phase-2 reports */
1006
+ canonicalDocs?: DocumentRef[];
1007
+ dimension: string;
1008
+ /**
1009
+ * `graderJudgments` manifest entry key = `formatEntryKey({mode, task,
1010
+ * model, grader})` from the slim-report publisher. Present on reports
1011
+ * published under W0051+. Consumed by `useArtifactDetail` in
1012
+ * `JudgmentDetailDrawer` to hydrate the full reasoning from GCS, and by
1013
+ * `JudgmentList` to drive hover-prefetch and the list row's data hook.
1014
+ * Optional so legacy reports (pre-W0051) still type-check.
1015
+ */
1016
+ id?: string;
1017
+ modelId: string;
1018
+ /** True when the model failed to produce output (empty response, API error, refusal) */
1019
+ outputFailure?: boolean;
1020
+ reason: string;
1021
+ score: number;
1022
+ taskId: string;
1023
+ }
1227
1024
  /**
1228
- * AILF Dashboard tool configuration.
1229
- *
1230
- * Add to your sanity.config.ts:
1231
- * ```ts
1232
- * import { ailfTool } from "@sanity/ailf-studio"
1025
+ * Per-feature agent behavior data — how agents interacted with docs.
1233
1026
  *
1234
- * export default defineConfig({
1235
- * // ...
1236
- * tools: [ailfTool()],
1237
- * })
1238
- * ```
1027
+ * W0051 slimmed the full `searchQueries` / `docSlugsVisited` arrays out
1028
+ * of the Report summary and replaced them with `*Sample` (bounded first-N)
1029
+ * + `*Count` (distinct total). Older reports still carry the full arrays
1030
+ * under the legacy names; both shapes are optional here so the
1031
+ * `AgentBehaviorCard` renders either fluidly.
1239
1032
  */
1240
- interface AilfToolOptions {
1241
- name?: string;
1242
- title?: string;
1033
+ interface FeatureAgentBehaviorData {
1034
+ avgDocPagesVisited: number;
1035
+ avgNetworkTimeMs: number;
1036
+ avgSearchesPerformed: number;
1037
+ externalDomains: string[];
1038
+ feature: string;
1039
+ tasksWithBehaviorData: number;
1040
+ /** W0051 slim: bounded sample of unique search queries (first ~5). */
1041
+ searchQueriesSample?: string[];
1042
+ /** W0051 slim: count of distinct queries in the full traces artifact. */
1043
+ searchQueriesCount?: number;
1044
+ /** W0051 slim: bounded sample of unique doc slugs visited. */
1045
+ docSlugsVisitedSample?: string[];
1046
+ /** W0051 slim: count of distinct slugs in the full traces artifact. */
1047
+ docSlugsVisitedCount?: number;
1048
+ /** Legacy (pre-W0051): the full arrays inlined on the Report. */
1049
+ searchQueries?: string[];
1050
+ docSlugsVisited?: string[];
1051
+ }
1052
+ /** Overall agent behavior stats (aggregated across all features) */
1053
+ interface OverallAgentBehaviorData {
1054
+ avgDocPagesVisited: number;
1055
+ avgNetworkTimeMs: number;
1056
+ avgSearchesPerformed: number;
1057
+ testsWithBehaviorData: number;
1058
+ totalUniqueDocSlugs: number;
1059
+ totalUniqueSearchQueries: number;
1060
+ }
1061
+ /** Per-model score breakdown stored in summary.perModel */
1062
+ interface PerModelData {
1063
+ modelId: string;
1064
+ label: string;
1065
+ overall: {
1066
+ avgScore: number;
1067
+ avgDocLift: number;
1068
+ cost?: null | number;
1069
+ testCount: number;
1070
+ };
1071
+ scores: ScoreItem[];
1072
+ }
1073
+ /** Summary data as stored in Sanity */
1074
+ interface SummaryData {
1075
+ /** Per-feature agent behavior data (only present when agentic mode ran) */
1076
+ agentBehavior?: FeatureAgentBehaviorData[] | null;
1077
+ /** External artifact references — present when pipeline uploads to GCS (D0032) */
1078
+ artifactManifest?: {
1079
+ testOutputs?: StudioArtifactRef;
1080
+ renderedPrompts?: StudioArtifactRef;
1081
+ rawResults?: StudioArtifactRef;
1082
+ graderPrompts?: StudioArtifactRef;
1083
+ traces?: StudioArtifactRef;
1084
+ pipelineContext?: StudioArtifactRef;
1085
+ diagnosis?: StudioArtifactRef;
1086
+ };
1087
+ belowCritical: string[];
1088
+ /** All Sanity documents used across the entire evaluation */
1089
+ documentManifest?: DocumentRef[];
1090
+ evaluationMode?: string;
1091
+ lowestArea: string;
1092
+ lowestScore: number;
1093
+ overall: {
1094
+ /** Aggregate agent behavior stats (only present when agentic mode ran) */
1095
+ agentBehavior?: OverallAgentBehaviorData;
1096
+ avgDocLift: number;
1097
+ avgScore: number;
1098
+ avgCeilingScore?: number;
1099
+ avgFloorScore?: number;
1100
+ avgActualScore?: number;
1101
+ avgRetrievalGap?: number;
1102
+ avgInfrastructureEfficiency?: number;
1103
+ };
1104
+ /** Low-scoring grader judgments — the raw "red text" explaining failures */
1105
+ lowScoringJudgments: JudgmentData[] | null;
1106
+ /** Per-model score breakdown (one entry per LLM model evaluated) */
1107
+ perModel?: PerModelData[] | null;
1108
+ /**
1109
+ * Slim failure-mode summary (W0051). `topTitles[*]` carry the
1110
+ * `graderJudgments`-era `id = formatEntryKey({mode, category})` so the
1111
+ * FailureModesPanel can resolve each row to its per-category manifest
1112
+ * entry via `useFailureModeArtifact`.
1113
+ */
1114
+ failureModes?: {
1115
+ counts: Record<string, number>;
1116
+ topTitles: {
1117
+ id: string;
1118
+ category: string;
1119
+ severity: "low" | "medium" | "high" | "critical";
1120
+ title: string;
1121
+ count: number;
1122
+ }[];
1123
+ totalJudgments: number;
1124
+ classificationRate: number;
1125
+ } | null;
1126
+ scores: ScoreItem[];
1127
+ /** Per-test results with model output and metadata (D0029) */
1128
+ testResults?: StoredTestResultData[] | null;
1129
+ timestamp: string;
1130
+ }
1131
+ /** Shape returned by scoreTimelineQuery */
1132
+ interface TimelineDataPoint {
1133
+ _id: string;
1134
+ actualScore?: number | null;
1135
+ areas?: null | string[];
1136
+ completedAt: string;
1137
+ durationMs?: null | number;
1138
+ mode: string;
1139
+ models?: null | string[];
1140
+ overall: number;
1141
+ ownerTeam?: null | string;
1142
+ reportId: null | string;
1143
+ scores: {
1144
+ feature: string;
1145
+ totalScore: number;
1146
+ actualScore?: number;
1147
+ }[];
1148
+ source: string;
1149
+ tag: null | string;
1150
+ title: null | string;
1151
+ trigger?: null | string;
1243
1152
  }
1244
- declare function ailfTool(options?: AilfToolOptions): Tool;
1245
1153
 
1246
1154
  /**
1247
1155
  * Options for `ailfPlugin()`.
@@ -1271,4 +1179,4 @@ interface AilfPluginOptions {
1271
1179
  */
1272
1180
  declare const ailfPlugin: sanity.Plugin<void | AilfPluginOptions>;
1273
1181
 
1274
- export { type AilfPluginOptions, ArchiveTaskAction, AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, RestoreTaskAction, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfStructure, ailfTaskStructureItem, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };
1182
+ export { type AilfPluginOptions, ArchiveTaskAction, AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GraduateToNativeAction, HelpDrawer, HelpProvider, MirrorBanner, type PerModelData, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, RestoreTaskAction, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfStructure, ailfTaskStructureItem, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };