@sanity/ailf-studio 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +480 -373
  2. package/dist/index.js +1669 -850
  3. package/package.json +8 -1
package/dist/index.d.ts CHANGED
@@ -1,6 +1,8 @@
1
1
  import * as sanity from 'sanity';
2
2
  import { DocumentActionComponent, ObjectInputProps, StringInputProps, ReleaseActionComponent, Tool } from 'sanity';
3
3
  import * as react_jsx_runtime from 'react/jsx-runtime';
4
+ import * as react from 'react';
5
+ import { ReactNode } from 'react';
4
6
  import { DocumentRef } from './document-ref.js';
5
7
 
6
8
  /**
@@ -210,386 +212,46 @@ interface RunEvaluationActionOptions {
210
212
  */
211
213
  declare function createRunEvaluationAction(options?: RunEvaluationActionOptions): ReleaseActionComponent;
212
214
 
215
+ interface HelpContextValue {
216
+ /** Whether the drawer is open */
217
+ open: boolean;
218
+ /** Current topic ID */
219
+ topicId: string | null;
220
+ /** Open the drawer to a specific topic, or the context-derived default */
221
+ openHelp: (topicId?: string) => void;
222
+ /** Close the drawer */
223
+ closeHelp: () => void;
224
+ /** Navigate to a specific topic (adds to history) */
225
+ navigateTo: (topicId: string) => void;
226
+ /** Go back to the previous topic in history */
227
+ goBack: () => void;
228
+ /** Whether there's history to go back to */
229
+ canGoBack: boolean;
230
+ }
231
+ interface HelpProviderProps {
232
+ children: ReactNode;
233
+ /** Fallback topic when `openHelp()` is called without an argument */
234
+ defaultTopicId?: string;
235
+ }
236
+ declare function HelpProvider({ children, defaultTopicId }: HelpProviderProps): react.FunctionComponentElement<react.ProviderProps<HelpContextValue | null>>;
213
237
  /**
214
- * glossary.ts
215
- *
216
- * Centralized tooltip descriptions for all evaluation metrics.
217
- *
218
- * Every user-facing metric label in the Studio dashboard should use
219
- * a description from this file. This ensures consistent wording across
220
- * stat cards, table headers, and comparison views.
221
- *
222
- * @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
223
- * @see docs/ARCHITECTURE.md (scoring model)
224
- */
225
- declare const GLOSSARY: {
226
- readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
227
- readonly docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.";
228
- readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
229
- readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
230
- readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
231
- readonly floor: "Score without any documentation. This tells you what the model already knows from its training data.";
232
- readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
233
- readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
234
- readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
235
- readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
236
- readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
237
- readonly score: "Weighted score for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
238
- readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
239
- readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
240
- readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
241
- readonly tests: "Number of test cases in this feature area.";
242
- readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
243
- readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
244
- readonly retGapDelta: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.";
245
- readonly efficiencyDelta: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.";
246
- readonly baseline: "The reference run you're comparing against.";
247
- readonly experiment: "The new run you're evaluating.";
248
- readonly delta: "Difference between experiment and baseline. Positive means improvement, negative means regression.";
249
- readonly change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
250
- readonly lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.";
251
- readonly judgmentReason: "The grading model's natural language explanation of what went wrong.";
252
- readonly healthStrong: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.";
253
- readonly healthAttention: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.";
254
- readonly healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
255
- readonly negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
256
- readonly weakAreas: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
257
- readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.";
258
- readonly retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
259
- readonly dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
260
- readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
261
- readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
262
- readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
263
- readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
264
- readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
265
- readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
266
- readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
267
- readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
268
- readonly confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.";
269
- readonly agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
270
- readonly searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
271
- readonly docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
272
- readonly externalDomains: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.";
273
- readonly avgDocPagesVisited: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.";
274
- readonly avgSearchesPerformed: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.";
275
- readonly avgNetworkTimeMs: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.";
276
- readonly totalRequests: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.";
277
- readonly totalBytesDownloaded: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.";
278
- readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
279
- readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
280
- readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
281
- readonly areaDelta: "Score change for this area compared to the previous evaluation run.";
282
- readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
283
- readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
284
- readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
285
- readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
286
- readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
287
- readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
288
- readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
289
- readonly modeFull: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
290
- readonly modeAgentic: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
291
- readonly modeObserved: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
292
- readonly modeDebug: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
293
- readonly triggerManual: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
294
- readonly triggerCi: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
295
- readonly triggerSchedule: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
296
- readonly triggerWebhook: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
297
- readonly triggerCrossRepo: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
298
- };
299
-
300
- /**
301
- * queries.ts
302
- *
303
- * GROQ queries for the AILF Studio dashboard.
304
- *
305
- * All dashboard views are powered by GROQ — no backend needed.
306
- * These queries run directly against the Sanity Content Lake using
307
- * the Studio's built-in client.
308
- *
309
- * @see docs/design-docs/report-store/architecture.md — Query capabilities
310
- */
311
- /**
312
- * Fetch the N most recent reports, optionally filtered by source and/or mode.
313
- *
314
- * Used by: LatestReports view, Dashboard overview
315
- */
316
- declare const latestReportsQuery: string;
317
- /**
318
- * Fetch score data points for a time range, projected into a chart-friendly shape.
319
- *
320
- * Used by: ScoreTimeline view
321
- */
322
- declare const scoreTimelineQuery: string;
323
- /**
324
- * Fetch a single report by ID with full detail.
325
- *
326
- * Used by: ReportDetail view
327
- */
328
- declare const reportDetailQuery = "\n *[_type == \"ailf.report\" && reportId == $reportId][0] {\n _id,\n reportId,\n completedAt,\n durationMs,\n tag,\n provenance,\n summary,\n comparison\n }\n";
329
- /**
330
- * Find all reports that evaluated a specific Sanity document or perspective.
331
- *
332
- * Used by: ContentImpact view (answer: "what did my edit do to scores?")
333
- *
334
- * Supports optional source/mode filtering via the shared filter helpers.
335
- * When $documentId and $perspective are both null, the filter clause
336
- * `(null in [] || ...)` evaluates to false — callers should use
337
- * `recentDocumentEvalsQuery` for the browse-mode (no search) case.
338
- */
339
- declare const contentImpactQuery: string;
340
- /**
341
- * Browse recent reports that have document-level targeting or perspectives.
342
- *
343
- * Used by: ContentImpact view browse mode (no search active).
344
- * Shows the most recent document-scoped evaluations to help users discover
345
- * what content has been evaluated recently.
346
- */
347
- declare const recentDocumentEvalsQuery: string;
348
- /** All unique targetDocuments across reports (for autocomplete) */
349
- declare const distinctTargetDocumentsQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.targetDocuments)].provenance.targetDocuments[])\n";
350
- /**
351
- * Search articles by title, slug, or _id.
352
- *
353
- * Used by: ContentImpact document search autocomplete.
354
- * Returns a lightweight projection for the dropdown — title, slug, section path, and _id.
355
- * The `score()` function ranks title matches highest, then slug, then _id.
356
- *
357
- * Includes all document versions (published, drafts, perspectives) so the UI
358
- * can show provenance badges. The `_id` prefix determines the version type:
359
- * - No prefix → published (production)
360
- * - `drafts.` → unpublished draft
361
- * - `versions.<perspectiveId>.` → content release perspective
362
- */
363
- declare const articleSearchQuery = "\n *[_type == \"article\"\n && (\n title match $query + \"*\"\n || slug.current match $query + \"*\"\n || _id match $query + \"*\"\n )\n ] | score(\n boost(title match $query + \"*\", 3),\n boost(slug.current match $query + \"*\", 2),\n boost(_id match $query + \"*\", 1)\n ) [0...40] {\n _id,\n title,\n \"slug\": slug.current,\n \"section\": primarySection->{ \"slug\": slug.current, \"title\": title }\n }\n";
364
- /** All unique perspectives across reports (for autocomplete) */
365
- declare const distinctPerspectivesQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.source.perspective)].provenance.source.perspective)\n";
366
- /**
367
- * Fetch two reports by their IDs for comparison.
368
- *
369
- * Used by: ComparisonView — user selects two reports to compare
370
- */
371
- declare const comparisonPairQuery = "\n *[_type == \"ailf.report\" && reportId in [$baselineId, $experimentId]] {\n _id,\n reportId,\n completedAt,\n tag,\n provenance,\n summary\n }\n";
372
- /** All unique source names (for filter dropdowns) */
373
- declare const distinctSourcesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.source.name)\n";
374
- /** All unique modes (for filter dropdowns) */
375
- declare const distinctModesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.mode)\n";
376
- /** All unique feature areas (for filter dropdowns) */
377
- declare const distinctAreasQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.areas[])\n";
378
- /** All unique trigger types (for filter dropdowns) */
379
- declare const distinctTriggersQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.trigger.type)\n";
380
-
381
- /**
382
- * schema/eval-request.ts
383
- *
384
- * Sanity document schema for `ailf.evalRequest` — an intent document that
385
- * requests an evaluation pipeline run.
386
- *
387
- * The Studio creates this document programmatically (e.g. from the release
388
- * action component). A Sanity webhook watches for new `ailf.evalRequest`
389
- * documents with `status == "pending"` and dispatches a GitHub Actions
390
- * workflow. The webhook handler updates `status` to "dispatched", and a
391
- * callback from the pipeline sets it to "completed" or "failed".
392
- *
393
- * Intent documents are immutable — all fields are `readOnly: true`. The
394
- * document is created once and only updated server-side by the webhook
395
- * handler or pipeline callback.
396
- */
397
- declare const evalRequestSchema: {
398
- type: "document";
399
- name: "ailf.evalRequest";
400
- } & Omit<sanity.DocumentDefinition, "preview"> & {
401
- preview?: sanity.PreviewConfig<{
402
- perspective: string;
403
- status: string;
404
- tasks: string;
405
- }, Record<string, unknown>> | undefined;
406
- };
407
-
408
- /**
409
- * schema/feature-area.ts
410
- *
411
- * Sanity document schema for `ailf.featureArea` — a feature area that groups
412
- * related evaluation tasks for score aggregation and filtering.
413
- *
414
- * Feature areas are lightweight metadata documents. They exist primarily to
415
- * provide referential integrity (tasks reference areas by document reference
416
- * instead of plain strings) and to enable Studio-based browsing/filtering.
417
- *
418
- * Initial areas (migrated from YAML filenames): groq, frameworks, functions,
419
- * nextjs-live, studio-setup, visual-editing.
238
+ * Access the help drawer state and actions.
420
239
  *
421
- * @see docs/design-docs/tasks-as-content.md
240
+ * @throws {Error} If called outside of a {@link HelpProvider}.
422
241
  */
423
- declare const featureAreaSchema: {
424
- type: "document";
425
- name: "ailf.featureArea";
426
- } & Omit<sanity.DocumentDefinition, "preview"> & {
427
- preview?: sanity.PreviewConfig<{
428
- areaId: string;
429
- description: string;
430
- }, Record<string, unknown>> | undefined;
431
- };
242
+ declare function useHelp(): HelpContextValue;
432
243
 
433
244
  /**
434
- * schema/reference-solution.ts
245
+ * types.ts
435
246
  *
436
- * Sanity document schema for `ailf.referenceSolution` a gold-standard
437
- * implementation that demonstrates the correct approach for a task.
247
+ * Shared types for the AILF Studio dashboard plugin.
438
248
  *
439
- * Reference solutions contain code blocks and prose explaining why the
440
- * approach is correct. They are referenced by `ailf.task` documents.
249
+ * These mirror the shapes returned by the GROQ queries in queries.ts.
250
+ * They're kept separate from the eval package types to avoid a build
251
+ * dependency — the Studio plugin reads from Sanity directly.
441
252
  *
442
- * @see docs/design-docs/tasks-as-content.md
443
- */
444
- declare const referenceSolutionSchema: {
445
- type: "document";
446
- name: "ailf.referenceSolution";
447
- } & Omit<sanity.DocumentDefinition, "preview"> & {
448
- preview?: sanity.PreviewConfig<{
449
- title: string;
450
- }, Record<"title", any>> | undefined;
451
- };
452
-
453
- /**
454
- * schema/report.ts
455
- *
456
- * Sanity document schema for `ailf.report` — the persisted evaluation report.
457
- *
458
- * This schema defines how reports appear in Sanity Studio and enables
459
- * GROQ queries for the dashboard. The document shape mirrors the
460
- * `Report` type in `packages/eval/src/pipeline/types.ts`.
461
- *
462
- * Reports are immutable events (P1) — once created, they should not be
463
- * edited. The schema uses `readOnly: true` on all fields to enforce this.
464
- *
465
- * @see docs/design-docs/report-store/domain-model.md
466
- * @see docs/design-docs/report-store/architecture.md
467
- */
468
- declare const reportSchema: {
469
- type: "document";
470
- name: "ailf.report";
471
- } & Omit<sanity.DocumentDefinition, "preview"> & {
472
- preview?: sanity.PreviewConfig<{
473
- completedAt: string;
474
- mode: string;
475
- overall: string;
476
- tag: string;
477
- }, Record<string, unknown>> | undefined;
478
- };
479
-
480
- /**
481
- * schema/task.ts
482
- *
483
- * Sanity document schema for `ailf.task` — an evaluation task definition.
484
- *
485
- * This is the core unit of the AI Literacy Framework. A task defines:
486
- * - What the LLM should implement (the task prompt)
487
- * - Which docs are relevant (canonical doc references)
488
- * - How to grade the output (assertions with rubric templates)
489
- * - A gold-standard implementation (reference solution)
490
- * - When/how the task runs (execution controls)
491
- *
492
- * ## Execution paths
493
- *
494
- * Published tasks are automatically discovered by the pipeline — no
495
- * registration step needed. There are four ways to execute a task:
496
- *
497
- * 1. **Run Task Eval** — click ▶ on any ailf.task document in Studio.
498
- * Creates an ailf.evalRequest scoped to this task. Webhook dispatches
499
- * the pipeline; score appears inline when complete (~10–15 min).
500
- * 2. **Run AI Eval** — click on a content release page. Auto-scopes to
501
- * tasks whose canonical docs are in the release.
502
- * 3. **CLI** — `ailf pipeline --task <id>` or `ailf pipeline --area <area>`.
503
- * 4. **Scheduled** — GitHub Actions cron (daily baseline, weekly full).
504
- *
505
- * Tasks can be authored natively in Studio or mirrored from external
506
- * repositories. Mirrored tasks have a read-only `origin` block that
507
- * tracks their source repo provenance.
508
- *
509
- * @see docs/CONTRIBUTING_TASKS.md#running-your-task — full execution guide
510
- * @see docs/design-docs/tasks-as-content.md
511
- * @see docs/design-docs/tasks-as-content.md#decision-8-domain-specific-assertion-types-not-a-promptfoo-subset
512
- */
513
- declare const taskSchema: {
514
- type: "document";
515
- name: "ailf.task";
516
- } & Omit<sanity.DocumentDefinition, "preview"> & {
517
- preview?: sanity.PreviewConfig<{
518
- area: string;
519
- description: string;
520
- id: string;
521
- origin: string;
522
- ownership: string;
523
- status: string;
524
- }, Record<string, unknown>> | undefined;
525
- };
526
-
527
- /**
528
- * schema/webhook-config.ts
529
- *
530
- * Sanity document schema for `ailf.webhookConfig` — the "evaluate on publish"
531
- * toggle and webhook-triggered evaluation settings.
532
- *
533
- * This is a singleton document (only one should exist) that controls
534
- * whether content changes automatically trigger evaluation pipelines.
535
- *
536
- * @see docs/design-docs/report-store/visibility-workflows.md
537
- */
538
- declare const webhookConfigSchema: {
539
- type: "document";
540
- name: "ailf.webhookConfig";
541
- } & Omit<sanity.DocumentDefinition, "preview"> & {
542
- preview?: sanity.PreviewConfig<{
543
- enabled: string;
544
- }, Record<string, unknown>> | undefined;
545
- };
546
-
547
- /**
548
- * tool.tsx
549
- *
550
- * Sanity Studio tool definition for the AILF dashboard.
551
- *
552
- * Registers as a top-level Studio tool accessible from the sidebar.
553
- * Defines URL-based routing so each view is bookmarkable and
554
- * supports browser back/forward navigation.
555
- *
556
- * Route structure:
557
- * /ailf → Latest Reports (home)
558
- * /ailf/report/:reportId → Report Detail
559
- * /ailf/timeline → Score Timeline
560
- * /ailf/compare → Compare
561
- */
562
-
563
- /**
564
- * AILF Dashboard tool configuration.
565
- *
566
- * Add to your sanity.config.ts:
567
- * ```ts
568
- * import { ailfTool } from "@sanity/ailf-studio"
569
- *
570
- * export default defineConfig({
571
- * // ...
572
- * tools: [ailfTool()],
573
- * })
574
- * ```
575
- */
576
- interface AilfToolOptions {
577
- name?: string;
578
- title?: string;
579
- }
580
- declare function ailfTool(options?: AilfToolOptions): Tool;
581
-
582
- /**
583
- * types.ts
584
- *
585
- * Shared types for the AILF Studio dashboard plugin.
586
- *
587
- * These mirror the shapes returned by the GROQ queries in queries.ts.
588
- * They're kept separate from the eval package types to avoid a build
589
- * dependency — the Studio plugin reads from Sanity directly.
590
- *
591
- * Cross-package contract types (DocumentRef, ScoreGrade, scoreGrade) are
592
- * imported from @sanity/ailf-shared — the single source of truth.
253
+ * Cross-package contract types (DocumentRef, ScoreGrade, scoreGrade) are
254
+ * imported from @sanity/ailf-shared — the single source of truth.
593
255
  */
594
256
 
595
257
  /** Comparison data as stored in Sanity */
@@ -844,6 +506,451 @@ interface TimelineDataPoint {
844
506
  source: string;
845
507
  tag: null | string;
846
508
  }
509
+ /**
510
+ * A single help topic extracted from markdown docs at build time.
511
+ *
512
+ * Topics are authored using `:::help` remark container directives in markdown
513
+ * files under `docs/`. The extraction script (`scripts/extract-help.ts`) reads
514
+ * these directives and emits a generated TypeScript module that the HelpDrawer
515
+ * component imports.
516
+ *
517
+ * @see docs/design-docs/contextual-help-sidebar.md
518
+ */
519
+ interface HelpTopic {
520
+ /** URL-safe identifier — matches the #id in the :::help directive */
521
+ id: string;
522
+ /** Display title shown in the drawer header */
523
+ title: string;
524
+ /** Markdown body content (rendered in the drawer) */
525
+ body: string;
526
+ /** Source file path (for debugging / "Edit this page" links) */
527
+ source: string;
528
+ /** Related topic IDs — rendered as "See also" links */
529
+ related?: string[];
530
+ /** Tags for search/filtering */
531
+ tags?: string[];
532
+ }
533
+
534
+ /**
535
+ * lib/help-topics.ts
536
+ *
537
+ * Utility functions for help topic lookup and context-aware topic
538
+ * derivation. Used by the HelpDrawer and HelpContext to determine
539
+ * which topic to show based on the current router state.
540
+ *
541
+ * @see docs/design-docs/contextual-help-sidebar.md
542
+ */
543
+
544
+ /** Router state shape (matches Dashboard.tsx) */
545
+ interface RouterState {
546
+ focus?: string;
547
+ reportId?: string;
548
+ subTab?: string;
549
+ tab?: string;
550
+ view?: string;
551
+ }
552
+ /**
553
+ * Derive the default help topic from the current router state.
554
+ * This is the "help finds you" mechanism — the drawer shows a
555
+ * relevant topic based on what the user is currently viewing.
556
+ */
557
+ declare function deriveHelpTopic(routerState: RouterState): string;
558
+ /**
559
+ * Find a help topic by ID. Returns undefined if not found.
560
+ */
561
+ declare function findTopic(topicId: string): HelpTopic | undefined;
562
+ /**
563
+ * Simple client-side text search across help topics.
564
+ * Matches against title and body (case-insensitive substring).
565
+ * Returns all topics if query is empty.
566
+ */
567
+ declare function searchTopics(query: string): HelpTopic[];
568
+
569
+ /**
570
+ * HelpDrawer.tsx
571
+ *
572
+ * Sliding help panel for the AILF Studio dashboard. Anchored to the
573
+ * right edge of the tool viewport, pushes main content left (non-modal).
574
+ *
575
+ * The drawer:
576
+ * - Reads current topic from HelpContext
577
+ * - Renders markdown body via HelpMarkdown
578
+ * - Shows "See also" links for related topics
579
+ * - Includes a search bar for topic discovery
580
+ * - Supports back navigation through topic history
581
+ *
582
+ * @see docs/design-docs/contextual-help-sidebar.md
583
+ */
584
+ declare function HelpDrawer(): react_jsx_runtime.JSX.Element | null;
585
+
586
+ /**
587
+ * glossary.ts
588
+ *
589
+ * Centralized tooltip descriptions for all evaluation metrics.
590
+ *
591
+ * Every user-facing metric label in the Studio dashboard should use
592
+ * a description from this file. This ensures consistent wording across
593
+ * stat cards, table headers, and comparison views.
594
+ *
595
+ * @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
596
+ * @see docs/ARCHITECTURE.md (scoring model)
597
+ */
598
+ declare const GLOSSARY: {
599
+ readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
600
+ readonly docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.";
601
+ readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
602
+ readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
603
+ readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
604
+ readonly floor: "Score without any documentation. This tells you what the model already knows from its training data.";
605
+ readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
606
+ readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
607
+ readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
608
+ readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
609
+ readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
610
+ readonly score: "Weighted score for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
611
+ readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
612
+ readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
613
+ readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
614
+ readonly tests: "Number of test cases in this feature area.";
615
+ readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
616
+ readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
617
+ readonly retGapDelta: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.";
618
+ readonly efficiencyDelta: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.";
619
+ readonly baseline: "The reference run you're comparing against.";
620
+ readonly experiment: "The new run you're evaluating.";
621
+ readonly delta: "Difference between experiment and baseline. Positive means improvement, negative means regression.";
622
+ readonly change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
623
+ readonly lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.";
624
+ readonly judgmentReason: "The grading model's natural language explanation of what went wrong.";
625
+ readonly healthStrong: "Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.";
626
+ readonly healthAttention: "Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.";
627
+ readonly healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.";
628
+ readonly negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.";
629
+ readonly weakAreas: "Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.";
630
+ readonly docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation is actively misleading the model. These docs need to be rewritten or removed.";
631
+ readonly retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.";
632
+ readonly dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).";
633
+ readonly efficiencyAnomalies: "Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.";
634
+ readonly docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.";
635
+ readonly retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.";
636
+ readonly strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.";
637
+ readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
638
+ readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
639
+ readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
640
+ readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
641
+ readonly confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.";
642
+ readonly agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
643
+ readonly searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
644
+ readonly docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
645
+ readonly externalDomains: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.";
646
+ readonly avgDocPagesVisited: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.";
647
+ readonly avgSearchesPerformed: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.";
648
+ readonly avgNetworkTimeMs: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.";
649
+ readonly totalRequests: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.";
650
+ readonly totalBytesDownloaded: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.";
651
+ readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
652
+ readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
653
+ readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
654
+ readonly areaDelta: "Score change for this area compared to the previous evaluation run.";
655
+ readonly sourceProduction: "Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.";
656
+ readonly sourceBranch: "Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.";
657
+ readonly sourceLocal: "Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.";
658
+ readonly reportScore: "The overall weighted score for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.";
659
+ readonly reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.";
660
+ readonly reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.";
661
+ readonly modeBaseline: "Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).";
662
+ readonly modeFull: "Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.";
663
+ readonly modeAgentic: "Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?";
664
+ readonly modeObserved: "Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.";
665
+ readonly modeDebug: "Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.";
666
+ readonly triggerManual: "Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.";
667
+ readonly triggerCi: "CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.";
668
+ readonly triggerSchedule: "Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.";
669
+ readonly triggerWebhook: "Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.";
670
+ readonly triggerCrossRepo: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
671
+ };
672
+
673
+ /**
674
+ * queries.ts
675
+ *
676
+ * GROQ queries for the AILF Studio dashboard.
677
+ *
678
+ * All dashboard views are powered by GROQ — no backend needed.
679
+ * These queries run directly against the Sanity Content Lake using
680
+ * the Studio's built-in client.
681
+ *
682
+ * @see docs/design-docs/report-store/architecture.md — Query capabilities
683
+ */
684
+ /**
685
+ * Fetch the N most recent reports, optionally filtered by source and/or mode.
686
+ *
687
+ * Used by: LatestReports view, Dashboard overview
688
+ */
689
+ declare const latestReportsQuery: string;
690
+ /**
691
+ * Fetch score data points for a time range, projected into a chart-friendly shape.
692
+ *
693
+ * Used by: ScoreTimeline view
694
+ */
695
+ declare const scoreTimelineQuery: string;
696
+ /**
697
+ * Fetch a single report by ID with full detail.
698
+ *
699
+ * Used by: ReportDetail view
700
+ */
701
+ declare const reportDetailQuery = "\n *[_type == \"ailf.report\" && reportId == $reportId][0] {\n _id,\n reportId,\n completedAt,\n durationMs,\n tag,\n provenance,\n summary,\n comparison\n }\n";
702
+ /**
703
+ * Find all reports that evaluated a specific Sanity document or perspective.
704
+ *
705
+ * Used by: ContentImpact view (answer: "what did my edit do to scores?")
706
+ *
707
+ * Supports optional source/mode filtering via the shared filter helpers.
708
+ * When $documentId and $perspective are both null, the filter clause
709
+ * `(null in [] || ...)` evaluates to false — callers should use
710
+ * `recentDocumentEvalsQuery` for the browse-mode (no search) case.
711
+ */
712
+ declare const contentImpactQuery: string;
713
+ /**
714
+ * Browse recent reports that have document-level targeting or perspectives.
715
+ *
716
+ * Used by: ContentImpact view browse mode (no search active).
717
+ * Shows the most recent document-scoped evaluations to help users discover
718
+ * what content has been evaluated recently.
719
+ */
720
+ declare const recentDocumentEvalsQuery: string;
721
+ /** All unique targetDocuments across reports (for autocomplete) */
722
+ declare const distinctTargetDocumentsQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.targetDocuments)].provenance.targetDocuments[])\n";
723
+ /**
724
+ * Search articles by title, slug, or _id.
725
+ *
726
+ * Used by: ContentImpact document search autocomplete.
727
+ * Returns a lightweight projection for the dropdown — title, slug, section path, and _id.
728
+ * The `score()` function ranks title matches highest, then slug, then _id.
729
+ *
730
+ * Includes all document versions (published, drafts, perspectives) so the UI
731
+ * can show provenance badges. The `_id` prefix determines the version type:
732
+ * - No prefix → published (production)
733
+ * - `drafts.` → unpublished draft
734
+ * - `versions.<perspectiveId>.` → content release perspective
735
+ */
736
+ declare const articleSearchQuery = "\n *[_type == \"article\"\n && (\n title match $query + \"*\"\n || slug.current match $query + \"*\"\n || _id match $query + \"*\"\n )\n ] | score(\n boost(title match $query + \"*\", 3),\n boost(slug.current match $query + \"*\", 2),\n boost(_id match $query + \"*\", 1)\n ) [0...40] {\n _id,\n title,\n \"slug\": slug.current,\n \"section\": primarySection->{ \"slug\": slug.current, \"title\": title }\n }\n";
737
+ /** All unique perspectives across reports (for autocomplete) */
738
+ declare const distinctPerspectivesQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.source.perspective)].provenance.source.perspective)\n";
739
+ /**
740
+ * Fetch two reports by their IDs for comparison.
741
+ *
742
+ * Used by: ComparisonView — user selects two reports to compare
743
+ */
744
+ declare const comparisonPairQuery = "\n *[_type == \"ailf.report\" && reportId in [$baselineId, $experimentId]] {\n _id,\n reportId,\n completedAt,\n tag,\n provenance,\n summary\n }\n";
745
+ /** All unique source names (for filter dropdowns) */
746
+ declare const distinctSourcesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.source.name)\n";
747
+ /** All unique modes (for filter dropdowns) */
748
+ declare const distinctModesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.mode)\n";
749
+ /** All unique feature areas (for filter dropdowns) */
750
+ declare const distinctAreasQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.areas[])\n";
751
+ /** All unique trigger types (for filter dropdowns) */
752
+ declare const distinctTriggersQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.trigger.type)\n";
753
+
754
+ /**
755
+ * schema/eval-request.ts
756
+ *
757
+ * Sanity document schema for `ailf.evalRequest` — an intent document that
758
+ * requests an evaluation pipeline run.
759
+ *
760
+ * The Studio creates this document programmatically (e.g. from the release
761
+ * action component). A Sanity webhook watches for new `ailf.evalRequest`
762
+ * documents with `status == "pending"` and dispatches a GitHub Actions
763
+ * workflow. The webhook handler updates `status` to "dispatched", and a
764
+ * callback from the pipeline sets it to "completed" or "failed".
765
+ *
766
+ * Intent documents are immutable — all fields are `readOnly: true`. The
767
+ * document is created once and only updated server-side by the webhook
768
+ * handler or pipeline callback.
769
+ */
770
+ declare const evalRequestSchema: {
771
+ type: "document";
772
+ name: "ailf.evalRequest";
773
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
774
+ preview?: sanity.PreviewConfig<{
775
+ perspective: string;
776
+ status: string;
777
+ tasks: string;
778
+ }, Record<string, unknown>> | undefined;
779
+ };
780
+
781
+ /**
782
+ * schema/feature-area.ts
783
+ *
784
+ * Sanity document schema for `ailf.featureArea` — a feature area that groups
785
+ * related evaluation tasks for score aggregation and filtering.
786
+ *
787
+ * Feature areas are lightweight metadata documents. They exist primarily to
788
+ * provide referential integrity (tasks reference areas by document reference
789
+ * instead of plain strings) and to enable Studio-based browsing/filtering.
790
+ *
791
+ * Initial areas (migrated from YAML filenames): groq, frameworks, functions,
792
+ * nextjs-live, studio-setup, visual-editing.
793
+ *
794
+ * @see docs/design-docs/tasks-as-content.md
795
+ */
796
+ declare const featureAreaSchema: {
797
+ type: "document";
798
+ name: "ailf.featureArea";
799
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
800
+ preview?: sanity.PreviewConfig<{
801
+ areaId: string;
802
+ description: string;
803
+ }, Record<string, unknown>> | undefined;
804
+ };
805
+
806
+ /**
807
+ * schema/reference-solution.ts
808
+ *
809
+ * Sanity document schema for `ailf.referenceSolution` — a gold-standard
810
+ * implementation that demonstrates the correct approach for a task.
811
+ *
812
+ * Reference solutions contain code blocks and prose explaining why the
813
+ * approach is correct. They are referenced by `ailf.task` documents.
814
+ *
815
+ * @see docs/design-docs/tasks-as-content.md
816
+ */
817
+ declare const referenceSolutionSchema: {
818
+ type: "document";
819
+ name: "ailf.referenceSolution";
820
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
821
+ preview?: sanity.PreviewConfig<{
822
+ title: string;
823
+ }, Record<"title", any>> | undefined;
824
+ };
825
+
826
+ /**
827
+ * schema/report.ts
828
+ *
829
+ * Sanity document schema for `ailf.report` — the persisted evaluation report.
830
+ *
831
+ * This schema defines how reports appear in Sanity Studio and enables
832
+ * GROQ queries for the dashboard. The document shape mirrors the
833
+ * `Report` type in `packages/eval/src/pipeline/types.ts`.
834
+ *
835
+ * Reports are immutable events (P1) — once created, they should not be
836
+ * edited. The schema uses `readOnly: true` on all fields to enforce this.
837
+ *
838
+ * @see docs/design-docs/report-store/domain-model.md
839
+ * @see docs/design-docs/report-store/architecture.md
840
+ */
841
+ declare const reportSchema: {
842
+ type: "document";
843
+ name: "ailf.report";
844
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
845
+ preview?: sanity.PreviewConfig<{
846
+ completedAt: string;
847
+ mode: string;
848
+ overall: string;
849
+ tag: string;
850
+ }, Record<string, unknown>> | undefined;
851
+ };
852
+
853
+ /**
854
+ * schema/task.ts
855
+ *
856
+ * Sanity document schema for `ailf.task` — an evaluation task definition.
857
+ *
858
+ * This is the core unit of the AI Literacy Framework. A task defines:
859
+ * - What the LLM should implement (the task prompt)
860
+ * - Which docs are relevant (canonical doc references)
861
+ * - How to grade the output (assertions with rubric templates)
862
+ * - A gold-standard implementation (reference solution)
863
+ * - When/how the task runs (execution controls)
864
+ *
865
+ * ## Execution paths
866
+ *
867
+ * Published tasks are automatically discovered by the pipeline — no
868
+ * registration step needed. There are four ways to execute a task:
869
+ *
870
+ * 1. **Run Task Eval** — click ▶ on any ailf.task document in Studio.
871
+ * Creates an ailf.evalRequest scoped to this task. Webhook dispatches
872
+ * the pipeline; score appears inline when complete (~10–15 min).
873
+ * 2. **Run AI Eval** — click on a content release page. Auto-scopes to
874
+ * tasks whose canonical docs are in the release.
875
+ * 3. **CLI** — `ailf pipeline --task <id>` or `ailf pipeline --area <area>`.
876
+ * 4. **Scheduled** — GitHub Actions cron (daily baseline, weekly full).
877
+ *
878
+ * Tasks can be authored natively in Studio or mirrored from external
879
+ * repositories. Mirrored tasks have a read-only `origin` block that
880
+ * tracks their source repo provenance.
881
+ *
882
+ * @see docs/CONTRIBUTING_TASKS.md#running-your-task — full execution guide
883
+ * @see docs/design-docs/tasks-as-content.md
884
+ * @see docs/design-docs/tasks-as-content.md#decision-8-domain-specific-assertion-types-not-a-promptfoo-subset
885
+ */
886
+ declare const taskSchema: {
887
+ type: "document";
888
+ name: "ailf.task";
889
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
890
+ preview?: sanity.PreviewConfig<{
891
+ area: string;
892
+ description: string;
893
+ id: string;
894
+ origin: string;
895
+ ownership: string;
896
+ status: string;
897
+ }, Record<string, unknown>> | undefined;
898
+ };
899
+
900
+ /**
901
+ * schema/webhook-config.ts
902
+ *
903
+ * Sanity document schema for `ailf.webhookConfig` — the "evaluate on publish"
904
+ * toggle and webhook-triggered evaluation settings.
905
+ *
906
+ * This is a singleton document (only one should exist) that controls
907
+ * whether content changes automatically trigger evaluation pipelines.
908
+ *
909
+ * @see docs/design-docs/report-store/visibility-workflows.md
910
+ */
911
+ declare const webhookConfigSchema: {
912
+ type: "document";
913
+ name: "ailf.webhookConfig";
914
+ } & Omit<sanity.DocumentDefinition, "preview"> & {
915
+ preview?: sanity.PreviewConfig<{
916
+ enabled: string;
917
+ }, Record<string, unknown>> | undefined;
918
+ };
919
+
920
+ /**
921
+ * tool.tsx
922
+ *
923
+ * Sanity Studio tool definition for the AILF dashboard.
924
+ *
925
+ * Registers as a top-level Studio tool accessible from the sidebar.
926
+ * Defines URL-based routing so each view is bookmarkable and
927
+ * supports browser back/forward navigation.
928
+ *
929
+ * Route structure:
930
+ * /ailf → Latest Reports (home)
931
+ * /ailf/report/:reportId → Report Detail
932
+ * /ailf/timeline → Score Timeline
933
+ * /ailf/compare → Compare
934
+ */
935
+
936
+ /**
937
+ * AILF Dashboard tool configuration.
938
+ *
939
+ * Add to your sanity.config.ts:
940
+ * ```ts
941
+ * import { ailfTool } from "@sanity/ailf-studio"
942
+ *
943
+ * export default defineConfig({
944
+ * // ...
945
+ * tools: [ailfTool()],
946
+ * })
947
+ * ```
948
+ */
949
+ interface AilfToolOptions {
950
+ name?: string;
951
+ title?: string;
952
+ }
953
+ declare function ailfTool(options?: AilfToolOptions): Tool;
847
954
 
848
955
  /**
849
956
  * AILF Studio plugin — registers the report schema, dashboard tool,
@@ -855,4 +962,4 @@ interface TimelineDataPoint {
855
962
  */
856
963
  declare const ailfPlugin: sanity.Plugin<void>;
857
964
 
858
- export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, MirrorBanner, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, taskSchema, webhookConfigSchema };
965
+ export { AssertionInput, CanonicalDocInput, type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, HelpDrawer, HelpProvider, type HelpTopic, MirrorBanner, type ProvenanceData, ReleasePicker, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, RunTaskEvaluationAction, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, deriveHelpTopic, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, distinctTriggersQuery, evalRequestSchema, featureAreaSchema, findTopic, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, searchTopics, taskSchema, useHelp, webhookConfigSchema };