@sanity/ailf-studio 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +255 -0
- package/dist/index.d.ts +703 -0
- package/dist/index.js +5452 -0
- package/package.json +61 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
import * as sanity from 'sanity';
|
|
2
|
+
import { DocumentActionComponent, ReleaseActionComponent, Tool } from 'sanity';
|
|
3
|
+
import * as react_jsx_runtime from 'react/jsx-runtime';
|
|
4
|
+
import { DocumentRef } from './document-ref.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* actions/GraduateToNativeAction.tsx
|
|
8
|
+
*
|
|
9
|
+
* Sanity Studio document action that "graduates" a mirrored task to
|
|
10
|
+
* a native task by removing the `origin` field.
|
|
11
|
+
*
|
|
12
|
+
* This is a one-way, irreversible operation. Once graduated:
|
|
13
|
+
* - The task becomes fully editable in Studio
|
|
14
|
+
* - Future pipeline mirror syncs will NOT overwrite it (the mirror
|
|
15
|
+
* uses createOrReplace with the same _id, but since origin is gone,
|
|
16
|
+
* the document-level readOnly check returns false and the task
|
|
17
|
+
* behaves like any other native task)
|
|
18
|
+
* - The task's _id is unchanged — it keeps the mirror prefix
|
|
19
|
+
* (ailf.task.mirror.*) but that's just an ID string, not a
|
|
20
|
+
* behavioral marker
|
|
21
|
+
*
|
|
22
|
+
* The action only appears on ailf.task documents that have an `origin`
|
|
23
|
+
* field (i.e., mirrored tasks). Native tasks never see it.
|
|
24
|
+
*
|
|
25
|
+
* @see docs/exec-plans/active/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
declare const GraduateToNativeAction: DocumentActionComponent;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* components/MirrorBanner.tsx
|
|
32
|
+
*
|
|
33
|
+
* Informational banner shown at the top of mirrored task documents.
|
|
34
|
+
* Communicates that the task is managed in an external repo and links
|
|
35
|
+
* to the source file on GitHub.
|
|
36
|
+
*
|
|
37
|
+
* Paired with `SyncStatusBadge` to show both the source and freshness
|
|
38
|
+
* of the mirror.
|
|
39
|
+
*
|
|
40
|
+
* @see docs/exec-plans/active/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
41
|
+
*/
|
|
42
|
+
interface MirrorBannerProps {
|
|
43
|
+
origin: {
|
|
44
|
+
repo?: string;
|
|
45
|
+
repoOwner?: string;
|
|
46
|
+
repoName?: string;
|
|
47
|
+
path?: string;
|
|
48
|
+
branch?: string;
|
|
49
|
+
commitSha?: string;
|
|
50
|
+
lastSyncedAt?: string;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
declare function MirrorBanner({ origin }: MirrorBannerProps): react_jsx_runtime.JSX.Element;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* components/SyncStatusBadge.tsx
|
|
57
|
+
*
|
|
58
|
+
* Displays the sync freshness of a mirrored task as a colored badge.
|
|
59
|
+
* Green = recently synced, yellow = getting stale, red = outdated.
|
|
60
|
+
*
|
|
61
|
+
* Uses the `origin.lastSyncedAt` timestamp that the pipeline sets
|
|
62
|
+
* on every mirror upsert (Phase 5a).
|
|
63
|
+
*
|
|
64
|
+
* @see docs/exec-plans/active/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
65
|
+
*/
|
|
66
|
+
interface SyncStatusBadgeProps {
|
|
67
|
+
/** ISO 8601 timestamp from origin.lastSyncedAt */
|
|
68
|
+
lastSyncedAt: string;
|
|
69
|
+
/** Optional: show the commit SHA alongside the badge */
|
|
70
|
+
commitSha?: string;
|
|
71
|
+
/** Font size (default: 0 for compact list view) */
|
|
72
|
+
fontSize?: 0 | 1;
|
|
73
|
+
}
|
|
74
|
+
declare function SyncStatusBadge({ lastSyncedAt, commitSha, fontSize, }: SyncStatusBadgeProps): react_jsx_runtime.JSX.Element;
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* actions/RunEvaluationAction.tsx
|
|
78
|
+
*
|
|
79
|
+
* Sanity Studio release action that triggers an AILF evaluation
|
|
80
|
+
* for a content release. Appears as a button in the release detail
|
|
81
|
+
* page's action bar.
|
|
82
|
+
*
|
|
83
|
+
* Uses the `releases.actions` extension point (Sanity v5) — follows
|
|
84
|
+
* the same composable pattern as `document.actions`.
|
|
85
|
+
*
|
|
86
|
+
* ## Security model
|
|
87
|
+
*
|
|
88
|
+
* This component does NOT hold any secrets. Instead of calling
|
|
89
|
+
* GitHub Actions directly (which would require a PAT in the browser
|
|
90
|
+
* bundle), it creates an `ailf.evalRequest` document in the Content
|
|
91
|
+
* Lake. A server-side Sanity webhook watches for these documents and
|
|
92
|
+
* dispatches the GitHub Actions pipeline — the GitHub token lives
|
|
93
|
+
* only on the server.
|
|
94
|
+
*
|
|
95
|
+
* ## Feedback model
|
|
96
|
+
*
|
|
97
|
+
* - On mount, queries for any existing report for this perspective
|
|
98
|
+
* - If a previous result exists, shows score in the button label
|
|
99
|
+
* - After creating the request, watches the eval request doc for
|
|
100
|
+
* status changes and polls for the resulting report
|
|
101
|
+
* - Score persists in the button label (no auto-reset)
|
|
102
|
+
* - Tooltip provides full context at every stage
|
|
103
|
+
*
|
|
104
|
+
* @see packages/eval/src/webhook/eval-request-handler.ts
|
|
105
|
+
* @see .github/workflows/external-eval.yml
|
|
106
|
+
*/
|
|
107
|
+
|
|
108
|
+
interface RunEvaluationActionOptions {
|
|
109
|
+
/** Evaluation mode (default: "baseline") */
|
|
110
|
+
mode?: "agentic" | "baseline" | "full" | "observed";
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Create a release action component that requests AILF evaluations.
|
|
114
|
+
*
|
|
115
|
+
* When the user clicks the button, the action creates an `ailf.evalRequest`
|
|
116
|
+
* document in the Content Lake. A server-side Sanity webhook picks up
|
|
117
|
+
* the document and dispatches a GitHub Actions pipeline — no secrets
|
|
118
|
+
* are needed in the browser.
|
|
119
|
+
*
|
|
120
|
+
* The action automatically reads the dataset and project ID from the
|
|
121
|
+
* Studio's workspace context, so the pipeline queries the same Content
|
|
122
|
+
* Lake the editor is working in.
|
|
123
|
+
*
|
|
124
|
+
* On mount it queries for the most recent report matching this release's
|
|
125
|
+
* perspective. If one exists, the button shows the score immediately —
|
|
126
|
+
* the user can see the current AI literacy score before deciding whether
|
|
127
|
+
* to re-run.
|
|
128
|
+
*
|
|
129
|
+
* Usage in `sanity.config.ts`:
|
|
130
|
+
* ```ts
|
|
131
|
+
* import { createRunEvaluationAction } from "@sanity/ailf-studio"
|
|
132
|
+
*
|
|
133
|
+
* export default defineConfig({
|
|
134
|
+
* // ...
|
|
135
|
+
* releases: {
|
|
136
|
+
* actions: (prev) => [
|
|
137
|
+
* ...prev,
|
|
138
|
+
* createRunEvaluationAction(),
|
|
139
|
+
* ],
|
|
140
|
+
* },
|
|
141
|
+
* })
|
|
142
|
+
* ```
|
|
143
|
+
*/
|
|
144
|
+
declare function createRunEvaluationAction(options?: RunEvaluationActionOptions): ReleaseActionComponent;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* glossary.ts
|
|
148
|
+
*
|
|
149
|
+
* Centralized tooltip descriptions for all evaluation metrics.
|
|
150
|
+
*
|
|
151
|
+
* Every user-facing metric label in the Studio dashboard should use
|
|
152
|
+
* a description from this file. This ensures consistent wording across
|
|
153
|
+
* stat cards, table headers, and comparison views.
|
|
154
|
+
*
|
|
155
|
+
* @see docs/design-docs/scenario-matrix/evaluation-ceiling.md (three reference points)
|
|
156
|
+
* @see docs/ARCHITECTURE.md (scoring model)
|
|
157
|
+
*/
|
|
158
|
+
declare const GLOSSARY: {
|
|
159
|
+
readonly overallScore: "A weighted average across all feature areas: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).";
|
|
160
|
+
readonly docLift: "How much the docs help, compared to the model's training data alone. This is the score with docs minus the score without. Higher is better.";
|
|
161
|
+
readonly actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.";
|
|
162
|
+
readonly retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.";
|
|
163
|
+
readonly infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.";
|
|
164
|
+
readonly floor: "Score without any documentation. This tells you what the model already knows from its training data.";
|
|
165
|
+
readonly ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.";
|
|
166
|
+
readonly actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.";
|
|
167
|
+
readonly retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.";
|
|
168
|
+
readonly efficiency: "What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).";
|
|
169
|
+
readonly invertedRetGap: "⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.";
|
|
170
|
+
readonly score: "Weighted score for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.";
|
|
171
|
+
readonly taskCompletion: "Can the LLM implement the requested feature? Graded 0–100.";
|
|
172
|
+
readonly codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0–100.";
|
|
173
|
+
readonly docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0–100.";
|
|
174
|
+
readonly tests: "Number of test cases in this feature area.";
|
|
175
|
+
readonly overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.";
|
|
176
|
+
readonly actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.";
|
|
177
|
+
readonly retGapDelta: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.";
|
|
178
|
+
readonly efficiencyDelta: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.";
|
|
179
|
+
readonly baseline: "The reference run you're comparing against.";
|
|
180
|
+
readonly experiment: "The new run you're evaluating.";
|
|
181
|
+
readonly delta: "Difference between experiment and baseline. Positive means improvement, negative means regression.";
|
|
182
|
+
readonly change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).";
|
|
183
|
+
readonly lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.";
|
|
184
|
+
readonly judgmentReason: "The grading model's natural language explanation of what went wrong.";
|
|
185
|
+
readonly recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.";
|
|
186
|
+
readonly totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.";
|
|
187
|
+
readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
|
|
188
|
+
readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
|
|
189
|
+
readonly confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.";
|
|
190
|
+
readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
|
|
191
|
+
readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
|
|
192
|
+
readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* queries.ts
|
|
197
|
+
*
|
|
198
|
+
* GROQ queries for the AILF Studio dashboard.
|
|
199
|
+
*
|
|
200
|
+
* All dashboard views are powered by GROQ — no backend needed.
|
|
201
|
+
* These queries run directly against the Sanity Content Lake using
|
|
202
|
+
* the Studio's built-in client.
|
|
203
|
+
*
|
|
204
|
+
* @see docs/design-docs/report-store/architecture.md — Query capabilities
|
|
205
|
+
*/
|
|
206
|
+
/**
|
|
207
|
+
* Fetch the N most recent reports, optionally filtered by source and/or mode.
|
|
208
|
+
*
|
|
209
|
+
* Used by: LatestReports view, Dashboard overview
|
|
210
|
+
*/
|
|
211
|
+
declare const latestReportsQuery: string;
|
|
212
|
+
/**
|
|
213
|
+
* Fetch score data points for a time range, projected into a chart-friendly shape.
|
|
214
|
+
*
|
|
215
|
+
* Used by: ScoreTimeline view
|
|
216
|
+
*/
|
|
217
|
+
declare const scoreTimelineQuery: string;
|
|
218
|
+
/**
|
|
219
|
+
* Fetch a single report by ID with full detail.
|
|
220
|
+
*
|
|
221
|
+
* Used by: ReportDetail view
|
|
222
|
+
*/
|
|
223
|
+
declare const reportDetailQuery = "\n *[_type == \"ailf.report\" && reportId == $reportId][0] {\n _id,\n reportId,\n completedAt,\n durationMs,\n tag,\n provenance,\n summary,\n comparison\n }\n";
|
|
224
|
+
/**
|
|
225
|
+
* Find all reports that evaluated a specific Sanity document or perspective.
|
|
226
|
+
*
|
|
227
|
+
* Used by: ContentImpact view (answer: "what did my edit do to scores?")
|
|
228
|
+
*
|
|
229
|
+
* Supports optional source/mode filtering via the shared filter helpers.
|
|
230
|
+
* When $documentId and $perspective are both null, the filter clause
|
|
231
|
+
* `(null in [] || ...)` evaluates to false — callers should use
|
|
232
|
+
* `recentDocumentEvalsQuery` for the browse-mode (no search) case.
|
|
233
|
+
*/
|
|
234
|
+
declare const contentImpactQuery: string;
|
|
235
|
+
/**
|
|
236
|
+
* Browse recent reports that have document-level targeting or perspectives.
|
|
237
|
+
*
|
|
238
|
+
* Used by: ContentImpact view browse mode (no search active).
|
|
239
|
+
* Shows the most recent document-scoped evaluations to help users discover
|
|
240
|
+
* what content has been evaluated recently.
|
|
241
|
+
*/
|
|
242
|
+
declare const recentDocumentEvalsQuery: string;
|
|
243
|
+
/** All unique targetDocuments across reports (for autocomplete) */
|
|
244
|
+
declare const distinctTargetDocumentsQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.targetDocuments)].provenance.targetDocuments[])\n";
|
|
245
|
+
/**
|
|
246
|
+
* Search articles by title, slug, or _id.
|
|
247
|
+
*
|
|
248
|
+
* Used by: ContentImpact document search autocomplete.
|
|
249
|
+
* Returns a lightweight projection for the dropdown — title, slug, section path, and _id.
|
|
250
|
+
* The `score()` function ranks title matches highest, then slug, then _id.
|
|
251
|
+
*
|
|
252
|
+
* Includes all document versions (published, drafts, perspectives) so the UI
|
|
253
|
+
* can show provenance badges. The `_id` prefix determines the version type:
|
|
254
|
+
* - No prefix → published (production)
|
|
255
|
+
* - `drafts.` → unpublished draft
|
|
256
|
+
* - `versions.<perspectiveId>.` → content release perspective
|
|
257
|
+
*/
|
|
258
|
+
declare const articleSearchQuery = "\n *[_type == \"article\"\n && (\n title match $query + \"*\"\n || slug.current match $query + \"*\"\n || _id match $query + \"*\"\n )\n ] | score(\n boost(title match $query + \"*\", 3),\n boost(slug.current match $query + \"*\", 2),\n boost(_id match $query + \"*\", 1)\n ) [0...40] {\n _id,\n title,\n \"slug\": slug.current,\n \"section\": primarySection->{ \"slug\": slug.current, \"title\": title }\n }\n";
|
|
259
|
+
/** All unique perspectives across reports (for autocomplete) */
|
|
260
|
+
declare const distinctPerspectivesQuery = "\n array::unique(*[_type == \"ailf.report\" && defined(provenance.source.perspective)].provenance.source.perspective)\n";
|
|
261
|
+
/**
|
|
262
|
+
* Fetch two reports by their IDs for comparison.
|
|
263
|
+
*
|
|
264
|
+
* Used by: ComparisonView — user selects two reports to compare
|
|
265
|
+
*/
|
|
266
|
+
declare const comparisonPairQuery = "\n *[_type == \"ailf.report\" && reportId in [$baselineId, $experimentId]] {\n _id,\n reportId,\n completedAt,\n tag,\n provenance,\n summary\n }\n";
|
|
267
|
+
/** All unique source names (for filter dropdowns) */
|
|
268
|
+
declare const distinctSourcesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.source.name)\n";
|
|
269
|
+
/** All unique modes (for filter dropdowns) */
|
|
270
|
+
declare const distinctModesQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.mode)\n";
|
|
271
|
+
/** All unique feature areas (for filter dropdowns) */
|
|
272
|
+
declare const distinctAreasQuery = "\n array::unique(*[_type == \"ailf.report\"].provenance.areas[])\n";
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* schema/eval-request.ts
|
|
276
|
+
*
|
|
277
|
+
* Sanity document schema for `ailf.evalRequest` — an intent document that
|
|
278
|
+
* requests an evaluation pipeline run.
|
|
279
|
+
*
|
|
280
|
+
* The Studio creates this document programmatically (e.g. from the release
|
|
281
|
+
* action component). A Sanity webhook watches for new `ailf.evalRequest`
|
|
282
|
+
* documents with `status == "pending"` and dispatches a GitHub Actions
|
|
283
|
+
* workflow. The webhook handler updates `status` to "dispatched", and a
|
|
284
|
+
* callback from the pipeline sets it to "completed" or "failed".
|
|
285
|
+
*
|
|
286
|
+
* Intent documents are immutable — all fields are `readOnly: true`. The
|
|
287
|
+
* document is created once and only updated server-side by the webhook
|
|
288
|
+
* handler or pipeline callback.
|
|
289
|
+
*/
|
|
290
|
+
declare const evalRequestSchema: {
|
|
291
|
+
type: "document";
|
|
292
|
+
name: "ailf.evalRequest";
|
|
293
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
294
|
+
preview?: sanity.PreviewConfig<{
|
|
295
|
+
perspective: string;
|
|
296
|
+
status: string;
|
|
297
|
+
}, Record<string, unknown>> | undefined;
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* schema/feature-area.ts
|
|
302
|
+
*
|
|
303
|
+
* Sanity document schema for `ailf.featureArea` — a feature area that groups
|
|
304
|
+
* related evaluation tasks for score aggregation and filtering.
|
|
305
|
+
*
|
|
306
|
+
* Feature areas are lightweight metadata documents. They exist primarily to
|
|
307
|
+
* provide referential integrity (tasks reference areas by document reference
|
|
308
|
+
* instead of plain strings) and to enable Studio-based browsing/filtering.
|
|
309
|
+
*
|
|
310
|
+
* Initial areas (migrated from YAML filenames): groq, frameworks, functions,
|
|
311
|
+
* nextjs-live, studio-setup, visual-editing.
|
|
312
|
+
*
|
|
313
|
+
* @see docs/design-docs/tasks-as-content.md
|
|
314
|
+
*/
|
|
315
|
+
declare const featureAreaSchema: {
|
|
316
|
+
type: "document";
|
|
317
|
+
name: "ailf.featureArea";
|
|
318
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
319
|
+
preview?: sanity.PreviewConfig<{
|
|
320
|
+
areaId: string;
|
|
321
|
+
description: string;
|
|
322
|
+
}, Record<string, unknown>> | undefined;
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* schema/reference-solution.ts
|
|
327
|
+
*
|
|
328
|
+
* Sanity document schema for `ailf.referenceSolution` — a gold-standard
|
|
329
|
+
* implementation that demonstrates the correct approach for a task.
|
|
330
|
+
*
|
|
331
|
+
* Reference solutions contain code blocks and prose explaining why the
|
|
332
|
+
* approach is correct. They are referenced by `ailf.task` documents.
|
|
333
|
+
*
|
|
334
|
+
* @see docs/design-docs/tasks-as-content.md
|
|
335
|
+
*/
|
|
336
|
+
declare const referenceSolutionSchema: {
|
|
337
|
+
type: "document";
|
|
338
|
+
name: "ailf.referenceSolution";
|
|
339
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
340
|
+
preview?: sanity.PreviewConfig<{
|
|
341
|
+
language: string;
|
|
342
|
+
title: string;
|
|
343
|
+
}, Record<string, unknown>> | undefined;
|
|
344
|
+
};
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* schema/report.ts
|
|
348
|
+
*
|
|
349
|
+
* Sanity document schema for `ailf.report` — the persisted evaluation report.
|
|
350
|
+
*
|
|
351
|
+
* This schema defines how reports appear in Sanity Studio and enables
|
|
352
|
+
* GROQ queries for the dashboard. The document shape mirrors the
|
|
353
|
+
* `Report` type in `packages/eval/src/pipeline/types.ts`.
|
|
354
|
+
*
|
|
355
|
+
* Reports are immutable events (P1) — once created, they should not be
|
|
356
|
+
* edited. The schema uses `readOnly: true` on all fields to enforce this.
|
|
357
|
+
*
|
|
358
|
+
* @see docs/design-docs/report-store/domain-model.md
|
|
359
|
+
* @see docs/design-docs/report-store/architecture.md
|
|
360
|
+
*/
|
|
361
|
+
declare const reportSchema: {
|
|
362
|
+
type: "document";
|
|
363
|
+
name: "ailf.report";
|
|
364
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
365
|
+
preview?: sanity.PreviewConfig<{
|
|
366
|
+
completedAt: string;
|
|
367
|
+
mode: string;
|
|
368
|
+
overall: string;
|
|
369
|
+
tag: string;
|
|
370
|
+
}, Record<string, unknown>> | undefined;
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* schema/task.ts
|
|
375
|
+
*
|
|
376
|
+
* Sanity document schema for `ailf.task` — an evaluation task definition.
|
|
377
|
+
*
|
|
378
|
+
* This is the core unit of the AI Literacy Framework. A task defines:
|
|
379
|
+
* - What the LLM should implement (the task prompt)
|
|
380
|
+
* - Which docs are relevant (canonical doc references)
|
|
381
|
+
* - How to grade the output (assertions with rubric templates)
|
|
382
|
+
* - A gold-standard implementation (reference solution)
|
|
383
|
+
* - When/how the task runs (execution controls)
|
|
384
|
+
*
|
|
385
|
+
* Tasks can be authored natively in Studio or mirrored from external
|
|
386
|
+
* repositories. Mirrored tasks have a read-only `origin` block that
|
|
387
|
+
* tracks their source repo provenance.
|
|
388
|
+
*
|
|
389
|
+
* @see docs/design-docs/tasks-as-content.md
|
|
390
|
+
* @see docs/design-docs/tasks-as-content.md#decision-8-domain-specific-assertion-types-not-a-promptfoo-subset
|
|
391
|
+
*/
|
|
392
|
+
declare const taskSchema: {
|
|
393
|
+
type: "document";
|
|
394
|
+
name: "ailf.task";
|
|
395
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
396
|
+
preview?: sanity.PreviewConfig<{
|
|
397
|
+
area: string;
|
|
398
|
+
description: string;
|
|
399
|
+
id: string;
|
|
400
|
+
origin: string;
|
|
401
|
+
}, Record<string, unknown>> | undefined;
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* schema/webhook-config.ts
|
|
406
|
+
*
|
|
407
|
+
* Sanity document schema for `ailf.webhookConfig` — the "evaluate on publish"
|
|
408
|
+
* toggle and webhook-triggered evaluation settings.
|
|
409
|
+
*
|
|
410
|
+
* This is a singleton document (only one should exist) that controls
|
|
411
|
+
* whether content changes automatically trigger evaluation pipelines.
|
|
412
|
+
*
|
|
413
|
+
* @see docs/design-docs/report-store/visibility-workflows.md
|
|
414
|
+
*/
|
|
415
|
+
declare const webhookConfigSchema: {
|
|
416
|
+
type: "document";
|
|
417
|
+
name: "ailf.webhookConfig";
|
|
418
|
+
} & Omit<sanity.DocumentDefinition, "preview"> & {
|
|
419
|
+
preview?: sanity.PreviewConfig<{
|
|
420
|
+
enabled: string;
|
|
421
|
+
}, Record<string, unknown>> | undefined;
|
|
422
|
+
};
|
|
423
|
+
|
|
424
|
+
/**
|
|
425
|
+
* tool.tsx
|
|
426
|
+
*
|
|
427
|
+
* Sanity Studio tool definition for the AILF dashboard.
|
|
428
|
+
*
|
|
429
|
+
* Registers as a top-level Studio tool accessible from the sidebar.
|
|
430
|
+
* Defines URL-based routing so each view is bookmarkable and
|
|
431
|
+
* supports browser back/forward navigation.
|
|
432
|
+
*
|
|
433
|
+
* Route structure:
|
|
434
|
+
* /ai-literacy → Latest Reports (home)
|
|
435
|
+
* /ai-literacy/report/:reportId → Report Detail
|
|
436
|
+
* /ai-literacy/timeline → Score Timeline
|
|
437
|
+
* /ai-literacy/compare → Compare
|
|
438
|
+
*/
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* AILF Dashboard tool configuration.
|
|
442
|
+
*
|
|
443
|
+
* Add to your sanity.config.ts:
|
|
444
|
+
* ```ts
|
|
445
|
+
* import { ailfTool } from "@sanity/ailf-studio"
|
|
446
|
+
*
|
|
447
|
+
* export default defineConfig({
|
|
448
|
+
* // ...
|
|
449
|
+
* tools: [ailfTool()],
|
|
450
|
+
* })
|
|
451
|
+
* ```
|
|
452
|
+
*/
|
|
453
|
+
interface AilfToolOptions {
|
|
454
|
+
name?: string;
|
|
455
|
+
title?: string;
|
|
456
|
+
}
|
|
457
|
+
declare function ailfTool(options?: AilfToolOptions): Tool;
|
|
458
|
+
|
|
459
|
+
/**
|
|
460
|
+
* types.ts
|
|
461
|
+
*
|
|
462
|
+
* Shared types for the AILF Studio dashboard plugin.
|
|
463
|
+
*
|
|
464
|
+
* These mirror the shapes returned by the GROQ queries in queries.ts.
|
|
465
|
+
* They're kept separate from the eval package types to avoid a build
|
|
466
|
+
* dependency — the Studio plugin reads from Sanity directly.
|
|
467
|
+
*
|
|
468
|
+
* Cross-package contract types (DocumentRef, ScoreGrade, scoreGrade) are
|
|
469
|
+
* imported from @sanity/ailf-shared — the single source of truth.
|
|
470
|
+
*/
|
|
471
|
+
|
|
472
|
+
/** Comparison data as stored in Sanity */
|
|
473
|
+
interface ComparisonData {
|
|
474
|
+
deltas: {
|
|
475
|
+
docLift: number;
|
|
476
|
+
overall: number;
|
|
477
|
+
actualDelta?: number;
|
|
478
|
+
retrievalGapDelta?: number;
|
|
479
|
+
infrastructureEfficiencyDelta?: number;
|
|
480
|
+
};
|
|
481
|
+
generatedAt: string;
|
|
482
|
+
improved: string[];
|
|
483
|
+
noiseThreshold: number;
|
|
484
|
+
regressed: string[];
|
|
485
|
+
unchanged: string[];
|
|
486
|
+
}
|
|
487
|
+
/** Shape returned by contentImpactQuery and recentDocumentEvalsQuery */
|
|
488
|
+
interface ContentImpactItem {
|
|
489
|
+
_id: string;
|
|
490
|
+
areas: null | string[];
|
|
491
|
+
comparisonDelta: null | number;
|
|
492
|
+
completedAt: string;
|
|
493
|
+
durationMs: number;
|
|
494
|
+
improved: null | string[];
|
|
495
|
+
mode: string;
|
|
496
|
+
models: null | string[];
|
|
497
|
+
overall: number;
|
|
498
|
+
perspective: null | string;
|
|
499
|
+
regressed: null | string[];
|
|
500
|
+
reportId: string;
|
|
501
|
+
scores: null | {
|
|
502
|
+
actualScore?: number;
|
|
503
|
+
docLift: number;
|
|
504
|
+
feature: string;
|
|
505
|
+
totalScore: number;
|
|
506
|
+
}[];
|
|
507
|
+
source: string;
|
|
508
|
+
tag: null | string;
|
|
509
|
+
targetDocuments: null | string[];
|
|
510
|
+
trigger: null | string;
|
|
511
|
+
}
|
|
512
|
+
/** Provenance data as stored in Sanity */
|
|
513
|
+
interface ProvenanceData {
|
|
514
|
+
areas: string[];
|
|
515
|
+
contextHash?: string;
|
|
516
|
+
git?: {
|
|
517
|
+
branch: string;
|
|
518
|
+
prNumber?: number;
|
|
519
|
+
repo: string;
|
|
520
|
+
sha: string;
|
|
521
|
+
};
|
|
522
|
+
graderModel: string;
|
|
523
|
+
mode: string;
|
|
524
|
+
models: {
|
|
525
|
+
id: string;
|
|
526
|
+
label: string;
|
|
527
|
+
}[];
|
|
528
|
+
/** @deprecated Use `promptfooUrls` when available */
|
|
529
|
+
promptfooUrl?: string;
|
|
530
|
+
/** Per-mode Promptfoo share URLs (one per sub-eval) */
|
|
531
|
+
promptfooUrls?: {
|
|
532
|
+
mode: string;
|
|
533
|
+
url: string;
|
|
534
|
+
}[];
|
|
535
|
+
source: {
|
|
536
|
+
baseUrl: string;
|
|
537
|
+
dataset?: string;
|
|
538
|
+
name: string;
|
|
539
|
+
perspective?: string;
|
|
540
|
+
projectId?: string;
|
|
541
|
+
};
|
|
542
|
+
targetDocuments?: string[];
|
|
543
|
+
taskIds?: string[];
|
|
544
|
+
trigger: {
|
|
545
|
+
callerRef?: string;
|
|
546
|
+
callerRepo?: string;
|
|
547
|
+
documentId?: string;
|
|
548
|
+
runId?: string;
|
|
549
|
+
schedule?: string;
|
|
550
|
+
source?: string;
|
|
551
|
+
type: string;
|
|
552
|
+
workflow?: string;
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
/** Shape returned by reportDetailQuery */
|
|
556
|
+
interface ReportDetail {
|
|
557
|
+
_id: string;
|
|
558
|
+
comparison: ComparisonData | null;
|
|
559
|
+
completedAt: string;
|
|
560
|
+
durationMs: number;
|
|
561
|
+
provenance: ProvenanceData;
|
|
562
|
+
reportId: string;
|
|
563
|
+
summary: SummaryData;
|
|
564
|
+
tag: null | string;
|
|
565
|
+
}
|
|
566
|
+
/** Shape returned by latestReportsQuery */
|
|
567
|
+
interface ReportListItem {
|
|
568
|
+
_id: string;
|
|
569
|
+
actualScore?: number | null;
|
|
570
|
+
areas: string[];
|
|
571
|
+
comparisonDelta: null | number;
|
|
572
|
+
completedAt: string;
|
|
573
|
+
docLift: number;
|
|
574
|
+
durationMs: number;
|
|
575
|
+
evaluationMode?: string | null;
|
|
576
|
+
git: null | {
|
|
577
|
+
branch: string;
|
|
578
|
+
prNumber?: number;
|
|
579
|
+
repo: string;
|
|
580
|
+
sha: string;
|
|
581
|
+
};
|
|
582
|
+
improved: null | string[];
|
|
583
|
+
mode: string;
|
|
584
|
+
models: string[];
|
|
585
|
+
overall: number;
|
|
586
|
+
/** Content release perspective ID (when evaluated with --sanity-perspective) */
|
|
587
|
+
perspective?: null | string;
|
|
588
|
+
promptfooUrl: null | string;
|
|
589
|
+
promptfooUrls: null | {
|
|
590
|
+
mode: string;
|
|
591
|
+
url: string;
|
|
592
|
+
}[];
|
|
593
|
+
regressed: null | string[];
|
|
594
|
+
reportId: string;
|
|
595
|
+
retrievalGap?: number | null;
|
|
596
|
+
scores: ScoreItem[];
|
|
597
|
+
source: string;
|
|
598
|
+
tag: null | string;
|
|
599
|
+
/** Target document slugs (when evaluated with --changed-docs) */
|
|
600
|
+
targetDocuments?: null | string[];
|
|
601
|
+
trigger: string;
|
|
602
|
+
}
|
|
603
|
+
/** Per-area score (shared between list and detail views) */
|
|
604
|
+
interface ScoreItem {
|
|
605
|
+
codeCorrectness: number;
|
|
606
|
+
docCoverage: number;
|
|
607
|
+
docLift: number;
|
|
608
|
+
/** Sanity documents used for this feature area's evaluation */
|
|
609
|
+
documents?: DocumentRef[];
|
|
610
|
+
feature: string;
|
|
611
|
+
taskCompletion: number;
|
|
612
|
+
testCount: number;
|
|
613
|
+
totalScore: number;
|
|
614
|
+
/** Score from agent-retrieved docs (only in full-mode reports) */
|
|
615
|
+
actualScore?: number;
|
|
616
|
+
/** Ceiling − actual: quality lost to discoverability (only in full-mode reports) */
|
|
617
|
+
retrievalGap?: number;
|
|
618
|
+
/** Actual / ceiling (0–1): agent effectiveness (only in full-mode reports) */
|
|
619
|
+
infrastructureEfficiency?: number | null;
|
|
620
|
+
/** True when agents outperform by not finding bad docs */
|
|
621
|
+
invertedRetrievalGap?: boolean;
|
|
622
|
+
/** Floor score — model knowledge alone */
|
|
623
|
+
floorScore?: number;
|
|
624
|
+
/** Ceiling score — gold-standard docs injected */
|
|
625
|
+
ceilingScore?: number;
|
|
626
|
+
}
|
|
627
|
+
/** A single gap/recommendation from gap analysis */
|
|
628
|
+
interface RecommendationGap {
|
|
629
|
+
affectedTaskIds: string[];
|
|
630
|
+
area: string;
|
|
631
|
+
bottleneckDimensions: string[];
|
|
632
|
+
confidence: "high" | "low" | "medium";
|
|
633
|
+
estimatedLift: number;
|
|
634
|
+
failureMode: string;
|
|
635
|
+
priority: number;
|
|
636
|
+
remediation: string;
|
|
637
|
+
}
|
|
638
|
+
/** Gap analysis recommendations stored in Sanity */
|
|
639
|
+
interface RecommendationsData {
|
|
640
|
+
gaps: RecommendationGap[];
|
|
641
|
+
generatedAt: string;
|
|
642
|
+
totalPotentialLift: number;
|
|
643
|
+
}
|
|
644
|
+
/** A single low-scoring grader judgment stored in reports */
|
|
645
|
+
interface JudgmentData {
|
|
646
|
+
/** Docs the task expected the model to use */
|
|
647
|
+
canonicalDocs?: DocumentRef[];
|
|
648
|
+
dimension: string;
|
|
649
|
+
modelId: string;
|
|
650
|
+
reason: string;
|
|
651
|
+
score: number;
|
|
652
|
+
taskId: string;
|
|
653
|
+
}
|
|
654
|
+
/** Summary data as stored in Sanity */
|
|
655
|
+
interface SummaryData {
|
|
656
|
+
belowCritical: string[];
|
|
657
|
+
/** All Sanity documents used across the entire evaluation */
|
|
658
|
+
documentManifest?: DocumentRef[];
|
|
659
|
+
evaluationMode?: string;
|
|
660
|
+
lowestArea: string;
|
|
661
|
+
lowestScore: number;
|
|
662
|
+
overall: {
|
|
663
|
+
avgDocLift: number;
|
|
664
|
+
avgScore: number;
|
|
665
|
+
avgCeilingScore?: number;
|
|
666
|
+
avgFloorScore?: number;
|
|
667
|
+
avgActualScore?: number;
|
|
668
|
+
avgRetrievalGap?: number;
|
|
669
|
+
avgInfrastructureEfficiency?: number;
|
|
670
|
+
};
|
|
671
|
+
/** Low-scoring grader judgments — the raw "red text" explaining failures */
|
|
672
|
+
lowScoringJudgments: JudgmentData[] | null;
|
|
673
|
+
/** Gap analysis recommendations (when gap analysis was run) */
|
|
674
|
+
recommendations: null | RecommendationsData;
|
|
675
|
+
scores: ScoreItem[];
|
|
676
|
+
timestamp: string;
|
|
677
|
+
}
|
|
678
|
+
/** Shape returned by scoreTimelineQuery */
|
|
679
|
+
interface TimelineDataPoint {
|
|
680
|
+
_id: string;
|
|
681
|
+
actualScore?: number | null;
|
|
682
|
+
completedAt: string;
|
|
683
|
+
mode: string;
|
|
684
|
+
overall: number;
|
|
685
|
+
scores: {
|
|
686
|
+
feature: string;
|
|
687
|
+
totalScore: number;
|
|
688
|
+
actualScore?: number;
|
|
689
|
+
}[];
|
|
690
|
+
source: string;
|
|
691
|
+
tag: null | string;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
/**
|
|
695
|
+
* AILF Studio plugin — registers the report schema, dashboard tool,
|
|
696
|
+
* and document actions (Graduate to Native for mirrored tasks).
|
|
697
|
+
*
|
|
698
|
+
* This is the recommended way to install the plugin. It registers
|
|
699
|
+
* schemas, the dashboard tool, and document-level actions in one call.
|
|
700
|
+
*/
|
|
701
|
+
declare const ailfPlugin: sanity.Plugin<void>;
|
|
702
|
+
|
|
703
|
+
export { type ComparisonData, type ContentImpactItem, GLOSSARY, GraduateToNativeAction, MirrorBanner, type ProvenanceData, type ReportDetail, type ReportListItem, type RunEvaluationActionOptions, type ScoreItem, type SummaryData, SyncStatusBadge, type TimelineDataPoint, ailfPlugin, ailfTool, articleSearchQuery, comparisonPairQuery, contentImpactQuery, createRunEvaluationAction, distinctAreasQuery, distinctModesQuery, distinctPerspectivesQuery, distinctSourcesQuery, distinctTargetDocumentsQuery, evalRequestSchema, featureAreaSchema, latestReportsQuery, recentDocumentEvalsQuery, referenceSolutionSchema, reportDetailQuery, reportSchema, scoreTimelineQuery, taskSchema, webhookConfigSchema };
|