@sanity/ailf-studio 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -64
- package/dist/index.d.ts +33 -0
- package/dist/index.js +801 -360
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -13,28 +13,13 @@ AILF reports are stored.
|
|
|
13
13
|
|
|
14
14
|
### 1. Add the dependency
|
|
15
15
|
|
|
16
|
-
#### Continuous releases (recommended for external projects)
|
|
17
|
-
|
|
18
|
-
Every merge to `main` that touches `packages/studio/` automatically publishes
|
|
19
|
-
via [pkg.pr.new](https://pkg.pr.new). Install the latest main build:
|
|
20
|
-
|
|
21
|
-
```bash
|
|
22
|
-
pnpm add https://pkg.pr.new/sanity-labs/ai-literacy-framework/@sanity/ailf-studio@main
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
Or pin to a specific commit:
|
|
26
|
-
|
|
27
16
|
```bash
|
|
28
|
-
pnpm add
|
|
17
|
+
pnpm add @sanity/ailf-studio
|
|
29
18
|
```
|
|
30
19
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#### PR preview packages
|
|
35
|
-
|
|
36
|
-
PRs labeled `trigger: preview` also publish preview packages. Install URLs are
|
|
37
|
-
posted as PR comments automatically.
|
|
20
|
+
> **Note:** The package is published with `restricted` access to the `@sanity`
|
|
21
|
+
> npm scope. You need an npm token with read access — see the root
|
|
22
|
+
> [README](../../README.md#obtain-secrets) for how to obtain one.
|
|
38
23
|
|
|
39
24
|
#### Within the monorepo
|
|
40
25
|
|
|
@@ -120,7 +105,8 @@ export default defineConfig({
|
|
|
120
105
|
|
|
121
106
|
## Dashboard Views
|
|
122
107
|
|
|
123
|
-
The plugin provides
|
|
108
|
+
The plugin provides three tab views plus a detail drill-down, accessible from
|
|
109
|
+
the **AI Literacy Framework** tool in the Studio sidebar.
|
|
124
110
|
|
|
125
111
|
### Latest Reports
|
|
126
112
|
|
|
@@ -128,11 +114,14 @@ A card list of the most recent evaluation reports. Each card shows:
|
|
|
128
114
|
|
|
129
115
|
- Overall score, doc lift, and lowest-scoring area
|
|
130
116
|
- Evaluation mode, source, and trigger type
|
|
131
|
-
- Git metadata (branch, PR number) when available
|
|
117
|
+
- Git metadata (branch, PR number, origin repo) when available
|
|
132
118
|
- Auto-comparison delta against the previous run
|
|
133
119
|
|
|
134
120
|
Click any card to navigate to the Report Detail view.
|
|
135
121
|
|
|
122
|
+
The view includes a **search bar** for filtering reports by document slug, area,
|
|
123
|
+
or content release perspective.
|
|
124
|
+
|
|
136
125
|
### Score Timeline
|
|
137
126
|
|
|
138
127
|
A line chart of overall and per-area scores over time. Filterable by:
|
|
@@ -153,25 +142,22 @@ report from dropdowns, then view:
|
|
|
153
142
|
- Per-model deltas (when both reports include per-model breakdowns)
|
|
154
143
|
- Noise threshold classification
|
|
155
144
|
|
|
156
|
-
### Content Impact
|
|
157
|
-
|
|
158
|
-
Find all evaluation reports related to a specific Sanity document. Enter a
|
|
159
|
-
document ID to see:
|
|
160
|
-
|
|
161
|
-
- Which evaluations included that document in their target set
|
|
162
|
-
- Score trends for that document's feature area over time
|
|
163
|
-
- Whether edits to the document improved or regressed scores
|
|
164
|
-
|
|
165
145
|
### Report Detail
|
|
166
146
|
|
|
167
|
-
Full drill-down into a single report
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
- Per-
|
|
172
|
-
|
|
173
|
-
-
|
|
174
|
-
|
|
147
|
+
Full drill-down into a single report (navigated from Latest Reports or a direct
|
|
148
|
+
URL):
|
|
149
|
+
|
|
150
|
+
- **Overview stats** — composite score, doc lift, cost, duration
|
|
151
|
+
- **Per-area score table** with all dimensions (task completion, code
|
|
152
|
+
correctness, doc coverage, lift from docs)
|
|
153
|
+
- **Three-layer table** — floor / ceiling / actual decomposition (when
|
|
154
|
+
available)
|
|
155
|
+
- **Per-model breakdowns** with cost-per-quality-point
|
|
156
|
+
- **Judgment list** — individual grader verdicts with reasoning
|
|
157
|
+
- **Recommendations** — gap analysis remediation suggestions (when available)
|
|
158
|
+
- **Provenance card** — trigger, git info (branch, PR, origin repo), grader
|
|
159
|
+
model, context hash, eval fingerprint
|
|
160
|
+
- **Auto-comparison summary** against the previous comparable run
|
|
175
161
|
|
|
176
162
|
## Filtering
|
|
177
163
|
|
|
@@ -198,13 +184,13 @@ export default defineConfig({
|
|
|
198
184
|
})
|
|
199
185
|
```
|
|
200
186
|
|
|
201
|
-
Reports are written by the evaluation pipeline (`
|
|
202
|
-
|
|
203
|
-
|
|
187
|
+
Reports are written by the evaluation pipeline (`ailf pipeline --publish`). See
|
|
188
|
+
the [report store design docs](../../docs/design-docs/report-store/index.md) for
|
|
189
|
+
the full architecture.
|
|
204
190
|
|
|
205
191
|
## Exported API
|
|
206
192
|
|
|
207
|
-
The plugin exports building blocks for custom views or extensions
|
|
193
|
+
The plugin exports building blocks for custom views or extensions.
|
|
208
194
|
|
|
209
195
|
### Plugin & Tool
|
|
210
196
|
|
|
@@ -230,6 +216,7 @@ The plugin exports building blocks for custom views or extensions:
|
|
|
230
216
|
| ------------------- | --------------------------------------------------------------------------------------------- |
|
|
231
217
|
| `AssertionInput` | Custom input for task assertions with contextual type descriptions and monospace code styling |
|
|
232
218
|
| `CanonicalDocInput` | Custom input for canonical doc references with polymorphic resolution type help |
|
|
219
|
+
| `ReleasePicker` | Content release perspective picker for evaluation scoping |
|
|
233
220
|
| `MirrorBanner` | Banner showing repo source, sync status, and provenance for mirrored tasks |
|
|
234
221
|
| `SyncStatusBadge` | Colored badge (green/yellow/red) showing sync freshness of mirrored tasks |
|
|
235
222
|
|
|
@@ -240,31 +227,58 @@ The plugin exports building blocks for custom views or extensions:
|
|
|
240
227
|
| `GraduateToNativeAction` | Converts a mirrored (read-only) task to a native (editable) task by removing origin |
|
|
241
228
|
| `createRunEvaluationAction` | Factory for creating a Studio action that triggers evaluations |
|
|
242
229
|
|
|
230
|
+
### Glossary
|
|
231
|
+
|
|
232
|
+
| Export | Description |
|
|
233
|
+
| ---------- | ------------------------------------------------------------------------ |
|
|
234
|
+
| `GLOSSARY` | Centralized tooltip descriptions for all evaluation metrics and concepts |
|
|
235
|
+
|
|
243
236
|
### GROQ Queries
|
|
244
237
|
|
|
245
|
-
| Export
|
|
246
|
-
|
|
|
247
|
-
| `latestReportsQuery`
|
|
248
|
-
| `scoreTimelineQuery`
|
|
249
|
-
| `reportDetailQuery`
|
|
250
|
-
| `comparisonPairQuery`
|
|
251
|
-
| `contentImpactQuery`
|
|
252
|
-
| `
|
|
253
|
-
| `
|
|
254
|
-
| `
|
|
238
|
+
| Export | Description |
|
|
239
|
+
| ------------------------------ | ------------------------------------------ |
|
|
240
|
+
| `latestReportsQuery` | N most recent reports (filterable) |
|
|
241
|
+
| `scoreTimelineQuery` | Score data points over time |
|
|
242
|
+
| `reportDetailQuery` | Full report with all fields |
|
|
243
|
+
| `comparisonPairQuery` | Two reports for side-by-side comparison |
|
|
244
|
+
| `contentImpactQuery` | Reports related to a document ID |
|
|
245
|
+
| `recentDocumentEvalsQuery` | Recent evaluations for a specific document |
|
|
246
|
+
| `articleSearchQuery` | Full-text search across article documents |
|
|
247
|
+
| `distinctSourcesQuery` | All unique source names |
|
|
248
|
+
| `distinctModesQuery` | All unique evaluation modes |
|
|
249
|
+
| `distinctAreasQuery` | All unique feature areas |
|
|
250
|
+
| `distinctModelsQuery` | All unique model identifiers |
|
|
251
|
+
| `distinctPerspectivesQuery` | All unique content release perspectives |
|
|
252
|
+
| `distinctTargetDocumentsQuery` | All unique target document slugs |
|
|
255
253
|
|
|
256
254
|
### Types
|
|
257
255
|
|
|
258
|
-
| Export
|
|
259
|
-
|
|
|
260
|
-
| `ReportListItem`
|
|
261
|
-
| `ReportDetail`
|
|
262
|
-
| `TimelineDataPoint`
|
|
263
|
-
| `ComparisonData`
|
|
264
|
-
| `ContentImpactItem`
|
|
265
|
-
| `ProvenanceData`
|
|
266
|
-
| `SummaryData`
|
|
267
|
-
| `ScoreItem`
|
|
256
|
+
| Export | Description |
|
|
257
|
+
| ---------------------------- | --------------------------------------------------------------------- |
|
|
258
|
+
| `ReportListItem` | Shape returned by `latestReportsQuery` |
|
|
259
|
+
| `ReportDetail` | Shape returned by `reportDetailQuery` |
|
|
260
|
+
| `TimelineDataPoint` | Shape returned by `scoreTimelineQuery` |
|
|
261
|
+
| `ComparisonData` | Auto-comparison data embedded in reports |
|
|
262
|
+
| `ContentImpactItem` | Shape returned by `contentImpactQuery` |
|
|
263
|
+
| `ProvenanceData` | Report provenance metadata |
|
|
264
|
+
| `SummaryData` | Score summary (overall + per-area + per-model) |
|
|
265
|
+
| `ScoreItem` | Individual area score entry |
|
|
266
|
+
| `RecommendationGap` | Single gap analysis recommendation |
|
|
267
|
+
| `RecommendationsData` | Full recommendations payload |
|
|
268
|
+
| `JudgmentData` | Individual grader judgment with reasoning |
|
|
269
|
+
| `DocumentRef` | Canonical document reference (re-exported from `@sanity/ailf-shared`) |
|
|
270
|
+
| `ScoreGrade` | Letter grade type (re-exported from `@sanity/ailf-shared`) |
|
|
271
|
+
| `scoreGrade` | Function to compute letter grade from numeric score |
|
|
272
|
+
| `RunEvaluationActionOptions` | Options for `createRunEvaluationAction` factory |
|
|
273
|
+
|
|
274
|
+
### Utility Functions
|
|
275
|
+
|
|
276
|
+
| Export | Description |
|
|
277
|
+
| -------------------- | --------------------------------------------------------- |
|
|
278
|
+
| `formatPercent` | Format a number as a percentage string |
|
|
279
|
+
| `formatRelativeTime` | Format an ISO timestamp as relative time (e.g., "2h ago") |
|
|
280
|
+
| `formatDelta` | Format a score delta with +/− sign |
|
|
281
|
+
| `formatDuration` | Format milliseconds as human-readable duration |
|
|
268
282
|
|
|
269
283
|
## Development
|
|
270
284
|
|
|
@@ -279,8 +293,8 @@ pnpm --filter @sanity/ailf-studio dev
|
|
|
279
293
|
turbo build
|
|
280
294
|
```
|
|
281
295
|
|
|
282
|
-
The plugin
|
|
283
|
-
Studio's bundler (Vite) handles the final bundle.
|
|
296
|
+
The plugin uses [tsup](https://github.com/egoist/tsup) for bundling. The
|
|
297
|
+
consuming Studio's bundler (Vite) handles the final bundle.
|
|
284
298
|
|
|
285
299
|
## Related Documentation
|
|
286
300
|
|
package/dist/index.d.ts
CHANGED
|
@@ -193,6 +193,15 @@ declare const GLOSSARY: {
|
|
|
193
193
|
readonly failureMode: "The type of documentation problem: missing-docs (functionality not covered), incorrect-docs (factual errors), outdated-docs (stale API/patterns), or poor-structure (hard to find/understand).";
|
|
194
194
|
readonly estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.";
|
|
195
195
|
readonly confidence: "How confident the classifier is in this diagnosis. High = strong keyword + structural signal agreement. Medium = partial agreement. Low = weak signals only.";
|
|
196
|
+
readonly agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.";
|
|
197
|
+
readonly searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.";
|
|
198
|
+
readonly docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.";
|
|
199
|
+
readonly externalDomains: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.";
|
|
200
|
+
readonly avgDocPagesVisited: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.";
|
|
201
|
+
readonly avgSearchesPerformed: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.";
|
|
202
|
+
readonly avgNetworkTimeMs: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.";
|
|
203
|
+
readonly totalRequests: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.";
|
|
204
|
+
readonly totalBytesDownloaded: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.";
|
|
196
205
|
readonly dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.";
|
|
197
206
|
readonly dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.";
|
|
198
207
|
readonly dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.";
|
|
@@ -656,8 +665,30 @@ interface JudgmentData {
|
|
|
656
665
|
score: number;
|
|
657
666
|
taskId: string;
|
|
658
667
|
}
|
|
668
|
+
/** Per-feature agent behavior data — how agents interacted with docs */
|
|
669
|
+
interface FeatureAgentBehaviorData {
|
|
670
|
+
avgDocPagesVisited: number;
|
|
671
|
+
avgNetworkTimeMs: number;
|
|
672
|
+
avgSearchesPerformed: number;
|
|
673
|
+
docSlugsVisited: string[];
|
|
674
|
+
externalDomains: string[];
|
|
675
|
+
feature: string;
|
|
676
|
+
searchQueries: string[];
|
|
677
|
+
tasksWithBehaviorData: number;
|
|
678
|
+
}
|
|
679
|
+
/** Overall agent behavior stats (aggregated across all features) */
|
|
680
|
+
interface OverallAgentBehaviorData {
|
|
681
|
+
avgDocPagesVisited: number;
|
|
682
|
+
avgNetworkTimeMs: number;
|
|
683
|
+
avgSearchesPerformed: number;
|
|
684
|
+
testsWithBehaviorData: number;
|
|
685
|
+
totalUniqueDocSlugs: number;
|
|
686
|
+
totalUniqueSearchQueries: number;
|
|
687
|
+
}
|
|
659
688
|
/** Summary data as stored in Sanity */
|
|
660
689
|
interface SummaryData {
|
|
690
|
+
/** Per-feature agent behavior data (only present when agentic mode ran) */
|
|
691
|
+
agentBehavior?: FeatureAgentBehaviorData[] | null;
|
|
661
692
|
belowCritical: string[];
|
|
662
693
|
/** All Sanity documents used across the entire evaluation */
|
|
663
694
|
documentManifest?: DocumentRef[];
|
|
@@ -665,6 +696,8 @@ interface SummaryData {
|
|
|
665
696
|
lowestArea: string;
|
|
666
697
|
lowestScore: number;
|
|
667
698
|
overall: {
|
|
699
|
+
/** Aggregate agent behavior stats (only present when agentic mode ran) */
|
|
700
|
+
agentBehavior?: OverallAgentBehaviorData;
|
|
668
701
|
avgDocLift: number;
|
|
669
702
|
avgScore: number;
|
|
670
703
|
avgCeilingScore?: number;
|