@sanity/ailf-studio 1.19.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +580 -672
- package/dist/index.js +794 -564
- package/package.json +2 -3
package/dist/index.js
CHANGED
|
@@ -9,9 +9,9 @@ import { useClient } from "sanity";
|
|
|
9
9
|
|
|
10
10
|
// src/lib/constants.ts
|
|
11
11
|
var API_VERSION = "2026-03-11";
|
|
12
|
-
var isProduction = process.env.NODE_ENV === "production";
|
|
13
|
-
var ARTIFACT_API_BASE_URL = isProduction ? "https://ailf-api.sanity.build/v1" : "http://localhost:3000/v1";
|
|
14
12
|
var ENV = globalThis.process?.env ?? {};
|
|
13
|
+
var isProduction = process.env.NODE_ENV === "production";
|
|
14
|
+
var ARTIFACT_API_BASE_URL = ENV.SANITY_STUDIO_AILF_API_BASE_URL ?? (isProduction ? "https://ailf-api.sanity.build/v1" : "http://localhost:3000/v1");
|
|
15
15
|
var REFERENCE_DATASET = ENV.SANITY_STUDIO_AILF_REF_DATASET ?? "next";
|
|
16
16
|
var REFERENCE_WORKSPACE = ENV.SANITY_STUDIO_AILF_REF_WORKSPACE ?? "editorial";
|
|
17
17
|
var AILF_DATASET = ENV.SANITY_STUDIO_AILF_DATASET ?? "ailf-prod-private";
|
|
@@ -209,6 +209,529 @@ var FEATURE_FLAGS = {
|
|
|
209
209
|
}
|
|
210
210
|
};
|
|
211
211
|
|
|
212
|
+
// ../shared/dist/glossary.js
|
|
213
|
+
var GLOSSARY = {
|
|
214
|
+
// -- Overview stats -------------------------------------------------------
|
|
215
|
+
overallScore: {
|
|
216
|
+
label: "Overall Score",
|
|
217
|
+
long: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%)."
|
|
218
|
+
},
|
|
219
|
+
docLift: {
|
|
220
|
+
label: "Doc Lift",
|
|
221
|
+
long: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better."
|
|
222
|
+
},
|
|
223
|
+
actualScore: {
|
|
224
|
+
label: "Actual Score",
|
|
225
|
+
long: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode."
|
|
226
|
+
},
|
|
227
|
+
retrievalGap: {
|
|
228
|
+
label: "Retrieval Gap",
|
|
229
|
+
long: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything."
|
|
230
|
+
},
|
|
231
|
+
infraEfficiency: {
|
|
232
|
+
label: "Infra Efficiency",
|
|
233
|
+
long: "What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly."
|
|
234
|
+
},
|
|
235
|
+
// -- Three-layer decomposition columns ------------------------------------
|
|
236
|
+
floor: {
|
|
237
|
+
label: "Floor",
|
|
238
|
+
long: "Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data."
|
|
239
|
+
},
|
|
240
|
+
ceiling: {
|
|
241
|
+
label: "Ceiling",
|
|
242
|
+
long: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do."
|
|
243
|
+
},
|
|
244
|
+
actual: {
|
|
245
|
+
label: "Actual",
|
|
246
|
+
long: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience."
|
|
247
|
+
},
|
|
248
|
+
retGap: {
|
|
249
|
+
label: "Ret. Gap",
|
|
250
|
+
long: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get."
|
|
251
|
+
},
|
|
252
|
+
efficiency: {
|
|
253
|
+
label: "Efficiency",
|
|
254
|
+
long: "What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage)."
|
|
255
|
+
},
|
|
256
|
+
invertedRetGap: {
|
|
257
|
+
label: "Inverted Retrieval Gap",
|
|
258
|
+
long: "\u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem."
|
|
259
|
+
},
|
|
260
|
+
// -- Per-area score columns -----------------------------------------------
|
|
261
|
+
score: {
|
|
262
|
+
label: "Score",
|
|
263
|
+
long: "Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%."
|
|
264
|
+
},
|
|
265
|
+
taskCompletion: {
|
|
266
|
+
label: "Task Completion",
|
|
267
|
+
long: "Can the LLM implement the requested feature? Graded 0\u2013100."
|
|
268
|
+
},
|
|
269
|
+
codeCorrectness: {
|
|
270
|
+
label: "Code Correctness",
|
|
271
|
+
long: "Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100."
|
|
272
|
+
},
|
|
273
|
+
docCoverage: {
|
|
274
|
+
label: "Doc Coverage",
|
|
275
|
+
long: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation."
|
|
276
|
+
},
|
|
277
|
+
tests: {
|
|
278
|
+
label: "Tests",
|
|
279
|
+
long: "Number of test cases in this feature area."
|
|
280
|
+
},
|
|
281
|
+
// -- Comparison deltas ----------------------------------------------------
|
|
282
|
+
overallDelta: {
|
|
283
|
+
label: "Overall \u0394",
|
|
284
|
+
long: "Change in overall score between the two runs. Positive means the experiment scored higher."
|
|
285
|
+
},
|
|
286
|
+
actualDelta: {
|
|
287
|
+
label: "Actual \u0394",
|
|
288
|
+
long: "Change in actual (agent-retrieved) score between runs. Positive means agents did better."
|
|
289
|
+
},
|
|
290
|
+
retGapDelta: {
|
|
291
|
+
label: "Ret. Gap \u0394",
|
|
292
|
+
long: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs."
|
|
293
|
+
},
|
|
294
|
+
efficiencyDelta: {
|
|
295
|
+
label: "Efficiency \u0394",
|
|
296
|
+
long: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential."
|
|
297
|
+
},
|
|
298
|
+
// -- Comparison table columns ---------------------------------------------
|
|
299
|
+
baseline: {
|
|
300
|
+
label: "Baseline",
|
|
301
|
+
long: "The reference run you're comparing against."
|
|
302
|
+
},
|
|
303
|
+
experiment: {
|
|
304
|
+
label: "Experiment",
|
|
305
|
+
long: "The new run you're evaluating."
|
|
306
|
+
},
|
|
307
|
+
delta: {
|
|
308
|
+
label: "Delta",
|
|
309
|
+
long: "Difference between experiment and baseline. Positive means improvement, negative means regression."
|
|
310
|
+
},
|
|
311
|
+
change: {
|
|
312
|
+
label: "Change",
|
|
313
|
+
long: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold)."
|
|
314
|
+
},
|
|
315
|
+
// -- Grader judgments ------------------------------------------------------
|
|
316
|
+
lowScoringJudgments: {
|
|
317
|
+
label: "Low-Scoring Judgments",
|
|
318
|
+
long: "The grading model's explanations for tests that scored below 70/100."
|
|
319
|
+
},
|
|
320
|
+
judgmentReason: {
|
|
321
|
+
label: "Judgment Reason",
|
|
322
|
+
long: "The grading model's natural language explanation of what went wrong."
|
|
323
|
+
},
|
|
324
|
+
// -- Diagnostics overview ---------------------------------------------------
|
|
325
|
+
healthStrong: {
|
|
326
|
+
label: "Strong (80+)",
|
|
327
|
+
long: "Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations."
|
|
328
|
+
},
|
|
329
|
+
healthAttention: {
|
|
330
|
+
label: "Needs Attention (70\u201379)",
|
|
331
|
+
long: "Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness."
|
|
332
|
+
},
|
|
333
|
+
healthWeak: {
|
|
334
|
+
label: "Weak (<70)",
|
|
335
|
+
long: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly."
|
|
336
|
+
},
|
|
337
|
+
negativeDocLiftMetric: {
|
|
338
|
+
label: "Negative Doc Lift",
|
|
339
|
+
long: "Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples."
|
|
340
|
+
},
|
|
341
|
+
weakAreas: {
|
|
342
|
+
label: "Weak Areas",
|
|
343
|
+
long: "Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features."
|
|
344
|
+
},
|
|
345
|
+
docsHurt: {
|
|
346
|
+
label: "Docs Hurt Performance",
|
|
347
|
+
long: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed."
|
|
348
|
+
},
|
|
349
|
+
retrievalIssues: {
|
|
350
|
+
label: "Retrieval Issues",
|
|
351
|
+
long: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing."
|
|
352
|
+
},
|
|
353
|
+
dimWeaknesses: {
|
|
354
|
+
label: "Dimension Weaknesses",
|
|
355
|
+
long: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?)."
|
|
356
|
+
},
|
|
357
|
+
efficiencyAnomalies: {
|
|
358
|
+
label: "Efficiency Anomalies",
|
|
359
|
+
long: "Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization."
|
|
360
|
+
},
|
|
361
|
+
docLiftWins: {
|
|
362
|
+
label: "Doc Lift Wins",
|
|
363
|
+
long: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know."
|
|
364
|
+
},
|
|
365
|
+
retrievalExcellence: {
|
|
366
|
+
label: "Retrieval Excellence",
|
|
367
|
+
long: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover."
|
|
368
|
+
},
|
|
369
|
+
// -- Model breakdown --------------------------------------------------------
|
|
370
|
+
modelBreakdown: {
|
|
371
|
+
label: "Model Breakdown",
|
|
372
|
+
long: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas."
|
|
373
|
+
},
|
|
374
|
+
// -- Strengths (positive diagnostics) ---------------------------------------
|
|
375
|
+
strengths: {
|
|
376
|
+
label: "Strengths",
|
|
377
|
+
long: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation."
|
|
378
|
+
},
|
|
379
|
+
// -- Recommendations / gap analysis ----------------------------------------
|
|
380
|
+
recommendations: {
|
|
381
|
+
label: "Recommendations",
|
|
382
|
+
long: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it."
|
|
383
|
+
},
|
|
384
|
+
totalPotentialLift: {
|
|
385
|
+
label: "Total Potential Lift",
|
|
386
|
+
long: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100."
|
|
387
|
+
},
|
|
388
|
+
failureMode: {
|
|
389
|
+
label: "Failure Mode",
|
|
390
|
+
long: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination)."
|
|
391
|
+
},
|
|
392
|
+
estimatedLift: {
|
|
393
|
+
label: "Estimated Lift",
|
|
394
|
+
long: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions."
|
|
395
|
+
},
|
|
396
|
+
confidence: {
|
|
397
|
+
label: "Confidence",
|
|
398
|
+
long: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent."
|
|
399
|
+
},
|
|
400
|
+
// -- Agent behavior --------------------------------------------------------
|
|
401
|
+
agentBehaviorOverview: {
|
|
402
|
+
label: "Agent Behavior",
|
|
403
|
+
long: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests."
|
|
404
|
+
},
|
|
405
|
+
searchQueries: {
|
|
406
|
+
label: "Search Queries",
|
|
407
|
+
long: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries."
|
|
408
|
+
},
|
|
409
|
+
docSlugsVisited: {
|
|
410
|
+
label: "Unique Doc Slugs",
|
|
411
|
+
long: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages."
|
|
412
|
+
},
|
|
413
|
+
externalDomains: {
|
|
414
|
+
label: "External Domains",
|
|
415
|
+
long: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs."
|
|
416
|
+
},
|
|
417
|
+
avgDocPagesVisited: {
|
|
418
|
+
label: "Avg Pages Visited",
|
|
419
|
+
long: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly."
|
|
420
|
+
},
|
|
421
|
+
avgSearchesPerformed: {
|
|
422
|
+
label: "Avg Searches",
|
|
423
|
+
long: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines."
|
|
424
|
+
},
|
|
425
|
+
avgNetworkTimeMs: {
|
|
426
|
+
label: "Avg Network Time",
|
|
427
|
+
long: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls."
|
|
428
|
+
},
|
|
429
|
+
totalRequests: {
|
|
430
|
+
label: "Total Requests",
|
|
431
|
+
long: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls."
|
|
432
|
+
},
|
|
433
|
+
totalBytesDownloaded: {
|
|
434
|
+
label: "Total Bytes Downloaded",
|
|
435
|
+
long: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents."
|
|
436
|
+
},
|
|
437
|
+
// -- Dimension deltas -----------------------------------------------------
|
|
438
|
+
dimTaskCompletion: {
|
|
439
|
+
label: "Task Completion \u0394",
|
|
440
|
+
long: "Change in task completion between runs. Positive means implementations are more complete."
|
|
441
|
+
},
|
|
442
|
+
dimCodeCorrectness: {
|
|
443
|
+
label: "Code Correctness \u0394",
|
|
444
|
+
long: "Change in code correctness between runs. Positive means better code quality."
|
|
445
|
+
},
|
|
446
|
+
dimDocCoverage: {
|
|
447
|
+
label: "Doc Coverage \u0394",
|
|
448
|
+
long: "Change in doc coverage between runs. Positive means the docs are providing more useful information."
|
|
449
|
+
},
|
|
450
|
+
// -- Per-area trend delta ----------------------------------------------------
|
|
451
|
+
areaDelta: {
|
|
452
|
+
label: "Area \u0394",
|
|
453
|
+
long: "Score change for this area compared to the previous evaluation run."
|
|
454
|
+
},
|
|
455
|
+
// -- Source values -----------------------------------------------------------
|
|
456
|
+
sourceProduction: {
|
|
457
|
+
label: "Production",
|
|
458
|
+
long: "Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today."
|
|
459
|
+
},
|
|
460
|
+
sourceBranch: {
|
|
461
|
+
label: "Branch",
|
|
462
|
+
long: "Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing."
|
|
463
|
+
},
|
|
464
|
+
sourceLocal: {
|
|
465
|
+
label: "Local",
|
|
466
|
+
long: "Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing."
|
|
467
|
+
},
|
|
468
|
+
// -- Report list columns ----------------------------------------------------
|
|
469
|
+
reportScore: {
|
|
470
|
+
label: "Score",
|
|
471
|
+
long: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas."
|
|
472
|
+
},
|
|
473
|
+
reportMode: {
|
|
474
|
+
label: "Mode",
|
|
475
|
+
long: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation."
|
|
476
|
+
},
|
|
477
|
+
reportTrigger: {
|
|
478
|
+
label: "Trigger",
|
|
479
|
+
long: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check."
|
|
480
|
+
},
|
|
481
|
+
// -- Mode values -----------------------------------------------------------
|
|
482
|
+
modeBaseline: {
|
|
483
|
+
label: "Baseline",
|
|
484
|
+
long: "Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do)."
|
|
485
|
+
},
|
|
486
|
+
modeFull: {
|
|
487
|
+
label: "Full",
|
|
488
|
+
long: "Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency."
|
|
489
|
+
},
|
|
490
|
+
modeAgentic: {
|
|
491
|
+
label: "Agentic",
|
|
492
|
+
long: "Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?"
|
|
493
|
+
},
|
|
494
|
+
modeObserved: {
|
|
495
|
+
label: "Observed",
|
|
496
|
+
long: "Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis."
|
|
497
|
+
},
|
|
498
|
+
modeDebug: {
|
|
499
|
+
label: "Debug",
|
|
500
|
+
long: "Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets."
|
|
501
|
+
},
|
|
502
|
+
// -- Trigger values --------------------------------------------------------
|
|
503
|
+
triggerManual: {
|
|
504
|
+
label: "Manual",
|
|
505
|
+
long: "Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI."
|
|
506
|
+
},
|
|
507
|
+
triggerCi: {
|
|
508
|
+
label: "CI",
|
|
509
|
+
long: "CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline."
|
|
510
|
+
},
|
|
511
|
+
triggerSchedule: {
|
|
512
|
+
label: "Scheduled",
|
|
513
|
+
long: "Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time."
|
|
514
|
+
},
|
|
515
|
+
triggerWebhook: {
|
|
516
|
+
label: "Webhook",
|
|
517
|
+
long: "Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early."
|
|
518
|
+
},
|
|
519
|
+
triggerCrossRepo: {
|
|
520
|
+
label: "Cross-Repo",
|
|
521
|
+
long: "Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks."
|
|
522
|
+
}
|
|
523
|
+
};
|
|
524
|
+
|
|
525
|
+
// ../shared/dist/generated/help-content.js
|
|
526
|
+
var HELP_TOPICS = [
|
|
527
|
+
{
|
|
528
|
+
"id": "negative-doc-lift",
|
|
529
|
+
"title": "When Docs Hurt: Negative Doc Lift",
|
|
530
|
+
"body": `The ceiling score is _lower_ than the floor score. The canonical docs actively
|
|
531
|
+
hurt the model's performance. This is a negative Doc Lift: the model produces
|
|
532
|
+
better output from its training data alone than when given the "correct"
|
|
533
|
+
documentation.
|
|
534
|
+
|
|
535
|
+
This will happen. It is not an error condition \u2014 it is a high-signal diagnostic
|
|
536
|
+
that demands investigation. A negative Doc Lift means one or more of the
|
|
537
|
+
following:
|
|
538
|
+
|
|
539
|
+
- **Outdated documentation** \u2014 the docs describe an older API version or
|
|
540
|
+
pattern, and the model's training data has absorbed a newer version. The docs
|
|
541
|
+
are actively teaching the model the wrong thing. _Action: update the docs to
|
|
542
|
+
reflect the current API._
|
|
543
|
+
|
|
544
|
+
- **Misleading documentation** \u2014 the docs are technically accurate but
|
|
545
|
+
structured in a way that leads the model to an incorrect implementation path.
|
|
546
|
+
Ambiguous phrasing, conflicting code examples, or poor organization can cause
|
|
547
|
+
a model to produce worse code than its baseline intuition. _Action:
|
|
548
|
+
restructure or rewrite the docs for clarity._
|
|
549
|
+
|
|
550
|
+
- **Adversarial context dilution** \u2014 the canonical doc set is too large or
|
|
551
|
+
contains too much tangential content, and the relevant signal gets buried in
|
|
552
|
+
noise. The model performs better with no docs because fewer tokens means less
|
|
553
|
+
distraction. _Action: trim the canonical doc set or restructure docs to
|
|
554
|
+
front-load the most relevant information._
|
|
555
|
+
|
|
556
|
+
- **Model training data superiority** \u2014 for mature, well-known features (e.g.,
|
|
557
|
+
basic GROQ queries, standard Studio setup), the model may have absorbed
|
|
558
|
+
high-quality examples from training data (blog posts, Stack Overflow, open
|
|
559
|
+
source code) that are simply better than the official documentation. _Action:
|
|
560
|
+
improve the docs to meet or exceed the quality of community content the model
|
|
561
|
+
learned from._`,
|
|
562
|
+
"source": "docs/design-docs/scenario-matrix/evaluation-ceiling.md",
|
|
563
|
+
"related": [
|
|
564
|
+
"doc-lift",
|
|
565
|
+
"three-layer"
|
|
566
|
+
]
|
|
567
|
+
},
|
|
568
|
+
{
|
|
569
|
+
"id": "three-layer",
|
|
570
|
+
"title": "Floor, Ceiling, and Actual Scores",
|
|
571
|
+
"body": "- **Doc Lift** = `ceiling score \u2212 floor score`. Positive when docs help,\n negative when docs hurt. This is the fundamental documentation value metric.\n When negative, it triggers the interference investigation described above\n- **Retrieval gap** = `ceiling score \u2212 agentic score`. The cost of imperfect\n discovery. Should be minimized via infrastructure investment (`llms.txt`,\n `.md` endpoints, better navigation, SEO). Note: this metric is only meaningful\n when Doc Lift is positive \u2014 if the docs themselves are harmful, a large\n retrieval gap might paradoxically be _protective_ (agents that can't find the\n bad docs may outperform agents that can)\n- **Doc quality gap** = `100 \u2212 ceiling score`. The ceiling itself is below\n perfect. This is the documentation team's problem \u2014 the docs need improvement\n regardless of how they're delivered\n- **Infrastructure efficiency** = `agentic score / ceiling score`. What fraction\n of the documentation's potential value actually reaches agents? A ratio of\n 0.90 means infrastructure delivers 90% of the doc quality. A ratio of 0.50\n means half the documentation value is lost in delivery. When the ceiling is\n below the floor, this ratio is undefined \u2014 report it as \"N/A (negative Doc\n Lift)\" rather than producing a misleading number\n\n### Decomposition: the healthy case (positive Doc Lift)\n\n```\nfloor score \u2192 model's inherent knowledge\n + Doc Lift (ceiling \u2212 floor) \u2192 documentation quality contribution\n \u2212 retrieval gap (ceiling \u2212 agentic) \u2192 discovery/infrastructure loss\n = agentic score \u2192 what users actually experience\n```\n\n### Decomposition: the interference case (negative Doc Lift)\n\n```\nfloor score \u2192 model's inherent knowledge\n \u2212 |Doc Lift| (floor \u2212 ceiling) \u2192 documentation interference penalty\n \u2212 retrieval gap (ceiling \u2212 agentic) \u2192 discovery/infrastructure loss\n = agentic score \u2192 what users actually experience\n\n but note: agentic score may exceed ceiling score here, because\n agents that fail to find the interfering docs avoid the penalty.\n In this case the \"retrieval gap\" inverts \u2014 poor retrieval is a\n net positive. The framework should flag this explicitly:\n \"Retrieval failure is masking a documentation quality problem.\"\n```\n\nBoth decompositions are valid expressions of the **core measurement model**.\nEvery scenario in the matrix is asking about one or more of these three layers:\ninherent knowledge, documentation quality, or discovery effectiveness. The sign\nof the Doc Lift determines which layer needs urgent attention.",
|
|
572
|
+
"source": "docs/design-docs/scenario-matrix/evaluation-ceiling.md",
|
|
573
|
+
"related": [
|
|
574
|
+
"doc-lift",
|
|
575
|
+
"negative-doc-lift",
|
|
576
|
+
"retrieval-gap",
|
|
577
|
+
"scoring-model"
|
|
578
|
+
]
|
|
579
|
+
},
|
|
580
|
+
{
|
|
581
|
+
"id": "comparing-runs",
|
|
582
|
+
"title": "Comparing Evaluation Runs",
|
|
583
|
+
"body": "## How comparison works\n\nThe Compare view lets you select any two evaluation reports and see a detailed\nside-by-side breakdown. One report is the **baseline** (your reference point)\nand the other is the **experiment** (what you're evaluating).\n\n## Reading the comparison\n\nFor each feature area, you'll see:\n\n- **Delta** \u2014 The score difference (experiment minus baseline). Positive means\n the experiment scored higher.\n- **Change** \u2014 Whether the delta is meaningful: **improved**, **regressed**, or\n **unchanged**. This accounts for the noise threshold \u2014 small deltas within the\n noise band are marked unchanged even if they're non-zero.\n\n## The noise threshold\n\nNot every score change is real. LLM responses vary between runs, and the grader\nmodel has its own variance. The comparison applies a noise threshold (currently\nconfigured per-evaluation) to filter out statistical noise. Only changes that\nexceed this threshold are classified as improvements or regressions.\n\n## What to compare\n\n- **Before and after a doc change** \u2014 Did your edits actually improve scores?\n- **Production vs. branch** \u2014 Will publishing this content release help or hurt?\n- **Different models** \u2014 How does Claude compare to GPT on the same docs?\n- **Baseline vs. full mode** \u2014 Is the retrieval gap shrinking over time?\n\n## Per-area deltas\n\nThe most actionable part of the comparison is the per-area breakdown. If your\noverall score improved but one area regressed, the per-area deltas tell you\nexactly which area needs attention.",
|
|
584
|
+
"source": "docs/help/comparing-runs.md",
|
|
585
|
+
"related": [
|
|
586
|
+
"reading-score-trends",
|
|
587
|
+
"scoring-model"
|
|
588
|
+
]
|
|
589
|
+
},
|
|
590
|
+
{
|
|
591
|
+
"id": "doc-lift",
|
|
592
|
+
"title": "Doc Lift: Do Your Docs Help?",
|
|
593
|
+
"body": "## What is doc lift?\n\n**Doc lift** is the difference between how an AI performs _with_ your\ndocumentation and how it performs _without_ it:\n\n```\nDoc lift = ceiling score \u2212 floor score\n```\n\n- **Ceiling score** \u2014 the AI's score when gold-standard docs are injected\n directly into its prompt.\n- **Floor score** \u2014 the AI's score with no documentation at all, relying only on\n its training data.\n\nA positive doc lift means your docs are helping. The higher the number, the more\nvalue your documentation provides beyond what the model already knows.\n\n## What good doc lift looks like\n\n- **Doc lift of 15+** \u2014 Your docs are providing crucial information the model\n doesn't already know. This is a strong signal that the docs are worth\n maintaining and improving.\n- **Doc lift of 5\u201315** \u2014 Docs are helping, but the model's training data covers\n a lot of the ground already. The docs are adding incremental value.\n- **Doc lift near zero** \u2014 The docs aren't adding much. Either the model already\n knows the material, or the docs aren't providing useful implementation\n guidance.\n\n## Negative doc lift\n\nA **negative** doc lift means the documentation is actively hurting the AI's\nperformance \u2014 the model produces _better_ code without your docs than with them.\nThis is never ignorable. See the \"When Docs Hurt\" help topic for causes and\nremediation.\n\n## Where you see it\n\nDoc lift appears in several places in the dashboard:\n\n- **Overview stats** \u2014 The aggregate doc lift across all areas.\n- **Per-area score table** \u2014 Doc lift for each feature area, so you can see\n which docs help most and which help least.\n- **Comparison view** \u2014 Doc lift deltas between two runs, showing whether your\n doc changes increased or decreased documentation value.",
|
|
594
|
+
"source": "docs/help/doc-lift.md",
|
|
595
|
+
"related": [
|
|
596
|
+
"three-layer",
|
|
597
|
+
"negative-doc-lift",
|
|
598
|
+
"scoring-model"
|
|
599
|
+
]
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
"id": "getting-started",
|
|
603
|
+
"title": "Getting Started",
|
|
604
|
+
"body": `## What does AILF measure?
|
|
605
|
+
|
|
606
|
+
The AI Literacy Framework measures how well your documentation helps AI coding
|
|
607
|
+
tools (like Claude Code, Cursor, ChatGPT, and Copilot) implement features
|
|
608
|
+
correctly. When a developer asks an AI agent "set up a webhook in Sanity," the
|
|
609
|
+
agent needs to find the right docs and produce working code. AILF scores how
|
|
610
|
+
well that goes.
|
|
611
|
+
|
|
612
|
+
## The key number: AI Literacy Score
|
|
613
|
+
|
|
614
|
+
Every evaluation produces a score from 0\u2013100, composed of three dimensions:
|
|
615
|
+
|
|
616
|
+
- **Task Completion (50%)** \u2014 Can the AI implement the feature at all?
|
|
617
|
+
- **Code Correctness (25%)** \u2014 Is the generated code correct and idiomatic?
|
|
618
|
+
- **Doc Coverage (25%)** \u2014 Did the docs provide the information needed?
|
|
619
|
+
|
|
620
|
+
Higher is better. Scores above 80 mean the docs are working well for that
|
|
621
|
+
feature. Scores below 70 need attention.
|
|
622
|
+
|
|
623
|
+
## What you see in the dashboard
|
|
624
|
+
|
|
625
|
+
- **Latest Reports** \u2014 The most recent evaluation runs with scores, areas
|
|
626
|
+
tested, and trend indicators.
|
|
627
|
+
- **Score Timeline** \u2014 How scores change over time. Look for upward trends after
|
|
628
|
+
doc improvements.
|
|
629
|
+
- **Compare** \u2014 Side-by-side comparison of any two runs to see what improved or
|
|
630
|
+
regressed.
|
|
631
|
+
|
|
632
|
+
Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
633
|
+
(for full-mode runs) how AI agents actually navigated your docs.
|
|
634
|
+
|
|
635
|
+
## Quick orientation
|
|
636
|
+
|
|
637
|
+
| If you see\u2026 | It means\u2026 |
|
|
638
|
+
| ------------------- | ---------------------------------------------------- |
|
|
639
|
+
| A score above 80 | Docs are working well for this area |
|
|
640
|
+
| A score below 70 | Docs need improvement \u2014 AI agents struggle here |
|
|
641
|
+
| Positive doc lift | Docs are helping (good!) |
|
|
642
|
+
| Negative doc lift | Docs are hurting \u2014 they're worse than no docs at all |
|
|
643
|
+
| Large retrieval gap | Good docs exist but agents can't find them |`,
|
|
644
|
+
"source": "docs/help/getting-started.md",
|
|
645
|
+
"related": [
|
|
646
|
+
"scoring-model",
|
|
647
|
+
"eval-modes"
|
|
648
|
+
]
|
|
649
|
+
},
|
|
650
|
+
{
|
|
651
|
+
"id": "interpreting-diagnostics",
|
|
652
|
+
"title": "Interpreting Diagnostics",
|
|
653
|
+
"body": "## The diagnostics tab\n\nWhen you open a report and click the **Diagnostics** tab, you see a health\nsummary of your documentation across all feature areas. This is the most\nactionable view in the dashboard \u2014 it tells you exactly where to focus your doc\nimprovement efforts.\n\n## Health categories\n\nFeature areas are grouped into three health bands:\n\n- **Strong (80+)** \u2014 Docs are working well. AI agents produce correct, complete\n implementations. No action needed unless you see regression.\n- **Needs Attention (70\u201379)** \u2014 Docs are okay but have gaps. There may be\n specific dimensions (like code correctness or doc coverage) dragging the score\n down. Worth investigating.\n- **Weak (below 70)** \u2014 Docs are not providing enough support. AI agents\n consistently struggle with these features. These need priority attention.\n\n## Strengths vs. Issues\n\nThe diagnostics tab has two sub-views:\n\n**Strengths** highlights what's working: high-scoring areas, strong dimensions,\nand areas where agents successfully find and use your docs. Use this to\nunderstand what good looks like in your docs \u2014 and replicate it elsewhere.\n\n**Issues** lists the problems: weak areas, dimensions scoring below 50, negative\ndoc lift, retrieval problems, and (if gap analysis was run) specific\nrecommendations with estimated score lift.\n\n## Key diagnostic signals\n\n| Signal | What it means | What to do |\n| ------------------------------ | ------------------------------------------ | ---------------------------------------- |\n| **Negative doc lift** | Docs are worse than no docs | Rewrite or remove the offending docs |\n| **Large retrieval gap** | Good docs exist but agents can't find them | Improve page titles, metadata, SEO |\n| **Low code correctness** | Agents find the docs but produce bad code | Add or fix code examples |\n| **Low doc coverage** | The docs don't cover what the task needs | Write new documentation |\n| **Efficiency anomaly (>100%)** | Agents do better without gold docs | Injected docs may be confusing the model |",
|
|
654
|
+
"source": "docs/help/interpreting-diagnostics.md",
|
|
655
|
+
"related": [
|
|
656
|
+
"scoring-model",
|
|
657
|
+
"weaknesses-recommendations"
|
|
658
|
+
]
|
|
659
|
+
},
|
|
660
|
+
{
|
|
661
|
+
"id": "reading-score-trends",
|
|
662
|
+
"title": "Reading Score Trends",
|
|
663
|
+
"body": "## What the timeline shows\n\nThe Score Timeline view plots your AI Literacy Score over time. Each point is an\nevaluation run \u2014 a snapshot of how well your docs support AI agents at that\nmoment.\n\n## What to look for\n\n**Upward trends** after doc changes confirm that your improvements are working.\nIf you rewrote a GROQ guide and the GROQ area score climbs in the next run,\nthat's direct evidence of impact.\n\n**Sudden drops** usually mean something changed: a doc was deleted, an API\nchanged without a doc update, or a new task was added that exposes a gap.\n\n**Flat lines** mean stability \u2014 neither improving nor regressing. This is fine\nfor mature areas but concerning for areas you're actively working on.\n\n## Meaningful change vs. noise\n\nSmall fluctuations (\xB12\u20133 points) between runs are normal \u2014 they come from LLM\nnon-determinism and grader variance. Focus on changes of **5+ points** sustained\nacross multiple runs. The comparison view applies a noise threshold to help\ndistinguish real changes from statistical noise.\n\n## Filtering the timeline\n\nUse the filters to focus on specific evaluation modes (baseline vs. full),\nspecific doc sources (production vs. branch), or specific feature areas.\nComparing the same area across modes reveals whether a problem is in the docs\nthemselves (baseline score) or in how agents find them (agentic score).",
|
|
664
|
+
"source": "docs/help/reading-score-trends.md",
|
|
665
|
+
"related": [
|
|
666
|
+
"scoring-model",
|
|
667
|
+
"comparing-runs"
|
|
668
|
+
]
|
|
669
|
+
},
|
|
670
|
+
{
|
|
671
|
+
"id": "retrieval-gap",
|
|
672
|
+
"title": "Retrieval Gap & Infrastructure Efficiency",
|
|
673
|
+
"body": "## What is the retrieval gap?\n\nThe **retrieval gap** is the difference between the **ceiling score** (docs\ninjected directly into the AI's prompt) and the **actual score** (the AI agent\nfinds docs on its own via web search). It measures how much documentation\nquality is lost because agents can't find the right pages.\n\n```\nRetrieval gap = ceiling score \u2212 actual score\n```\n\nA retrieval gap of zero means agents find everything. A gap of 20 means 20\npoints of doc quality never reach the agents.\n\n## What is infrastructure efficiency?\n\n**Infrastructure efficiency** expresses the retrieval gap as a ratio:\n\n```\nInfrastructure efficiency = actual score / ceiling score\n```\n\nAn efficiency of 90% means agents capture 90% of your docs' potential. An\nefficiency of 50% means half the documentation value is lost to discoverability\nproblems.\n\n## What causes a large retrieval gap?\n\n- **Poor page titles** \u2014 Agents search by keyword. If your page title doesn't\n match what a developer would ask, agents won't find it.\n- **Missing from search indexes** \u2014 Pages that aren't indexed by search engines\n are invisible to agents that rely on web search.\n- **No `llms.txt`** \u2014 An `llms.txt` file gives agents a table of contents.\n Without it, they rely entirely on search queries.\n- **No `.md` endpoints** \u2014 Agents that can fetch clean markdown directly\n (instead of parsing HTML) get better context with less noise.\n- **Content spread across many pages** \u2014 If implementing a feature requires\n reading 5 different pages, agents are less likely to find all of them.\n\n## How to shrink the retrieval gap\n\n1. **Add clear, keyword-rich page titles** that match how developers phrase\n their questions.\n2. **Ensure pages are indexed** by search engines (no `noindex` meta tags on doc\n pages).\n3. **Provide `llms.txt`** at your docs root so agents can browse a structured\n table of contents.\n4. **Provide `.md` endpoints** so agents can fetch clean markdown instead of\n parsing JavaScript-rendered HTML.\n5. **Consolidate related content** \u2014 fewer, more comprehensive pages are easier\n for agents to find than many small fragments.\n\n## When the retrieval gap inverts\n\nIn rare cases, the actual score exceeds the ceiling score \u2014 agents that can't\nfind the docs perform _better_ than agents with gold-standard docs injected.\nThis means the docs themselves are hurting performance (negative doc lift), and\nagents that fail to find them accidentally avoid the damage. The dashboard flags\nthis as an **inverted retrieval gap** \u2014 it's a documentation quality problem,\nnot a discoverability win.",
|
|
674
|
+
"source": "docs/help/retrieval-gap.md",
|
|
675
|
+
"related": [
|
|
676
|
+
"three-layer",
|
|
677
|
+
"eval-modes",
|
|
678
|
+
"how-agents-work"
|
|
679
|
+
]
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
"id": "scoring-model",
|
|
683
|
+
"title": "Understanding Scores",
|
|
684
|
+
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs): Total = Task \xD7 0.50 + Code \xD7 0.25 + Docs \xD7 0.25\nBaseline (no docs): Total = Task \xD7 0.60 + Code \xD7 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling \u2212 floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100** | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379** | Needs attention \u2014 there may be gaps in specific dimensions |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
|
|
685
|
+
"source": "docs/help/scoring-model.md",
|
|
686
|
+
"related": [
|
|
687
|
+
"three-layer",
|
|
688
|
+
"doc-lift",
|
|
689
|
+
"eval-modes"
|
|
690
|
+
]
|
|
691
|
+
},
|
|
692
|
+
{
|
|
693
|
+
"id": "weaknesses-recommendations",
|
|
694
|
+
"title": "Weaknesses & Recommendations",
|
|
695
|
+
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** \u2014 Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** \u2014 Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** \u2014 How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** \u2014 specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** \u2014 The type of doc problem identified:\n - `missing-docs` \u2014 The functionality isn't documented at all.\n - `incorrect-docs` \u2014 The docs contain factual errors.\n - `outdated-docs` \u2014 The docs describe an old API version or pattern.\n - `poor-structure` \u2014 The docs exist but are hard to find or understand.\n- **Estimated lift** \u2014 How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate \u2014 actual improvement may be higher.\n- **Confidence** \u2014 How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** \u2014 Which specific evaluation tasks exposed this gap.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong \u2014 missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
696
|
+
"source": "docs/help/weaknesses-recommendations.md",
|
|
697
|
+
"related": [
|
|
698
|
+
"interpreting-diagnostics",
|
|
699
|
+
"scoring-model",
|
|
700
|
+
"negative-doc-lift"
|
|
701
|
+
]
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
"id": "how-agents-work",
|
|
705
|
+
"title": "How AI Agents Find Documentation",
|
|
706
|
+
"body": "Understanding how popular AI coding agents retrieve and use documentation is\ncentral to the ai-literacy-framework evaluation framework. This document\nexplains the mechanisms used by common agents and how our test modes simulate\nthem.\n\n## The Documentation Access Problem\n\nWhen a developer asks an AI coding assistant \"Set up a Sanity Studio with a\ncustom blog schema,\" the agent needs to find and read the relevant Sanity\ndocumentation. But different agents do this in fundamentally different ways, and\nthose differences directly impact the quality of the response.\n\nThe framework measures this impact through four evaluation modes: **full**\n(default \u2014 runs baseline + agentic together), **baseline** (docs in prompt),\n**agentic** (tool-calling with real web access), and **observed** (instrumented\nsingle-call).\n\n## How Popular Agents Work\n\n### Claude Code (Anthropic)\n\nClaude Code has built-in tools including `WebSearchTool` and `WebFetchTool`.\nWhen a user asks a Sanity question:\n\n1. The model decides whether to search the web\n2. If so, it calls `WebSearchTool` with a query string\n3. Search results come back as structured data (titles, URLs, snippets)\n4. The model may call `WebFetchTool` to read specific pages\n5. The fetched content is returned as **rendered text** \u2014 Claude Code's fetch\n tool handles JavaScript rendering internally, so even SPA pages return\n readable content\n6. The model synthesizes the fetched docs with its training data to produce an\n answer\n\n**Key characteristic**: Claude Code sees the web as rendered, readable text. It\ndoesn't get raw HTML soup. But it also doesn't know about agent-friendly\nendpoints like `.md` files or `llms.txt` \u2014 it fetches the same HTML pages a\nbrowser would load.\n\n### ChatGPT (OpenAI)\n\nChatGPT's browsing capability uses Bing search under the hood:\n\n1. The model decides to search (users can also explicitly ask it to browse)\n2. It searches via Bing, getting ranked results\n3. It can \"click\" on results to read page content\n4. Pages are rendered server-side and returned as text\n5. The model reads relevant sections and synthesizes an answer\n\n**Key characteristic**: ChatGPT's browsing is similar to Claude Code \u2014 it gets\nrendered content. The URLs visited are returned in citations. It also has no\nawareness of `llms.txt` or `.md` endpoints.\n\n### Cursor\n\nCursor takes a different approach:\n\n1. It maintains a pre-built index of popular documentation sites (`@docs`)\n2. Users can manually add documentation sources\n3. It also has web search capability for unknown topics\n4. Codebase context is injected automatically from the project\n\n**Key characteristic**: Cursor's `@docs` feature means it may have indexed\nSanity docs already, but the index may be outdated. For unknown topics, it falls\nback to web search like the other agents.\n\n### GitHub Copilot\n\nCopilot primarily relies on:\n\n1. The model's training data (parametric knowledge)\n2. Codebase context from the current project\n3. Bing search for `@workspace` queries in newer versions\n\n**Key characteristic**: Copilot historically had no web access, relying entirely\non training data. Newer versions can search, but the experience is similar to\nChatGPT.\n\n## The JavaScript SPA Problem\n\nSanity's documentation site (`sanity.io/docs`) is built with Next.js \u2014 a\nJavaScript single-page application. When an agent makes a raw HTTP request:\n\n```\nGET https://www.sanity.io/docs/schema-types\n\u2192 Returns ~125KB of HTML that is mostly:\n - <script> tags for Next.js bundles\n - React hydration data\n - Navigation chrome\n - Very little actual documentation text\n```\n\nReal agents handle this differently than a raw `fetch()`:\n\n| Agent | Raw fetch? | Gets readable content? | How? |\n| ------------- | ---------- | ---------------------- | ---------------------------------- |\n| Claude Code | No | Yes | Built-in rendering in WebFetchTool |\n| ChatGPT | No | Yes | Server-side rendering via Bing |\n| Cursor | No | Yes | Pre-built doc index |\n| Raw `fetch()` | Yes | **No** | Gets HTML soup |\n\nThis is why the agentic provider uses **Jina Reader** (`r.jina.ai`) as a\nreadability proxy in \"naive\" mode \u2014 it simulates the rendering capability that\nreal agents have built in.\n\n## Sanity's Agent-Friendly Endpoints\n\nSanity has invested in making their documentation accessible to AI agents\nthrough special endpoints:\n\n### `.md` endpoint\n\nAppending `.md` to any docs URL returns pure markdown:\n\n```\nGET https://www.sanity.io/docs/schema-types.md\nContent-Type: text/markdown;charset=UTF-8\n\n# Schema types\nSchema types are used to define the shape of your content...\n```\n\nThis returns **clean markdown** \u2014 no HTML, no JavaScript, no navigation. Just\nthe documentation content. Typical response size: 2-10KB (vs 125KB for the HTML\npage).\n\n### `llms.txt`\n\nSanity provides an `llms.txt` file at `https://www.sanity.io/docs/llms.txt` \u2014 a\nstructured listing of all documentation pages designed for AI agent consumption:\n\n```\n# Sanity\n## Docs\n- [Manage Sanity with code](https://www.sanity.io/docs/blueprints)\n- [Introduction](https://www.sanity.io/docs/blueprints-introduction)\n- [Deploy with GitHub Actions](https://www.sanity.io/docs/blueprints/blueprint-action)\n...\n```\n\nThis follows the emerging [llms.txt standard](https://llmstxt.org/) \u2014 a\nmachine-readable table of contents that tells agents what documentation is\navailable and where to find it.\n\n### Impact on Agent Performance\n\nOur smoke tests show the dramatic difference these endpoints make:\n\n| Metric | Naive Agent (Jina) | Optimized Agent (.md) |\n| ---------------- | ------------------ | --------------------- |\n| Result | \u274C FAIL | \u2705 PASS |\n| Latency | 57.9s | 15.2s (3.8\xD7 faster) |\n| Bytes downloaded | 108 KB | 59 KB (45% less) |\n| Total requests | 9 | 6 (33% fewer) |\n| Search queries | 3 | 0 (used llms.txt) |\n\nThe optimized agent skips search entirely \u2014 it calls `list_docs(\"sanity.io\")` to\nget the `llms.txt` table of contents, identifies the relevant pages, and fetches\nthem directly as `.md`. No search round-trips, no proxy overhead, no content\ncleaning needed.\n\n## How Test Modes Map to Real Agents\n\n| Mode | Config | Simulates | Documentation Access |\n| -------------------------- | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------- |\n| `eval` (baseline) | `promptfooconfig.yaml` | No agent \u2014 docs in prompt | Docs are injected directly into the prompt context, with and without variants |\n| `eval:observed` | `promptfooconfig.observed.yaml` | Non-agentic API call | Single OpenAI API call, records the HTTP request but model doesn't browse |\n| `eval:agentic` (naive) | `promptfooconfig.agentic.yaml` | Claude Code, ChatGPT today | Model has `web_search` + `fetch_page` tools; pages fetched via Jina Reader (simulates JS rendering) |\n| `eval:agentic` (optimized) | `promptfooconfig.agentic.yaml` | Ideal future agent | Model has `web_search` + `fetch_page` + `list_docs` tools; fetches `.md` endpoints directly, uses `llms.txt` for discovery |\n| `agent-harness` | compiled via compiler | Real agent in sandbox | Agent harness mode evaluates real agent behavior in a sandboxed environment (Docker, tempdir, git-worktree) |\n\n### Why Both Naive and Optimized?\n\nThe comparison between naive and optimized modes answers a critical business\nquestion:\n\n> **\"How much does investing in agent-friendly documentation endpoints (`.md`,\n> `llms.txt`) improve the AI developer experience?\"**\n\nIf the optimized agent significantly outperforms the naive agent, it validates\nthe investment in these endpoints. The data from our tests provides concrete\nevidence for this.\n\n## Limitations of the Simulation\n\nWhile the agentic provider faithfully simulates agent behavior, there are\ndifferences from real agents:\n\n1. **Search quality**: We use DuckDuckGo via Jina as a search fallback. Real\n agents use Bing (ChatGPT) or their own search (Claude Code). Search result\n quality varies.\n\n2. **Page rendering**: Jina Reader is a good proxy for JS rendering, but may\n produce slightly different output than what Claude Code or ChatGPT's internal\n renderers produce.\n\n3. **Context window management**: Real agents have sophisticated context\n management \u2014 they may truncate long pages, summarize content, or use sliding\n windows. Our provider returns content up to a fixed limit (12KB).\n\n4. **Codebase context**: Real agents (especially Cursor and Copilot) inject the\n developer's current codebase into context. Our eval doesn't simulate this \u2014\n it only tests documentation retrieval.\n\n5. **Multi-turn interactions**: A real developer might have a conversation with\n their agent, refining the request. Our eval tests single-turn interactions.\n\n## Future Directions\n\nThe architecture overhaul (Phase 4: agent harness mode) addressed several of\nthese goals \u2014 real agents can now be evaluated in sandboxed environments with\nfixture provisioning, tool manifests, and process-quality scoring. Remaining\ndirections:\n\n- **Subprocess agents** _(partially addressed by agent harness mode)_: The\n harness supports running agents via entrypoints in Docker, tempdir, or\n git-worktree sandboxes. Real `claude` CLI or other agent CLIs can be\n configured as harness entrypoints.\n- **Anthropic/OpenAI native tools**: Use Claude's built-in `web_search` tool or\n OpenAI's `web_search_preview` in the Responses API for more faithful\n simulation of the agentic mode\n- **Agent-specific configs**: The compiler's mode handler system makes it\n straightforward to create per-agent configurations\n- **Codebase context injection** _(partially addressed by fixture\n provisioning)_: The agent harness fixture provisioner can inject project\n workspaces, dependency manifests, and code contexts into sandbox environments",
|
|
707
|
+
"source": "docs/how-agents-work.md",
|
|
708
|
+
"related": [
|
|
709
|
+
"eval-modes",
|
|
710
|
+
"retrieval-gap"
|
|
711
|
+
]
|
|
712
|
+
},
|
|
713
|
+
{
|
|
714
|
+
"id": "eval-modes",
|
|
715
|
+
"title": "Evaluation Modes",
|
|
716
|
+
"body": '> **This guide is for:** Anyone using AILF who wants to understand what modes\n> exist and when to use each one.\n\nAILF supports five canonical evaluation modes. Each mode measures a different\naspect of AI tool effectiveness.\n\n## Mode overview\n\n| Mode | What it measures | When to use it |\n| ------------------- | ---------------------------------------------------- | ------------------------------------- |\n| **literacy** | Can AI agents implement features using your docs? | Testing documentation quality |\n| **mcp-server** | Can an LLM correctly use your MCP server\'s tools? | Testing MCP server implementations |\n| **knowledge-probe** | What does the model know without any docs? | Measuring baseline model knowledge |\n| **agent-harness** | Can an autonomous agent complete tasks in a sandbox? | Testing agent capabilities end-to-end |\n| **custom** | Whatever you define | Building your own evaluation type |\n\n## Choosing a mode\n\n```\nWhat do you want to test?\n \u2502\n \u251C\u2500\u2500 "Are our docs helping AI agents?" \u2500\u2500\u2500\u2500\u2500\u2500\u2192 literacy\n \u251C\u2500\u2500 "Does our MCP server work correctly?" \u2500\u2500\u2192 mcp-server\n \u251C\u2500\u2500 "What does the model already know?" \u2500\u2500\u2500\u2500\u2192 knowledge-probe\n \u251C\u2500\u2500 "Can an agent complete real tasks?" \u2500\u2500\u2500\u2500\u2192 agent-harness\n \u2514\u2500\u2500 "Something else entirely" \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2192 custom\n```',
|
|
717
|
+
"source": "docs/modes.md",
|
|
718
|
+
"related": [
|
|
719
|
+
"scoring-model",
|
|
720
|
+
"three-layer"
|
|
721
|
+
]
|
|
722
|
+
},
|
|
723
|
+
{
|
|
724
|
+
"id": "glossary",
|
|
725
|
+
"title": "Glossary",
|
|
726
|
+
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall \u0394**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual \u0394**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap \u0394**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency \u0394**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Needs Attention (70\u201379)**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion \u0394**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness \u0394**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage \u0394**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area \u0394**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
727
|
+
"source": "packages/shared/src/glossary.ts",
|
|
728
|
+
"tags": [
|
|
729
|
+
"reference",
|
|
730
|
+
"glossary"
|
|
731
|
+
]
|
|
732
|
+
}
|
|
733
|
+
];
|
|
734
|
+
|
|
212
735
|
// ../shared/dist/score-grades.js
|
|
213
736
|
var GRADE_BOUNDARIES = {
|
|
214
737
|
good: 80,
|
|
@@ -322,6 +845,21 @@ var EVAL_REQUEST_STATUS_QUERY = (
|
|
|
322
845
|
}
|
|
323
846
|
`
|
|
324
847
|
);
|
|
848
|
+
function buildTaskEvalPipelineRequest(args) {
|
|
849
|
+
const { areaId, dataset, projectId, tag, taskId } = args;
|
|
850
|
+
return {
|
|
851
|
+
dataset,
|
|
852
|
+
mode: "literacy",
|
|
853
|
+
...projectId ? { projectId } : {},
|
|
854
|
+
publish: true,
|
|
855
|
+
publishTag: tag,
|
|
856
|
+
source: "production",
|
|
857
|
+
taskMode: "content-lake",
|
|
858
|
+
tasks: [taskId],
|
|
859
|
+
variant: "full",
|
|
860
|
+
...areaId ? { areas: [areaId] } : {}
|
|
861
|
+
};
|
|
862
|
+
}
|
|
325
863
|
var TASK_REPORT_QUERY = (
|
|
326
864
|
/* groq */
|
|
327
865
|
`
|
|
@@ -454,18 +992,22 @@ var RunTaskEvaluationAction = (props) => {
|
|
|
454
992
|
if (!taskId) return;
|
|
455
993
|
const tag = `task-${slugify(taskId)}-${dateStamp()}`;
|
|
456
994
|
const now = Date.now();
|
|
995
|
+
const pipelineRequest = buildTaskEvalPipelineRequest({
|
|
996
|
+
areaId,
|
|
997
|
+
dataset,
|
|
998
|
+
projectId,
|
|
999
|
+
tag,
|
|
1000
|
+
taskId
|
|
1001
|
+
});
|
|
457
1002
|
try {
|
|
458
1003
|
const evalRequest = await client.create({
|
|
459
1004
|
_type: EVAL_REQUEST_TYPE,
|
|
460
1005
|
dataset,
|
|
461
|
-
|
|
1006
|
+
pipelineRequest: JSON.stringify(pipelineRequest),
|
|
462
1007
|
projectId,
|
|
463
1008
|
requestedAt: new Date(now).toISOString(),
|
|
464
1009
|
requestedBy: currentUser?.id ?? "unknown",
|
|
465
|
-
status: "pending"
|
|
466
|
-
tag,
|
|
467
|
-
tasks: [taskId],
|
|
468
|
-
...areaId ? { areas: [areaId] } : {}
|
|
1010
|
+
status: "pending"
|
|
469
1011
|
});
|
|
470
1012
|
requestedAtRef.current = now;
|
|
471
1013
|
setState({
|
|
@@ -583,55 +1125,13 @@ var evalRequestSchema = defineType({
|
|
|
583
1125
|
type: "string"
|
|
584
1126
|
}),
|
|
585
1127
|
defineField({
|
|
586
|
-
description: "
|
|
587
|
-
group: ["main", "all-fields"],
|
|
588
|
-
initialValue: "baseline",
|
|
589
|
-
name: "mode",
|
|
590
|
-
options: {
|
|
591
|
-
list: [
|
|
592
|
-
{ title: "Baseline", value: "baseline" },
|
|
593
|
-
{ title: "Full", value: "full" },
|
|
594
|
-
{ title: "Observed", value: "observed" },
|
|
595
|
-
{ title: "Agentic", value: "agentic" }
|
|
596
|
-
]
|
|
597
|
-
},
|
|
598
|
-
readOnly: true,
|
|
599
|
-
title: "Mode",
|
|
600
|
-
type: "string"
|
|
601
|
-
}),
|
|
602
|
-
defineField({
|
|
603
|
-
description: "Content release perspective ID (required for release evals)",
|
|
1128
|
+
description: "Canonical PipelineRequest JSON. Source of truth for what the webhook handler dispatches. Parses against PipelineRequestSchema from @sanity/ailf-core.",
|
|
604
1129
|
group: ["main", "all-fields"],
|
|
605
|
-
name: "
|
|
606
|
-
readOnly: true,
|
|
607
|
-
title: "Perspective",
|
|
608
|
-
type: "string"
|
|
609
|
-
}),
|
|
610
|
-
defineField({
|
|
611
|
-
description: "Specific task IDs to evaluate (for task-scoped evals). When set, the pipeline only runs these tasks instead of the full suite.",
|
|
612
|
-
group: ["optional", "all-fields"],
|
|
613
|
-
name: "tasks",
|
|
614
|
-
of: [{ type: "string" }],
|
|
615
|
-
readOnly: true,
|
|
616
|
-
title: "Tasks",
|
|
617
|
-
type: "array"
|
|
618
|
-
}),
|
|
619
|
-
defineField({
|
|
620
|
-
description: "Feature areas to scope the evaluation. When set together with tasks, provides additional context for the pipeline.",
|
|
621
|
-
group: ["optional", "all-fields"],
|
|
622
|
-
name: "areas",
|
|
623
|
-
of: [{ type: "string" }],
|
|
624
|
-
readOnly: true,
|
|
625
|
-
title: "Areas",
|
|
626
|
-
type: "array"
|
|
627
|
-
}),
|
|
628
|
-
defineField({
|
|
629
|
-
description: "Run in debug mode (limits to a subset of test cases for fast feedback).",
|
|
630
|
-
group: ["optional", "all-fields"],
|
|
631
|
-
name: "debug",
|
|
1130
|
+
name: "pipelineRequest",
|
|
632
1131
|
readOnly: true,
|
|
633
|
-
title: "
|
|
634
|
-
type: "
|
|
1132
|
+
title: "Pipeline Request (JSON)",
|
|
1133
|
+
type: "text",
|
|
1134
|
+
validation: (rule) => rule.required()
|
|
635
1135
|
}),
|
|
636
1136
|
defineField({
|
|
637
1137
|
description: "Sanity project ID",
|
|
@@ -676,7 +1176,7 @@ var evalRequestSchema = defineType({
|
|
|
676
1176
|
type: "string"
|
|
677
1177
|
}),
|
|
678
1178
|
defineField({
|
|
679
|
-
description: "Request lifecycle: pending \u2192 dispatched \u2192 completed \u2192 failed. Only 'pending' is set by Studio; the webhook handler and pipeline callback update subsequent states.",
|
|
1179
|
+
description: "Request lifecycle: pending \u2192 dispatched \u2192 completed \u2192 failed. Only 'pending' is set by Studio/dashboard; the webhook handler and pipeline callback update subsequent states.",
|
|
680
1180
|
group: ["main", "all-fields"],
|
|
681
1181
|
initialValue: "pending",
|
|
682
1182
|
name: "status",
|
|
@@ -692,34 +1192,16 @@ var evalRequestSchema = defineType({
|
|
|
692
1192
|
title: "Status",
|
|
693
1193
|
type: "string",
|
|
694
1194
|
validation: (rule) => rule.required()
|
|
695
|
-
}),
|
|
696
|
-
defineField({
|
|
697
|
-
description: 'Publish tag for the report (e.g. "release-my-release-2026-03-16")',
|
|
698
|
-
group: ["optional", "all-fields"],
|
|
699
|
-
name: "tag",
|
|
700
|
-
readOnly: true,
|
|
701
|
-
title: "Tag",
|
|
702
|
-
type: "string"
|
|
703
1195
|
})
|
|
704
1196
|
],
|
|
705
1197
|
name: "ailf.evalRequest",
|
|
706
1198
|
preview: {
|
|
707
|
-
prepare({
|
|
1199
|
+
prepare({ status }) {
|
|
708
1200
|
const statusStr = typeof status === "string" ? status : "unknown";
|
|
709
|
-
|
|
710
|
-
if (typeof perspective === "string") {
|
|
711
|
-
title = perspective;
|
|
712
|
-
} else if (Array.isArray(tasks) && tasks.length > 0) {
|
|
713
|
-
title = tasks.length === 1 ? `Task: ${tasks[0]}` : `Tasks: ${tasks.slice(0, 3).join(", ")}${tasks.length > 3 ? "\u2026" : ""}`;
|
|
714
|
-
} else {
|
|
715
|
-
title = "Evaluation Request";
|
|
716
|
-
}
|
|
717
|
-
return { subtitle: statusStr, title };
|
|
1201
|
+
return { subtitle: statusStr, title: "Evaluation Request" };
|
|
718
1202
|
},
|
|
719
1203
|
select: {
|
|
720
|
-
|
|
721
|
-
status: "status",
|
|
722
|
-
tasks: "tasks"
|
|
1204
|
+
status: "status"
|
|
723
1205
|
}
|
|
724
1206
|
},
|
|
725
1207
|
title: "AILF Evaluation Request",
|
|
@@ -3621,318 +4103,108 @@ import {
|
|
|
3621
4103
|
Tab as Tab3,
|
|
3622
4104
|
TabList as TabList3,
|
|
3623
4105
|
TabPanel as TabPanel3,
|
|
3624
|
-
Text as Text54
|
|
3625
|
-
} from "@sanity/ui";
|
|
3626
|
-
import { useCallback as useCallback43, useEffect as useEffect18 } from "react";
|
|
3627
|
-
import { useRouter as useRouter5 } from "sanity/router";
|
|
3628
|
-
|
|
3629
|
-
// src/lib/help-context.ts
|
|
3630
|
-
import {
|
|
3631
|
-
createElement,
|
|
3632
|
-
createContext,
|
|
3633
|
-
useCallback as useCallback8,
|
|
3634
|
-
useContext,
|
|
3635
|
-
useState as useState4
|
|
3636
|
-
} from "react";
|
|
3637
|
-
var HelpContext = createContext(null);
|
|
3638
|
-
function HelpProvider({ children, defaultTopicId }) {
|
|
3639
|
-
const [state, setState] = useState4({
|
|
3640
|
-
open: false,
|
|
3641
|
-
topicId: defaultTopicId ?? null,
|
|
3642
|
-
history: []
|
|
3643
|
-
});
|
|
3644
|
-
const openHelp = useCallback8(
|
|
3645
|
-
(topicId) => {
|
|
3646
|
-
setState((prev) => ({
|
|
3647
|
-
open: true,
|
|
3648
|
-
topicId: topicId ?? prev.topicId ?? defaultTopicId ?? null,
|
|
3649
|
-
history: []
|
|
3650
|
-
}));
|
|
3651
|
-
},
|
|
3652
|
-
[defaultTopicId]
|
|
3653
|
-
);
|
|
3654
|
-
const closeHelp = useCallback8(() => {
|
|
3655
|
-
setState((prev) => ({ ...prev, open: false }));
|
|
3656
|
-
}, []);
|
|
3657
|
-
const navigateTo = useCallback8((topicId) => {
|
|
3658
|
-
setState((prev) => ({
|
|
3659
|
-
...prev,
|
|
3660
|
-
history: prev.topicId ? [...prev.history, prev.topicId] : prev.history,
|
|
3661
|
-
topicId
|
|
3662
|
-
}));
|
|
3663
|
-
}, []);
|
|
3664
|
-
const goBack = useCallback8(() => {
|
|
3665
|
-
setState((prev) => {
|
|
3666
|
-
if (prev.history.length === 0) return prev;
|
|
3667
|
-
const history = [...prev.history];
|
|
3668
|
-
const topicId = history.pop();
|
|
3669
|
-
return { ...prev, topicId, history };
|
|
3670
|
-
});
|
|
3671
|
-
}, []);
|
|
3672
|
-
const value = {
|
|
3673
|
-
open: state.open,
|
|
3674
|
-
topicId: state.topicId,
|
|
3675
|
-
openHelp,
|
|
3676
|
-
closeHelp,
|
|
3677
|
-
navigateTo,
|
|
3678
|
-
goBack,
|
|
3679
|
-
canGoBack: state.history.length > 0
|
|
3680
|
-
};
|
|
3681
|
-
return createElement(HelpContext.Provider, { value }, children);
|
|
3682
|
-
}
|
|
3683
|
-
function useHelp() {
|
|
3684
|
-
const ctx = useContext(HelpContext);
|
|
3685
|
-
if (!ctx) {
|
|
3686
|
-
throw new Error("useHelp must be used within a <HelpProvider>");
|
|
3687
|
-
}
|
|
3688
|
-
return ctx;
|
|
3689
|
-
}
|
|
3690
|
-
|
|
3691
|
-
// src/lib/judgment-drawer-context.tsx
|
|
3692
|
-
import {
|
|
3693
|
-
createContext as createContext2,
|
|
3694
|
-
useCallback as useCallback9,
|
|
3695
|
-
useContext as useContext2,
|
|
3696
|
-
useMemo as useMemo3,
|
|
3697
|
-
useState as useState5
|
|
3698
|
-
} from "react";
|
|
3699
|
-
import { jsx as jsx10 } from "react/jsx-runtime";
|
|
3700
|
-
var JudgmentDrawerContext = createContext2(
|
|
3701
|
-
null
|
|
3702
|
-
);
|
|
3703
|
-
function JudgmentDrawerProvider({ children }) {
|
|
3704
|
-
const [active, setActive] = useState5(null);
|
|
3705
|
-
const open = useCallback9((payload) => {
|
|
3706
|
-
setActive(payload);
|
|
3707
|
-
}, []);
|
|
3708
|
-
const close = useCallback9(() => {
|
|
3709
|
-
setActive(null);
|
|
3710
|
-
}, []);
|
|
3711
|
-
const value = useMemo3(
|
|
3712
|
-
() => ({ active, close, isOpen: active != null, open }),
|
|
3713
|
-
[active, close, open]
|
|
3714
|
-
);
|
|
3715
|
-
return /* @__PURE__ */ jsx10(JudgmentDrawerContext.Provider, { value, children });
|
|
3716
|
-
}
|
|
3717
|
-
function useJudgmentDrawer() {
|
|
3718
|
-
const ctx = useContext2(JudgmentDrawerContext);
|
|
3719
|
-
if (!ctx) {
|
|
3720
|
-
throw new Error(
|
|
3721
|
-
"useJudgmentDrawer must be called inside a <JudgmentDrawerProvider>"
|
|
3722
|
-
);
|
|
3723
|
-
}
|
|
3724
|
-
return ctx;
|
|
3725
|
-
}
|
|
3726
|
-
|
|
3727
|
-
// src/generated/help-content.ts
|
|
3728
|
-
var HELP_TOPICS = [
|
|
3729
|
-
{
|
|
3730
|
-
"id": "negative-doc-lift",
|
|
3731
|
-
"title": "When Docs Hurt: Negative Doc Lift",
|
|
3732
|
-
"body": `The ceiling score is _lower_ than the floor score. The canonical docs actively
|
|
3733
|
-
hurt the model's performance. This is a negative Doc Lift: the model produces
|
|
3734
|
-
better output from its training data alone than when given the "correct"
|
|
3735
|
-
documentation.
|
|
3736
|
-
|
|
3737
|
-
This will happen. It is not an error condition \u2014 it is a high-signal diagnostic
|
|
3738
|
-
that demands investigation. A negative Doc Lift means one or more of the
|
|
3739
|
-
following:
|
|
3740
|
-
|
|
3741
|
-
- **Outdated documentation** \u2014 the docs describe an older API version or
|
|
3742
|
-
pattern, and the model's training data has absorbed a newer version. The docs
|
|
3743
|
-
are actively teaching the model the wrong thing. _Action: update the docs to
|
|
3744
|
-
reflect the current API._
|
|
3745
|
-
|
|
3746
|
-
- **Misleading documentation** \u2014 the docs are technically accurate but
|
|
3747
|
-
structured in a way that leads the model to an incorrect implementation path.
|
|
3748
|
-
Ambiguous phrasing, conflicting code examples, or poor organization can cause
|
|
3749
|
-
a model to produce worse code than its baseline intuition. _Action:
|
|
3750
|
-
restructure or rewrite the docs for clarity._
|
|
3751
|
-
|
|
3752
|
-
- **Adversarial context dilution** \u2014 the canonical doc set is too large or
|
|
3753
|
-
contains too much tangential content, and the relevant signal gets buried in
|
|
3754
|
-
noise. The model performs better with no docs because fewer tokens means less
|
|
3755
|
-
distraction. _Action: trim the canonical doc set or restructure docs to
|
|
3756
|
-
front-load the most relevant information._
|
|
3757
|
-
|
|
3758
|
-
- **Model training data superiority** \u2014 for mature, well-known features (e.g.,
|
|
3759
|
-
basic GROQ queries, standard Studio setup), the model may have absorbed
|
|
3760
|
-
high-quality examples from training data (blog posts, Stack Overflow, open
|
|
3761
|
-
source code) that are simply better than the official documentation. _Action:
|
|
3762
|
-
improve the docs to meet or exceed the quality of community content the model
|
|
3763
|
-
learned from._`,
|
|
3764
|
-
"source": "docs/design-docs/scenario-matrix/evaluation-ceiling.md",
|
|
3765
|
-
"related": [
|
|
3766
|
-
"doc-lift",
|
|
3767
|
-
"three-layer"
|
|
3768
|
-
]
|
|
3769
|
-
},
|
|
3770
|
-
{
|
|
3771
|
-
"id": "three-layer",
|
|
3772
|
-
"title": "Floor, Ceiling, and Actual Scores",
|
|
3773
|
-
"body": "- **Doc Lift** = `ceiling score \u2212 floor score`. Positive when docs help,\n negative when docs hurt. This is the fundamental documentation value metric.\n When negative, it triggers the interference investigation described above\n- **Retrieval gap** = `ceiling score \u2212 agentic score`. The cost of imperfect\n discovery. Should be minimized via infrastructure investment (`llms.txt`,\n `.md` endpoints, better navigation, SEO). Note: this metric is only meaningful\n when Doc Lift is positive \u2014 if the docs themselves are harmful, a large\n retrieval gap might paradoxically be _protective_ (agents that can't find the\n bad docs may outperform agents that can)\n- **Doc quality gap** = `100 \u2212 ceiling score`. The ceiling itself is below\n perfect. This is the documentation team's problem \u2014 the docs need improvement\n regardless of how they're delivered\n- **Infrastructure efficiency** = `agentic score / ceiling score`. What fraction\n of the documentation's potential value actually reaches agents? A ratio of\n 0.90 means infrastructure delivers 90% of the doc quality. A ratio of 0.50\n means half the documentation value is lost in delivery. When the ceiling is\n below the floor, this ratio is undefined \u2014 report it as \"N/A (negative Doc\n Lift)\" rather than producing a misleading number\n\n### Decomposition: the healthy case (positive Doc Lift)\n\n```\nfloor score \u2192 model's inherent knowledge\n + Doc Lift (ceiling \u2212 floor) \u2192 documentation quality contribution\n \u2212 retrieval gap (ceiling \u2212 agentic) \u2192 discovery/infrastructure loss\n = agentic score \u2192 what users actually experience\n```\n\n### Decomposition: the interference case (negative Doc Lift)\n\n```\nfloor score \u2192 model's inherent knowledge\n \u2212 |Doc Lift| (floor \u2212 ceiling) \u2192 documentation interference penalty\n \u2212 retrieval gap (ceiling \u2212 agentic) \u2192 discovery/infrastructure loss\n = agentic score \u2192 what users actually experience\n\n but note: agentic score may exceed ceiling score here, because\n agents that fail to find the interfering docs avoid the penalty.\n In this case the \"retrieval gap\" inverts \u2014 poor retrieval is a\n net positive. The framework should flag this explicitly:\n \"Retrieval failure is masking a documentation quality problem.\"\n```\n\nBoth decompositions are valid expressions of the **core measurement model**.\nEvery scenario in the matrix is asking about one or more of these three layers:\ninherent knowledge, documentation quality, or discovery effectiveness. The sign\nof the Doc Lift determines which layer needs urgent attention.",
|
|
3774
|
-
"source": "docs/design-docs/scenario-matrix/evaluation-ceiling.md",
|
|
3775
|
-
"related": [
|
|
3776
|
-
"doc-lift",
|
|
3777
|
-
"negative-doc-lift",
|
|
3778
|
-
"retrieval-gap",
|
|
3779
|
-
"scoring-model"
|
|
3780
|
-
]
|
|
3781
|
-
},
|
|
3782
|
-
{
|
|
3783
|
-
"id": "comparing-runs",
|
|
3784
|
-
"title": "Comparing Evaluation Runs",
|
|
3785
|
-
"body": "## How comparison works\n\nThe Compare view lets you select any two evaluation reports and see a detailed\nside-by-side breakdown. One report is the **baseline** (your reference point)\nand the other is the **experiment** (what you're evaluating).\n\n## Reading the comparison\n\nFor each feature area, you'll see:\n\n- **Delta** \u2014 The score difference (experiment minus baseline). Positive means\n the experiment scored higher.\n- **Change** \u2014 Whether the delta is meaningful: **improved**, **regressed**, or\n **unchanged**. This accounts for the noise threshold \u2014 small deltas within the\n noise band are marked unchanged even if they're non-zero.\n\n## The noise threshold\n\nNot every score change is real. LLM responses vary between runs, and the grader\nmodel has its own variance. The comparison applies a noise threshold (currently\nconfigured per-evaluation) to filter out statistical noise. Only changes that\nexceed this threshold are classified as improvements or regressions.\n\n## What to compare\n\n- **Before and after a doc change** \u2014 Did your edits actually improve scores?\n- **Production vs. branch** \u2014 Will publishing this content release help or hurt?\n- **Different models** \u2014 How does Claude compare to GPT on the same docs?\n- **Baseline vs. full mode** \u2014 Is the retrieval gap shrinking over time?\n\n## Per-area deltas\n\nThe most actionable part of the comparison is the per-area breakdown. If your\noverall score improved but one area regressed, the per-area deltas tell you\nexactly which area needs attention.",
|
|
3786
|
-
"source": "docs/help/comparing-runs.md",
|
|
3787
|
-
"related": [
|
|
3788
|
-
"reading-score-trends",
|
|
3789
|
-
"scoring-model"
|
|
3790
|
-
]
|
|
3791
|
-
},
|
|
3792
|
-
{
|
|
3793
|
-
"id": "doc-lift",
|
|
3794
|
-
"title": "Doc Lift: Do Your Docs Help?",
|
|
3795
|
-
"body": "## What is doc lift?\n\n**Doc lift** is the difference between how an AI performs _with_ your\ndocumentation and how it performs _without_ it:\n\n```\nDoc lift = ceiling score \u2212 floor score\n```\n\n- **Ceiling score** \u2014 the AI's score when gold-standard docs are injected\n directly into its prompt.\n- **Floor score** \u2014 the AI's score with no documentation at all, relying only on\n its training data.\n\nA positive doc lift means your docs are helping. The higher the number, the more\nvalue your documentation provides beyond what the model already knows.\n\n## What good doc lift looks like\n\n- **Doc lift of 15+** \u2014 Your docs are providing crucial information the model\n doesn't already know. This is a strong signal that the docs are worth\n maintaining and improving.\n- **Doc lift of 5\u201315** \u2014 Docs are helping, but the model's training data covers\n a lot of the ground already. The docs are adding incremental value.\n- **Doc lift near zero** \u2014 The docs aren't adding much. Either the model already\n knows the material, or the docs aren't providing useful implementation\n guidance.\n\n## Negative doc lift\n\nA **negative** doc lift means the documentation is actively hurting the AI's\nperformance \u2014 the model produces _better_ code without your docs than with them.\nThis is never ignorable. See the \"When Docs Hurt\" help topic for causes and\nremediation.\n\n## Where you see it\n\nDoc lift appears in several places in the dashboard:\n\n- **Overview stats** \u2014 The aggregate doc lift across all areas.\n- **Per-area score table** \u2014 Doc lift for each feature area, so you can see\n which docs help most and which help least.\n- **Comparison view** \u2014 Doc lift deltas between two runs, showing whether your\n doc changes increased or decreased documentation value.",
|
|
3796
|
-
"source": "docs/help/doc-lift.md",
|
|
3797
|
-
"related": [
|
|
3798
|
-
"three-layer",
|
|
3799
|
-
"negative-doc-lift",
|
|
3800
|
-
"scoring-model"
|
|
3801
|
-
]
|
|
3802
|
-
},
|
|
3803
|
-
{
|
|
3804
|
-
"id": "getting-started",
|
|
3805
|
-
"title": "Getting Started",
|
|
3806
|
-
"body": `## What does AILF measure?
|
|
3807
|
-
|
|
3808
|
-
The AI Literacy Framework measures how well your documentation helps AI coding
|
|
3809
|
-
tools (like Claude Code, Cursor, ChatGPT, and Copilot) implement features
|
|
3810
|
-
correctly. When a developer asks an AI agent "set up a webhook in Sanity," the
|
|
3811
|
-
agent needs to find the right docs and produce working code. AILF scores how
|
|
3812
|
-
well that goes.
|
|
3813
|
-
|
|
3814
|
-
## The key number: AI Literacy Score
|
|
3815
|
-
|
|
3816
|
-
Every evaluation produces a score from 0\u2013100, composed of three dimensions:
|
|
3817
|
-
|
|
3818
|
-
- **Task Completion (50%)** \u2014 Can the AI implement the feature at all?
|
|
3819
|
-
- **Code Correctness (25%)** \u2014 Is the generated code correct and idiomatic?
|
|
3820
|
-
- **Doc Coverage (25%)** \u2014 Did the docs provide the information needed?
|
|
3821
|
-
|
|
3822
|
-
Higher is better. Scores above 80 mean the docs are working well for that
|
|
3823
|
-
feature. Scores below 70 need attention.
|
|
3824
|
-
|
|
3825
|
-
## What you see in the dashboard
|
|
3826
|
-
|
|
3827
|
-
- **Latest Reports** \u2014 The most recent evaluation runs with scores, areas
|
|
3828
|
-
tested, and trend indicators.
|
|
3829
|
-
- **Score Timeline** \u2014 How scores change over time. Look for upward trends after
|
|
3830
|
-
doc improvements.
|
|
3831
|
-
- **Compare** \u2014 Side-by-side comparison of any two runs to see what improved or
|
|
3832
|
-
regressed.
|
|
3833
|
-
|
|
3834
|
-
Click into any report for the full breakdown: per-area scores, diagnostics, and
|
|
3835
|
-
(for full-mode runs) how AI agents actually navigated your docs.
|
|
4106
|
+
Text as Text54
|
|
4107
|
+
} from "@sanity/ui";
|
|
4108
|
+
import { useCallback as useCallback43, useEffect as useEffect18 } from "react";
|
|
4109
|
+
import { useRouter as useRouter5 } from "sanity/router";
|
|
3836
4110
|
|
|
3837
|
-
|
|
4111
|
+
// src/lib/help-context.ts
|
|
4112
|
+
import {
|
|
4113
|
+
createElement,
|
|
4114
|
+
createContext,
|
|
4115
|
+
useCallback as useCallback8,
|
|
4116
|
+
useContext,
|
|
4117
|
+
useState as useState4
|
|
4118
|
+
} from "react";
|
|
4119
|
+
var HelpContext = createContext(null);
|
|
4120
|
+
function HelpProvider({ children, defaultTopicId }) {
|
|
4121
|
+
const [state, setState] = useState4({
|
|
4122
|
+
open: false,
|
|
4123
|
+
topicId: defaultTopicId ?? null,
|
|
4124
|
+
history: []
|
|
4125
|
+
});
|
|
4126
|
+
const openHelp = useCallback8(
|
|
4127
|
+
(topicId) => {
|
|
4128
|
+
setState((prev) => ({
|
|
4129
|
+
open: true,
|
|
4130
|
+
topicId: topicId ?? prev.topicId ?? defaultTopicId ?? null,
|
|
4131
|
+
history: []
|
|
4132
|
+
}));
|
|
4133
|
+
},
|
|
4134
|
+
[defaultTopicId]
|
|
4135
|
+
);
|
|
4136
|
+
const closeHelp = useCallback8(() => {
|
|
4137
|
+
setState((prev) => ({ ...prev, open: false }));
|
|
4138
|
+
}, []);
|
|
4139
|
+
const navigateTo = useCallback8((topicId) => {
|
|
4140
|
+
setState((prev) => ({
|
|
4141
|
+
...prev,
|
|
4142
|
+
history: prev.topicId ? [...prev.history, prev.topicId] : prev.history,
|
|
4143
|
+
topicId
|
|
4144
|
+
}));
|
|
4145
|
+
}, []);
|
|
4146
|
+
const goBack = useCallback8(() => {
|
|
4147
|
+
setState((prev) => {
|
|
4148
|
+
if (prev.history.length === 0) return prev;
|
|
4149
|
+
const history = [...prev.history];
|
|
4150
|
+
const topicId = history.pop();
|
|
4151
|
+
return { ...prev, topicId, history };
|
|
4152
|
+
});
|
|
4153
|
+
}, []);
|
|
4154
|
+
const value = {
|
|
4155
|
+
open: state.open,
|
|
4156
|
+
topicId: state.topicId,
|
|
4157
|
+
openHelp,
|
|
4158
|
+
closeHelp,
|
|
4159
|
+
navigateTo,
|
|
4160
|
+
goBack,
|
|
4161
|
+
canGoBack: state.history.length > 0
|
|
4162
|
+
};
|
|
4163
|
+
return createElement(HelpContext.Provider, { value }, children);
|
|
4164
|
+
}
|
|
4165
|
+
function useHelp() {
|
|
4166
|
+
const ctx = useContext(HelpContext);
|
|
4167
|
+
if (!ctx) {
|
|
4168
|
+
throw new Error("useHelp must be used within a <HelpProvider>");
|
|
4169
|
+
}
|
|
4170
|
+
return ctx;
|
|
4171
|
+
}
|
|
3838
4172
|
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
"
|
|
3870
|
-
|
|
3871
|
-
},
|
|
3872
|
-
{
|
|
3873
|
-
"id": "retrieval-gap",
|
|
3874
|
-
"title": "Retrieval Gap & Infrastructure Efficiency",
|
|
3875
|
-
"body": "## What is the retrieval gap?\n\nThe **retrieval gap** is the difference between the **ceiling score** (docs\ninjected directly into the AI's prompt) and the **actual score** (the AI agent\nfinds docs on its own via web search). It measures how much documentation\nquality is lost because agents can't find the right pages.\n\n```\nRetrieval gap = ceiling score \u2212 actual score\n```\n\nA retrieval gap of zero means agents find everything. A gap of 20 means 20\npoints of doc quality never reach the agents.\n\n## What is infrastructure efficiency?\n\n**Infrastructure efficiency** expresses the retrieval gap as a ratio:\n\n```\nInfrastructure efficiency = actual score / ceiling score\n```\n\nAn efficiency of 90% means agents capture 90% of your docs' potential. An\nefficiency of 50% means half the documentation value is lost to discoverability\nproblems.\n\n## What causes a large retrieval gap?\n\n- **Poor page titles** \u2014 Agents search by keyword. If your page title doesn't\n match what a developer would ask, agents won't find it.\n- **Missing from search indexes** \u2014 Pages that aren't indexed by search engines\n are invisible to agents that rely on web search.\n- **No `llms.txt`** \u2014 An `llms.txt` file gives agents a table of contents.\n Without it, they rely entirely on search queries.\n- **No `.md` endpoints** \u2014 Agents that can fetch clean markdown directly\n (instead of parsing HTML) get better context with less noise.\n- **Content spread across many pages** \u2014 If implementing a feature requires\n reading 5 different pages, agents are less likely to find all of them.\n\n## How to shrink the retrieval gap\n\n1. **Add clear, keyword-rich page titles** that match how developers phrase\n their questions.\n2. **Ensure pages are indexed** by search engines (no `noindex` meta tags on doc\n pages).\n3. **Provide `llms.txt`** at your docs root so agents can browse a structured\n table of contents.\n4. **Provide `.md` endpoints** so agents can fetch clean markdown instead of\n parsing JavaScript-rendered HTML.\n5. **Consolidate related content** \u2014 fewer, more comprehensive pages are easier\n for agents to find than many small fragments.\n\n## When the retrieval gap inverts\n\nIn rare cases, the actual score exceeds the ceiling score \u2014 agents that can't\nfind the docs perform _better_ than agents with gold-standard docs injected.\nThis means the docs themselves are hurting performance (negative doc lift), and\nagents that fail to find them accidentally avoid the damage. The dashboard flags\nthis as an **inverted retrieval gap** \u2014 it's a documentation quality problem,\nnot a discoverability win.",
|
|
3876
|
-
"source": "docs/help/retrieval-gap.md",
|
|
3877
|
-
"related": [
|
|
3878
|
-
"three-layer",
|
|
3879
|
-
"eval-modes",
|
|
3880
|
-
"how-agents-work"
|
|
3881
|
-
]
|
|
3882
|
-
},
|
|
3883
|
-
{
|
|
3884
|
-
"id": "scoring-model",
|
|
3885
|
-
"title": "Understanding Scores",
|
|
3886
|
-
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0\u2013100:\n\n- **Task Completion (50% weight)** \u2014 Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** \u2014 Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** \u2014 Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs): Total = Task \xD7 0.50 + Code \xD7 0.25 + Docs \xD7 0.25\nBaseline (no docs): Total = Task \xD7 0.60 + Code \xD7 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling \u2212 floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0\u2013100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80\u2013100** | Docs are working well \u2014 AI agents produce correct implementations |\n| **70\u201379** | Needs attention \u2014 there may be gaps in specific dimensions |\n| **Below 70** | Weak \u2014 AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice \u2014 with and without\ndocumentation. This produces:\n\n- **Floor score** \u2014 Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** \u2014 Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** \u2014 Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** \u2014 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement \u2014 what happens when AI agents find docs on\ntheir own:\n\n- **Floor** \u2014 No docs (parametric knowledge only)\n- **Ceiling** \u2014 Gold-standard docs injected (best the docs can do)\n- **Actual** \u2014 Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** \u2014 Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** \u2014 Actual \xF7 ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** \u2014 Token usage for generating implementations\n- **Grader cost** \u2014 Token usage for the grading model's assessments\n- **Total cost** \u2014 Both combined, reported in the score summary",
|
|
3887
|
-
"source": "docs/help/scoring-model.md",
|
|
3888
|
-
"related": [
|
|
3889
|
-
"three-layer",
|
|
3890
|
-
"doc-lift",
|
|
3891
|
-
"eval-modes"
|
|
3892
|
-
]
|
|
3893
|
-
},
|
|
3894
|
-
{
|
|
3895
|
-
"id": "weaknesses-recommendations",
|
|
3896
|
-
"title": "Weaknesses & Recommendations",
|
|
3897
|
-
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** \u2014 Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** \u2014 Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** \u2014 How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** \u2014 specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** \u2014 The type of doc problem identified:\n - `missing-docs` \u2014 The functionality isn't documented at all.\n - `incorrect-docs` \u2014 The docs contain factual errors.\n - `outdated-docs` \u2014 The docs describe an old API version or pattern.\n - `poor-structure` \u2014 The docs exist but are hard to find or understand.\n- **Estimated lift** \u2014 How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate \u2014 actual improvement may be higher.\n- **Confidence** \u2014 How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** \u2014 Which specific evaluation tasks exposed this gap.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong \u2014 missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
3898
|
-
"source": "docs/help/weaknesses-recommendations.md",
|
|
3899
|
-
"related": [
|
|
3900
|
-
"interpreting-diagnostics",
|
|
3901
|
-
"scoring-model",
|
|
3902
|
-
"negative-doc-lift"
|
|
3903
|
-
]
|
|
3904
|
-
},
|
|
3905
|
-
{
|
|
3906
|
-
"id": "how-agents-work",
|
|
3907
|
-
"title": "How AI Agents Find Documentation",
|
|
3908
|
-
"body": "Understanding how popular AI coding agents retrieve and use documentation is\ncentral to the ai-literacy-framework evaluation framework. This document\nexplains the mechanisms used by common agents and how our test modes simulate\nthem.\n\n## The Documentation Access Problem\n\nWhen a developer asks an AI coding assistant \"Set up a Sanity Studio with a\ncustom blog schema,\" the agent needs to find and read the relevant Sanity\ndocumentation. But different agents do this in fundamentally different ways, and\nthose differences directly impact the quality of the response.\n\nThe framework measures this impact through four evaluation modes: **full**\n(default \u2014 runs baseline + agentic together), **baseline** (docs in prompt),\n**agentic** (tool-calling with real web access), and **observed** (instrumented\nsingle-call).\n\n## How Popular Agents Work\n\n### Claude Code (Anthropic)\n\nClaude Code has built-in tools including `WebSearchTool` and `WebFetchTool`.\nWhen a user asks a Sanity question:\n\n1. The model decides whether to search the web\n2. If so, it calls `WebSearchTool` with a query string\n3. Search results come back as structured data (titles, URLs, snippets)\n4. The model may call `WebFetchTool` to read specific pages\n5. The fetched content is returned as **rendered text** \u2014 Claude Code's fetch\n tool handles JavaScript rendering internally, so even SPA pages return\n readable content\n6. The model synthesizes the fetched docs with its training data to produce an\n answer\n\n**Key characteristic**: Claude Code sees the web as rendered, readable text. It\ndoesn't get raw HTML soup. But it also doesn't know about agent-friendly\nendpoints like `.md` files or `llms.txt` \u2014 it fetches the same HTML pages a\nbrowser would load.\n\n### ChatGPT (OpenAI)\n\nChatGPT's browsing capability uses Bing search under the hood:\n\n1. The model decides to search (users can also explicitly ask it to browse)\n2. It searches via Bing, getting ranked results\n3. It can \"click\" on results to read page content\n4. Pages are rendered server-side and returned as text\n5. The model reads relevant sections and synthesizes an answer\n\n**Key characteristic**: ChatGPT's browsing is similar to Claude Code \u2014 it gets\nrendered content. The URLs visited are returned in citations. It also has no\nawareness of `llms.txt` or `.md` endpoints.\n\n### Cursor\n\nCursor takes a different approach:\n\n1. It maintains a pre-built index of popular documentation sites (`@docs`)\n2. Users can manually add documentation sources\n3. It also has web search capability for unknown topics\n4. Codebase context is injected automatically from the project\n\n**Key characteristic**: Cursor's `@docs` feature means it may have indexed\nSanity docs already, but the index may be outdated. For unknown topics, it falls\nback to web search like the other agents.\n\n### GitHub Copilot\n\nCopilot primarily relies on:\n\n1. The model's training data (parametric knowledge)\n2. Codebase context from the current project\n3. Bing search for `@workspace` queries in newer versions\n\n**Key characteristic**: Copilot historically had no web access, relying entirely\non training data. Newer versions can search, but the experience is similar to\nChatGPT.\n\n## The JavaScript SPA Problem\n\nSanity's documentation site (`sanity.io/docs`) is built with Next.js \u2014 a\nJavaScript single-page application. When an agent makes a raw HTTP request:\n\n```\nGET https://www.sanity.io/docs/schema-types\n\u2192 Returns ~125KB of HTML that is mostly:\n - <script> tags for Next.js bundles\n - React hydration data\n - Navigation chrome\n - Very little actual documentation text\n```\n\nReal agents handle this differently than a raw `fetch()`:\n\n| Agent | Raw fetch? | Gets readable content? | How? |\n| ------------- | ---------- | ---------------------- | ---------------------------------- |\n| Claude Code | No | Yes | Built-in rendering in WebFetchTool |\n| ChatGPT | No | Yes | Server-side rendering via Bing |\n| Cursor | No | Yes | Pre-built doc index |\n| Raw `fetch()` | Yes | **No** | Gets HTML soup |\n\nThis is why the agentic provider uses **Jina Reader** (`r.jina.ai`) as a\nreadability proxy in \"naive\" mode \u2014 it simulates the rendering capability that\nreal agents have built in.\n\n## Sanity's Agent-Friendly Endpoints\n\nSanity has invested in making their documentation accessible to AI agents\nthrough special endpoints:\n\n### `.md` endpoint\n\nAppending `.md` to any docs URL returns pure markdown:\n\n```\nGET https://www.sanity.io/docs/schema-types.md\nContent-Type: text/markdown;charset=UTF-8\n\n# Schema types\nSchema types are used to define the shape of your content...\n```\n\nThis returns **clean markdown** \u2014 no HTML, no JavaScript, no navigation. Just\nthe documentation content. Typical response size: 2-10KB (vs 125KB for the HTML\npage).\n\n### `llms.txt`\n\nSanity provides an `llms.txt` file at `https://www.sanity.io/docs/llms.txt` \u2014 a\nstructured listing of all documentation pages designed for AI agent consumption:\n\n```\n# Sanity\n## Docs\n- [Manage Sanity with code](https://www.sanity.io/docs/blueprints)\n- [Introduction](https://www.sanity.io/docs/blueprints-introduction)\n- [Deploy with GitHub Actions](https://www.sanity.io/docs/blueprints/blueprint-action)\n...\n```\n\nThis follows the emerging [llms.txt standard](https://llmstxt.org/) \u2014 a\nmachine-readable table of contents that tells agents what documentation is\navailable and where to find it.\n\n### Impact on Agent Performance\n\nOur smoke tests show the dramatic difference these endpoints make:\n\n| Metric | Naive Agent (Jina) | Optimized Agent (.md) |\n| ---------------- | ------------------ | --------------------- |\n| Result | \u274C FAIL | \u2705 PASS |\n| Latency | 57.9s | 15.2s (3.8\xD7 faster) |\n| Bytes downloaded | 108 KB | 59 KB (45% less) |\n| Total requests | 9 | 6 (33% fewer) |\n| Search queries | 3 | 0 (used llms.txt) |\n\nThe optimized agent skips search entirely \u2014 it calls `list_docs(\"sanity.io\")` to\nget the `llms.txt` table of contents, identifies the relevant pages, and fetches\nthem directly as `.md`. No search round-trips, no proxy overhead, no content\ncleaning needed.\n\n## How Test Modes Map to Real Agents\n\n| Mode | Config | Simulates | Documentation Access |\n| -------------------------- | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------- |\n| `eval` (baseline) | `promptfooconfig.yaml` | No agent \u2014 docs in prompt | Docs are injected directly into the prompt context, with and without variants |\n| `eval:observed` | `promptfooconfig.observed.yaml` | Non-agentic API call | Single OpenAI API call, records the HTTP request but model doesn't browse |\n| `eval:agentic` (naive) | `promptfooconfig.agentic.yaml` | Claude Code, ChatGPT today | Model has `web_search` + `fetch_page` tools; pages fetched via Jina Reader (simulates JS rendering) |\n| `eval:agentic` (optimized) | `promptfooconfig.agentic.yaml` | Ideal future agent | Model has `web_search` + `fetch_page` + `list_docs` tools; fetches `.md` endpoints directly, uses `llms.txt` for discovery |\n| `agent-harness` | compiled via compiler | Real agent in sandbox | Agent harness mode evaluates real agent behavior in a sandboxed environment (Docker, tempdir, git-worktree) |\n\n### Why Both Naive and Optimized?\n\nThe comparison between naive and optimized modes answers a critical business\nquestion:\n\n> **\"How much does investing in agent-friendly documentation endpoints (`.md`,\n> `llms.txt`) improve the AI developer experience?\"**\n\nIf the optimized agent significantly outperforms the naive agent, it validates\nthe investment in these endpoints. The data from our tests provides concrete\nevidence for this.\n\n## Limitations of the Simulation\n\nWhile the agentic provider faithfully simulates agent behavior, there are\ndifferences from real agents:\n\n1. **Search quality**: We use DuckDuckGo via Jina as a search fallback. Real\n agents use Bing (ChatGPT) or their own search (Claude Code). Search result\n quality varies.\n\n2. **Page rendering**: Jina Reader is a good proxy for JS rendering, but may\n produce slightly different output than what Claude Code or ChatGPT's internal\n renderers produce.\n\n3. **Context window management**: Real agents have sophisticated context\n management \u2014 they may truncate long pages, summarize content, or use sliding\n windows. Our provider returns content up to a fixed limit (12KB).\n\n4. **Codebase context**: Real agents (especially Cursor and Copilot) inject the\n developer's current codebase into context. Our eval doesn't simulate this \u2014\n it only tests documentation retrieval.\n\n5. **Multi-turn interactions**: A real developer might have a conversation with\n their agent, refining the request. Our eval tests single-turn interactions.\n\n## Future Directions\n\nThe architecture overhaul (Phase 4: agent harness mode) addressed several of\nthese goals \u2014 real agents can now be evaluated in sandboxed environments with\nfixture provisioning, tool manifests, and process-quality scoring. Remaining\ndirections:\n\n- **Subprocess agents** _(partially addressed by agent harness mode)_: The\n harness supports running agents via entrypoints in Docker, tempdir, or\n git-worktree sandboxes. Real `claude` CLI or other agent CLIs can be\n configured as harness entrypoints.\n- **Anthropic/OpenAI native tools**: Use Claude's built-in `web_search` tool or\n OpenAI's `web_search_preview` in the Responses API for more faithful\n simulation of the agentic mode\n- **Agent-specific configs**: The compiler's mode handler system makes it\n straightforward to create per-agent configurations\n- **Codebase context injection** _(partially addressed by fixture\n provisioning)_: The agent harness fixture provisioner can inject project\n workspaces, dependency manifests, and code contexts into sandbox environments",
|
|
3909
|
-
"source": "docs/how-agents-work.md",
|
|
3910
|
-
"related": [
|
|
3911
|
-
"eval-modes",
|
|
3912
|
-
"retrieval-gap"
|
|
3913
|
-
]
|
|
3914
|
-
},
|
|
3915
|
-
{
|
|
3916
|
-
"id": "eval-modes",
|
|
3917
|
-
"title": "Evaluation Modes",
|
|
3918
|
-
"body": '> **This guide is for:** Anyone using AILF who wants to understand what modes\n> exist and when to use each one.\n\nAILF supports five canonical evaluation modes. Each mode measures a different\naspect of AI tool effectiveness.\n\n## Mode overview\n\n| Mode | What it measures | When to use it |\n| ------------------- | ---------------------------------------------------- | ------------------------------------- |\n| **literacy** | Can AI agents implement features using your docs? | Testing documentation quality |\n| **mcp-server** | Can an LLM correctly use your MCP server\'s tools? | Testing MCP server implementations |\n| **knowledge-probe** | What does the model know without any docs? | Measuring baseline model knowledge |\n| **agent-harness** | Can an autonomous agent complete tasks in a sandbox? | Testing agent capabilities end-to-end |\n| **custom** | Whatever you define | Building your own evaluation type |\n\n## Choosing a mode\n\n```\nWhat do you want to test?\n \u2502\n \u251C\u2500\u2500 "Are our docs helping AI agents?" \u2500\u2500\u2500\u2500\u2500\u2500\u2192 literacy\n \u251C\u2500\u2500 "Does our MCP server work correctly?" \u2500\u2500\u2192 mcp-server\n \u251C\u2500\u2500 "What does the model already know?" \u2500\u2500\u2500\u2500\u2192 knowledge-probe\n \u251C\u2500\u2500 "Can an agent complete real tasks?" \u2500\u2500\u2500\u2500\u2192 agent-harness\n \u2514\u2500\u2500 "Something else entirely" \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2192 custom\n```',
|
|
3919
|
-
"source": "docs/modes.md",
|
|
3920
|
-
"related": [
|
|
3921
|
-
"scoring-model",
|
|
3922
|
-
"three-layer"
|
|
3923
|
-
]
|
|
3924
|
-
},
|
|
3925
|
-
{
|
|
3926
|
-
"id": "glossary",
|
|
3927
|
-
"title": "Glossary",
|
|
3928
|
-
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).\n\n**Inverted Ret Gap**\n: \u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0\u2013100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Delta**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Delta**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret Gap Delta**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Delta**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Health Strong**\n: Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.\n\n**Health Attention**\n: Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Health Weak**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift Metric**\n: Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dim Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior Overview**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Doc Slugs Visited**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Doc Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches Performed**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time Ms**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Dim Task Completion**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Dim Code Correctness**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Dim Doc Coverage**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Delta**\n: Score change for this area compared to the previous evaluation run.\n\n**Source Production**\n: Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Source Branch**\n: Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Source Local**\n: Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Report Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Report Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Report Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Mode Baseline**\n: Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Mode Full**\n: Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Mode Agentic**\n: Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Mode Observed**\n: Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Mode Debug**\n: Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Trigger Manual**\n: Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**Trigger Ci**\n: CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Trigger Schedule**\n: Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Trigger Webhook**\n: Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Trigger Cross Repo**\n: Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
3929
|
-
"source": "packages/studio/src/glossary.ts",
|
|
3930
|
-
"tags": [
|
|
3931
|
-
"reference",
|
|
3932
|
-
"glossary"
|
|
3933
|
-
]
|
|
4173
|
+
// src/lib/judgment-drawer-context.tsx
|
|
4174
|
+
import {
|
|
4175
|
+
createContext as createContext2,
|
|
4176
|
+
useCallback as useCallback9,
|
|
4177
|
+
useContext as useContext2,
|
|
4178
|
+
useMemo as useMemo3,
|
|
4179
|
+
useState as useState5
|
|
4180
|
+
} from "react";
|
|
4181
|
+
import { jsx as jsx10 } from "react/jsx-runtime";
|
|
4182
|
+
var JudgmentDrawerContext = createContext2(
|
|
4183
|
+
null
|
|
4184
|
+
);
|
|
4185
|
+
function JudgmentDrawerProvider({ children }) {
|
|
4186
|
+
const [active, setActive] = useState5(null);
|
|
4187
|
+
const open = useCallback9((payload) => {
|
|
4188
|
+
setActive(payload);
|
|
4189
|
+
}, []);
|
|
4190
|
+
const close = useCallback9(() => {
|
|
4191
|
+
setActive(null);
|
|
4192
|
+
}, []);
|
|
4193
|
+
const value = useMemo3(
|
|
4194
|
+
() => ({ active, close, isOpen: active != null, open }),
|
|
4195
|
+
[active, close, open]
|
|
4196
|
+
);
|
|
4197
|
+
return /* @__PURE__ */ jsx10(JudgmentDrawerContext.Provider, { value, children });
|
|
4198
|
+
}
|
|
4199
|
+
function useJudgmentDrawer() {
|
|
4200
|
+
const ctx = useContext2(JudgmentDrawerContext);
|
|
4201
|
+
if (!ctx) {
|
|
4202
|
+
throw new Error(
|
|
4203
|
+
"useJudgmentDrawer must be called inside a <JudgmentDrawerProvider>"
|
|
4204
|
+
);
|
|
3934
4205
|
}
|
|
3935
|
-
|
|
4206
|
+
return ctx;
|
|
4207
|
+
}
|
|
3936
4208
|
|
|
3937
4209
|
// src/lib/help-topics.ts
|
|
3938
4210
|
function deriveHelpTopic(routerState) {
|
|
@@ -5296,100 +5568,6 @@ import {
|
|
|
5296
5568
|
Text as Text17
|
|
5297
5569
|
} from "@sanity/ui";
|
|
5298
5570
|
|
|
5299
|
-
// src/glossary.ts
|
|
5300
|
-
var GLOSSARY = {
|
|
5301
|
-
// -- Overview stats -------------------------------------------------------
|
|
5302
|
-
overallScore: "A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).",
|
|
5303
|
-
docLift: "How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.",
|
|
5304
|
-
actualScore: "How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.",
|
|
5305
|
-
retrievalGap: "The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.",
|
|
5306
|
-
infraEfficiency: "What percentage of the docs' potential quality actually reaches agents (actual \xF7 ceiling). 100% means agents find and use all relevant docs perfectly.",
|
|
5307
|
-
// -- Three-layer decomposition columns ------------------------------------
|
|
5308
|
-
floor: "Output-quality composite without documentation \u2014 Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.",
|
|
5309
|
-
ceiling: "Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.",
|
|
5310
|
-
actual: "Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.",
|
|
5311
|
-
retGap: "Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.",
|
|
5312
|
-
efficiency: "What fraction of the docs' quality reaches agents in practice (actual \xF7 ceiling, shown as a percentage).",
|
|
5313
|
-
invertedRetGap: "\u26A0\uFE0F Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.",
|
|
5314
|
-
// -- Per-area score columns -----------------------------------------------
|
|
5315
|
-
score: "Ceiling composite for this feature area: Task Completion \xD7 50% + Code Correctness \xD7 25% + Doc Coverage \xD7 25%.",
|
|
5316
|
-
taskCompletion: "Can the LLM implement the requested feature? Graded 0\u2013100.",
|
|
5317
|
-
codeCorrectness: "Is the generated code idiomatic, correct, and following best practices? Graded 0\u2013100.",
|
|
5318
|
-
docCoverage: "Did the docs provide the information needed to implement the feature? Graded 0\u2013100. This dimension only contributes to the ceiling composite (with docs) \u2014 it's excluded from the floor composite because it's undefined without documentation.",
|
|
5319
|
-
tests: "Number of test cases in this feature area.",
|
|
5320
|
-
// -- Comparison deltas ----------------------------------------------------
|
|
5321
|
-
overallDelta: "Change in overall score between the two runs. Positive means the experiment scored higher.",
|
|
5322
|
-
actualDelta: "Change in actual (agent-retrieved) score between runs. Positive means agents did better.",
|
|
5323
|
-
retGapDelta: "Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.",
|
|
5324
|
-
efficiencyDelta: "Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.",
|
|
5325
|
-
// -- Comparison table columns ---------------------------------------------
|
|
5326
|
-
baseline: "The reference run you're comparing against.",
|
|
5327
|
-
experiment: "The new run you're evaluating.",
|
|
5328
|
-
delta: "Difference between experiment and baseline. Positive means improvement, negative means regression.",
|
|
5329
|
-
change: "Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).",
|
|
5330
|
-
// -- Grader judgments ------------------------------------------------------
|
|
5331
|
-
lowScoringJudgments: "The grading model's explanations for tests that scored below 70/100.",
|
|
5332
|
-
judgmentReason: "The grading model's natural language explanation of what went wrong.",
|
|
5333
|
-
// -- Diagnostics overview ---------------------------------------------------
|
|
5334
|
-
healthStrong: "Feature areas scoring 80 or above. The docs are working well for these features \u2014 AI agents produce correct, complete implementations.",
|
|
5335
|
-
healthAttention: "Feature areas scoring 70\u201379. These are okay but could be improved \u2014 there may be gaps in specific dimensions like doc coverage or code correctness.",
|
|
5336
|
-
healthWeak: "Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.",
|
|
5337
|
-
negativeDocLiftMetric: "Number of areas where the documentation actually hurts AI performance \u2014 the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.",
|
|
5338
|
-
weakAreas: "Feature areas where the overall score is below 70. These need the most attention \u2014 low scores mean AI agents consistently struggle to implement these features.",
|
|
5339
|
-
docsHurt: "Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.",
|
|
5340
|
-
retrievalIssues: "Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.",
|
|
5341
|
-
dimWeaknesses: "Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most \u2014 task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).",
|
|
5342
|
-
efficiencyAnomalies: "Areas where agent efficiency exceeds 100% \u2014 meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.",
|
|
5343
|
-
docLiftWins: "Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.",
|
|
5344
|
-
retrievalExcellence: "Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.",
|
|
5345
|
-
// -- Model breakdown --------------------------------------------------------
|
|
5346
|
-
modelBreakdown: "Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently \u2014 useful for spotting models that struggle with specific feature areas.",
|
|
5347
|
-
// -- Strengths (positive diagnostics) ---------------------------------------
|
|
5348
|
-
strengths: "What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.",
|
|
5349
|
-
// -- Recommendations / gap analysis ----------------------------------------
|
|
5350
|
-
recommendations: "Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.",
|
|
5351
|
-
totalPotentialLift: "Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate \u2014 each gap targets the median of non-bottlenecked dimensions, not 100.",
|
|
5352
|
-
failureMode: "The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).",
|
|
5353
|
-
estimatedLift: "Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.",
|
|
5354
|
-
confidence: "How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.",
|
|
5355
|
-
// -- Agent behavior --------------------------------------------------------
|
|
5356
|
-
agentBehaviorOverview: "How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.",
|
|
5357
|
-
searchQueries: "The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.",
|
|
5358
|
-
docSlugsVisited: "Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.",
|
|
5359
|
-
externalDomains: "Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.",
|
|
5360
|
-
avgDocPagesVisited: "Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.",
|
|
5361
|
-
avgSearchesPerformed: "Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.",
|
|
5362
|
-
avgNetworkTimeMs: "Average time spent on network requests per test. Includes page fetches, search queries, and API calls.",
|
|
5363
|
-
totalRequests: "Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.",
|
|
5364
|
-
totalBytesDownloaded: "Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.",
|
|
5365
|
-
// -- Dimension deltas -----------------------------------------------------
|
|
5366
|
-
dimTaskCompletion: "Change in task completion between runs. Positive means implementations are more complete.",
|
|
5367
|
-
dimCodeCorrectness: "Change in code correctness between runs. Positive means better code quality.",
|
|
5368
|
-
dimDocCoverage: "Change in doc coverage between runs. Positive means the docs are providing more useful information.",
|
|
5369
|
-
// -- Per-area trend delta ----------------------------------------------------
|
|
5370
|
-
areaDelta: "Score change for this area compared to the previous evaluation run.",
|
|
5371
|
-
// -- Source values -----------------------------------------------------------
|
|
5372
|
-
sourceProduction: "Production source \u2014 docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.",
|
|
5373
|
-
sourceBranch: "Branch source \u2014 docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.",
|
|
5374
|
-
sourceLocal: "Local source \u2014 docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.",
|
|
5375
|
-
// -- Report list columns ----------------------------------------------------
|
|
5376
|
-
reportScore: "The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.",
|
|
5377
|
-
reportMode: "The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.",
|
|
5378
|
-
reportTrigger: "What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.",
|
|
5379
|
-
// -- Mode values -----------------------------------------------------------
|
|
5380
|
-
modeBaseline: "Baseline mode \u2014 tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).",
|
|
5381
|
-
modeFull: "Full mode \u2014 runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.",
|
|
5382
|
-
modeAgentic: "Agentic mode \u2014 the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?",
|
|
5383
|
-
modeObserved: "Observed mode \u2014 records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.",
|
|
5384
|
-
modeDebug: "Debug mode \u2014 a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.",
|
|
5385
|
-
// -- Trigger values --------------------------------------------------------
|
|
5386
|
-
triggerManual: "Manually triggered \u2014 someone ran the evaluation pipeline by hand, either locally or via the Studio UI.",
|
|
5387
|
-
triggerCi: "CI-triggered \u2014 the evaluation ran automatically as part of a pull request or merge pipeline.",
|
|
5388
|
-
triggerSchedule: "Scheduled \u2014 the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.",
|
|
5389
|
-
triggerWebhook: "Webhook-triggered \u2014 a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.",
|
|
5390
|
-
triggerCrossRepo: "Cross-repo \u2014 triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks."
|
|
5391
|
-
};
|
|
5392
|
-
|
|
5393
5571
|
// src/components/report-table/useContainerWidth.ts
|
|
5394
5572
|
import { useCallback as useCallback13, useEffect as useEffect5, useRef as useRef4, useState as useState9 } from "react";
|
|
5395
5573
|
var FULL_MIN = 900;
|
|
@@ -5703,8 +5881,14 @@ function ReportTable({
|
|
|
5703
5881
|
),
|
|
5704
5882
|
tier !== "narrow" && /* @__PURE__ */ jsx20(ColHeader, { label: "Change" }),
|
|
5705
5883
|
tier === "full" && /* @__PURE__ */ jsxs17(Fragment6, { children: [
|
|
5706
|
-
/* @__PURE__ */ jsx20(ColHeader, { infoTip: GLOSSARY.reportMode, label: "Mode" }),
|
|
5707
|
-
/* @__PURE__ */ jsx20(
|
|
5884
|
+
/* @__PURE__ */ jsx20(ColHeader, { infoTip: GLOSSARY.reportMode.long, label: "Mode" }),
|
|
5885
|
+
/* @__PURE__ */ jsx20(
|
|
5886
|
+
ColHeader,
|
|
5887
|
+
{
|
|
5888
|
+
infoTip: GLOSSARY.reportTrigger.long,
|
|
5889
|
+
label: "Trigger"
|
|
5890
|
+
}
|
|
5891
|
+
),
|
|
5708
5892
|
/* @__PURE__ */ jsx20(
|
|
5709
5893
|
ColHeader,
|
|
5710
5894
|
{
|
|
@@ -5914,18 +6098,18 @@ var TRIGGER_TONE = {
|
|
|
5914
6098
|
webhook: "positive"
|
|
5915
6099
|
};
|
|
5916
6100
|
var MODE_TIP = {
|
|
5917
|
-
agentic: GLOSSARY.modeAgentic,
|
|
5918
|
-
baseline: GLOSSARY.modeBaseline,
|
|
5919
|
-
debug: GLOSSARY.modeDebug,
|
|
5920
|
-
full: GLOSSARY.modeFull,
|
|
5921
|
-
observed: GLOSSARY.modeObserved
|
|
6101
|
+
agentic: GLOSSARY.modeAgentic.long,
|
|
6102
|
+
baseline: GLOSSARY.modeBaseline.long,
|
|
6103
|
+
debug: GLOSSARY.modeDebug.long,
|
|
6104
|
+
full: GLOSSARY.modeFull.long,
|
|
6105
|
+
observed: GLOSSARY.modeObserved.long
|
|
5922
6106
|
};
|
|
5923
6107
|
var TRIGGER_TIP = {
|
|
5924
|
-
ci: GLOSSARY.triggerCi,
|
|
5925
|
-
"cross-repo": GLOSSARY.triggerCrossRepo,
|
|
5926
|
-
manual: GLOSSARY.triggerManual,
|
|
5927
|
-
schedule: GLOSSARY.triggerSchedule,
|
|
5928
|
-
webhook: GLOSSARY.triggerWebhook
|
|
6108
|
+
ci: GLOSSARY.triggerCi.long,
|
|
6109
|
+
"cross-repo": GLOSSARY.triggerCrossRepo.long,
|
|
6110
|
+
manual: GLOSSARY.triggerManual.long,
|
|
6111
|
+
schedule: GLOSSARY.triggerSchedule.long,
|
|
6112
|
+
webhook: GLOSSARY.triggerWebhook.long
|
|
5929
6113
|
};
|
|
5930
6114
|
function ModeBadge({ mode }) {
|
|
5931
6115
|
const badge = /* @__PURE__ */ jsx20(OutlineBadge, { tone: MODE_TONE[mode] ?? "default", children: mode });
|
|
@@ -6997,14 +7181,14 @@ function OverviewGrid({ overall }) {
|
|
|
6997
7181
|
return /* @__PURE__ */ jsx25(Card10, { padding: 4, radius: 2, shadow: 1, children: /* @__PURE__ */ jsxs20(Stack16, { space: 4, children: [
|
|
6998
7182
|
/* @__PURE__ */ jsxs20(Flex11, { align: "center", gap: 3, children: [
|
|
6999
7183
|
/* @__PURE__ */ jsx25(Text21, { size: 3, weight: "semibold", children: "Agent Activity Overview" }),
|
|
7000
|
-
/* @__PURE__ */ jsx25(GlossaryTip, { text: GLOSSARY.agentBehaviorOverview })
|
|
7184
|
+
/* @__PURE__ */ jsx25(GlossaryTip, { text: GLOSSARY.agentBehaviorOverview.long })
|
|
7001
7185
|
] }),
|
|
7002
7186
|
/* @__PURE__ */ jsxs20(Grid, { columns: [1, 2, 3], gap: 3, children: [
|
|
7003
7187
|
/* @__PURE__ */ jsx25(
|
|
7004
7188
|
StatCard,
|
|
7005
7189
|
{
|
|
7006
7190
|
label: "Avg Pages Visited",
|
|
7007
|
-
tooltip: GLOSSARY.avgDocPagesVisited,
|
|
7191
|
+
tooltip: GLOSSARY.avgDocPagesVisited.long,
|
|
7008
7192
|
value: overall.avgDocPagesVisited.toFixed(1)
|
|
7009
7193
|
}
|
|
7010
7194
|
),
|
|
@@ -7012,7 +7196,7 @@ function OverviewGrid({ overall }) {
|
|
|
7012
7196
|
StatCard,
|
|
7013
7197
|
{
|
|
7014
7198
|
label: "Avg Searches",
|
|
7015
|
-
tooltip: GLOSSARY.avgSearchesPerformed,
|
|
7199
|
+
tooltip: GLOSSARY.avgSearchesPerformed.long,
|
|
7016
7200
|
value: overall.avgSearchesPerformed.toFixed(1)
|
|
7017
7201
|
}
|
|
7018
7202
|
),
|
|
@@ -7020,7 +7204,7 @@ function OverviewGrid({ overall }) {
|
|
|
7020
7204
|
StatCard,
|
|
7021
7205
|
{
|
|
7022
7206
|
label: "Avg Network Time",
|
|
7023
|
-
tooltip: GLOSSARY.avgNetworkTimeMs,
|
|
7207
|
+
tooltip: GLOSSARY.avgNetworkTimeMs.long,
|
|
7024
7208
|
value: formatDuration(overall.avgNetworkTimeMs)
|
|
7025
7209
|
}
|
|
7026
7210
|
),
|
|
@@ -7028,7 +7212,7 @@ function OverviewGrid({ overall }) {
|
|
|
7028
7212
|
StatCard,
|
|
7029
7213
|
{
|
|
7030
7214
|
label: "Unique Doc Slugs",
|
|
7031
|
-
tooltip: GLOSSARY.docSlugsVisited,
|
|
7215
|
+
tooltip: GLOSSARY.docSlugsVisited.long,
|
|
7032
7216
|
value: String(overall.totalUniqueDocSlugs)
|
|
7033
7217
|
}
|
|
7034
7218
|
),
|
|
@@ -7036,7 +7220,7 @@ function OverviewGrid({ overall }) {
|
|
|
7036
7220
|
StatCard,
|
|
7037
7221
|
{
|
|
7038
7222
|
label: "Unique Search Queries",
|
|
7039
|
-
tooltip: GLOSSARY.searchQueries,
|
|
7223
|
+
tooltip: GLOSSARY.searchQueries.long,
|
|
7040
7224
|
value: String(overall.totalUniqueSearchQueries)
|
|
7041
7225
|
}
|
|
7042
7226
|
),
|
|
@@ -7109,7 +7293,7 @@ function FeatureActivityCard({
|
|
|
7109
7293
|
items: slugs.items,
|
|
7110
7294
|
label: "Pages Visited",
|
|
7111
7295
|
tone: "primary",
|
|
7112
|
-
tooltip: GLOSSARY.docSlugsVisited,
|
|
7296
|
+
tooltip: GLOSSARY.docSlugsVisited.long,
|
|
7113
7297
|
totalCount: slugs.totalCount,
|
|
7114
7298
|
truncated: slugs.truncated
|
|
7115
7299
|
}
|
|
@@ -7120,7 +7304,7 @@ function FeatureActivityCard({
|
|
|
7120
7304
|
items: behavior.externalDomains,
|
|
7121
7305
|
label: "External Domains",
|
|
7122
7306
|
tone: "caution",
|
|
7123
|
-
tooltip: GLOSSARY.externalDomains
|
|
7307
|
+
tooltip: GLOSSARY.externalDomains.long
|
|
7124
7308
|
}
|
|
7125
7309
|
)
|
|
7126
7310
|
] }) });
|
|
@@ -7151,7 +7335,7 @@ function SearchQueryList({
|
|
|
7151
7335
|
return /* @__PURE__ */ jsxs20(Stack16, { space: 3, children: [
|
|
7152
7336
|
/* @__PURE__ */ jsxs20(Flex11, { align: "center", gap: 3, wrap: "wrap", children: [
|
|
7153
7337
|
/* @__PURE__ */ jsx25(Text21, { size: 2, weight: "semibold", children: "Search Queries" }),
|
|
7154
|
-
/* @__PURE__ */ jsx25(GlossaryTip, { text: GLOSSARY.searchQueries }),
|
|
7338
|
+
/* @__PURE__ */ jsx25(GlossaryTip, { text: GLOSSARY.searchQueries.long }),
|
|
7155
7339
|
/* @__PURE__ */ jsx25(Badge4, { tone: "default", children: totalCount }),
|
|
7156
7340
|
truncated && /* @__PURE__ */ jsxs20(Text21, { muted: true, size: 1, children: [
|
|
7157
7341
|
"(showing ",
|
|
@@ -7746,7 +7930,7 @@ function DiagnosticsOverview({
|
|
|
7746
7930
|
gridTemplateColumns: showDocMetrics ? "repeat(3, 1fr)" : "repeat(2, 1fr)"
|
|
7747
7931
|
},
|
|
7748
7932
|
children: [
|
|
7749
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.overallScore, children: /* @__PURE__ */ jsx29(
|
|
7933
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.overallScore.long, children: /* @__PURE__ */ jsx29(
|
|
7750
7934
|
ScoreCard,
|
|
7751
7935
|
{
|
|
7752
7936
|
delta: comparison?.deltas.overall,
|
|
@@ -7756,7 +7940,7 @@ function DiagnosticsOverview({
|
|
|
7756
7940
|
value: Math.round(overall.avgScore)
|
|
7757
7941
|
}
|
|
7758
7942
|
) }),
|
|
7759
|
-
showDocMetrics && /* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.docLift, children: /* @__PURE__ */ jsx29(
|
|
7943
|
+
showDocMetrics && /* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.docLift.long, children: /* @__PURE__ */ jsx29(
|
|
7760
7944
|
ScoreCard,
|
|
7761
7945
|
{
|
|
7762
7946
|
delta: comparison?.deltas.docLift,
|
|
@@ -7766,7 +7950,7 @@ function DiagnosticsOverview({
|
|
|
7766
7950
|
value: Math.round(overall.avgDocLift)
|
|
7767
7951
|
}
|
|
7768
7952
|
) }),
|
|
7769
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.tests, children: /* @__PURE__ */ jsx29(
|
|
7953
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.tests.long, children: /* @__PURE__ */ jsx29(
|
|
7770
7954
|
ScoreCard,
|
|
7771
7955
|
{
|
|
7772
7956
|
label: "TESTS",
|
|
@@ -7786,14 +7970,21 @@ function DiagnosticsOverview({
|
|
|
7786
7970
|
gridTemplateColumns: showDocMetrics ? "repeat(2, 1fr)" : "1fr"
|
|
7787
7971
|
},
|
|
7788
7972
|
children: [
|
|
7789
|
-
showDocMetrics && /* @__PURE__ */ jsx29(
|
|
7790
|
-
|
|
7973
|
+
showDocMetrics && /* @__PURE__ */ jsx29(
|
|
7974
|
+
HoverTip,
|
|
7791
7975
|
{
|
|
7792
|
-
|
|
7793
|
-
|
|
7794
|
-
|
|
7976
|
+
display: "block",
|
|
7977
|
+
text: GLOSSARY.negativeDocLiftMetric.long,
|
|
7978
|
+
children: /* @__PURE__ */ jsx29(
|
|
7979
|
+
MetricCard,
|
|
7980
|
+
{
|
|
7981
|
+
label: "Negative Doc Lift",
|
|
7982
|
+
sentiment: negativeDocLiftSentiment(negativeDocLiftCount),
|
|
7983
|
+
value: `${negativeDocLiftCount} area${negativeDocLiftCount === 1 ? "" : "s"}`
|
|
7984
|
+
}
|
|
7985
|
+
)
|
|
7795
7986
|
}
|
|
7796
|
-
)
|
|
7987
|
+
),
|
|
7797
7988
|
durationMs != null && durationMs > 0 ? /* @__PURE__ */ jsx29(
|
|
7798
7989
|
HoverTip,
|
|
7799
7990
|
{
|
|
@@ -7824,7 +8015,7 @@ function DiagnosticsOverview({
|
|
|
7824
8015
|
gridTemplateColumns: "repeat(3, 1fr)"
|
|
7825
8016
|
},
|
|
7826
8017
|
children: [
|
|
7827
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.actualScore, children: /* @__PURE__ */ jsx29(
|
|
8018
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.actualScore.long, children: /* @__PURE__ */ jsx29(
|
|
7828
8019
|
ScoreCard,
|
|
7829
8020
|
{
|
|
7830
8021
|
delta: comparison?.deltas.actualDelta,
|
|
@@ -7834,7 +8025,7 @@ function DiagnosticsOverview({
|
|
|
7834
8025
|
value: Math.round(overall.avgActualScore)
|
|
7835
8026
|
}
|
|
7836
8027
|
) }),
|
|
7837
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.retrievalGap, children: /* @__PURE__ */ jsx29(
|
|
8028
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.retrievalGap.long, children: /* @__PURE__ */ jsx29(
|
|
7838
8029
|
ScoreCard,
|
|
7839
8030
|
{
|
|
7840
8031
|
label: "RETRIEVAL GAP",
|
|
@@ -7844,7 +8035,7 @@ function DiagnosticsOverview({
|
|
|
7844
8035
|
value: overall.avgRetrievalGap != null ? Math.round(overall.avgRetrievalGap) : 0
|
|
7845
8036
|
}
|
|
7846
8037
|
) }),
|
|
7847
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency, children: /* @__PURE__ */ jsx29(
|
|
8038
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.infraEfficiency.long, children: /* @__PURE__ */ jsx29(
|
|
7848
8039
|
ScoreCard,
|
|
7849
8040
|
{
|
|
7850
8041
|
label: "EFFICIENCY",
|
|
@@ -7869,7 +8060,7 @@ function DiagnosticsOverview({
|
|
|
7869
8060
|
gridTemplateColumns: "1fr 1fr 1fr"
|
|
7870
8061
|
},
|
|
7871
8062
|
children: [
|
|
7872
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthStrong, children: /* @__PURE__ */ jsx29(
|
|
8063
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthStrong.long, children: /* @__PURE__ */ jsx29(
|
|
7873
8064
|
HealthCard,
|
|
7874
8065
|
{
|
|
7875
8066
|
color: strong.length > 0 ? "emerald" : "muted",
|
|
@@ -7878,7 +8069,7 @@ function DiagnosticsOverview({
|
|
|
7878
8069
|
label: "Strong (80+)"
|
|
7879
8070
|
}
|
|
7880
8071
|
) }),
|
|
7881
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthAttention, children: /* @__PURE__ */ jsx29(
|
|
8072
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthAttention.long, children: /* @__PURE__ */ jsx29(
|
|
7882
8073
|
HealthCard,
|
|
7883
8074
|
{
|
|
7884
8075
|
color: attention.length === 0 ? "muted" : "amber",
|
|
@@ -7887,7 +8078,7 @@ function DiagnosticsOverview({
|
|
|
7887
8078
|
label: "Attention (70-79)"
|
|
7888
8079
|
}
|
|
7889
8080
|
) }),
|
|
7890
|
-
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthWeak, children: /* @__PURE__ */ jsx29(
|
|
8081
|
+
/* @__PURE__ */ jsx29(HoverTip, { display: "block", text: GLOSSARY.healthWeak.long, children: /* @__PURE__ */ jsx29(
|
|
7891
8082
|
HealthCard,
|
|
7892
8083
|
{
|
|
7893
8084
|
color: weak.length === 0 ? "muted" : "red",
|
|
@@ -9408,7 +9599,7 @@ function JudgmentList({
|
|
|
9408
9599
|
/* @__PURE__ */ jsx36(
|
|
9409
9600
|
Tooltip8,
|
|
9410
9601
|
{
|
|
9411
|
-
content: /* @__PURE__ */ jsx36(Box22, { padding: 2, style: { maxWidth: 260 }, children: /* @__PURE__ */ jsx36(Text30, { size: 2, children: GLOSSARY.lowScoringJudgments }) }),
|
|
9602
|
+
content: /* @__PURE__ */ jsx36(Box22, { padding: 2, style: { maxWidth: 260 }, children: /* @__PURE__ */ jsx36(Text30, { size: 2, children: GLOSSARY.lowScoringJudgments.long }) }),
|
|
9412
9603
|
placement: "bottom",
|
|
9413
9604
|
portal: true,
|
|
9414
9605
|
children: /* @__PURE__ */ jsx36(Text30, { muted: true, size: 1, children: /* @__PURE__ */ jsx36(HelpCircleIcon6, {}) })
|
|
@@ -10493,7 +10684,7 @@ var fmLabel = {
|
|
|
10493
10684
|
};
|
|
10494
10685
|
function failureModeTip(mode) {
|
|
10495
10686
|
const desc = FAILURE_MODE_DESCS[mode];
|
|
10496
|
-
if (!desc) return GLOSSARY.failureMode;
|
|
10687
|
+
if (!desc) return GLOSSARY.failureMode.long;
|
|
10497
10688
|
return /* @__PURE__ */ jsxs31(Text33, { size: 2, style: { lineHeight: 1.5 }, children: [
|
|
10498
10689
|
/* @__PURE__ */ jsx40("span", { style: fmLabel, children: mode }),
|
|
10499
10690
|
" \u2014 ",
|
|
@@ -10502,7 +10693,7 @@ function failureModeTip(mode) {
|
|
|
10502
10693
|
}
|
|
10503
10694
|
function confidenceTip(level) {
|
|
10504
10695
|
const desc = CONFIDENCE_DESCS[level];
|
|
10505
|
-
if (!desc) return GLOSSARY.confidence;
|
|
10696
|
+
if (!desc) return GLOSSARY.confidence.long;
|
|
10506
10697
|
const icons = { high: "\u{1F7E2}", medium: "\u{1F7E1}", low: "\u{1F534}" };
|
|
10507
10698
|
return /* @__PURE__ */ jsxs31(Text33, { size: 2, style: { lineHeight: 1.5 }, children: [
|
|
10508
10699
|
icons[level],
|
|
@@ -11126,27 +11317,42 @@ function dateStamp2() {
|
|
|
11126
11317
|
const d = /* @__PURE__ */ new Date();
|
|
11127
11318
|
return `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, "0")}-${String(d.getDate()).padStart(2, "0")}`;
|
|
11128
11319
|
}
|
|
11320
|
+
function buildRerunPipelineRequest(args) {
|
|
11321
|
+
const { scope, sourceReportId, tag } = args;
|
|
11322
|
+
return {
|
|
11323
|
+
areas: scope.areas,
|
|
11324
|
+
dataset: scope.dataset,
|
|
11325
|
+
mode: scope.mode,
|
|
11326
|
+
...scope.perspective ? { perspective: scope.perspective } : {},
|
|
11327
|
+
projectId: scope.projectId,
|
|
11328
|
+
publish: true,
|
|
11329
|
+
publishTag: tag,
|
|
11330
|
+
source: "production",
|
|
11331
|
+
sourceReportId,
|
|
11332
|
+
taskMode: "content-lake",
|
|
11333
|
+
...scope.taskIds && scope.taskIds.length > 0 ? { tasks: scope.taskIds } : {}
|
|
11334
|
+
};
|
|
11335
|
+
}
|
|
11129
11336
|
function buildRequestDoc(scope, meta) {
|
|
11130
11337
|
const shortId = slugify2(meta.reportId.slice(0, 12));
|
|
11131
11338
|
const tag = `rerun-${shortId}-${dateStamp2()}`;
|
|
11132
11339
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
11340
|
+
const pipelineRequest = buildRerunPipelineRequest({
|
|
11341
|
+
scope,
|
|
11342
|
+
sourceReportId: meta.reportId,
|
|
11343
|
+
tag
|
|
11344
|
+
});
|
|
11133
11345
|
return {
|
|
11134
11346
|
_type: EVAL_REQUEST_TYPE2,
|
|
11135
|
-
// Spread all scope fields — new EvalScope fields are automatically
|
|
11136
|
-
// included without changes here
|
|
11137
|
-
areas: scope.areas,
|
|
11138
11347
|
dataset: scope.dataset,
|
|
11139
|
-
|
|
11140
|
-
...scope.perspective ? { perspective: scope.perspective } : {},
|
|
11348
|
+
pipelineRequest: JSON.stringify(pipelineRequest),
|
|
11141
11349
|
projectId: scope.projectId,
|
|
11142
|
-
...scope.taskIds && scope.taskIds.length > 0 ? { tasks: scope.taskIds } : {},
|
|
11143
|
-
// Request metadata
|
|
11144
11350
|
requestedAt: now,
|
|
11145
11351
|
requestedBy: meta.userId,
|
|
11146
|
-
// Lineage —
|
|
11352
|
+
// Lineage — duplicated on the document for projection convenience.
|
|
11353
|
+
// Authoritative copy is on pipelineRequest.sourceReportId.
|
|
11147
11354
|
sourceReportId: meta.reportId,
|
|
11148
|
-
status: "pending"
|
|
11149
|
-
tag
|
|
11355
|
+
status: "pending"
|
|
11150
11356
|
};
|
|
11151
11357
|
}
|
|
11152
11358
|
function RerunEvaluationAction({
|
|
@@ -11495,9 +11701,9 @@ import { Box as Box28, Flex as Flex29, Stack as Stack31, Text as Text40 } from "
|
|
|
11495
11701
|
var DIMENSION_TOOLTIPS = {
|
|
11496
11702
|
agentOutput: "Quality and completeness of the agent's output. Graded 0\u2013100.",
|
|
11497
11703
|
assertionPassRate: "Fraction of structural assertions that passed. Graded 0\u2013100.",
|
|
11498
|
-
codeCorrectness: GLOSSARY.codeCorrectness,
|
|
11499
|
-
docCoverage: GLOSSARY.docCoverage,
|
|
11500
|
-
taskCompletion: GLOSSARY.taskCompletion,
|
|
11704
|
+
codeCorrectness: GLOSSARY.codeCorrectness.long,
|
|
11705
|
+
docCoverage: GLOSSARY.docCoverage.long,
|
|
11706
|
+
taskCompletion: GLOSSARY.taskCompletion.long,
|
|
11501
11707
|
toolUsage: "How effectively the agent used available tools (file read/write, shell, etc.). Graded 0\u2013100."
|
|
11502
11708
|
};
|
|
11503
11709
|
function dimensionLabel2(key) {
|
|
@@ -11595,9 +11801,9 @@ import { Fragment as Fragment14, jsx as jsx56, jsxs as jsxs39 } from "react/jsx-
|
|
|
11595
11801
|
var DIMENSION_TOOLTIPS2 = {
|
|
11596
11802
|
agentOutput: "Quality and completeness of the agent's output. Graded 0\u2013100.",
|
|
11597
11803
|
assertionPassRate: "Fraction of structural assertions that passed. Graded 0\u2013100.",
|
|
11598
|
-
codeCorrectness: GLOSSARY.codeCorrectness,
|
|
11599
|
-
docCoverage: GLOSSARY.docCoverage,
|
|
11600
|
-
taskCompletion: GLOSSARY.taskCompletion,
|
|
11804
|
+
codeCorrectness: GLOSSARY.codeCorrectness.long,
|
|
11805
|
+
docCoverage: GLOSSARY.docCoverage.long,
|
|
11806
|
+
taskCompletion: GLOSSARY.taskCompletion.long,
|
|
11601
11807
|
toolUsage: "How effectively the agent used available tools (file read/write, shell, etc.). Graded 0\u2013100."
|
|
11602
11808
|
};
|
|
11603
11809
|
function tableTier(width) {
|
|
@@ -11707,7 +11913,7 @@ function AreaScoresGrid({
|
|
|
11707
11913
|
direction: sortDir,
|
|
11708
11914
|
label: "Score",
|
|
11709
11915
|
onClick: () => handleSort("score"),
|
|
11710
|
-
tooltip: GLOSSARY.score
|
|
11916
|
+
tooltip: GLOSSARY.score.long
|
|
11711
11917
|
}
|
|
11712
11918
|
),
|
|
11713
11919
|
/* @__PURE__ */ jsx56(
|
|
@@ -11737,10 +11943,10 @@ function AreaScoresGrid({
|
|
|
11737
11943
|
direction: sortDir,
|
|
11738
11944
|
label: "Lift",
|
|
11739
11945
|
onClick: () => handleSort("lift"),
|
|
11740
|
-
tooltip: GLOSSARY.docLift
|
|
11946
|
+
tooltip: GLOSSARY.docLift.long
|
|
11741
11947
|
}
|
|
11742
11948
|
),
|
|
11743
|
-
tier === "full" && hasActual && /* @__PURE__ */ jsx56(ColHeader2, { label: "Actual", tooltip: GLOSSARY.actualScore })
|
|
11949
|
+
tier === "full" && hasActual && /* @__PURE__ */ jsx56(ColHeader2, { label: "Actual", tooltip: GLOSSARY.actualScore.long })
|
|
11744
11950
|
]
|
|
11745
11951
|
}
|
|
11746
11952
|
),
|
|
@@ -11919,7 +12125,7 @@ function AreaRow({
|
|
|
11919
12125
|
/* @__PURE__ */ jsx56("span", { style: { color: "var(--card-muted-fg-color)" }, children: "/100" }),
|
|
11920
12126
|
".",
|
|
11921
12127
|
" ",
|
|
11922
|
-
GLOSSARY.score
|
|
12128
|
+
GLOSSARY.score.long
|
|
11923
12129
|
] }),
|
|
11924
12130
|
children: /* @__PURE__ */ jsx56(
|
|
11925
12131
|
"div",
|
|
@@ -11942,11 +12148,11 @@ function AreaRow({
|
|
|
11942
12148
|
)
|
|
11943
12149
|
}
|
|
11944
12150
|
),
|
|
11945
|
-
!isNarrow && delta != null && delta !== 0 && /* @__PURE__ */ jsx56(HoverTip, { text: GLOSSARY.areaDelta, children: /* @__PURE__ */ jsx56(DeltaIndicator, { delta, icon: true, size: 1 }) })
|
|
12151
|
+
!isNarrow && delta != null && delta !== 0 && /* @__PURE__ */ jsx56(HoverTip, { text: GLOSSARY.areaDelta.long, children: /* @__PURE__ */ jsx56(DeltaIndicator, { delta, icon: true, size: 1 }) })
|
|
11946
12152
|
] }),
|
|
11947
12153
|
/* @__PURE__ */ jsxs39(Flex27, { align: "center", gap: 2, wrap: "wrap", children: [
|
|
11948
12154
|
/* @__PURE__ */ jsx56(Text38, { size: 2, weight: "medium", children: area.feature }),
|
|
11949
|
-
area.negativeDocLift && showLift && /* @__PURE__ */ jsx56(HoverTip, { text: GLOSSARY.docsHurt, children: /* @__PURE__ */ jsx56(
|
|
12155
|
+
area.negativeDocLift && showLift && /* @__PURE__ */ jsx56(HoverTip, { text: GLOSSARY.docsHurt.long, children: /* @__PURE__ */ jsx56(
|
|
11950
12156
|
"span",
|
|
11951
12157
|
{
|
|
11952
12158
|
style: {
|
|
@@ -11995,7 +12201,7 @@ function AreaRow({
|
|
|
11995
12201
|
),
|
|
11996
12202
|
" ",
|
|
11997
12203
|
"pts. ",
|
|
11998
|
-
GLOSSARY.docLift
|
|
12204
|
+
GLOSSARY.docLift.long
|
|
11999
12205
|
] }),
|
|
12000
12206
|
children: /* @__PURE__ */ jsxs39(
|
|
12001
12207
|
Text38,
|
|
@@ -12017,7 +12223,7 @@ function AreaRow({
|
|
|
12017
12223
|
tier === "full" && hasActual && /* @__PURE__ */ jsx56(
|
|
12018
12224
|
HoverTip,
|
|
12019
12225
|
{
|
|
12020
|
-
text: area.actualScore != null ? `${area.feature} actual score: ${Math.round(area.actualScore)}/100. ${GLOSSARY.actualScore}` : `No agentic data for ${area.feature}.`,
|
|
12226
|
+
text: area.actualScore != null ? `${area.feature} actual score: ${Math.round(area.actualScore)}/100. ${GLOSSARY.actualScore.long}` : `No agentic data for ${area.feature}.`,
|
|
12021
12227
|
children: /* @__PURE__ */ jsx56(
|
|
12022
12228
|
Text38,
|
|
12023
12229
|
{
|
|
@@ -12334,7 +12540,7 @@ function StrengthsList({
|
|
|
12334
12540
|
/* @__PURE__ */ jsxs41(Flex29, { align: "center", gap: 2, wrap: "wrap", children: [
|
|
12335
12541
|
/* @__PURE__ */ jsx58(CheckmarkCircleIcon2, { style: { color: "#34d399" } }),
|
|
12336
12542
|
/* @__PURE__ */ jsx58(Text40, { size: 2, weight: "medium", children: "Strong Areas (70+)" }),
|
|
12337
|
-
/* @__PURE__ */ jsx58(InfoTip, { text: GLOSSARY.strengths }),
|
|
12543
|
+
/* @__PURE__ */ jsx58(InfoTip, { text: GLOSSARY.strengths.long }),
|
|
12338
12544
|
hasModels && /* @__PURE__ */ jsx58(Box28, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx58(
|
|
12339
12545
|
ModelSelector,
|
|
12340
12546
|
{
|
|
@@ -12367,7 +12573,7 @@ function StrengthsList({
|
|
|
12367
12573
|
Math.round(EFFICIENCY_POSITIVE * 100),
|
|
12368
12574
|
"%+ efficiency)"
|
|
12369
12575
|
] }),
|
|
12370
|
-
/* @__PURE__ */ jsx58(InfoTip, { text: GLOSSARY.retrievalExcellence })
|
|
12576
|
+
/* @__PURE__ */ jsx58(InfoTip, { text: GLOSSARY.retrievalExcellence.long })
|
|
12371
12577
|
] })
|
|
12372
12578
|
}
|
|
12373
12579
|
),
|
|
@@ -12492,7 +12698,7 @@ function WeaknessesList({
|
|
|
12492
12698
|
/* @__PURE__ */ jsxs43(Flex31, { align: "center", gap: 2, wrap: "wrap", children: [
|
|
12493
12699
|
/* @__PURE__ */ jsx60(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
|
|
12494
12700
|
/* @__PURE__ */ jsx60(Text42, { size: 2, weight: "medium", children: "Weak Areas (<70)" }),
|
|
12495
|
-
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.weakAreas }),
|
|
12701
|
+
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.weakAreas.long }),
|
|
12496
12702
|
hasModels && /* @__PURE__ */ jsx60(Box30, { style: { marginLeft: "auto" }, children: /* @__PURE__ */ jsx60(
|
|
12497
12703
|
ModelSelector,
|
|
12498
12704
|
{
|
|
@@ -12516,7 +12722,7 @@ function WeaknessesList({
|
|
|
12516
12722
|
/* @__PURE__ */ jsxs43(Flex31, { align: "center", gap: 2, children: [
|
|
12517
12723
|
/* @__PURE__ */ jsx60(ErrorOutlineIcon3, { style: { color: "#f87171" } }),
|
|
12518
12724
|
/* @__PURE__ */ jsx60(Text42, { size: 2, weight: "medium", children: "Docs Hurt Performance (Negative Doc Lift)" }),
|
|
12519
|
-
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.docsHurt })
|
|
12725
|
+
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.docsHurt.long })
|
|
12520
12726
|
] }),
|
|
12521
12727
|
/* @__PURE__ */ jsx60(Box30, { style: sectionStyle("red"), children: docsHurt.map((area, i) => /* @__PURE__ */ jsxs43(
|
|
12522
12728
|
Box30,
|
|
@@ -12575,7 +12781,7 @@ function WeaknessesList({
|
|
|
12575
12781
|
/* @__PURE__ */ jsxs43(Flex31, { align: "center", gap: 2, children: [
|
|
12576
12782
|
/* @__PURE__ */ jsx60(SearchIcon9, { style: { color: "#fbbf24" } }),
|
|
12577
12783
|
/* @__PURE__ */ jsx60(Text42, { size: 2, weight: "medium", children: "Retrieval Issues (<70% efficiency)" }),
|
|
12578
|
-
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.retrievalIssues })
|
|
12784
|
+
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.retrievalIssues.long })
|
|
12579
12785
|
] }),
|
|
12580
12786
|
/* @__PURE__ */ jsx60(Box30, { style: sectionStyle("amber"), children: retrievalIssues.map((area, i) => /* @__PURE__ */ jsxs43(
|
|
12581
12787
|
Box30,
|
|
@@ -12633,7 +12839,7 @@ function WeaknessesList({
|
|
|
12633
12839
|
/* @__PURE__ */ jsxs43(Flex31, { align: "center", gap: 2, children: [
|
|
12634
12840
|
/* @__PURE__ */ jsx60(WarningOutlineIcon3, { style: { color: "#fbbf24" } }),
|
|
12635
12841
|
/* @__PURE__ */ jsx60(Text42, { size: 2, weight: "medium", children: "Dimension Weaknesses (<50)" }),
|
|
12636
|
-
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.dimWeaknesses })
|
|
12842
|
+
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.dimWeaknesses.long })
|
|
12637
12843
|
] }),
|
|
12638
12844
|
/* @__PURE__ */ jsx60(Box30, { style: neutralCardStyle, children: dimWeaknesses.map(({ area, dims }, i) => /* @__PURE__ */ jsxs43(
|
|
12639
12845
|
Box30,
|
|
@@ -12740,7 +12946,7 @@ function WeaknessesList({
|
|
|
12740
12946
|
/* @__PURE__ */ jsxs43(Flex31, { align: "center", gap: 2, children: [
|
|
12741
12947
|
/* @__PURE__ */ jsx60(BoltIcon2, { style: { color: "#fbbf24" } }),
|
|
12742
12948
|
/* @__PURE__ */ jsx60(Text42, { size: 2, weight: "medium", children: "Efficiency Anomalies (>100%)" }),
|
|
12743
|
-
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.efficiencyAnomalies })
|
|
12949
|
+
/* @__PURE__ */ jsx60(InfoTip, { text: GLOSSARY.efficiencyAnomalies.long })
|
|
12744
12950
|
] }),
|
|
12745
12951
|
/* @__PURE__ */ jsx60(Text42, { muted: true, size: 2, children: "Agent outperforms injected docs \u2014 may indicate doc quality issues or agent memorization." })
|
|
12746
12952
|
] })
|
|
@@ -12950,11 +13156,17 @@ function ReportDetail({
|
|
|
12950
13156
|
/* @__PURE__ */ jsx61(
|
|
12951
13157
|
HoverTip,
|
|
12952
13158
|
{
|
|
12953
|
-
text: SOURCE_TIP[provenance.source.name] ?? GLOSSARY.reportMode,
|
|
13159
|
+
text: SOURCE_TIP[provenance.source.name] ?? GLOSSARY.reportMode.long,
|
|
12954
13160
|
children: /* @__PURE__ */ jsx61(Badge9, { mode: "outline", tone: "default", children: provenance.source.name })
|
|
12955
13161
|
}
|
|
12956
13162
|
),
|
|
12957
|
-
/* @__PURE__ */ jsx61(
|
|
13163
|
+
/* @__PURE__ */ jsx61(
|
|
13164
|
+
HoverTip,
|
|
13165
|
+
{
|
|
13166
|
+
text: MODE_TIP2[provenance.mode] ?? GLOSSARY.reportMode.long,
|
|
13167
|
+
children: /* @__PURE__ */ jsx61(Badge9, { tone: "primary", children: provenance.mode })
|
|
13168
|
+
}
|
|
13169
|
+
),
|
|
12958
13170
|
/* @__PURE__ */ jsx61(
|
|
12959
13171
|
ReportActions,
|
|
12960
13172
|
{
|
|
@@ -13180,16 +13392,16 @@ function DiagnosticsPanel({
|
|
|
13180
13392
|
] }) });
|
|
13181
13393
|
}
|
|
13182
13394
|
var MODE_TIP2 = {
|
|
13183
|
-
agentic: GLOSSARY.modeAgentic,
|
|
13184
|
-
baseline: GLOSSARY.modeBaseline,
|
|
13185
|
-
debug: GLOSSARY.modeDebug,
|
|
13186
|
-
full: GLOSSARY.modeFull,
|
|
13187
|
-
observed: GLOSSARY.modeObserved
|
|
13395
|
+
agentic: GLOSSARY.modeAgentic.long,
|
|
13396
|
+
baseline: GLOSSARY.modeBaseline.long,
|
|
13397
|
+
debug: GLOSSARY.modeDebug.long,
|
|
13398
|
+
full: GLOSSARY.modeFull.long,
|
|
13399
|
+
observed: GLOSSARY.modeObserved.long
|
|
13188
13400
|
};
|
|
13189
13401
|
var SOURCE_TIP = {
|
|
13190
|
-
branch: GLOSSARY.sourceBranch,
|
|
13191
|
-
local: GLOSSARY.sourceLocal,
|
|
13192
|
-
production: GLOSSARY.sourceProduction
|
|
13402
|
+
branch: GLOSSARY.sourceBranch.long,
|
|
13403
|
+
local: GLOSSARY.sourceLocal.long,
|
|
13404
|
+
production: GLOSSARY.sourceProduction.long
|
|
13193
13405
|
};
|
|
13194
13406
|
var inlineCodeStyle = {
|
|
13195
13407
|
background: "var(--card-code-bg-color, rgba(255,255,255,0.06))",
|
|
@@ -15159,6 +15371,20 @@ import {
|
|
|
15159
15371
|
useDataset as useDataset2,
|
|
15160
15372
|
useProjectId as useProjectId2
|
|
15161
15373
|
} from "sanity";
|
|
15374
|
+
function buildReleaseEvalPipelineRequest(args) {
|
|
15375
|
+
const { mode, perspectiveId, projectId, sourceDataset, tag } = args;
|
|
15376
|
+
return {
|
|
15377
|
+
dataset: sourceDataset,
|
|
15378
|
+
mode: "literacy",
|
|
15379
|
+
perspective: perspectiveId,
|
|
15380
|
+
...projectId ? { projectId } : {},
|
|
15381
|
+
publish: true,
|
|
15382
|
+
publishTag: tag,
|
|
15383
|
+
source: "production",
|
|
15384
|
+
taskMode: "content-lake",
|
|
15385
|
+
variant: mode
|
|
15386
|
+
};
|
|
15387
|
+
}
|
|
15162
15388
|
var API_VERSION2 = "2026-03-11";
|
|
15163
15389
|
var EVAL_REQUEST_TYPE3 = "ailf.evalRequest";
|
|
15164
15390
|
var POLL_INTERVAL_MS2 = 3e4;
|
|
@@ -15286,17 +15512,22 @@ function createRunEvaluationAction(options = {}) {
|
|
|
15286
15512
|
const releaseTitle = release.metadata?.title ?? perspectiveId ?? "release";
|
|
15287
15513
|
const tag = `release-${slugify3(releaseTitle)}-${dateStamp3()}`;
|
|
15288
15514
|
const now = Date.now();
|
|
15515
|
+
const pipelineRequest = buildReleaseEvalPipelineRequest({
|
|
15516
|
+
mode,
|
|
15517
|
+
perspectiveId,
|
|
15518
|
+
projectId,
|
|
15519
|
+
sourceDataset,
|
|
15520
|
+
tag
|
|
15521
|
+
});
|
|
15289
15522
|
try {
|
|
15290
15523
|
const doc = await ailfClient.create({
|
|
15291
15524
|
_type: EVAL_REQUEST_TYPE3,
|
|
15292
15525
|
dataset: sourceDataset,
|
|
15293
|
-
|
|
15294
|
-
perspective: perspectiveId,
|
|
15526
|
+
pipelineRequest: JSON.stringify(pipelineRequest),
|
|
15295
15527
|
projectId,
|
|
15296
15528
|
requestedAt: new Date(now).toISOString(),
|
|
15297
15529
|
requestedBy: currentUser?.id ?? "unknown",
|
|
15298
|
-
status: "pending"
|
|
15299
|
-
tag
|
|
15530
|
+
status: "pending"
|
|
15300
15531
|
});
|
|
15301
15532
|
requestedAtRef.current = now;
|
|
15302
15533
|
setState({ requestId: doc._id, startedAt: now, status: "requested" });
|
|
@@ -15423,7 +15654,6 @@ export {
|
|
|
15423
15654
|
ArchiveTaskAction,
|
|
15424
15655
|
AssertionInput,
|
|
15425
15656
|
CanonicalDocInput,
|
|
15426
|
-
GLOSSARY,
|
|
15427
15657
|
GraduateToNativeAction,
|
|
15428
15658
|
HelpDrawer,
|
|
15429
15659
|
HelpProvider,
|