@sanity/ailf 4.0.7 โ†’ 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/bin/ailf.js +6 -1
  2. package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
  3. package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
  4. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  5. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
  7. package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
  8. package/dist/_vendor/ailf-core/schemas/report.js +235 -0
  9. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  10. package/dist/_vendor/ailf-core/services/index.js +1 -0
  11. package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
  12. package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
  13. package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
  14. package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
  15. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +20 -3
  16. package/dist/_vendor/ailf-core/types/index.d.ts +4 -1
  17. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
  18. package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
  19. package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
  20. package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
  21. package/dist/_vendor/ailf-shared/index.d.ts +7 -5
  22. package/dist/_vendor/ailf-shared/index.js +7 -5
  23. package/dist/adapters/api-client/types.d.ts +2 -5
  24. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +21 -5
  25. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +129 -25
  26. package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
  27. package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
  28. package/dist/adapters/task-sources/index.d.ts +1 -1
  29. package/dist/adapters/task-sources/index.js +1 -1
  30. package/dist/adapters/task-sources/repo-schemas.d.ts +19 -2
  31. package/dist/adapters/task-sources/repo-schemas.js +81 -2
  32. package/dist/adapters/task-sources/repo-task-source.js +11 -2
  33. package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
  34. package/dist/adapters/task-sources/repo-validation.js +1 -1
  35. package/dist/agent-observer/agentic-provider.d.ts +1 -0
  36. package/dist/agent-observer/agentic-provider.js +43 -36
  37. package/dist/agent-observer/config-schemas.d.ts +61 -0
  38. package/dist/agent-observer/config-schemas.js +65 -0
  39. package/dist/agent-observer/provider.d.ts +1 -0
  40. package/dist/agent-observer/provider.js +19 -17
  41. package/dist/cli.js +4 -4
  42. package/dist/commands/validate-tasks.js +10 -4
  43. package/dist/composition-root.js +4 -2
  44. package/dist/index.d.ts +1 -1
  45. package/dist/index.js +1 -1
  46. package/dist/job-store.js +2 -2
  47. package/dist/lib/dotenv-resolution.d.ts +21 -0
  48. package/dist/lib/dotenv-resolution.js +30 -0
  49. package/dist/orchestration/steps/mirror-repo-tasks-step.js +14 -3
  50. package/dist/orchestration/steps/run-eval-step.js +21 -3
  51. package/dist/pipeline/agent-behavior-report.d.ts +2 -8
  52. package/dist/pipeline/cache.d.ts +2 -2
  53. package/dist/pipeline/checks.d.ts +10 -2
  54. package/dist/pipeline/checks.js +14 -4
  55. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  56. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +0 -12
  57. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +0 -12
  58. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
  59. package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
  60. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
  61. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +44 -5
  62. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  63. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
  64. package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
  65. package/dist/pipeline/compiler/provider-assembler.js +33 -3
  66. package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
  67. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
  68. package/dist/pipeline/mirror-repo-tasks.js +16 -8
  69. package/dist/pipeline/pr-comment.d.ts +22 -9
  70. package/dist/pipeline/pr-comment.js +52 -472
  71. package/dist/pipeline/resolve-mappings.d.ts +8 -3
  72. package/dist/promptfoo-providers/mock-path.d.ts +12 -0
  73. package/dist/promptfoo-providers/mock-path.js +15 -0
  74. package/dist/report-store.d.ts +63 -1
  75. package/dist/report-store.js +111 -31
  76. package/dist/sanity/client.d.ts +58 -0
  77. package/dist/sanity/client.js +106 -0
  78. package/dist/sanity/document-renderers.d.ts +68 -0
  79. package/dist/sanity/document-renderers.js +221 -0
  80. package/dist/sanity/queries.d.ts +21 -0
  81. package/dist/sanity/queries.js +71 -0
  82. package/dist/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  83. package/dist/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  84. package/dist/tasks/literacy/content-lake.task.ts +4 -10
  85. package/dist/tasks/literacy/frameworks.task.ts +2 -8
  86. package/dist/tasks/literacy/functions.task.ts +1 -4
  87. package/dist/tasks/literacy/groq.task.ts +3 -12
  88. package/dist/tasks/literacy/image-handling.task.ts +1 -4
  89. package/dist/tasks/literacy/nextjs-live.task.ts +1 -4
  90. package/dist/tasks/literacy/portable-text.task.ts +2 -8
  91. package/dist/tasks/literacy/studio-setup.task.ts +2 -8
  92. package/dist/tasks/literacy/visual-editing.task.ts +2 -8
  93. package/package.json +8 -7
  94. package/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  95. package/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  96. package/tasks/literacy/content-lake.task.ts +4 -10
  97. package/tasks/literacy/frameworks.task.ts +2 -8
  98. package/tasks/literacy/functions.task.ts +1 -4
  99. package/tasks/literacy/groq.task.ts +3 -12
  100. package/tasks/literacy/image-handling.task.ts +1 -4
  101. package/tasks/literacy/nextjs-live.task.ts +1 -4
  102. package/tasks/literacy/portable-text.task.ts +2 -8
  103. package/tasks/literacy/studio-setup.task.ts +2 -8
  104. package/tasks/literacy/visual-editing.task.ts +2 -8
@@ -0,0 +1,696 @@
1
+ /**
2
+ * report-to-markdown.ts โ€” Canonical PR-comment markdown renderer (W0150).
3
+ *
4
+ * Single source of truth for rendering an AILF report as PR-comment
5
+ * markdown. Used by:
6
+ * - the API gateway (`/v1/reports/:id/markdown`)
7
+ * - the eval pipeline (`ailf pr-comment` CLI / `pipeline/pr-comment.ts`)
8
+ *
9
+ * Operates on a structurally lenient `RenderableReport` shape so callers
10
+ * can pass either the persisted slim Report (Sanity doc) or an in-memory
11
+ * envelope built from `score-summary.json` + `comparison-report.json`.
12
+ *
13
+ * Canonical formatting decisions (W0150):
14
+ * - Header: level-1 (`# {emoji} AI Literacy Score Report`).
15
+ * - Footer: markdown link form `[view detailed results](url)`. The link
16
+ * URL comes from `provenance.promptfooUrls[0].url`.
17
+ * - Source verification block (sourceVerification + sourceIsolation) is
18
+ * rendered when present on the summary โ€” preserves info from agentic
19
+ * / sandboxed local-mode runs without breaking remote-mode reports
20
+ * that don't carry those fields.
21
+ */
22
+ // ---------------------------------------------------------------------------
23
+ // Public API
24
+ // ---------------------------------------------------------------------------
25
+ export function reportToMarkdown(report) {
26
+ const md = new MarkdownBuilder();
27
+ const summary = report.summary;
28
+ const comparison = report.comparison;
29
+ const provenance = report.provenance;
30
+ // Header
31
+ const avgScore = summary?.overall?.avgScore;
32
+ md.heading(1, `${scoreEmoji(avgScore)} AI Literacy Score Report`);
33
+ md.blank();
34
+ // Subtitle line
35
+ const parts = [];
36
+ if (avgScore !== undefined) {
37
+ parts.push(`**Overall: ${round(avgScore)}/100**`);
38
+ }
39
+ if (summary?.overall?.avgDocLift !== undefined) {
40
+ parts.push(`Doc Lift: ${signedNum(round(summary.overall.avgDocLift))}`);
41
+ }
42
+ if (summary?.overall?.avgActualScore !== undefined) {
43
+ parts.push(`Actual: ${round(summary.overall.avgActualScore)}/100`);
44
+ }
45
+ if (summary?.overall?.avgRetrievalGap !== undefined) {
46
+ parts.push(`Ret. Gap: ${round(summary.overall.avgRetrievalGap)}`);
47
+ }
48
+ const scores = normalizeScores(summary?.scores);
49
+ const testCount = scores.reduce((s, sc) => s + (sc.testCount ?? 0), 0);
50
+ if (scores.length > 0) {
51
+ parts.push(`${testCount} tests across ${scores.length} areas`);
52
+ }
53
+ const totalCost = computeTotalCost(scores, summary?.overall);
54
+ if (totalCost > 0) {
55
+ parts.push(`Cost: ${fmtCost(totalCost)}`);
56
+ }
57
+ if (parts.length > 0) {
58
+ md.line(parts.join(" ยท "));
59
+ md.blank();
60
+ }
61
+ renderMetadata(md, report);
62
+ const belowCritical = summary?.belowCritical ?? [];
63
+ if (belowCritical.length > 0) {
64
+ md.line(`> โš ๏ธ **Below critical threshold:** ${belowCritical.map((a) => `\`${a}\``).join(", ")}`);
65
+ md.blank();
66
+ }
67
+ renderSource(md, summary?.source);
68
+ renderSourceVerification(md, summary?.sourceVerification, summary?.sourceIsolation);
69
+ if (scores.length > 0) {
70
+ renderScoreTable(md, scores);
71
+ }
72
+ const hasActual = scores.some((s) => s.actualScore !== undefined);
73
+ if (hasActual) {
74
+ renderThreeLayerDecomposition(md, scores);
75
+ }
76
+ else if (scores.length > 0) {
77
+ renderCeilingDecomposition(md, scores);
78
+ }
79
+ renderPerModel(md, summary?.perModel);
80
+ if (comparison) {
81
+ renderComparison(md, comparison);
82
+ }
83
+ renderCostBreakdown(md, scores, summary?.overall);
84
+ renderGapRecommendations(md, summary?.recommendations);
85
+ renderLowScoringJudgments(md, summary?.lowScoringJudgments);
86
+ renderRecommendations(md, scores, belowCritical);
87
+ renderFooter(md, report, provenance);
88
+ return md.toString();
89
+ }
90
+ // ---------------------------------------------------------------------------
91
+ // Section renderers
92
+ // ---------------------------------------------------------------------------
93
+ function renderMetadata(md, report) {
94
+ const items = [];
95
+ if (report.id) {
96
+ const studioUrl = buildStudioUrl(report.id);
97
+ items.push(["Report ID", `[\`${report.id}\`](${studioUrl})`]);
98
+ }
99
+ if (report.completedAt) {
100
+ items.push(["Completed", fmtDate(report.completedAt)]);
101
+ }
102
+ if (typeof report.durationMs === "number") {
103
+ items.push(["Duration", fmtDuration(report.durationMs)]);
104
+ }
105
+ if (report.tag) {
106
+ items.push(["Tag", report.tag]);
107
+ }
108
+ const summary = report.summary;
109
+ if (summary?.evaluationMode) {
110
+ items.push(["Mode", summary.evaluationMode]);
111
+ }
112
+ if (items.length > 0) {
113
+ md.table(["Field", "Value"], items);
114
+ md.blank();
115
+ }
116
+ }
117
+ function renderSource(md, source) {
118
+ if (!source)
119
+ return;
120
+ const rows = [];
121
+ if (source.name)
122
+ rows.push(["Source", source.name]);
123
+ if (source.baseUrl)
124
+ rows.push(["Docs URL", source.baseUrl]);
125
+ if (source.dataset)
126
+ rows.push(["Dataset", source.dataset]);
127
+ if (source.projectId)
128
+ rows.push(["Project", source.projectId]);
129
+ if (source.perspective)
130
+ rows.push(["Perspective", source.perspective]);
131
+ if (rows.length > 0) {
132
+ md.details("๐Ÿ”ง Environment", () => {
133
+ md.table(["Setting", "Value"], rows);
134
+ });
135
+ md.blank();
136
+ }
137
+ }
138
+ function renderSourceVerification(md, verification, isolation) {
139
+ if (!verification && !isolation)
140
+ return;
141
+ md.details("๐Ÿ” Source verification", () => {
142
+ const rows = [];
143
+ if (verification) {
144
+ rows.push(["Source", verification.source]);
145
+ rows.push(["Mode", verification.mode]);
146
+ if (verification.allowedOrigins) {
147
+ rows.push(["Sandbox", verification.allowedOrigins.join(", ")]);
148
+ }
149
+ if (verification.searchMode) {
150
+ rows.push(["Search", verification.searchMode]);
151
+ }
152
+ }
153
+ if (isolation) {
154
+ const pct = Math.round(isolation.isolationScore * 100);
155
+ const icon = isolation.offOrigin === 0 ? "โœ…" : "โš ๏ธ";
156
+ rows.push([
157
+ "Agent isolation",
158
+ `${icon} ${pct}% (${isolation.onOrigin}/${isolation.total} on-origin)`,
159
+ ]);
160
+ if (isolation.offOrigin > 0) {
161
+ rows.push([
162
+ "Off-origin fetches",
163
+ isolation.offOriginUrls.slice(0, 5).join(", "),
164
+ ]);
165
+ }
166
+ }
167
+ if (verification?.urlFetch) {
168
+ const uf = verification.urlFetch;
169
+ rows.push([
170
+ "URL fetch",
171
+ `${uf.totalFetched} fetched, ${uf.totalFailed} failed`,
172
+ ]);
173
+ for (const f of uf.fetchedUrls) {
174
+ rows.push(["", `โœ… ${f.url} (via ${f.method})`]);
175
+ }
176
+ for (const f of uf.failures) {
177
+ rows.push([
178
+ "",
179
+ `โš ๏ธ ${f.url}: ${f.error && f.error.length > 0 ? f.error : "unknown"}`,
180
+ ]);
181
+ }
182
+ }
183
+ md.table(["Setting", "Value"], rows);
184
+ });
185
+ md.blank();
186
+ }
187
+ function renderScoreTable(md, scores) {
188
+ const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
189
+ md.heading(3, "Scores by Feature Area");
190
+ md.blank();
191
+ const headers = [
192
+ "Feature",
193
+ "Score",
194
+ "Grade",
195
+ "Task",
196
+ "Code",
197
+ "Docs",
198
+ "Doc Lift",
199
+ "Tests",
200
+ ];
201
+ const rows = sorted.map((s) => [
202
+ s.feature ?? "โ€”",
203
+ `**${round(s.totalScore)}**`,
204
+ `${gradeEmoji(s.totalScore)} ${gradeLetter(s.totalScore)}`,
205
+ String(round(s.taskCompletion)),
206
+ String(round(s.codeCorrectness)),
207
+ String(round(s.docCoverage)),
208
+ liftArrow(s.docLift),
209
+ String(s.testCount ?? 0),
210
+ ]);
211
+ md.table(headers, rows);
212
+ md.blank();
213
+ const negAreas = sorted.filter((s) => s.negativeDocLift);
214
+ for (const s of negAreas) {
215
+ md.line(`> ๐Ÿšจ **Negative Doc Lift:** \`${s.feature}\` (${s.docLift}) โ€” docs hurt performance. Floor: ${s.floorScore}, Ceiling: ${s.ceilingScore}`);
216
+ }
217
+ if (negAreas.length > 0)
218
+ md.blank();
219
+ }
220
+ function renderCeilingDecomposition(md, scores) {
221
+ const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
222
+ md.details("๐Ÿ“Š Ceiling decomposition", () => {
223
+ md.table(["Feature", "Floor", "Ceiling", "Doc Lift", "Quality Gap"], sorted.map((s) => [
224
+ s.feature ?? "โ€”",
225
+ String(s.floorScore ?? 0),
226
+ String(s.ceilingScore ?? 0),
227
+ liftArrow(s.docLift),
228
+ String(s.docQualityGap ?? 0),
229
+ ]));
230
+ });
231
+ md.blank();
232
+ }
233
+ function renderThreeLayerDecomposition(md, scores) {
234
+ const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
235
+ md.heading(3, "๐Ÿ”ฌ Three-Layer Decomposition");
236
+ md.blank();
237
+ md.table([
238
+ "Feature",
239
+ "Floor",
240
+ "Ceiling",
241
+ "Actual",
242
+ "Doc Lift",
243
+ "Retr. Gap",
244
+ "Infra %",
245
+ ], sorted.map((s) => {
246
+ const actualStr = s.actualScore !== undefined ? String(s.actualScore) : "โ€”";
247
+ const gapStr = s.retrievalGap !== undefined ? signedNum(s.retrievalGap) : "โ€”";
248
+ const infraStr = s.infrastructureEfficiency != null
249
+ ? `${Math.round(s.infrastructureEfficiency * 100)}%`
250
+ : "โ€”";
251
+ const flag = s.invertedRetrievalGap ? " ๐Ÿ”„" : "";
252
+ return [
253
+ s.feature ?? "โ€”",
254
+ String(s.floorScore ?? 0),
255
+ String(s.ceilingScore ?? 0),
256
+ actualStr,
257
+ liftArrow(s.docLift),
258
+ `${gapStr}${flag}`,
259
+ infraStr,
260
+ ];
261
+ }));
262
+ md.blank();
263
+ md.details("๐Ÿ“– What do these numbers mean?", () => {
264
+ md.line("- **Floor:** Model performance without any documentation (training data only)");
265
+ md.line("- **Ceiling:** Model performance with perfect documentation (hand-picked, injected)");
266
+ md.line("- **Actual:** Model performance when finding docs on its own (like real users)");
267
+ md.line("- **Doc Lift:** Documentation quality contribution (Ceiling โˆ’ Floor)");
268
+ md.line("- **Retr. Gap:** Quality lost in discovery (Ceiling โˆ’ Actual)");
269
+ md.line("- **Infra %:** Fraction of doc quality reaching agents (Actual รท Ceiling)");
270
+ });
271
+ md.blank();
272
+ }
273
+ function renderPerModel(md, perModel) {
274
+ if (!perModel || perModel.length <= 1)
275
+ return;
276
+ const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
277
+ md.details("๐Ÿค– Per-model scores", () => {
278
+ md.table(["Model", "Score", "Doc Lift", "Tests", "Cost"], sorted.map((e) => [
279
+ e.label || e.modelId,
280
+ `**${round(e.overall.avgScore)}**`,
281
+ signedNum(round(e.overall.avgDocLift)),
282
+ String(e.overall.testCount),
283
+ e.overall.cost ? fmtCost(e.overall.cost) : "โ€”",
284
+ ]));
285
+ md.blank();
286
+ for (const entry of sorted) {
287
+ const name = entry.label || entry.modelId;
288
+ md.line(`**${name}** (\`${entry.modelId}\`):`);
289
+ md.blank();
290
+ md.table(["Feature", "Score", "Task", "Code", "Docs", "Lift"], entry.scores.map((s) => [
291
+ s.feature ?? "โ€”",
292
+ `**${s.totalScore ?? 0}**`,
293
+ String(s.taskCompletion ?? 0),
294
+ String(s.codeCorrectness ?? 0),
295
+ String(s.docCoverage ?? 0),
296
+ signedNum(s.docLift),
297
+ ]));
298
+ md.blank();
299
+ }
300
+ });
301
+ md.blank();
302
+ }
303
+ function renderComparison(md, cmp) {
304
+ const overallDelta = cmp.deltas.overall;
305
+ const icon = overallDelta > cmp.noiseThreshold
306
+ ? "๐Ÿ“ˆ"
307
+ : overallDelta < -cmp.noiseThreshold
308
+ ? "๐Ÿ“‰"
309
+ : "โžก๏ธ";
310
+ md.heading(3, "๐Ÿ“Š Score Comparison");
311
+ md.blank();
312
+ const baselineScore = cmp.baseline?.overall.avgScore;
313
+ const experimentScore = cmp.experiment?.overall.avgScore;
314
+ if (baselineScore !== undefined && experimentScore !== undefined) {
315
+ md.line(`**Overall: ${round(baselineScore)} โ†’ ${round(experimentScore)}** (${icon} ${signedNum(round(overallDelta))})`);
316
+ }
317
+ else {
318
+ md.line(`**Overall delta: ${signedNum(round(overallDelta))}** (${icon}, threshold ยฑ${cmp.noiseThreshold})`);
319
+ }
320
+ md.blank();
321
+ const hasActualDeltas = cmp.areas.some((a) => a.actualDelta !== undefined);
322
+ if (hasActualDeltas) {
323
+ md.table([
324
+ "Feature",
325
+ "Baseline",
326
+ "Current",
327
+ "Delta",
328
+ "Actual ฮ”",
329
+ "Ret. Gap ฮ”",
330
+ "Infra ฮ”",
331
+ ], cmp.areas.map((a) => {
332
+ const changeIcon = changeEmoji(a.change);
333
+ const actualStr = a.actualDelta !== undefined ? signedNum(round(a.actualDelta)) : "โ€”";
334
+ const retGapStr = a.retrievalGapDelta !== undefined
335
+ ? signedNum(round(a.retrievalGapDelta))
336
+ : "โ€”";
337
+ const infraStr = a.infrastructureEfficiencyDelta !== undefined
338
+ ? `${a.infrastructureEfficiencyDelta > 0 ? "+" : ""}${Math.round(a.infrastructureEfficiencyDelta * 100)}pp`
339
+ : "โ€”";
340
+ return [
341
+ a.area,
342
+ String(a.baseline),
343
+ String(a.experiment),
344
+ `${changeIcon} ${signedNum(round(a.delta))}`,
345
+ actualStr,
346
+ retGapStr,
347
+ infraStr,
348
+ ];
349
+ }));
350
+ }
351
+ else {
352
+ md.table(["Feature", "Baseline", "Current", "Delta", "Task", "Code", "Docs"], cmp.areas.map((a) => {
353
+ const changeIcon = changeEmoji(a.change);
354
+ // TODO(multi-mode): Literacy-specific dimension keys.
355
+ const taskDelta = a.dimensions?.taskCompletion?.delta;
356
+ const codeDelta = a.dimensions?.codeCorrectness?.delta;
357
+ const docDelta = a.dimensions?.docCoverage?.delta;
358
+ return [
359
+ a.area,
360
+ String(a.baseline),
361
+ String(a.experiment),
362
+ `${changeIcon} ${signedNum(round(a.delta))}`,
363
+ taskDelta !== undefined ? signedNum(round(taskDelta)) : "โ€”",
364
+ codeDelta !== undefined ? signedNum(round(codeDelta)) : "โ€”",
365
+ docDelta !== undefined ? signedNum(round(docDelta)) : "โ€”",
366
+ ];
367
+ }));
368
+ }
369
+ md.blank();
370
+ const summaryParts = [];
371
+ if (cmp.improved && cmp.improved.length > 0) {
372
+ summaryParts.push(`๐Ÿ“ˆ ${cmp.improved.length} improved`);
373
+ }
374
+ if (cmp.regressed && cmp.regressed.length > 0) {
375
+ summaryParts.push(`๐Ÿ“‰ ${cmp.regressed.length} regressed`);
376
+ }
377
+ if (cmp.unchanged && cmp.unchanged.length > 0) {
378
+ summaryParts.push(`โžก๏ธ ${cmp.unchanged.length} unchanged`);
379
+ }
380
+ if (cmp.notEvaluated && cmp.notEvaluated.length > 0) {
381
+ summaryParts.push(`โญ๏ธ ${cmp.notEvaluated.length} not evaluated`);
382
+ }
383
+ if (summaryParts.length > 0) {
384
+ const thresholdNote = cmp.noiseThresholdEmpirical
385
+ ? ` (empirical threshold: ยฑ${cmp.noiseThreshold.toFixed(1)})`
386
+ : ` (threshold: ยฑ${cmp.noiseThreshold})`;
387
+ md.line(summaryParts.join(" ยท ") + thresholdNote);
388
+ md.blank();
389
+ }
390
+ md.details("Dimension averages", () => {
391
+ const dim = cmp.deltas.perDimension ?? {};
392
+ const rows = Object.entries(dim).map(([k, v]) => [
393
+ dimensionLabel(k),
394
+ signedNum(round(v)),
395
+ ]);
396
+ rows.push(["Doc Lift", signedNum(round(cmp.deltas.docLift))]);
397
+ md.table(["Dimension", "Delta"], rows);
398
+ });
399
+ md.blank();
400
+ }
401
+ function renderCostBreakdown(md, scores, overall) {
402
+ const providerCost = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
403
+ const graderCost = overall?.cost?.graderTotal ?? 0;
404
+ const combinedCost = providerCost + graderCost;
405
+ if (combinedCost <= 0)
406
+ return;
407
+ md.details("๐Ÿ’ฐ Eval cost breakdown", () => {
408
+ const rows = [
409
+ ["Provider (model inference)", fmtCost(providerCost)],
410
+ ];
411
+ if (graderCost > 0) {
412
+ const label = overall?.cost?.graderModel ?? "unknown";
413
+ rows.push([`Grader (${label})`, fmtCost(graderCost)]);
414
+ }
415
+ rows.push(["**Total**", `**${fmtCost(combinedCost)}**`]);
416
+ md.table(["Category", "Cost"], rows);
417
+ md.blank();
418
+ const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
419
+ md.line("**Provider cost by feature area:**");
420
+ md.blank();
421
+ md.table(["Feature", "Tests", "Cost", "Avg/Test"], sorted.map((s) => {
422
+ const avgCost = (s.testCount ?? 0) > 0 ? (s.totalCost ?? 0) / (s.testCount ?? 1) : 0;
423
+ return [
424
+ s.feature ?? "โ€”",
425
+ String(s.testCount ?? 0),
426
+ fmtCost(s.totalCost ?? 0),
427
+ fmtCost(avgCost),
428
+ ];
429
+ }));
430
+ });
431
+ md.blank();
432
+ }
433
+ function renderGapRecommendations(md, recommendations) {
434
+ if (!recommendations)
435
+ return;
436
+ const top3 = recommendations.top3 ?? [];
437
+ const counts = recommendations.counts ?? {};
438
+ if (top3.length === 0 && Object.keys(counts).length === 0)
439
+ return;
440
+ const lift = recommendations.totalPotentialLift ?? 0;
441
+ const totalGaps = recommendations.totalGaps ?? top3.length;
442
+ const liftSuffix = lift > 0 ? ` (+${lift.toFixed(1)} pts potential lift)` : "";
443
+ md.details(`๐Ÿ“‹ Recommendations โ€” ${totalGaps} gaps${liftSuffix}`, () => {
444
+ md.blank();
445
+ if (top3.length > 0) {
446
+ md.line(`**Top ${top3.length} by priority:**`);
447
+ md.blank();
448
+ md.table(["#", "Area", "Failure Mode", "Priority"], top3.map((gap, i) => [
449
+ String(i + 1),
450
+ gap.area,
451
+ gap.title,
452
+ String(gap.priority),
453
+ ]));
454
+ md.blank();
455
+ }
456
+ const countEntries = Object.entries(counts).sort((a, b) => b[1] - a[1]);
457
+ if (countEntries.length > 0) {
458
+ md.line("**Gap counts by area:**");
459
+ md.blank();
460
+ md.table(["Area", "Gaps"], countEntries.map(([area, n]) => [area, String(n)]));
461
+ }
462
+ });
463
+ md.blank();
464
+ }
465
+ function renderLowScoringJudgments(md, judgments) {
466
+ if (!judgments || judgments.length === 0)
467
+ return;
468
+ const byArea = new Map();
469
+ for (const j of judgments) {
470
+ const sep = j.taskId.indexOf(" - ");
471
+ const area = sep > 0 ? j.taskId.substring(0, sep) : j.taskId;
472
+ if (!byArea.has(area))
473
+ byArea.set(area, []);
474
+ byArea.get(area).push(j);
475
+ }
476
+ const groups = [...byArea.entries()]
477
+ .sort(([a], [b]) => a.localeCompare(b))
478
+ .map(([area, js]) => [area, js.sort((a, b) => a.score - b.score)]);
479
+ md.details(`๐Ÿ” Low-Scoring Judgments (${judgments.length} below 70)`, () => {
480
+ md.blank();
481
+ for (const [area, areaJudgments] of groups) {
482
+ md.heading(4, `${area} (${areaJudgments.length})`);
483
+ md.blank();
484
+ for (const j of areaJudgments) {
485
+ const sep = j.taskId.indexOf(" - ");
486
+ const taskName = sep > 0 ? j.taskId.substring(sep + 3) : j.taskId;
487
+ const dimLabel = dimensionLabel(j.dimension);
488
+ md.line(`**${gradeEmoji(j.score)} ${j.score}** ยท ${dimLabel} ยท ${taskName} ยท \`${j.modelId}\``);
489
+ md.blank();
490
+ const reasonLines = j.reason
491
+ .split("\n")
492
+ .map((l) => `> ${l}`)
493
+ .join("\n");
494
+ md.line(reasonLines);
495
+ md.blank();
496
+ if (j.canonicalDocs && j.canonicalDocs.length > 0) {
497
+ const docList = j.canonicalDocs.map((d) => `\`${d.slug}\``).join(", ");
498
+ md.line(`*Expected docs: ${docList}*`);
499
+ md.blank();
500
+ }
501
+ }
502
+ }
503
+ });
504
+ md.blank();
505
+ }
506
+ function renderRecommendations(md, scores, belowCritical) {
507
+ const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
508
+ const needsRecs = belowCritical.length > 0 || sorted.some((s) => (s.totalScore ?? 100) < 70);
509
+ if (!needsRecs)
510
+ return;
511
+ md.heading(3, "๐Ÿ’ก Recommendations");
512
+ md.blank();
513
+ for (const s of sorted) {
514
+ const score = s.totalScore ?? 0;
515
+ if (score < 50) {
516
+ md.line(`- ๐Ÿ”ด **${s.feature}** (score: ${round(score)}) โ€” needs significant doc improvements.`);
517
+ if ((s.codeCorrectness ?? 0) < 10) {
518
+ md.line(` Code correctness is very low (${round(s.codeCorrectness)}) โ€” add more complete code examples.`);
519
+ }
520
+ if ((s.docCoverage ?? 0) < 10) {
521
+ md.line(` Doc coverage is very low (${round(s.docCoverage)}) โ€” key APIs/patterns may be missing from docs.`);
522
+ }
523
+ }
524
+ else if (score < 70) {
525
+ md.line(`- ๐ŸŸ  **${s.feature}** (score: ${round(score)}) โ€” has room for improvement.`);
526
+ if ((s.codeCorrectness ?? 0) < 15) {
527
+ md.line(` Code correctness (${round(s.codeCorrectness)}) could improve with better code examples.`);
528
+ }
529
+ }
530
+ }
531
+ md.blank();
532
+ }
533
+ function renderFooter(md, report, provenance) {
534
+ md.line("---");
535
+ const date = report.completedAt ? fmtDate(report.completedAt) : undefined;
536
+ const parts = [
537
+ `[AI Literacy Framework](https://github.com/sanity-labs/ai-literacy-framework)`,
538
+ ];
539
+ if (date)
540
+ parts.push(date);
541
+ const promptfooUrl = provenance?.promptfooUrls?.[0]?.url;
542
+ if (typeof promptfooUrl === "string" && promptfooUrl.length > 0) {
543
+ parts.push(`[view detailed results](${promptfooUrl})`);
544
+ }
545
+ md.line(`*Generated by ${parts.join(" ยท ")}*`);
546
+ }
547
+ function buildStudioUrl(reportId) {
548
+ const origin = process.env.SANITY_STUDIO_ORIGIN ?? "https://admin.sanity.io";
549
+ return `${origin}/ailf/report/${reportId}`;
550
+ }
551
+ // ---------------------------------------------------------------------------
552
+ // MarkdownBuilder
553
+ // ---------------------------------------------------------------------------
554
+ class MarkdownBuilder {
555
+ lines = [];
556
+ blank() {
557
+ this.lines.push("");
558
+ }
559
+ details(summary, body) {
560
+ this.lines.push("<details>");
561
+ this.lines.push(`<summary>${summary}</summary>`);
562
+ this.lines.push("");
563
+ body();
564
+ this.lines.push("</details>");
565
+ }
566
+ heading(level, text) {
567
+ this.lines.push(`${"#".repeat(level)} ${text}`);
568
+ }
569
+ line(text) {
570
+ this.lines.push(text);
571
+ }
572
+ table(headers, rows) {
573
+ this.lines.push(`| ${headers.join(" | ")} |`);
574
+ this.lines.push(`|${headers.map(() => "---").join("|")}|`);
575
+ for (const row of rows) {
576
+ this.lines.push(`| ${row.join(" | ")} |`);
577
+ }
578
+ }
579
+ toString() {
580
+ return this.lines.join("\n") + "\n";
581
+ }
582
+ }
583
+ // ---------------------------------------------------------------------------
584
+ // Formatting helpers
585
+ // ---------------------------------------------------------------------------
586
+ function changeEmoji(change) {
587
+ if (change === "improved")
588
+ return "๐Ÿ“ˆ";
589
+ if (change === "regressed")
590
+ return "๐Ÿ“‰";
591
+ if (change === "not-evaluated")
592
+ return "โญ๏ธ";
593
+ return "โžก๏ธ";
594
+ }
595
+ function computeTotalCost(scores, overall) {
596
+ const provider = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
597
+ const grader = overall?.cost?.graderTotal ?? 0;
598
+ return provider + grader;
599
+ }
600
+ function dimensionLabel(dim) {
601
+ return dim
602
+ .split("-")
603
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
604
+ .join(" ");
605
+ }
606
+ function fmtCost(cost) {
607
+ if (cost === 0)
608
+ return "$0.00";
609
+ if (cost < 0.01)
610
+ return `$${cost.toFixed(4)}`;
611
+ return `$${cost.toFixed(2)}`;
612
+ }
613
+ function fmtDate(iso) {
614
+ try {
615
+ return new Date(iso).toLocaleString("en-US", {
616
+ day: "numeric",
617
+ hour: "numeric",
618
+ minute: "2-digit",
619
+ month: "short",
620
+ timeZone: "UTC",
621
+ timeZoneName: "short",
622
+ year: "numeric",
623
+ });
624
+ }
625
+ catch {
626
+ return iso;
627
+ }
628
+ }
629
+ function fmtDuration(ms) {
630
+ if (ms < 1000)
631
+ return `${ms}ms`;
632
+ const secs = ms / 1000;
633
+ if (secs < 60)
634
+ return `${secs.toFixed(1)}s`;
635
+ const mins = Math.floor(secs / 60);
636
+ const remSecs = Math.round(secs % 60);
637
+ return `${mins}m ${remSecs}s`;
638
+ }
639
+ function gradeEmoji(score) {
640
+ if (score === undefined)
641
+ return "โ€”";
642
+ if (score >= 80)
643
+ return "โœ…";
644
+ if (score >= 70)
645
+ return "๐ŸŸก";
646
+ if (score >= 50)
647
+ return "๐ŸŸ ";
648
+ return "๐Ÿ”ด";
649
+ }
650
+ function gradeLetter(score) {
651
+ if (score === undefined)
652
+ return "โ€”";
653
+ if (score >= 80)
654
+ return "A";
655
+ if (score >= 70)
656
+ return "B";
657
+ if (score >= 50)
658
+ return "C";
659
+ return "D";
660
+ }
661
+ function liftArrow(lift) {
662
+ if (lift === undefined)
663
+ return "โ€”";
664
+ const rounded = Math.round(lift);
665
+ if (rounded > 0)
666
+ return `๐Ÿ“ˆ +${rounded}`;
667
+ if (rounded < 0)
668
+ return `๐Ÿ“‰ ${rounded}`;
669
+ return "โžก๏ธ 0";
670
+ }
671
+ function normalizeScores(raw) {
672
+ if (!Array.isArray(raw))
673
+ return [];
674
+ return raw.filter((s) => s != null && typeof s === "object");
675
+ }
676
+ function round(n) {
677
+ if (n === undefined)
678
+ return 0;
679
+ return Math.round(n);
680
+ }
681
+ function scoreEmoji(avg) {
682
+ if (avg === undefined)
683
+ return "๐Ÿ“Š";
684
+ if (avg >= 75)
685
+ return "๐ŸŸข";
686
+ if (avg >= 60)
687
+ return "๐ŸŸก";
688
+ if (avg >= 45)
689
+ return "๐ŸŸ ";
690
+ return "๐Ÿ”ด";
691
+ }
692
+ function signedNum(n) {
693
+ if (n === undefined)
694
+ return "โ€”";
695
+ return n > 0 ? `+${n}` : String(n);
696
+ }