@sanity/ailf 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ailf.js +6 -1
- package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
- package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
- package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
- package/dist/_vendor/ailf-core/schemas/report.js +235 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
- package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
- package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
- package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
- package/dist/_vendor/ailf-shared/index.d.ts +7 -5
- package/dist/_vendor/ailf-shared/index.js +7 -5
- package/dist/adapters/api-client/types.d.ts +2 -5
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
- package/dist/adapters/task-sources/index.d.ts +1 -1
- package/dist/adapters/task-sources/index.js +1 -1
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -2
- package/dist/adapters/task-sources/repo-schemas.js +3 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
- package/dist/adapters/task-sources/repo-validation.js +1 -1
- package/dist/agent-observer/agentic-provider.d.ts +1 -0
- package/dist/agent-observer/agentic-provider.js +43 -36
- package/dist/agent-observer/config-schemas.d.ts +61 -0
- package/dist/agent-observer/config-schemas.js +65 -0
- package/dist/agent-observer/provider.d.ts +1 -0
- package/dist/agent-observer/provider.js +19 -17
- package/dist/cli.js +4 -4
- package/dist/commands/validate-tasks.js +2 -2
- package/dist/composition-root.js +4 -2
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/job-store.js +2 -2
- package/dist/lib/dotenv-resolution.d.ts +21 -0
- package/dist/lib/dotenv-resolution.js +30 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +14 -3
- package/dist/orchestration/steps/run-eval-step.js +21 -3
- package/dist/pipeline/agent-behavior-report.d.ts +2 -8
- package/dist/pipeline/cache.d.ts +2 -2
- package/dist/pipeline/checks.d.ts +10 -2
- package/dist/pipeline/checks.js +14 -4
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
- package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
- package/dist/pipeline/compiler/provider-assembler.js +33 -3
- package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
- package/dist/pipeline/mirror-repo-tasks.js +16 -8
- package/dist/pipeline/pr-comment.d.ts +22 -9
- package/dist/pipeline/pr-comment.js +52 -472
- package/dist/pipeline/resolve-mappings.d.ts +8 -3
- package/dist/promptfoo-providers/mock-path.d.ts +12 -0
- package/dist/promptfoo-providers/mock-path.js +15 -0
- package/dist/report-store.d.ts +63 -1
- package/dist/report-store.js +111 -31
- package/dist/sanity/client.d.ts +58 -0
- package/dist/sanity/client.js +106 -0
- package/package.json +8 -7
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* report-to-markdown.ts — Canonical PR-comment markdown renderer (W0150).
|
|
3
|
+
*
|
|
4
|
+
* Single source of truth for rendering an AILF report as PR-comment
|
|
5
|
+
* markdown. Used by:
|
|
6
|
+
* - the API gateway (`/v1/reports/:id/markdown`)
|
|
7
|
+
* - the eval pipeline (`ailf pr-comment` CLI / `pipeline/pr-comment.ts`)
|
|
8
|
+
*
|
|
9
|
+
* Operates on a structurally lenient `RenderableReport` shape so callers
|
|
10
|
+
* can pass either the persisted slim Report (Sanity doc) or an in-memory
|
|
11
|
+
* envelope built from `score-summary.json` + `comparison-report.json`.
|
|
12
|
+
*
|
|
13
|
+
* Canonical formatting decisions (W0150):
|
|
14
|
+
* - Header: level-1 (`# {emoji} AI Literacy Score Report`).
|
|
15
|
+
* - Footer: markdown link form `[view detailed results](url)`. The link
|
|
16
|
+
* URL comes from `provenance.promptfooUrls[0].url`.
|
|
17
|
+
* - Source verification block (sourceVerification + sourceIsolation) is
|
|
18
|
+
* rendered when present on the summary — preserves info from agentic
|
|
19
|
+
* / sandboxed local-mode runs without breaking remote-mode reports
|
|
20
|
+
* that don't carry those fields.
|
|
21
|
+
*/
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Public API
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
export function reportToMarkdown(report) {
|
|
26
|
+
const md = new MarkdownBuilder();
|
|
27
|
+
const summary = report.summary;
|
|
28
|
+
const comparison = report.comparison;
|
|
29
|
+
const provenance = report.provenance;
|
|
30
|
+
// Header
|
|
31
|
+
const avgScore = summary?.overall?.avgScore;
|
|
32
|
+
md.heading(1, `${scoreEmoji(avgScore)} AI Literacy Score Report`);
|
|
33
|
+
md.blank();
|
|
34
|
+
// Subtitle line
|
|
35
|
+
const parts = [];
|
|
36
|
+
if (avgScore !== undefined) {
|
|
37
|
+
parts.push(`**Overall: ${round(avgScore)}/100**`);
|
|
38
|
+
}
|
|
39
|
+
if (summary?.overall?.avgDocLift !== undefined) {
|
|
40
|
+
parts.push(`Doc Lift: ${signedNum(round(summary.overall.avgDocLift))}`);
|
|
41
|
+
}
|
|
42
|
+
if (summary?.overall?.avgActualScore !== undefined) {
|
|
43
|
+
parts.push(`Actual: ${round(summary.overall.avgActualScore)}/100`);
|
|
44
|
+
}
|
|
45
|
+
if (summary?.overall?.avgRetrievalGap !== undefined) {
|
|
46
|
+
parts.push(`Ret. Gap: ${round(summary.overall.avgRetrievalGap)}`);
|
|
47
|
+
}
|
|
48
|
+
const scores = normalizeScores(summary?.scores);
|
|
49
|
+
const testCount = scores.reduce((s, sc) => s + (sc.testCount ?? 0), 0);
|
|
50
|
+
if (scores.length > 0) {
|
|
51
|
+
parts.push(`${testCount} tests across ${scores.length} areas`);
|
|
52
|
+
}
|
|
53
|
+
const totalCost = computeTotalCost(scores, summary?.overall);
|
|
54
|
+
if (totalCost > 0) {
|
|
55
|
+
parts.push(`Cost: ${fmtCost(totalCost)}`);
|
|
56
|
+
}
|
|
57
|
+
if (parts.length > 0) {
|
|
58
|
+
md.line(parts.join(" · "));
|
|
59
|
+
md.blank();
|
|
60
|
+
}
|
|
61
|
+
renderMetadata(md, report);
|
|
62
|
+
const belowCritical = summary?.belowCritical ?? [];
|
|
63
|
+
if (belowCritical.length > 0) {
|
|
64
|
+
md.line(`> ⚠️ **Below critical threshold:** ${belowCritical.map((a) => `\`${a}\``).join(", ")}`);
|
|
65
|
+
md.blank();
|
|
66
|
+
}
|
|
67
|
+
renderSource(md, summary?.source);
|
|
68
|
+
renderSourceVerification(md, summary?.sourceVerification, summary?.sourceIsolation);
|
|
69
|
+
if (scores.length > 0) {
|
|
70
|
+
renderScoreTable(md, scores);
|
|
71
|
+
}
|
|
72
|
+
const hasActual = scores.some((s) => s.actualScore !== undefined);
|
|
73
|
+
if (hasActual) {
|
|
74
|
+
renderThreeLayerDecomposition(md, scores);
|
|
75
|
+
}
|
|
76
|
+
else if (scores.length > 0) {
|
|
77
|
+
renderCeilingDecomposition(md, scores);
|
|
78
|
+
}
|
|
79
|
+
renderPerModel(md, summary?.perModel);
|
|
80
|
+
if (comparison) {
|
|
81
|
+
renderComparison(md, comparison);
|
|
82
|
+
}
|
|
83
|
+
renderCostBreakdown(md, scores, summary?.overall);
|
|
84
|
+
renderGapRecommendations(md, summary?.recommendations);
|
|
85
|
+
renderLowScoringJudgments(md, summary?.lowScoringJudgments);
|
|
86
|
+
renderRecommendations(md, scores, belowCritical);
|
|
87
|
+
renderFooter(md, report, provenance);
|
|
88
|
+
return md.toString();
|
|
89
|
+
}
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// Section renderers
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
function renderMetadata(md, report) {
|
|
94
|
+
const items = [];
|
|
95
|
+
if (report.id) {
|
|
96
|
+
const studioUrl = buildStudioUrl(report.id);
|
|
97
|
+
items.push(["Report ID", `[\`${report.id}\`](${studioUrl})`]);
|
|
98
|
+
}
|
|
99
|
+
if (report.completedAt) {
|
|
100
|
+
items.push(["Completed", fmtDate(report.completedAt)]);
|
|
101
|
+
}
|
|
102
|
+
if (typeof report.durationMs === "number") {
|
|
103
|
+
items.push(["Duration", fmtDuration(report.durationMs)]);
|
|
104
|
+
}
|
|
105
|
+
if (report.tag) {
|
|
106
|
+
items.push(["Tag", report.tag]);
|
|
107
|
+
}
|
|
108
|
+
const summary = report.summary;
|
|
109
|
+
if (summary?.evaluationMode) {
|
|
110
|
+
items.push(["Mode", summary.evaluationMode]);
|
|
111
|
+
}
|
|
112
|
+
if (items.length > 0) {
|
|
113
|
+
md.table(["Field", "Value"], items);
|
|
114
|
+
md.blank();
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
function renderSource(md, source) {
|
|
118
|
+
if (!source)
|
|
119
|
+
return;
|
|
120
|
+
const rows = [];
|
|
121
|
+
if (source.name)
|
|
122
|
+
rows.push(["Source", source.name]);
|
|
123
|
+
if (source.baseUrl)
|
|
124
|
+
rows.push(["Docs URL", source.baseUrl]);
|
|
125
|
+
if (source.dataset)
|
|
126
|
+
rows.push(["Dataset", source.dataset]);
|
|
127
|
+
if (source.projectId)
|
|
128
|
+
rows.push(["Project", source.projectId]);
|
|
129
|
+
if (source.perspective)
|
|
130
|
+
rows.push(["Perspective", source.perspective]);
|
|
131
|
+
if (rows.length > 0) {
|
|
132
|
+
md.details("🔧 Environment", () => {
|
|
133
|
+
md.table(["Setting", "Value"], rows);
|
|
134
|
+
});
|
|
135
|
+
md.blank();
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
function renderSourceVerification(md, verification, isolation) {
|
|
139
|
+
if (!verification && !isolation)
|
|
140
|
+
return;
|
|
141
|
+
md.details("🔍 Source verification", () => {
|
|
142
|
+
const rows = [];
|
|
143
|
+
if (verification) {
|
|
144
|
+
rows.push(["Source", verification.source]);
|
|
145
|
+
rows.push(["Mode", verification.mode]);
|
|
146
|
+
if (verification.allowedOrigins) {
|
|
147
|
+
rows.push(["Sandbox", verification.allowedOrigins.join(", ")]);
|
|
148
|
+
}
|
|
149
|
+
if (verification.searchMode) {
|
|
150
|
+
rows.push(["Search", verification.searchMode]);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (isolation) {
|
|
154
|
+
const pct = Math.round(isolation.isolationScore * 100);
|
|
155
|
+
const icon = isolation.offOrigin === 0 ? "✅" : "⚠️";
|
|
156
|
+
rows.push([
|
|
157
|
+
"Agent isolation",
|
|
158
|
+
`${icon} ${pct}% (${isolation.onOrigin}/${isolation.total} on-origin)`,
|
|
159
|
+
]);
|
|
160
|
+
if (isolation.offOrigin > 0) {
|
|
161
|
+
rows.push([
|
|
162
|
+
"Off-origin fetches",
|
|
163
|
+
isolation.offOriginUrls.slice(0, 5).join(", "),
|
|
164
|
+
]);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (verification?.urlFetch) {
|
|
168
|
+
const uf = verification.urlFetch;
|
|
169
|
+
rows.push([
|
|
170
|
+
"URL fetch",
|
|
171
|
+
`${uf.totalFetched} fetched, ${uf.totalFailed} failed`,
|
|
172
|
+
]);
|
|
173
|
+
for (const f of uf.fetchedUrls) {
|
|
174
|
+
rows.push(["", `✅ ${f.url} (via ${f.method})`]);
|
|
175
|
+
}
|
|
176
|
+
for (const f of uf.failures) {
|
|
177
|
+
rows.push([
|
|
178
|
+
"",
|
|
179
|
+
`⚠️ ${f.url}: ${f.error && f.error.length > 0 ? f.error : "unknown"}`,
|
|
180
|
+
]);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
md.table(["Setting", "Value"], rows);
|
|
184
|
+
});
|
|
185
|
+
md.blank();
|
|
186
|
+
}
|
|
187
|
+
function renderScoreTable(md, scores) {
|
|
188
|
+
const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
|
|
189
|
+
md.heading(3, "Scores by Feature Area");
|
|
190
|
+
md.blank();
|
|
191
|
+
const headers = [
|
|
192
|
+
"Feature",
|
|
193
|
+
"Score",
|
|
194
|
+
"Grade",
|
|
195
|
+
"Task",
|
|
196
|
+
"Code",
|
|
197
|
+
"Docs",
|
|
198
|
+
"Doc Lift",
|
|
199
|
+
"Tests",
|
|
200
|
+
];
|
|
201
|
+
const rows = sorted.map((s) => [
|
|
202
|
+
s.feature ?? "—",
|
|
203
|
+
`**${round(s.totalScore)}**`,
|
|
204
|
+
`${gradeEmoji(s.totalScore)} ${gradeLetter(s.totalScore)}`,
|
|
205
|
+
String(round(s.taskCompletion)),
|
|
206
|
+
String(round(s.codeCorrectness)),
|
|
207
|
+
String(round(s.docCoverage)),
|
|
208
|
+
liftArrow(s.docLift),
|
|
209
|
+
String(s.testCount ?? 0),
|
|
210
|
+
]);
|
|
211
|
+
md.table(headers, rows);
|
|
212
|
+
md.blank();
|
|
213
|
+
const negAreas = sorted.filter((s) => s.negativeDocLift);
|
|
214
|
+
for (const s of negAreas) {
|
|
215
|
+
md.line(`> 🚨 **Negative Doc Lift:** \`${s.feature}\` (${s.docLift}) — docs hurt performance. Floor: ${s.floorScore}, Ceiling: ${s.ceilingScore}`);
|
|
216
|
+
}
|
|
217
|
+
if (negAreas.length > 0)
|
|
218
|
+
md.blank();
|
|
219
|
+
}
|
|
220
|
+
function renderCeilingDecomposition(md, scores) {
|
|
221
|
+
const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
|
|
222
|
+
md.details("📊 Ceiling decomposition", () => {
|
|
223
|
+
md.table(["Feature", "Floor", "Ceiling", "Doc Lift", "Quality Gap"], sorted.map((s) => [
|
|
224
|
+
s.feature ?? "—",
|
|
225
|
+
String(s.floorScore ?? 0),
|
|
226
|
+
String(s.ceilingScore ?? 0),
|
|
227
|
+
liftArrow(s.docLift),
|
|
228
|
+
String(s.docQualityGap ?? 0),
|
|
229
|
+
]));
|
|
230
|
+
});
|
|
231
|
+
md.blank();
|
|
232
|
+
}
|
|
233
|
+
function renderThreeLayerDecomposition(md, scores) {
|
|
234
|
+
const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
|
|
235
|
+
md.heading(3, "🔬 Three-Layer Decomposition");
|
|
236
|
+
md.blank();
|
|
237
|
+
md.table([
|
|
238
|
+
"Feature",
|
|
239
|
+
"Floor",
|
|
240
|
+
"Ceiling",
|
|
241
|
+
"Actual",
|
|
242
|
+
"Doc Lift",
|
|
243
|
+
"Retr. Gap",
|
|
244
|
+
"Infra %",
|
|
245
|
+
], sorted.map((s) => {
|
|
246
|
+
const actualStr = s.actualScore !== undefined ? String(s.actualScore) : "—";
|
|
247
|
+
const gapStr = s.retrievalGap !== undefined ? signedNum(s.retrievalGap) : "—";
|
|
248
|
+
const infraStr = s.infrastructureEfficiency != null
|
|
249
|
+
? `${Math.round(s.infrastructureEfficiency * 100)}%`
|
|
250
|
+
: "—";
|
|
251
|
+
const flag = s.invertedRetrievalGap ? " 🔄" : "";
|
|
252
|
+
return [
|
|
253
|
+
s.feature ?? "—",
|
|
254
|
+
String(s.floorScore ?? 0),
|
|
255
|
+
String(s.ceilingScore ?? 0),
|
|
256
|
+
actualStr,
|
|
257
|
+
liftArrow(s.docLift),
|
|
258
|
+
`${gapStr}${flag}`,
|
|
259
|
+
infraStr,
|
|
260
|
+
];
|
|
261
|
+
}));
|
|
262
|
+
md.blank();
|
|
263
|
+
md.details("📖 What do these numbers mean?", () => {
|
|
264
|
+
md.line("- **Floor:** Model performance without any documentation (training data only)");
|
|
265
|
+
md.line("- **Ceiling:** Model performance with perfect documentation (hand-picked, injected)");
|
|
266
|
+
md.line("- **Actual:** Model performance when finding docs on its own (like real users)");
|
|
267
|
+
md.line("- **Doc Lift:** Documentation quality contribution (Ceiling − Floor)");
|
|
268
|
+
md.line("- **Retr. Gap:** Quality lost in discovery (Ceiling − Actual)");
|
|
269
|
+
md.line("- **Infra %:** Fraction of doc quality reaching agents (Actual ÷ Ceiling)");
|
|
270
|
+
});
|
|
271
|
+
md.blank();
|
|
272
|
+
}
|
|
273
|
+
function renderPerModel(md, perModel) {
|
|
274
|
+
if (!perModel || perModel.length <= 1)
|
|
275
|
+
return;
|
|
276
|
+
const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
|
|
277
|
+
md.details("🤖 Per-model scores", () => {
|
|
278
|
+
md.table(["Model", "Score", "Doc Lift", "Tests", "Cost"], sorted.map((e) => [
|
|
279
|
+
e.label || e.modelId,
|
|
280
|
+
`**${round(e.overall.avgScore)}**`,
|
|
281
|
+
signedNum(round(e.overall.avgDocLift)),
|
|
282
|
+
String(e.overall.testCount),
|
|
283
|
+
e.overall.cost ? fmtCost(e.overall.cost) : "—",
|
|
284
|
+
]));
|
|
285
|
+
md.blank();
|
|
286
|
+
for (const entry of sorted) {
|
|
287
|
+
const name = entry.label || entry.modelId;
|
|
288
|
+
md.line(`**${name}** (\`${entry.modelId}\`):`);
|
|
289
|
+
md.blank();
|
|
290
|
+
md.table(["Feature", "Score", "Task", "Code", "Docs", "Lift"], entry.scores.map((s) => [
|
|
291
|
+
s.feature ?? "—",
|
|
292
|
+
`**${s.totalScore ?? 0}**`,
|
|
293
|
+
String(s.taskCompletion ?? 0),
|
|
294
|
+
String(s.codeCorrectness ?? 0),
|
|
295
|
+
String(s.docCoverage ?? 0),
|
|
296
|
+
signedNum(s.docLift),
|
|
297
|
+
]));
|
|
298
|
+
md.blank();
|
|
299
|
+
}
|
|
300
|
+
});
|
|
301
|
+
md.blank();
|
|
302
|
+
}
|
|
303
|
+
function renderComparison(md, cmp) {
|
|
304
|
+
const overallDelta = cmp.deltas.overall;
|
|
305
|
+
const icon = overallDelta > cmp.noiseThreshold
|
|
306
|
+
? "📈"
|
|
307
|
+
: overallDelta < -cmp.noiseThreshold
|
|
308
|
+
? "📉"
|
|
309
|
+
: "➡️";
|
|
310
|
+
md.heading(3, "📊 Score Comparison");
|
|
311
|
+
md.blank();
|
|
312
|
+
const baselineScore = cmp.baseline?.overall.avgScore;
|
|
313
|
+
const experimentScore = cmp.experiment?.overall.avgScore;
|
|
314
|
+
if (baselineScore !== undefined && experimentScore !== undefined) {
|
|
315
|
+
md.line(`**Overall: ${round(baselineScore)} → ${round(experimentScore)}** (${icon} ${signedNum(round(overallDelta))})`);
|
|
316
|
+
}
|
|
317
|
+
else {
|
|
318
|
+
md.line(`**Overall delta: ${signedNum(round(overallDelta))}** (${icon}, threshold ±${cmp.noiseThreshold})`);
|
|
319
|
+
}
|
|
320
|
+
md.blank();
|
|
321
|
+
const hasActualDeltas = cmp.areas.some((a) => a.actualDelta !== undefined);
|
|
322
|
+
if (hasActualDeltas) {
|
|
323
|
+
md.table([
|
|
324
|
+
"Feature",
|
|
325
|
+
"Baseline",
|
|
326
|
+
"Current",
|
|
327
|
+
"Delta",
|
|
328
|
+
"Actual Δ",
|
|
329
|
+
"Ret. Gap Δ",
|
|
330
|
+
"Infra Δ",
|
|
331
|
+
], cmp.areas.map((a) => {
|
|
332
|
+
const changeIcon = changeEmoji(a.change);
|
|
333
|
+
const actualStr = a.actualDelta !== undefined ? signedNum(round(a.actualDelta)) : "—";
|
|
334
|
+
const retGapStr = a.retrievalGapDelta !== undefined
|
|
335
|
+
? signedNum(round(a.retrievalGapDelta))
|
|
336
|
+
: "—";
|
|
337
|
+
const infraStr = a.infrastructureEfficiencyDelta !== undefined
|
|
338
|
+
? `${a.infrastructureEfficiencyDelta > 0 ? "+" : ""}${Math.round(a.infrastructureEfficiencyDelta * 100)}pp`
|
|
339
|
+
: "—";
|
|
340
|
+
return [
|
|
341
|
+
a.area,
|
|
342
|
+
String(a.baseline),
|
|
343
|
+
String(a.experiment),
|
|
344
|
+
`${changeIcon} ${signedNum(round(a.delta))}`,
|
|
345
|
+
actualStr,
|
|
346
|
+
retGapStr,
|
|
347
|
+
infraStr,
|
|
348
|
+
];
|
|
349
|
+
}));
|
|
350
|
+
}
|
|
351
|
+
else {
|
|
352
|
+
md.table(["Feature", "Baseline", "Current", "Delta", "Task", "Code", "Docs"], cmp.areas.map((a) => {
|
|
353
|
+
const changeIcon = changeEmoji(a.change);
|
|
354
|
+
// TODO(multi-mode): Literacy-specific dimension keys.
|
|
355
|
+
const taskDelta = a.dimensions?.taskCompletion?.delta;
|
|
356
|
+
const codeDelta = a.dimensions?.codeCorrectness?.delta;
|
|
357
|
+
const docDelta = a.dimensions?.docCoverage?.delta;
|
|
358
|
+
return [
|
|
359
|
+
a.area,
|
|
360
|
+
String(a.baseline),
|
|
361
|
+
String(a.experiment),
|
|
362
|
+
`${changeIcon} ${signedNum(round(a.delta))}`,
|
|
363
|
+
taskDelta !== undefined ? signedNum(round(taskDelta)) : "—",
|
|
364
|
+
codeDelta !== undefined ? signedNum(round(codeDelta)) : "—",
|
|
365
|
+
docDelta !== undefined ? signedNum(round(docDelta)) : "—",
|
|
366
|
+
];
|
|
367
|
+
}));
|
|
368
|
+
}
|
|
369
|
+
md.blank();
|
|
370
|
+
const summaryParts = [];
|
|
371
|
+
if (cmp.improved && cmp.improved.length > 0) {
|
|
372
|
+
summaryParts.push(`📈 ${cmp.improved.length} improved`);
|
|
373
|
+
}
|
|
374
|
+
if (cmp.regressed && cmp.regressed.length > 0) {
|
|
375
|
+
summaryParts.push(`📉 ${cmp.regressed.length} regressed`);
|
|
376
|
+
}
|
|
377
|
+
if (cmp.unchanged && cmp.unchanged.length > 0) {
|
|
378
|
+
summaryParts.push(`➡️ ${cmp.unchanged.length} unchanged`);
|
|
379
|
+
}
|
|
380
|
+
if (cmp.notEvaluated && cmp.notEvaluated.length > 0) {
|
|
381
|
+
summaryParts.push(`⏭️ ${cmp.notEvaluated.length} not evaluated`);
|
|
382
|
+
}
|
|
383
|
+
if (summaryParts.length > 0) {
|
|
384
|
+
const thresholdNote = cmp.noiseThresholdEmpirical
|
|
385
|
+
? ` (empirical threshold: ±${cmp.noiseThreshold.toFixed(1)})`
|
|
386
|
+
: ` (threshold: ±${cmp.noiseThreshold})`;
|
|
387
|
+
md.line(summaryParts.join(" · ") + thresholdNote);
|
|
388
|
+
md.blank();
|
|
389
|
+
}
|
|
390
|
+
md.details("Dimension averages", () => {
|
|
391
|
+
const dim = cmp.deltas.perDimension ?? {};
|
|
392
|
+
const rows = Object.entries(dim).map(([k, v]) => [
|
|
393
|
+
dimensionLabel(k),
|
|
394
|
+
signedNum(round(v)),
|
|
395
|
+
]);
|
|
396
|
+
rows.push(["Doc Lift", signedNum(round(cmp.deltas.docLift))]);
|
|
397
|
+
md.table(["Dimension", "Delta"], rows);
|
|
398
|
+
});
|
|
399
|
+
md.blank();
|
|
400
|
+
}
|
|
401
|
+
function renderCostBreakdown(md, scores, overall) {
|
|
402
|
+
const providerCost = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
|
|
403
|
+
const graderCost = overall?.cost?.graderTotal ?? 0;
|
|
404
|
+
const combinedCost = providerCost + graderCost;
|
|
405
|
+
if (combinedCost <= 0)
|
|
406
|
+
return;
|
|
407
|
+
md.details("💰 Eval cost breakdown", () => {
|
|
408
|
+
const rows = [
|
|
409
|
+
["Provider (model inference)", fmtCost(providerCost)],
|
|
410
|
+
];
|
|
411
|
+
if (graderCost > 0) {
|
|
412
|
+
const label = overall?.cost?.graderModel ?? "unknown";
|
|
413
|
+
rows.push([`Grader (${label})`, fmtCost(graderCost)]);
|
|
414
|
+
}
|
|
415
|
+
rows.push(["**Total**", `**${fmtCost(combinedCost)}**`]);
|
|
416
|
+
md.table(["Category", "Cost"], rows);
|
|
417
|
+
md.blank();
|
|
418
|
+
const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
|
|
419
|
+
md.line("**Provider cost by feature area:**");
|
|
420
|
+
md.blank();
|
|
421
|
+
md.table(["Feature", "Tests", "Cost", "Avg/Test"], sorted.map((s) => {
|
|
422
|
+
const avgCost = (s.testCount ?? 0) > 0 ? (s.totalCost ?? 0) / (s.testCount ?? 1) : 0;
|
|
423
|
+
return [
|
|
424
|
+
s.feature ?? "—",
|
|
425
|
+
String(s.testCount ?? 0),
|
|
426
|
+
fmtCost(s.totalCost ?? 0),
|
|
427
|
+
fmtCost(avgCost),
|
|
428
|
+
];
|
|
429
|
+
}));
|
|
430
|
+
});
|
|
431
|
+
md.blank();
|
|
432
|
+
}
|
|
433
|
+
function renderGapRecommendations(md, recommendations) {
|
|
434
|
+
if (!recommendations)
|
|
435
|
+
return;
|
|
436
|
+
const top3 = recommendations.top3 ?? [];
|
|
437
|
+
const counts = recommendations.counts ?? {};
|
|
438
|
+
if (top3.length === 0 && Object.keys(counts).length === 0)
|
|
439
|
+
return;
|
|
440
|
+
const lift = recommendations.totalPotentialLift ?? 0;
|
|
441
|
+
const totalGaps = recommendations.totalGaps ?? top3.length;
|
|
442
|
+
const liftSuffix = lift > 0 ? ` (+${lift.toFixed(1)} pts potential lift)` : "";
|
|
443
|
+
md.details(`📋 Recommendations — ${totalGaps} gaps${liftSuffix}`, () => {
|
|
444
|
+
md.blank();
|
|
445
|
+
if (top3.length > 0) {
|
|
446
|
+
md.line(`**Top ${top3.length} by priority:**`);
|
|
447
|
+
md.blank();
|
|
448
|
+
md.table(["#", "Area", "Failure Mode", "Priority"], top3.map((gap, i) => [
|
|
449
|
+
String(i + 1),
|
|
450
|
+
gap.area,
|
|
451
|
+
gap.title,
|
|
452
|
+
String(gap.priority),
|
|
453
|
+
]));
|
|
454
|
+
md.blank();
|
|
455
|
+
}
|
|
456
|
+
const countEntries = Object.entries(counts).sort((a, b) => b[1] - a[1]);
|
|
457
|
+
if (countEntries.length > 0) {
|
|
458
|
+
md.line("**Gap counts by area:**");
|
|
459
|
+
md.blank();
|
|
460
|
+
md.table(["Area", "Gaps"], countEntries.map(([area, n]) => [area, String(n)]));
|
|
461
|
+
}
|
|
462
|
+
});
|
|
463
|
+
md.blank();
|
|
464
|
+
}
|
|
465
|
+
function renderLowScoringJudgments(md, judgments) {
|
|
466
|
+
if (!judgments || judgments.length === 0)
|
|
467
|
+
return;
|
|
468
|
+
const byArea = new Map();
|
|
469
|
+
for (const j of judgments) {
|
|
470
|
+
const sep = j.taskId.indexOf(" - ");
|
|
471
|
+
const area = sep > 0 ? j.taskId.substring(0, sep) : j.taskId;
|
|
472
|
+
if (!byArea.has(area))
|
|
473
|
+
byArea.set(area, []);
|
|
474
|
+
byArea.get(area).push(j);
|
|
475
|
+
}
|
|
476
|
+
const groups = [...byArea.entries()]
|
|
477
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
478
|
+
.map(([area, js]) => [area, js.sort((a, b) => a.score - b.score)]);
|
|
479
|
+
md.details(`🔍 Low-Scoring Judgments (${judgments.length} below 70)`, () => {
|
|
480
|
+
md.blank();
|
|
481
|
+
for (const [area, areaJudgments] of groups) {
|
|
482
|
+
md.heading(4, `${area} (${areaJudgments.length})`);
|
|
483
|
+
md.blank();
|
|
484
|
+
for (const j of areaJudgments) {
|
|
485
|
+
const sep = j.taskId.indexOf(" - ");
|
|
486
|
+
const taskName = sep > 0 ? j.taskId.substring(sep + 3) : j.taskId;
|
|
487
|
+
const dimLabel = dimensionLabel(j.dimension);
|
|
488
|
+
md.line(`**${gradeEmoji(j.score)} ${j.score}** · ${dimLabel} · ${taskName} · \`${j.modelId}\``);
|
|
489
|
+
md.blank();
|
|
490
|
+
const reasonLines = j.reason
|
|
491
|
+
.split("\n")
|
|
492
|
+
.map((l) => `> ${l}`)
|
|
493
|
+
.join("\n");
|
|
494
|
+
md.line(reasonLines);
|
|
495
|
+
md.blank();
|
|
496
|
+
if (j.canonicalDocs && j.canonicalDocs.length > 0) {
|
|
497
|
+
const docList = j.canonicalDocs.map((d) => `\`${d.slug}\``).join(", ");
|
|
498
|
+
md.line(`*Expected docs: ${docList}*`);
|
|
499
|
+
md.blank();
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
});
|
|
504
|
+
md.blank();
|
|
505
|
+
}
|
|
506
|
+
function renderRecommendations(md, scores, belowCritical) {
|
|
507
|
+
const sorted = [...scores].sort((a, b) => (b.totalScore ?? 0) - (a.totalScore ?? 0));
|
|
508
|
+
const needsRecs = belowCritical.length > 0 || sorted.some((s) => (s.totalScore ?? 100) < 70);
|
|
509
|
+
if (!needsRecs)
|
|
510
|
+
return;
|
|
511
|
+
md.heading(3, "💡 Recommendations");
|
|
512
|
+
md.blank();
|
|
513
|
+
for (const s of sorted) {
|
|
514
|
+
const score = s.totalScore ?? 0;
|
|
515
|
+
if (score < 50) {
|
|
516
|
+
md.line(`- 🔴 **${s.feature}** (score: ${round(score)}) — needs significant doc improvements.`);
|
|
517
|
+
if ((s.codeCorrectness ?? 0) < 10) {
|
|
518
|
+
md.line(` Code correctness is very low (${round(s.codeCorrectness)}) — add more complete code examples.`);
|
|
519
|
+
}
|
|
520
|
+
if ((s.docCoverage ?? 0) < 10) {
|
|
521
|
+
md.line(` Doc coverage is very low (${round(s.docCoverage)}) — key APIs/patterns may be missing from docs.`);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
else if (score < 70) {
|
|
525
|
+
md.line(`- 🟠 **${s.feature}** (score: ${round(score)}) — has room for improvement.`);
|
|
526
|
+
if ((s.codeCorrectness ?? 0) < 15) {
|
|
527
|
+
md.line(` Code correctness (${round(s.codeCorrectness)}) could improve with better code examples.`);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
md.blank();
|
|
532
|
+
}
|
|
533
|
+
function renderFooter(md, report, provenance) {
|
|
534
|
+
md.line("---");
|
|
535
|
+
const date = report.completedAt ? fmtDate(report.completedAt) : undefined;
|
|
536
|
+
const parts = [
|
|
537
|
+
`[AI Literacy Framework](https://github.com/sanity-labs/ai-literacy-framework)`,
|
|
538
|
+
];
|
|
539
|
+
if (date)
|
|
540
|
+
parts.push(date);
|
|
541
|
+
const promptfooUrl = provenance?.promptfooUrls?.[0]?.url;
|
|
542
|
+
if (typeof promptfooUrl === "string" && promptfooUrl.length > 0) {
|
|
543
|
+
parts.push(`[view detailed results](${promptfooUrl})`);
|
|
544
|
+
}
|
|
545
|
+
md.line(`*Generated by ${parts.join(" · ")}*`);
|
|
546
|
+
}
|
|
547
|
+
function buildStudioUrl(reportId) {
|
|
548
|
+
const origin = process.env.SANITY_STUDIO_ORIGIN ?? "https://admin.sanity.io";
|
|
549
|
+
return `${origin}/ailf/report/${reportId}`;
|
|
550
|
+
}
|
|
551
|
+
// ---------------------------------------------------------------------------
|
|
552
|
+
// MarkdownBuilder
|
|
553
|
+
// ---------------------------------------------------------------------------
|
|
554
|
+
class MarkdownBuilder {
|
|
555
|
+
lines = [];
|
|
556
|
+
blank() {
|
|
557
|
+
this.lines.push("");
|
|
558
|
+
}
|
|
559
|
+
details(summary, body) {
|
|
560
|
+
this.lines.push("<details>");
|
|
561
|
+
this.lines.push(`<summary>${summary}</summary>`);
|
|
562
|
+
this.lines.push("");
|
|
563
|
+
body();
|
|
564
|
+
this.lines.push("</details>");
|
|
565
|
+
}
|
|
566
|
+
heading(level, text) {
|
|
567
|
+
this.lines.push(`${"#".repeat(level)} ${text}`);
|
|
568
|
+
}
|
|
569
|
+
line(text) {
|
|
570
|
+
this.lines.push(text);
|
|
571
|
+
}
|
|
572
|
+
table(headers, rows) {
|
|
573
|
+
this.lines.push(`| ${headers.join(" | ")} |`);
|
|
574
|
+
this.lines.push(`|${headers.map(() => "---").join("|")}|`);
|
|
575
|
+
for (const row of rows) {
|
|
576
|
+
this.lines.push(`| ${row.join(" | ")} |`);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
toString() {
|
|
580
|
+
return this.lines.join("\n") + "\n";
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
// ---------------------------------------------------------------------------
|
|
584
|
+
// Formatting helpers
|
|
585
|
+
// ---------------------------------------------------------------------------
|
|
586
|
+
function changeEmoji(change) {
|
|
587
|
+
if (change === "improved")
|
|
588
|
+
return "📈";
|
|
589
|
+
if (change === "regressed")
|
|
590
|
+
return "📉";
|
|
591
|
+
if (change === "not-evaluated")
|
|
592
|
+
return "⏭️";
|
|
593
|
+
return "➡️";
|
|
594
|
+
}
|
|
595
|
+
function computeTotalCost(scores, overall) {
|
|
596
|
+
const provider = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
|
|
597
|
+
const grader = overall?.cost?.graderTotal ?? 0;
|
|
598
|
+
return provider + grader;
|
|
599
|
+
}
|
|
600
|
+
function dimensionLabel(dim) {
|
|
601
|
+
return dim
|
|
602
|
+
.split("-")
|
|
603
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
604
|
+
.join(" ");
|
|
605
|
+
}
|
|
606
|
+
function fmtCost(cost) {
|
|
607
|
+
if (cost === 0)
|
|
608
|
+
return "$0.00";
|
|
609
|
+
if (cost < 0.01)
|
|
610
|
+
return `$${cost.toFixed(4)}`;
|
|
611
|
+
return `$${cost.toFixed(2)}`;
|
|
612
|
+
}
|
|
613
|
+
function fmtDate(iso) {
|
|
614
|
+
try {
|
|
615
|
+
return new Date(iso).toLocaleString("en-US", {
|
|
616
|
+
day: "numeric",
|
|
617
|
+
hour: "numeric",
|
|
618
|
+
minute: "2-digit",
|
|
619
|
+
month: "short",
|
|
620
|
+
timeZone: "UTC",
|
|
621
|
+
timeZoneName: "short",
|
|
622
|
+
year: "numeric",
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
catch {
|
|
626
|
+
return iso;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
function fmtDuration(ms) {
|
|
630
|
+
if (ms < 1000)
|
|
631
|
+
return `${ms}ms`;
|
|
632
|
+
const secs = ms / 1000;
|
|
633
|
+
if (secs < 60)
|
|
634
|
+
return `${secs.toFixed(1)}s`;
|
|
635
|
+
const mins = Math.floor(secs / 60);
|
|
636
|
+
const remSecs = Math.round(secs % 60);
|
|
637
|
+
return `${mins}m ${remSecs}s`;
|
|
638
|
+
}
|
|
639
|
+
function gradeEmoji(score) {
|
|
640
|
+
if (score === undefined)
|
|
641
|
+
return "—";
|
|
642
|
+
if (score >= 80)
|
|
643
|
+
return "✅";
|
|
644
|
+
if (score >= 70)
|
|
645
|
+
return "🟡";
|
|
646
|
+
if (score >= 50)
|
|
647
|
+
return "🟠";
|
|
648
|
+
return "🔴";
|
|
649
|
+
}
|
|
650
|
+
function gradeLetter(score) {
|
|
651
|
+
if (score === undefined)
|
|
652
|
+
return "—";
|
|
653
|
+
if (score >= 80)
|
|
654
|
+
return "A";
|
|
655
|
+
if (score >= 70)
|
|
656
|
+
return "B";
|
|
657
|
+
if (score >= 50)
|
|
658
|
+
return "C";
|
|
659
|
+
return "D";
|
|
660
|
+
}
|
|
661
|
+
function liftArrow(lift) {
|
|
662
|
+
if (lift === undefined)
|
|
663
|
+
return "—";
|
|
664
|
+
const rounded = Math.round(lift);
|
|
665
|
+
if (rounded > 0)
|
|
666
|
+
return `📈 +${rounded}`;
|
|
667
|
+
if (rounded < 0)
|
|
668
|
+
return `📉 ${rounded}`;
|
|
669
|
+
return "➡️ 0";
|
|
670
|
+
}
|
|
671
|
+
function normalizeScores(raw) {
|
|
672
|
+
if (!Array.isArray(raw))
|
|
673
|
+
return [];
|
|
674
|
+
return raw.filter((s) => s != null && typeof s === "object");
|
|
675
|
+
}
|
|
676
|
+
function round(n) {
|
|
677
|
+
if (n === undefined)
|
|
678
|
+
return 0;
|
|
679
|
+
return Math.round(n);
|
|
680
|
+
}
|
|
681
|
+
function scoreEmoji(avg) {
|
|
682
|
+
if (avg === undefined)
|
|
683
|
+
return "📊";
|
|
684
|
+
if (avg >= 75)
|
|
685
|
+
return "🟢";
|
|
686
|
+
if (avg >= 60)
|
|
687
|
+
return "🟡";
|
|
688
|
+
if (avg >= 45)
|
|
689
|
+
return "🟠";
|
|
690
|
+
return "🔴";
|
|
691
|
+
}
|
|
692
|
+
function signedNum(n) {
|
|
693
|
+
if (n === undefined)
|
|
694
|
+
return "—";
|
|
695
|
+
return n > 0 ? `+${n}` : String(n);
|
|
696
|
+
}
|