@huydao/karrot 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GUIDE.md +484 -0
- package/README.md +253 -0
- package/dist/assertions/assertion.d.ts +18 -0
- package/dist/assertions/assertion.js +198 -0
- package/dist/assertions/turn-eval.d.ts +22 -0
- package/dist/assertions/turn-eval.js +178 -0
- package/dist/executors/adapters/ag-ui-post.d.ts +55 -0
- package/dist/executors/adapters/ag-ui-post.js +703 -0
- package/dist/executors/adapters/ag-ui.d.ts +15 -0
- package/dist/executors/adapters/ag-ui.js +275 -0
- package/dist/executors/execute.d.ts +16 -0
- package/dist/executors/execute.js +145 -0
- package/dist/executors/executor.d.ts +37 -0
- package/dist/executors/executor.js +203 -0
- package/dist/executors/run-result.d.ts +33 -0
- package/dist/executors/run-result.js +22 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.js +28 -0
- package/dist/prompts/turn-eval-system-prompt.md +68 -0
- package/dist/prompts/turn-message-gen-system-prompt.md +16 -0
- package/dist/reports/report.d.ts +68 -0
- package/dist/reports/report.js +366 -0
- package/dist/scenarios/generated-message.d.ts +15 -0
- package/dist/scenarios/generated-message.js +116 -0
- package/dist/scenarios/scenario-loader.d.ts +12 -0
- package/dist/scenarios/scenario-loader.js +103 -0
- package/dist/scenarios/scenario.d.ts +62 -0
- package/dist/scenarios/scenario.js +35 -0
- package/dist/utils/artifact-files.d.ts +3 -0
- package/dist/utils/artifact-files.js +22 -0
- package/dist/utils/config.d.ts +101 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/openai-eval.d.ts +5 -0
- package/dist/utils/openai-eval.js +54 -0
- package/package.json +146 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.ScenarioExecutionError = void 0;
|
|
7
|
+
exports.finalizeScenarioResult = finalizeScenarioResult;
|
|
8
|
+
exports.getScenarioRunStatus = getScenarioRunStatus;
|
|
9
|
+
exports.buildScenarioSlackRows = buildScenarioSlackRows;
|
|
10
|
+
exports.writeScenarioRunReport = writeScenarioRunReport;
|
|
11
|
+
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
12
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
13
|
+
class ScenarioExecutionError extends Error {
|
|
14
|
+
result;
|
|
15
|
+
constructor(result) {
|
|
16
|
+
super(result.note ?? `${result.id} failed.`);
|
|
17
|
+
this.name = 'ScenarioExecutionError';
|
|
18
|
+
this.result = result;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
exports.ScenarioExecutionError = ScenarioExecutionError;
|
|
22
|
+
function sumNumbers(values) {
|
|
23
|
+
const definedValues = values.filter((value) => typeof value === 'number');
|
|
24
|
+
if (definedValues.length === 0) {
|
|
25
|
+
return undefined;
|
|
26
|
+
}
|
|
27
|
+
return Number(definedValues.reduce((total, value) => total + value, 0).toFixed(1));
|
|
28
|
+
}
|
|
29
|
+
function summarizeScenarioMetrics(turns) {
|
|
30
|
+
const protocolUsedKb = sumNumbers(turns.map((turn) => turn.metrics.protocolUsedKb));
|
|
31
|
+
const protocolTotalKb = sumNumbers(turns.map((turn) => turn.metrics.protocolTotalKb));
|
|
32
|
+
const efficiencyPercent = typeof protocolUsedKb === 'number' && typeof protocolTotalKb === 'number' && protocolTotalKb > 0
|
|
33
|
+
? Math.round((protocolUsedKb / protocolTotalKb) * 100)
|
|
34
|
+
: undefined;
|
|
35
|
+
return {
|
|
36
|
+
ttfToolSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfToolSeconds)),
|
|
37
|
+
ttfTextSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
38
|
+
totalSeconds: sumNumbers(turns.map((turn) => turn.metrics.totalSeconds)),
|
|
39
|
+
protocolUsedKb,
|
|
40
|
+
protocolTotalKb,
|
|
41
|
+
efficiencyPercent,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
function buildScenarioNote(result) {
|
|
45
|
+
if (result.note) {
|
|
46
|
+
return result.note;
|
|
47
|
+
}
|
|
48
|
+
if (result.id === 'S3') {
|
|
49
|
+
return 'Correctly declined';
|
|
50
|
+
}
|
|
51
|
+
const toolCallCount = result.turns.reduce((total, turn) => total + turn.toolCallCount, 0);
|
|
52
|
+
if (toolCallCount === 0) {
|
|
53
|
+
return 'No tools';
|
|
54
|
+
}
|
|
55
|
+
if (result.turns.length > 1) {
|
|
56
|
+
return 'Multi-turn tool flow';
|
|
57
|
+
}
|
|
58
|
+
return 'Tool-assisted response';
|
|
59
|
+
}
|
|
60
|
+
function finalizeScenarioResult(result) {
|
|
61
|
+
result.metrics = summarizeScenarioMetrics(result.turns);
|
|
62
|
+
result.note = buildScenarioNote(result);
|
|
63
|
+
return result;
|
|
64
|
+
}
|
|
65
|
+
function formatSeconds(value) {
|
|
66
|
+
return typeof value === 'number' ? `${value.toFixed(1)}s` : '-';
|
|
67
|
+
}
|
|
68
|
+
function formatPercent(value) {
|
|
69
|
+
return typeof value === 'number' ? `${value}%` : '-';
|
|
70
|
+
}
|
|
71
|
+
function hasScenarioFailures(results) {
|
|
72
|
+
return results.some((result) => result.status === 'FAIL');
|
|
73
|
+
}
|
|
74
|
+
function getScenarioRunStatus(results) {
|
|
75
|
+
return hasScenarioFailures(results) ? 'FAIL' : 'PASS';
|
|
76
|
+
}
|
|
77
|
+
function buildScenarioSlackRows(results) {
|
|
78
|
+
return results.map((result) => ({
|
|
79
|
+
id: result.id,
|
|
80
|
+
name: result.name,
|
|
81
|
+
status: result.status,
|
|
82
|
+
ttfTool: formatSeconds(result.metrics.ttfToolSeconds),
|
|
83
|
+
ttfText: formatSeconds(result.metrics.ttfTextSeconds),
|
|
84
|
+
total: formatSeconds(result.metrics.totalSeconds),
|
|
85
|
+
efficiency: formatPercent(result.metrics.efficiencyPercent),
|
|
86
|
+
note: result.note,
|
|
87
|
+
}));
|
|
88
|
+
}
|
|
89
|
+
function buildScenarioRunSummary(results) {
|
|
90
|
+
const turns = results.flatMap((result) => result.turns);
|
|
91
|
+
const assertions = turns.flatMap((turn) => turn.assertionResults ?? []);
|
|
92
|
+
const evaluations = turns.flatMap((turn) => turn.evaluationResults ?? []);
|
|
93
|
+
const requestedEvalDimensions = [...new Set(turns.flatMap((turn) => turn.evalDimensions ?? []))].sort();
|
|
94
|
+
const scoresByDimension = evaluations.reduce((accumulator, evaluation) => {
|
|
95
|
+
accumulator[evaluation.dimension] ??= [];
|
|
96
|
+
accumulator[evaluation.dimension].push(evaluation.score);
|
|
97
|
+
return accumulator;
|
|
98
|
+
}, {});
|
|
99
|
+
const averageScoresByDimension = Object.fromEntries(Object.entries(scoresByDimension).map(([dimension, scores]) => [
|
|
100
|
+
dimension,
|
|
101
|
+
Number((scores.reduce((total, score) => total + score, 0) / scores.length).toFixed(1)),
|
|
102
|
+
]));
|
|
103
|
+
return {
|
|
104
|
+
status: getScenarioRunStatus(results),
|
|
105
|
+
totalScenarios: results.length,
|
|
106
|
+
passedScenarios: results.filter((result) => result.status === 'PASS').length,
|
|
107
|
+
failedScenarios: results.filter((result) => result.status === 'FAIL').length,
|
|
108
|
+
skippedScenarios: results.filter((result) => result.status === 'SKIP').length,
|
|
109
|
+
totalTurns: turns.length,
|
|
110
|
+
totalAssertions: assertions.length,
|
|
111
|
+
passedAssertions: assertions.filter((assertion) => assertion.passed).length,
|
|
112
|
+
failedAssertions: assertions.filter((assertion) => !assertion.passed).length,
|
|
113
|
+
totalToolCalls: turns.reduce((total, turn) => total + turn.toolCallCount, 0),
|
|
114
|
+
totalEvaluations: evaluations.length,
|
|
115
|
+
averageScoresByDimension,
|
|
116
|
+
requestedEvalDimensions,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function escapeHtml(value) {
|
|
120
|
+
return value
|
|
121
|
+
.replaceAll('&', '&')
|
|
122
|
+
.replaceAll('<', '<')
|
|
123
|
+
.replaceAll('>', '>')
|
|
124
|
+
.replaceAll('"', '"')
|
|
125
|
+
.replaceAll("'", ''');
|
|
126
|
+
}
|
|
127
|
+
function renderOptionalText(value) {
|
|
128
|
+
if (!value?.trim()) {
|
|
129
|
+
return '<span class="muted">-</span>';
|
|
130
|
+
}
|
|
131
|
+
return escapeHtml(value);
|
|
132
|
+
}
|
|
133
|
+
function renderAssertionExpected(expected) {
|
|
134
|
+
if (Array.isArray(expected)) {
|
|
135
|
+
return expected.length > 0 ? escapeHtml(expected.join(', ')) : '<span class="muted">[]</span>';
|
|
136
|
+
}
|
|
137
|
+
return escapeHtml(expected);
|
|
138
|
+
}
|
|
139
|
+
function statusClass(status) {
|
|
140
|
+
return status.toLowerCase();
|
|
141
|
+
}
|
|
142
|
+
function renderMetrics(metrics) {
|
|
143
|
+
return [
|
|
144
|
+
`TTF Tool: ${formatSeconds(metrics.ttfToolSeconds)}`,
|
|
145
|
+
`TTF Text: ${formatSeconds(metrics.ttfTextSeconds)}`,
|
|
146
|
+
`Total: ${formatSeconds(metrics.totalSeconds)}`,
|
|
147
|
+
`Efficiency: ${formatPercent(metrics.efficiencyPercent)}`,
|
|
148
|
+
].join(' | ');
|
|
149
|
+
}
|
|
150
|
+
function renderDimensionList(dimensions) {
|
|
151
|
+
if (!dimensions?.length) {
|
|
152
|
+
return '<span class="muted">-</span>';
|
|
153
|
+
}
|
|
154
|
+
return dimensions.map((dimension) => `<span class="dimension-chip">${escapeHtml(dimension)}</span>`).join(' ');
|
|
155
|
+
}
|
|
156
|
+
function renderAssertionRows(assertions) {
|
|
157
|
+
if (!assertions?.length) {
|
|
158
|
+
return '<p class="muted">No assertions for this turn.</p>';
|
|
159
|
+
}
|
|
160
|
+
return [
|
|
161
|
+
'<table class="assertions">',
|
|
162
|
+
'<thead><tr><th>Status</th><th>Type</th><th>Expectation</th><th>Description</th><th>Reason</th></tr></thead>',
|
|
163
|
+
'<tbody>',
|
|
164
|
+
...assertions.map((assertion) => [
|
|
165
|
+
'<tr>',
|
|
166
|
+
`<td><span class="badge ${assertion.passed ? 'pass' : 'fail'}">${assertion.passed ? 'PASS' : 'FAIL'}</span></td>`,
|
|
167
|
+
`<td>${escapeHtml(`${assertion.kind}.${assertion.matcher}`)}</td>`,
|
|
168
|
+
`<td>${renderAssertionExpected(assertion.expected)}</td>`,
|
|
169
|
+
`<td>${renderOptionalText(assertion.description)}</td>`,
|
|
170
|
+
`<td>${escapeHtml(assertion.reason)}</td>`,
|
|
171
|
+
'</tr>',
|
|
172
|
+
].join('')),
|
|
173
|
+
'</tbody>',
|
|
174
|
+
'</table>',
|
|
175
|
+
].join('');
|
|
176
|
+
}
|
|
177
|
+
function renderEvaluationRows(evaluations) {
|
|
178
|
+
if (!evaluations?.length) {
|
|
179
|
+
return '<p class="muted">No evaluations for this turn.</p>';
|
|
180
|
+
}
|
|
181
|
+
return [
|
|
182
|
+
'<table class="evaluations">',
|
|
183
|
+
'<thead><tr><th>Dimension</th><th>Score</th><th>Why</th><th>Improve</th></tr></thead>',
|
|
184
|
+
'<tbody>',
|
|
185
|
+
...evaluations.map((evaluation) => [
|
|
186
|
+
'<tr>',
|
|
187
|
+
`<td>${escapeHtml(evaluation.dimension)}</td>`,
|
|
188
|
+
`<td><span class="score-pill">${escapeHtml(`${evaluation.score}/10`)}</span></td>`,
|
|
189
|
+
`<td>${escapeHtml(evaluation.explanation)}</td>`,
|
|
190
|
+
`<td>${escapeHtml(evaluation.suggestion)}</td>`,
|
|
191
|
+
'</tr>',
|
|
192
|
+
].join('')),
|
|
193
|
+
'</tbody>',
|
|
194
|
+
'</table>',
|
|
195
|
+
].join('');
|
|
196
|
+
}
|
|
197
|
+
function renderEvaluationSummary(averageScoresByDimension) {
|
|
198
|
+
const entries = Object.entries(averageScoresByDimension);
|
|
199
|
+
if (entries.length === 0) {
|
|
200
|
+
return '<p class="muted">No turn evaluations were requested in this run.</p>';
|
|
201
|
+
}
|
|
202
|
+
return [
|
|
203
|
+
'<div class="score-grid">',
|
|
204
|
+
...entries.map(([dimension, score]) => `<div class="score-card"><span class="label">${escapeHtml(dimension)}</span><span class="value">${escapeHtml(`${score}/10`)}</span></div>`),
|
|
205
|
+
'</div>',
|
|
206
|
+
].join('');
|
|
207
|
+
}
|
|
208
|
+
function renderTurnCard(turn, index) {
|
|
209
|
+
return [
|
|
210
|
+
'<section class="turn-card">',
|
|
211
|
+
`<h4>${escapeHtml(`${turn.label} (${index + 1})`)}</h4>`,
|
|
212
|
+
'<div class="meta-grid">',
|
|
213
|
+
`<div><span class="meta-label">Thread</span><code>${renderOptionalText(turn.threadId)}</code></div>`,
|
|
214
|
+
`<div><span class="meta-label">Tools</span><code>${String(turn.toolCallCount)}</code></div>`,
|
|
215
|
+
`<div><span class="meta-label">Eval Dimensions</span><div>${renderDimensionList(turn.evalDimensions)}</div></div>`,
|
|
216
|
+
`<div><span class="meta-label">Metrics</span><code>${escapeHtml(renderMetrics(turn.metrics))}</code></div>`,
|
|
217
|
+
`<div><span class="meta-label">Artifact</span><code>${renderOptionalText(turn.outputPath)}</code></div>`,
|
|
218
|
+
'</div>',
|
|
219
|
+
'<div class="content-block">',
|
|
220
|
+
'<h5>Prompt</h5>',
|
|
221
|
+
`<pre>${escapeHtml(turn.message)}</pre>`,
|
|
222
|
+
'</div>',
|
|
223
|
+
'<div class="content-block">',
|
|
224
|
+
'<h5>Response</h5>',
|
|
225
|
+
`<pre>${renderOptionalText(turn.output)}</pre>`,
|
|
226
|
+
'</div>',
|
|
227
|
+
'<div class="content-block">',
|
|
228
|
+
'<h5>Assertions</h5>',
|
|
229
|
+
renderAssertionRows(turn.assertionResults),
|
|
230
|
+
'</div>',
|
|
231
|
+
'<div class="content-block">',
|
|
232
|
+
'<h5>Evaluations</h5>',
|
|
233
|
+
renderEvaluationRows(turn.evaluationResults),
|
|
234
|
+
'</div>',
|
|
235
|
+
'<div class="content-block">',
|
|
236
|
+
'<h5>Notes</h5>',
|
|
237
|
+
`<p>${renderOptionalText(turn.note)}</p>`,
|
|
238
|
+
'</div>',
|
|
239
|
+
'</section>',
|
|
240
|
+
].join('');
|
|
241
|
+
}
|
|
242
|
+
function buildScenarioRunHtml(payload) {
|
|
243
|
+
const scenarioCards = payload.results
|
|
244
|
+
.map((result) => [
|
|
245
|
+
`<details class="scenario-card ${statusClass(result.status)}" ${result.status === 'FAIL' ? 'open' : ''}>`,
|
|
246
|
+
'<summary>',
|
|
247
|
+
`<span class="scenario-title">${escapeHtml(`${result.id} - ${result.name}`)}</span>`,
|
|
248
|
+
`<span class="badge ${statusClass(result.status)}">${result.status}</span>`,
|
|
249
|
+
`<span class="summary-note">${renderOptionalText(result.note)}</span>`,
|
|
250
|
+
'</summary>',
|
|
251
|
+
'<div class="scenario-body">',
|
|
252
|
+
'<div class="meta-grid">',
|
|
253
|
+
`<div><span class="meta-label">Thread</span><code>${renderOptionalText(result.threadId)}</code></div>`,
|
|
254
|
+
`<div><span class="meta-label">Turns</span><code>${String(result.turns.length)}</code></div>`,
|
|
255
|
+
`<div><span class="meta-label">Metrics</span><code>${escapeHtml(renderMetrics(result.metrics))}</code></div>`,
|
|
256
|
+
'</div>',
|
|
257
|
+
...result.turns.map((turn, index) => renderTurnCard(turn, index)),
|
|
258
|
+
'</div>',
|
|
259
|
+
'</details>',
|
|
260
|
+
].join(''))
|
|
261
|
+
.join('\n');
|
|
262
|
+
return [
|
|
263
|
+
'<!DOCTYPE html>',
|
|
264
|
+
'<html lang="en">',
|
|
265
|
+
'<head>',
|
|
266
|
+
'<meta charset="utf-8" />',
|
|
267
|
+
'<meta name="viewport" content="width=device-width, initial-scale=1" />',
|
|
268
|
+
`<title>${escapeHtml(`${payload.projectName} AI Scenario Report`)}</title>`,
|
|
269
|
+
'<style>',
|
|
270
|
+
'body{margin:0;font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;background:#f4f7fb;color:#142033;}',
|
|
271
|
+
'.page{max-width:1280px;margin:0 auto;padding:32px 24px 64px;}',
|
|
272
|
+
'.hero{background:linear-gradient(135deg,#0f172a,#1d4ed8);color:#fff;padding:28px;border-radius:20px;box-shadow:0 20px 50px rgba(15,23,42,.18);}',
|
|
273
|
+
'.hero h1{margin:0 0 8px;font-size:32px;}',
|
|
274
|
+
'.hero p{margin:4px 0;color:rgba(255,255,255,.88);}',
|
|
275
|
+
'.summary-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(170px,1fr));gap:14px;margin:22px 0 28px;}',
|
|
276
|
+
'.summary-card,.panel,.turn-card,.scenario-card{background:#fff;border:1px solid #dbe4f0;border-radius:18px;box-shadow:0 10px 30px rgba(15,23,42,.06);}',
|
|
277
|
+
'.summary-card{padding:18px;}',
|
|
278
|
+
'.summary-card .label{display:block;font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#5b6b84;margin-bottom:8px;}',
|
|
279
|
+
'.summary-card .value{font-size:28px;font-weight:700;}',
|
|
280
|
+
'.summary-card .sub{font-size:13px;color:#61728d;}',
|
|
281
|
+
'.panels{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:16px;margin-bottom:28px;}',
|
|
282
|
+
'.panel{padding:18px;}',
|
|
283
|
+
'.panel h2{margin:0 0 12px;font-size:18px;}',
|
|
284
|
+
'.panel pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;font-size:13px;}',
|
|
285
|
+
'.scenario-list{display:grid;gap:18px;}',
|
|
286
|
+
'.scenario-card summary{list-style:none;display:flex;gap:12px;align-items:center;justify-content:space-between;padding:18px 20px;cursor:pointer;}',
|
|
287
|
+
'.scenario-card summary::-webkit-details-marker{display:none;}',
|
|
288
|
+
'.scenario-title{font-weight:700;font-size:18px;flex:1;}',
|
|
289
|
+
'.summary-note{color:#61728d;font-size:14px;max-width:40%;text-align:right;}',
|
|
290
|
+
'.scenario-body{padding:0 20px 20px;}',
|
|
291
|
+
'.meta-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:12px;margin:8px 0 18px;}',
|
|
292
|
+
'.meta-grid>div{background:#f8fbff;border-radius:12px;padding:12px;}',
|
|
293
|
+
'.meta-label{display:block;font-size:12px;color:#61728d;text-transform:uppercase;letter-spacing:.08em;margin-bottom:6px;}',
|
|
294
|
+
'.turn-card{padding:18px;margin-top:16px;}',
|
|
295
|
+
'.turn-card h4,.content-block h5{margin:0 0 10px;}',
|
|
296
|
+
'.content-block{margin-top:14px;}',
|
|
297
|
+
'.content-block pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;max-height:420px;overflow:auto;}',
|
|
298
|
+
'.content-block p{margin:0;background:#f8fbff;border-radius:12px;padding:14px;}',
|
|
299
|
+
'.assertions,.evaluations{width:100%;border-collapse:collapse;font-size:14px;}',
|
|
300
|
+
'.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;border-bottom:1px solid #e5edf7;vertical-align:top;text-align:left;}',
|
|
301
|
+
'.assertions th,.evaluations th{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#61728d;}',
|
|
302
|
+
'.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 10px;font-size:12px;font-weight:700;min-width:56px;}',
|
|
303
|
+
'.badge.pass{background:#dcfce7;color:#166534;}',
|
|
304
|
+
'.badge.fail{background:#fee2e2;color:#991b1b;}',
|
|
305
|
+
'.badge.skip{background:#e2e8f0;color:#334155;}',
|
|
306
|
+
'.dimension-chip{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:4px 10px;margin:0 6px 6px 0;background:#eef2ff;color:#3730a3;font-size:12px;font-weight:600;}',
|
|
307
|
+
'.score-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;}',
|
|
308
|
+
'.score-card{background:#f8fbff;border-radius:12px;padding:14px;}',
|
|
309
|
+
'.score-card .label{display:block;font-size:12px;color:#61728d;text-transform:uppercase;letter-spacing:.08em;margin-bottom:8px;}',
|
|
310
|
+
'.score-card .value{font-size:24px;font-weight:700;}',
|
|
311
|
+
'.score-pill{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:4px 10px;background:#dbeafe;color:#1d4ed8;font-weight:700;min-width:58px;}',
|
|
312
|
+
'.muted{color:#7c8ba1;}',
|
|
313
|
+
'code{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono",monospace;font-size:12px;word-break:break-all;}',
|
|
314
|
+
'.footer{margin-top:28px;color:#61728d;font-size:13px;}',
|
|
315
|
+
'@media (max-width:900px){.scenario-card summary{flex-direction:column;align-items:flex-start;}.summary-note{max-width:none;text-align:left;}}',
|
|
316
|
+
'</style>',
|
|
317
|
+
'</head>',
|
|
318
|
+
'<body>',
|
|
319
|
+
'<div class="page">',
|
|
320
|
+
'<section class="hero">',
|
|
321
|
+
`<h1>${escapeHtml(payload.projectName)} AI Scenario Report</h1>`,
|
|
322
|
+
`<p>Environment: <strong>${escapeHtml(payload.environment)}</strong></p>`,
|
|
323
|
+
`<p>Generated: <strong>${escapeHtml(payload.generatedAt)}</strong></p>`,
|
|
324
|
+
`<p>Overall status: <span class="badge ${statusClass(payload.summary.status)}">${payload.summary.status}</span></p>`,
|
|
325
|
+
'</section>',
|
|
326
|
+
'<section class="summary-grid">',
|
|
327
|
+
`<article class="summary-card"><span class="label">Scenarios</span><div class="value">${payload.summary.totalScenarios}</div><div class="sub">${payload.summary.passedScenarios} pass / ${payload.summary.failedScenarios} fail / ${payload.summary.skippedScenarios} skip</div></article>`,
|
|
328
|
+
`<article class="summary-card"><span class="label">Turns</span><div class="value">${payload.summary.totalTurns}</div><div class="sub">${payload.summary.totalToolCalls} tool calls total</div></article>`,
|
|
329
|
+
`<article class="summary-card"><span class="label">Assertions</span><div class="value">${payload.summary.totalAssertions}</div><div class="sub">${payload.summary.passedAssertions} pass / ${payload.summary.failedAssertions} fail</div></article>`,
|
|
330
|
+
`<article class="summary-card"><span class="label">Evaluations</span><div class="value">${payload.summary.totalEvaluations}</div><div class="sub">LLM-scored dimensions</div></article>`,
|
|
331
|
+
'</section>',
|
|
332
|
+
'<section class="panels">',
|
|
333
|
+
`<article class="panel"><h2>Evaluation Averages</h2>${renderEvaluationSummary(payload.summary.averageScoresByDimension)}</article>`,
|
|
334
|
+
`<article class="panel"><h2>Requested Eval Dimensions</h2><div>${renderDimensionList(payload.summary.requestedEvalDimensions)}</div></article>`,
|
|
335
|
+
`<article class="panel"><h2>Scenario Context</h2><pre>${escapeHtml(JSON.stringify(payload.scenarioContext, null, 2))}</pre></article>`,
|
|
336
|
+
`<article class="panel"><h2>Runtime</h2><pre>${escapeHtml(JSON.stringify(payload.runtime, null, 2))}</pre></article>`,
|
|
337
|
+
'</section>',
|
|
338
|
+
`<section class="scenario-list">${scenarioCards}</section>`,
|
|
339
|
+
'<p class="footer">This report includes each tested scenario, each turn prompt, the captured assistant response, assertion outcomes, and the raw artifact path for follow-up debugging.</p>',
|
|
340
|
+
'</div>',
|
|
341
|
+
'</body>',
|
|
342
|
+
'</html>',
|
|
343
|
+
].join('');
|
|
344
|
+
}
|
|
345
|
+
async function writeScenarioRunReport(options) {
|
|
346
|
+
await promises_1.default.mkdir(options.outputDirectory, { recursive: true });
|
|
347
|
+
const timestamp = new Date().toISOString();
|
|
348
|
+
const safeTimestamp = timestamp.replace(/[:.]/g, '-');
|
|
349
|
+
const jsonPath = node_path_1.default.join(options.outputDirectory, `${safeTimestamp}-run.json`);
|
|
350
|
+
const htmlPath = node_path_1.default.join(options.outputDirectory, `${safeTimestamp}-run.html`);
|
|
351
|
+
const payload = {
|
|
352
|
+
generatedAt: timestamp,
|
|
353
|
+
runtime: options.runtime,
|
|
354
|
+
environment: options.environment,
|
|
355
|
+
projectName: options.projectName,
|
|
356
|
+
scenarioContext: options.scenarioContext,
|
|
357
|
+
summary: buildScenarioRunSummary(options.results),
|
|
358
|
+
results: options.results,
|
|
359
|
+
};
|
|
360
|
+
await promises_1.default.writeFile(jsonPath, JSON.stringify(payload, null, 2), 'utf8');
|
|
361
|
+
await promises_1.default.writeFile(htmlPath, buildScenarioRunHtml(payload), 'utf8');
|
|
362
|
+
return {
|
|
363
|
+
jsonPath,
|
|
364
|
+
htmlPath,
|
|
365
|
+
};
|
|
366
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { type AiTurn, type BaseAiScenarioContext } from './scenario';
|
|
2
|
+
type ConversationHistoryEntry = {
|
|
3
|
+
label: string;
|
|
4
|
+
message: string;
|
|
5
|
+
output?: string;
|
|
6
|
+
};
|
|
7
|
+
export declare function resolveTurnMessage<TContext extends BaseAiScenarioContext>(options: {
|
|
8
|
+
turn: AiTurn<TContext>;
|
|
9
|
+
context: TContext;
|
|
10
|
+
env: NodeJS.ProcessEnv;
|
|
11
|
+
scenarioId: string;
|
|
12
|
+
scenarioName: string;
|
|
13
|
+
history: ConversationHistoryEntry[];
|
|
14
|
+
}): Promise<string>;
|
|
15
|
+
export {};
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.resolveTurnMessage = resolveTurnMessage;
|
|
7
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
8
|
+
const openai_eval_1 = require("../utils/openai-eval");
|
|
9
|
+
let cachedSystemPrompt;
|
|
10
|
+
const DEFAULT_SYSTEM_PROMPT_PATH = node_path_1.default.resolve(__dirname, '..', 'prompts', 'turn-message-gen-system-prompt.md');
|
|
11
|
+
async function getMessageGenSystemPrompt() {
|
|
12
|
+
if (cachedSystemPrompt) {
|
|
13
|
+
return cachedSystemPrompt;
|
|
14
|
+
}
|
|
15
|
+
const { readFile } = await import('node:fs/promises');
|
|
16
|
+
cachedSystemPrompt = await readFile(DEFAULT_SYSTEM_PROMPT_PATH, 'utf8');
|
|
17
|
+
return cachedSystemPrompt;
|
|
18
|
+
}
|
|
19
|
+
function isAiGeneratedMessageDefinition(value) {
|
|
20
|
+
return (typeof value === 'object' &&
|
|
21
|
+
value !== null &&
|
|
22
|
+
'type' in value &&
|
|
23
|
+
(((value.type === 'ai_gen_previous_context') ||
|
|
24
|
+
(value.type === 'ai_gen_guidance') ||
|
|
25
|
+
(value.type === 'ai_gen_content'))));
|
|
26
|
+
}
|
|
27
|
+
function cleanGeneratedMessage(value) {
|
|
28
|
+
const trimmed = value.trim();
|
|
29
|
+
if (!trimmed) {
|
|
30
|
+
return '';
|
|
31
|
+
}
|
|
32
|
+
const withoutCodeFence = trimmed.replace(/^```[\w-]*\s*|\s*```$/g, '').trim();
|
|
33
|
+
if ((withoutCodeFence.startsWith('"') && withoutCodeFence.endsWith('"')) ||
|
|
34
|
+
(withoutCodeFence.startsWith("'") && withoutCodeFence.endsWith("'"))) {
|
|
35
|
+
return withoutCodeFence.slice(1, -1).trim();
|
|
36
|
+
}
|
|
37
|
+
return withoutCodeFence;
|
|
38
|
+
}
|
|
39
|
+
async function generateMessageFromAi(options) {
|
|
40
|
+
if (options.definition.type === 'ai_gen_previous_context' && options.history.length === 0) {
|
|
41
|
+
throw new Error(`Turn ${options.turnLabel} uses aiGen.fromPreviousContext() but there is no previous conversation context.`);
|
|
42
|
+
}
|
|
43
|
+
const apiKey = (0, openai_eval_1.getRequiredOpenAiApiKey)(options.env);
|
|
44
|
+
const baseUrl = (0, openai_eval_1.getOpenAiBaseUrl)(options.env);
|
|
45
|
+
const model = (0, openai_eval_1.getOpenAiMessageGenModel)(options.env);
|
|
46
|
+
const systemPrompt = await getMessageGenSystemPrompt();
|
|
47
|
+
const response = await fetch(`${baseUrl}/responses`, {
|
|
48
|
+
method: 'POST',
|
|
49
|
+
headers: {
|
|
50
|
+
Authorization: `Bearer ${apiKey}`,
|
|
51
|
+
'Content-Type': 'application/json',
|
|
52
|
+
},
|
|
53
|
+
body: JSON.stringify({
|
|
54
|
+
model,
|
|
55
|
+
input: [
|
|
56
|
+
{
|
|
57
|
+
role: 'system',
|
|
58
|
+
content: [
|
|
59
|
+
{
|
|
60
|
+
type: 'input_text',
|
|
61
|
+
text: systemPrompt,
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
role: 'user',
|
|
67
|
+
content: [
|
|
68
|
+
{
|
|
69
|
+
type: 'input_text',
|
|
70
|
+
text: JSON.stringify({
|
|
71
|
+
scenarioId: options.scenarioId,
|
|
72
|
+
scenarioName: options.scenarioName,
|
|
73
|
+
turnLabel: options.turnLabel,
|
|
74
|
+
mode: options.definition.type === 'ai_gen_previous_context'
|
|
75
|
+
? 'fromPreviousContext'
|
|
76
|
+
: options.definition.type === 'ai_gen_guidance'
|
|
77
|
+
? 'fromGuidance'
|
|
78
|
+
: 'fromContent',
|
|
79
|
+
guidance: options.definition.type === 'ai_gen_guidance'
|
|
80
|
+
? options.definition.guidance
|
|
81
|
+
: undefined,
|
|
82
|
+
sourceContent: options.definition.type === 'ai_gen_content' ? options.definition.content : undefined,
|
|
83
|
+
history: options.history,
|
|
84
|
+
}, null, 2),
|
|
85
|
+
},
|
|
86
|
+
],
|
|
87
|
+
},
|
|
88
|
+
],
|
|
89
|
+
}),
|
|
90
|
+
});
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
throw new Error(`OpenAI message generation failed with HTTP ${response.status}: ${await response.text()}`);
|
|
93
|
+
}
|
|
94
|
+
const payload = (await response.json());
|
|
95
|
+
const generatedMessage = cleanGeneratedMessage((0, openai_eval_1.extractOutputText)(payload));
|
|
96
|
+
if (!generatedMessage) {
|
|
97
|
+
throw new Error(`OpenAI message generation returned no content for ${options.scenarioId} ${options.turnLabel}.`);
|
|
98
|
+
}
|
|
99
|
+
return generatedMessage;
|
|
100
|
+
}
|
|
101
|
+
async function resolveTurnMessage(options) {
|
|
102
|
+
if (typeof options.turn.message === 'function') {
|
|
103
|
+
return options.turn.message(options.context);
|
|
104
|
+
}
|
|
105
|
+
if (isAiGeneratedMessageDefinition(options.turn.message)) {
|
|
106
|
+
return await generateMessageFromAi({
|
|
107
|
+
definition: options.turn.message,
|
|
108
|
+
history: options.history,
|
|
109
|
+
env: options.env,
|
|
110
|
+
scenarioId: options.scenarioId,
|
|
111
|
+
scenarioName: options.scenarioName,
|
|
112
|
+
turnLabel: options.turn.label,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
throw new Error(`Unsupported message definition for ${options.scenarioId} ${options.turn.label}.`);
|
|
116
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { AiScenarioSet, type BaseAiScenarioContext } from './scenario';
|
|
2
|
+
type ScenarioContextBuilder<TContext extends BaseAiScenarioContext> = (projectId: string) => TContext;
|
|
3
|
+
type LoadedScenarioModule<TContext extends BaseAiScenarioContext> = {
|
|
4
|
+
scenarioSet: AiScenarioSet<TContext>;
|
|
5
|
+
buildScenarioContext: ScenarioContextBuilder<TContext>;
|
|
6
|
+
scenarioFilePath: string;
|
|
7
|
+
};
|
|
8
|
+
export declare function loadScenarioModule<TContext extends BaseAiScenarioContext>(options: {
|
|
9
|
+
scenarioFile?: string;
|
|
10
|
+
defaultRelativePath: string;
|
|
11
|
+
}): Promise<LoadedScenarioModule<TContext>>;
|
|
12
|
+
export {};
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.loadScenarioModule = loadScenarioModule;
|
|
7
|
+
const node_fs_1 = __importDefault(require("node:fs"));
|
|
8
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
9
|
+
const node_url_1 = require("node:url");
|
|
10
|
+
function ensureTsExtension(value) {
|
|
11
|
+
return node_path_1.default.extname(value) ? value : `${value}.ts`;
|
|
12
|
+
}
|
|
13
|
+
function resolveScenarioFilePath(input, defaultRelativePath) {
|
|
14
|
+
const rawValue = (input?.trim() || defaultRelativePath).trim();
|
|
15
|
+
const withExtension = ensureTsExtension(rawValue);
|
|
16
|
+
if (node_path_1.default.isAbsolute(withExtension)) {
|
|
17
|
+
return withExtension;
|
|
18
|
+
}
|
|
19
|
+
if (withExtension.startsWith('.')) {
|
|
20
|
+
return node_path_1.default.resolve(process.cwd(), withExtension);
|
|
21
|
+
}
|
|
22
|
+
const directPath = node_path_1.default.resolve(process.cwd(), withExtension);
|
|
23
|
+
const scenariosPath = node_path_1.default.resolve(process.cwd(), 'data', 'ai-scenarios', withExtension);
|
|
24
|
+
if (node_fs_1.default.existsSync(directPath)) {
|
|
25
|
+
return directPath;
|
|
26
|
+
}
|
|
27
|
+
return scenariosPath;
|
|
28
|
+
}
|
|
29
|
+
function resolveExistingScenarioCandidate(input, defaultRelativePath) {
|
|
30
|
+
const rawValue = (input?.trim() || defaultRelativePath).trim();
|
|
31
|
+
const withExtension = ensureTsExtension(rawValue);
|
|
32
|
+
const candidates = [
|
|
33
|
+
node_path_1.default.isAbsolute(withExtension) ? withExtension : undefined,
|
|
34
|
+
withExtension.startsWith('.') ? node_path_1.default.resolve(process.cwd(), withExtension) : undefined,
|
|
35
|
+
node_path_1.default.resolve(process.cwd(), withExtension),
|
|
36
|
+
node_path_1.default.resolve(process.cwd(), 'data', 'ai-scenarios', withExtension),
|
|
37
|
+
].filter((candidate) => Boolean(candidate));
|
|
38
|
+
const uniqueCandidates = [...new Set(candidates)];
|
|
39
|
+
for (const candidate of uniqueCandidates) {
|
|
40
|
+
try {
|
|
41
|
+
const stat = node_fs_1.default.statSync(candidate);
|
|
42
|
+
if (stat.isFile()) {
|
|
43
|
+
return candidate;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return resolveScenarioFilePath(input, defaultRelativePath);
|
|
51
|
+
}
|
|
52
|
+
function getScenarioSetExport(exportsObject, scenarioFilePath) {
|
|
53
|
+
for (const candidate of getExportCandidates(exportsObject)) {
|
|
54
|
+
if (isScenarioSetLike(candidate.scenarioSet)) {
|
|
55
|
+
return candidate.scenarioSet;
|
|
56
|
+
}
|
|
57
|
+
for (const value of Object.values(candidate)) {
|
|
58
|
+
if (isScenarioSetLike(value)) {
|
|
59
|
+
return value;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
throw new Error(`Scenario file ${scenarioFilePath} does not export an AiScenarioSet. Add 'export const scenarioSet = new AiScenarioSet(...)' or an equivalent named export.`);
|
|
64
|
+
}
|
|
65
|
+
function isScenarioSetLike(value) {
|
|
66
|
+
if (!value || typeof value !== 'object') {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
const candidate = value;
|
|
70
|
+
return Array.isArray(candidate.items) && typeof candidate.select === 'function';
|
|
71
|
+
}
|
|
72
|
+
function getScenarioContextBuilder(exportsObject) {
|
|
73
|
+
for (const candidate of getExportCandidates(exportsObject)) {
|
|
74
|
+
if (typeof candidate.buildScenarioContext === 'function') {
|
|
75
|
+
return candidate.buildScenarioContext;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return ((projectId) => ({ projectId }));
|
|
79
|
+
}
|
|
80
|
+
function getExportCandidates(exportsObject) {
|
|
81
|
+
const candidates = [
|
|
82
|
+
exportsObject,
|
|
83
|
+
unwrapModuleLikeExport(exportsObject.default),
|
|
84
|
+
unwrapModuleLikeExport(exportsObject['module.exports']),
|
|
85
|
+
].filter((candidate) => Boolean(candidate));
|
|
86
|
+
return [...new Set(candidates)];
|
|
87
|
+
}
|
|
88
|
+
function unwrapModuleLikeExport(value) {
|
|
89
|
+
if (!value || typeof value !== 'object') {
|
|
90
|
+
return undefined;
|
|
91
|
+
}
|
|
92
|
+
return value;
|
|
93
|
+
}
|
|
94
|
+
async function loadScenarioModule(options) {
|
|
95
|
+
const scenarioFilePath = resolveExistingScenarioCandidate(options.scenarioFile, options.defaultRelativePath);
|
|
96
|
+
const moduleUrl = (0, node_url_1.pathToFileURL)(scenarioFilePath).href;
|
|
97
|
+
const exportsObject = (await import(moduleUrl));
|
|
98
|
+
return {
|
|
99
|
+
scenarioSet: getScenarioSetExport(exportsObject, scenarioFilePath),
|
|
100
|
+
buildScenarioContext: getScenarioContextBuilder(exportsObject),
|
|
101
|
+
scenarioFilePath,
|
|
102
|
+
};
|
|
103
|
+
}
|