@inspectr/mcplab 0.8.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/app/assets/index-6UW7HJmq.js +249 -0
- package/dist/app/assets/index-ppN3J-0o.css +1 -0
- package/dist/app/index.html +2 -2
- package/dist/app-server/app-context.d.ts +4 -0
- package/dist/app-server/app-context.d.ts.map +1 -1
- package/dist/app-server/assistant-common.d.ts +4 -0
- package/dist/app-server/assistant-common.d.ts.map +1 -1
- package/dist/app-server/assistant-common.js +8 -6
- package/dist/app-server/assistant-common.js.map +1 -1
- package/dist/app-server/router.d.ts.map +1 -1
- package/dist/app-server/router.js +2 -7
- package/dist/app-server/router.js.map +1 -1
- package/dist/app-server/runs-routes.d.ts +12 -3
- package/dist/app-server/runs-routes.d.ts.map +1 -1
- package/dist/app-server/runs-routes.js +411 -262
- package/dist/app-server/runs-routes.js.map +1 -1
- package/dist/app-server/scenario-assistant-domain.d.ts +2 -0
- package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant-domain.js +49 -26
- package/dist/app-server/scenario-assistant-domain.js.map +1 -1
- package/dist/app-server/scenario-assistant.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant.js +87 -2
- package/dist/app-server/scenario-assistant.js.map +1 -1
- package/package.json +4 -4
- package/dist/app/assets/index-D5Ew6sJk.css +0 -1
- package/dist/app/assets/index-SbN9VCfg.js +0 -249
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
1
2
|
import { existsSync, rmSync, writeFileSync } from 'node:fs';
|
|
2
3
|
import { isAbsolute, join } from 'node:path';
|
|
3
4
|
import { McpClientManager, loadConfig, runAll } from '@inspectr/mcplab-core';
|
|
4
5
|
import { renderReport } from '@inspectr/mcplab-reporting';
|
|
5
6
|
export async function handleRunsRoutes(params) {
|
|
6
|
-
const { req, res, pathname, method, settings, jobs,
|
|
7
|
+
const { req, res, pathname, method, settings, jobs, runQueueState, deps } = params;
|
|
7
8
|
const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pickDefaultAssistantAgentName, resolveAssistantAgentFromLibraries, chatWithAgent, pkgVersion } = deps;
|
|
8
9
|
if (pathname === '/api/runs' && method === 'GET') {
|
|
9
10
|
asJson(res, 200, listRuns(settings.runsDir));
|
|
@@ -30,7 +31,7 @@ export async function handleRunsRoutes(params) {
|
|
|
30
31
|
}
|
|
31
32
|
for (const event of job.events)
|
|
32
33
|
sendSseEvent(res, event);
|
|
33
|
-
if (job.status !== 'running') {
|
|
34
|
+
if (job.status !== 'running' && job.status !== 'queued') {
|
|
34
35
|
res.end();
|
|
35
36
|
return true;
|
|
36
37
|
}
|
|
@@ -47,21 +48,96 @@ export async function handleRunsRoutes(params) {
|
|
|
47
48
|
asJson(res, 404, { error: 'Job not found' });
|
|
48
49
|
return true;
|
|
49
50
|
}
|
|
51
|
+
if (job.status === 'queued') {
|
|
52
|
+
const idx = runQueueState.queue.indexOf(jobId);
|
|
53
|
+
if (idx !== -1)
|
|
54
|
+
runQueueState.queue.splice(idx, 1);
|
|
55
|
+
job.status = 'stopped';
|
|
56
|
+
addJobEvent(job, {
|
|
57
|
+
type: 'error',
|
|
58
|
+
ts: new Date().toISOString(),
|
|
59
|
+
payload: { message: 'Run stopped before it started' }
|
|
60
|
+
});
|
|
61
|
+
for (const client of job.clients)
|
|
62
|
+
client.end();
|
|
63
|
+
job.clients.clear();
|
|
64
|
+
asJson(res, 200, { ok: true, status: 'stopped' });
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
50
67
|
if (job.status !== 'running') {
|
|
51
68
|
asJson(res, 200, { ok: true, status: job.status });
|
|
52
69
|
return true;
|
|
53
70
|
}
|
|
54
71
|
job.abortController.abort();
|
|
55
72
|
job.status = 'stopped';
|
|
56
|
-
activeJobState.set(null);
|
|
57
73
|
asJson(res, 200, { ok: true, status: 'stopped' });
|
|
58
74
|
return true;
|
|
59
75
|
}
|
|
60
|
-
if (pathname === '/api/runs' && method === '
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
if (pathname === '/api/runs/queue' && method === 'GET') {
|
|
77
|
+
const activeJob = runQueueState.activeJobId ? jobs.get(runQueueState.activeJobId) : null;
|
|
78
|
+
const queuedEntries = runQueueState.queue
|
|
79
|
+
.map((id) => jobs.get(id))
|
|
80
|
+
.filter((j) => !!j && j.status === 'queued')
|
|
81
|
+
.map((j) => ({
|
|
82
|
+
jobId: j.id,
|
|
83
|
+
status: j.status,
|
|
84
|
+
runParams: {
|
|
85
|
+
configPath: j.runParams.configPath,
|
|
86
|
+
runsPerScenario: j.runParams.runsPerScenario,
|
|
87
|
+
scenarioIds: j.runParams.scenarioIds ?? null,
|
|
88
|
+
agents: j.runParams.requestedAgents ?? null
|
|
89
|
+
}
|
|
90
|
+
}));
|
|
91
|
+
asJson(res, 200, {
|
|
92
|
+
active: activeJob
|
|
93
|
+
? {
|
|
94
|
+
jobId: activeJob.id,
|
|
95
|
+
status: activeJob.status,
|
|
96
|
+
runParams: {
|
|
97
|
+
configPath: activeJob.runParams.configPath,
|
|
98
|
+
runsPerScenario: activeJob.runParams.runsPerScenario,
|
|
99
|
+
scenarioIds: activeJob.runParams.scenarioIds ?? null,
|
|
100
|
+
agents: activeJob.runParams.requestedAgents ?? null
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
: null,
|
|
104
|
+
queued: queuedEntries
|
|
105
|
+
});
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
if (pathname.startsWith('/api/runs/queue/') &&
|
|
109
|
+
method === 'DELETE' &&
|
|
110
|
+
pathname.split('/').length === 5) {
|
|
111
|
+
const jobId = pathname.split('/')[4];
|
|
112
|
+
const job = jobs.get(jobId);
|
|
113
|
+
if (!job) {
|
|
114
|
+
asJson(res, 404, { error: 'Job not found' });
|
|
63
115
|
return true;
|
|
64
116
|
}
|
|
117
|
+
if (job.status === 'running') {
|
|
118
|
+
asJson(res, 400, { error: 'Cannot remove a running job. Use the /stop endpoint instead.' });
|
|
119
|
+
return true;
|
|
120
|
+
}
|
|
121
|
+
if (job.status !== 'queued') {
|
|
122
|
+
asJson(res, 404, { error: 'Job is not queued' });
|
|
123
|
+
return true;
|
|
124
|
+
}
|
|
125
|
+
const idx = runQueueState.queue.indexOf(jobId);
|
|
126
|
+
if (idx !== -1)
|
|
127
|
+
runQueueState.queue.splice(idx, 1);
|
|
128
|
+
job.status = 'stopped';
|
|
129
|
+
addJobEvent(job, {
|
|
130
|
+
type: 'error',
|
|
131
|
+
ts: new Date().toISOString(),
|
|
132
|
+
payload: { message: 'Removed from queue by user' }
|
|
133
|
+
});
|
|
134
|
+
for (const client of job.clients)
|
|
135
|
+
client.end();
|
|
136
|
+
job.clients.clear();
|
|
137
|
+
asJson(res, 200, { ok: true, jobId, status: 'stopped' });
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
if (pathname === '/api/runs' && method === 'POST') {
|
|
65
141
|
const body = (await parseBody(req));
|
|
66
142
|
const configPathRaw = String(body.configPath ?? '');
|
|
67
143
|
const runsPerScenario = Number(body.runsPerScenario ?? 1);
|
|
@@ -88,270 +164,59 @@ export async function handleRunsRoutes(params) {
|
|
|
88
164
|
asJson(res, 404, { error: `Config not found: ${configPath}` });
|
|
89
165
|
return true;
|
|
90
166
|
}
|
|
91
|
-
const jobId =
|
|
167
|
+
const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
168
|
+
const runParamsObj = {
|
|
169
|
+
configPath,
|
|
170
|
+
runsPerScenario,
|
|
171
|
+
scenarioId,
|
|
172
|
+
scenarioIds,
|
|
173
|
+
requestedAgents,
|
|
174
|
+
applySnapshotEval
|
|
175
|
+
};
|
|
92
176
|
const job = {
|
|
93
177
|
id: jobId,
|
|
94
|
-
status: '
|
|
178
|
+
status: 'queued',
|
|
95
179
|
events: [],
|
|
96
180
|
clients: new Set(),
|
|
97
|
-
abortController: new AbortController()
|
|
181
|
+
abortController: new AbortController(),
|
|
182
|
+
runParams: runParamsObj
|
|
98
183
|
};
|
|
99
184
|
jobs.set(jobId, job);
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
try {
|
|
114
|
-
addJobEvent(job, {
|
|
115
|
-
type: 'log',
|
|
116
|
-
ts: new Date().toISOString(),
|
|
117
|
-
payload: { message: `Loading MCP Evaluation config: ${configPath}` }
|
|
118
|
-
});
|
|
119
|
-
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
120
|
-
addJobEvent(job, {
|
|
121
|
-
type: 'log',
|
|
122
|
-
ts: new Date().toISOString(),
|
|
123
|
-
payload: {
|
|
124
|
-
message: `Loaded config (${loaded.config.scenarios.length} scenario(s), ${Object.keys(loaded.config.agents ?? {}).length} agent(s), ${Object.keys(loaded.config.servers ?? {}).length} server(s))`
|
|
125
|
-
}
|
|
126
|
-
});
|
|
127
|
-
for (const warning of loaded.warnings ?? []) {
|
|
128
|
-
addJobEvent(job, {
|
|
129
|
-
type: 'log',
|
|
130
|
-
ts: new Date().toISOString(),
|
|
131
|
-
payload: { message: warning }
|
|
132
|
-
});
|
|
133
|
-
}
|
|
134
|
-
addJobEvent(job, {
|
|
135
|
-
type: 'log',
|
|
136
|
-
ts: new Date().toISOString(),
|
|
137
|
-
payload: {
|
|
138
|
-
message: scenarioIds && scenarioIds.length > 0
|
|
139
|
-
? `Selecting requested scenarios: ${scenarioIds.join(', ')}`
|
|
140
|
-
: scenarioId
|
|
141
|
-
? `Selecting requested scenario: ${scenarioId}`
|
|
142
|
-
: 'Using all scenarios from config'
|
|
143
|
-
}
|
|
144
|
-
});
|
|
145
|
-
const selectedBaseScenarios = selectScenarioIds(loaded.config, scenarioIds && scenarioIds.length > 0
|
|
146
|
-
? scenarioIds
|
|
147
|
-
: scenarioId
|
|
148
|
-
? [scenarioId]
|
|
149
|
-
: undefined);
|
|
150
|
-
addJobEvent(job, {
|
|
151
|
-
type: 'log',
|
|
152
|
-
ts: new Date().toISOString(),
|
|
153
|
-
payload: {
|
|
154
|
-
message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
|
|
155
|
-
}
|
|
156
|
-
});
|
|
157
|
-
const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
|
|
158
|
-
const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
|
|
159
|
-
addJobEvent(job, {
|
|
160
|
-
type: 'log',
|
|
161
|
-
ts: new Date().toISOString(),
|
|
162
|
-
payload: {
|
|
163
|
-
message: requestedAgents && requestedAgents.length > 0
|
|
164
|
-
? `Using requested agents: ${resolvedAgentList.join(', ')}`
|
|
165
|
-
: `Using resolved default agents: ${resolvedAgentList.join(', ')}`
|
|
166
|
-
}
|
|
167
|
-
});
|
|
168
|
-
const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
|
|
169
|
-
addJobEvent(job, {
|
|
170
|
-
type: 'log',
|
|
171
|
-
ts: new Date().toISOString(),
|
|
172
|
-
payload: {
|
|
173
|
-
message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
|
|
174
|
-
}
|
|
175
|
-
});
|
|
176
|
-
const cwdBefore = process.cwd();
|
|
177
|
-
process.chdir(settings.workspaceRoot);
|
|
178
|
-
try {
|
|
179
|
-
addJobEvent(job, {
|
|
180
|
-
type: 'log',
|
|
181
|
-
ts: new Date().toISOString(),
|
|
182
|
-
payload: {
|
|
183
|
-
message: `Running evaluation (${runsPerScenario} run(s) per scenario) ...`
|
|
184
|
-
}
|
|
185
|
-
});
|
|
186
|
-
const { runDir, results } = await runAll(expandedConfig, {
|
|
187
|
-
runsPerScenario,
|
|
188
|
-
scenarioId,
|
|
189
|
-
configHash: loaded.hash,
|
|
190
|
-
cliVersion: pkgVersion,
|
|
191
|
-
runsDir: settings.runsDir,
|
|
192
|
-
signal: job.abortController.signal,
|
|
193
|
-
onProgress: async (event) => {
|
|
194
|
-
const message = formatRunProgressMessage(event);
|
|
195
|
-
if (!message)
|
|
196
|
-
return;
|
|
197
|
-
addJobEvent(job, {
|
|
198
|
-
type: 'log',
|
|
199
|
-
ts: new Date().toISOString(),
|
|
200
|
-
payload: { message }
|
|
201
|
-
});
|
|
202
|
-
}
|
|
203
|
-
});
|
|
204
|
-
addJobEvent(job, {
|
|
205
|
-
type: 'log',
|
|
206
|
-
ts: new Date().toISOString(),
|
|
207
|
-
payload: {
|
|
208
|
-
message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
|
|
209
|
-
}
|
|
210
|
-
});
|
|
211
|
-
if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
|
|
212
|
-
addJobEvent(job, {
|
|
213
|
-
type: 'log',
|
|
214
|
-
ts: new Date().toISOString(),
|
|
215
|
-
payload: { message: 'Applying snapshot evaluation policy ...' }
|
|
216
|
-
});
|
|
217
|
-
const policy = expandedConfig.snapshot_eval;
|
|
218
|
-
const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
|
|
219
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
220
|
-
.map((scenario) => scenario.id));
|
|
221
|
-
const scenarioBaselineMap = new Map();
|
|
222
|
-
for (const scenario of selectedBaseScenarios.scenarios) {
|
|
223
|
-
if (scenario.snapshot_eval?.enabled === false)
|
|
224
|
-
continue;
|
|
225
|
-
const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
|
|
226
|
-
if (baselineId)
|
|
227
|
-
scenarioBaselineMap.set(scenario.id, baselineId);
|
|
228
|
-
}
|
|
229
|
-
const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
|
|
230
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
231
|
-
.filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
|
|
232
|
-
.map((scenario) => scenario.id);
|
|
233
|
-
if (scenariosWithoutBaseline.length > 0) {
|
|
234
|
-
addJobEvent(job, {
|
|
235
|
-
type: 'log',
|
|
236
|
-
ts: new Date().toISOString(),
|
|
237
|
-
payload: {
|
|
238
|
-
message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
|
|
239
|
-
}
|
|
240
|
-
});
|
|
241
|
-
}
|
|
242
|
-
const comparisons = [];
|
|
243
|
-
const scenarioIdsByBaseline = new Map();
|
|
244
|
-
for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
|
|
245
|
-
const list = scenarioIdsByBaseline.get(baselineId) ?? [];
|
|
246
|
-
list.push(scenarioIdItem);
|
|
247
|
-
scenarioIdsByBaseline.set(baselineId, list);
|
|
248
|
-
}
|
|
249
|
-
for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
|
|
250
|
-
addJobEvent(job, {
|
|
251
|
-
type: 'log',
|
|
252
|
-
ts: new Date().toISOString(),
|
|
253
|
-
payload: {
|
|
254
|
-
message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
|
|
255
|
-
}
|
|
256
|
-
});
|
|
257
|
-
const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
|
|
258
|
-
const fullComparison = compareRunToSnapshot(results, snapshot);
|
|
259
|
-
comparisons.push({
|
|
260
|
-
...fullComparison,
|
|
261
|
-
scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
|
|
262
|
-
});
|
|
263
|
-
}
|
|
264
|
-
if (comparisons.length > 0) {
|
|
265
|
-
applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
|
|
266
|
-
addJobEvent(job, {
|
|
267
|
-
type: 'log',
|
|
268
|
-
ts: new Date().toISOString(),
|
|
269
|
-
payload: {
|
|
270
|
-
message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
|
|
271
|
-
}
|
|
272
|
-
});
|
|
273
|
-
}
|
|
274
|
-
else {
|
|
275
|
-
addJobEvent(job, {
|
|
276
|
-
type: 'log',
|
|
277
|
-
ts: new Date().toISOString(),
|
|
278
|
-
payload: {
|
|
279
|
-
message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
|
|
280
|
-
}
|
|
281
|
-
});
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
else if (applySnapshotEval) {
|
|
285
|
-
addJobEvent(job, {
|
|
286
|
-
type: 'log',
|
|
287
|
-
ts: new Date().toISOString(),
|
|
288
|
-
payload: {
|
|
289
|
-
message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
|
|
290
|
-
}
|
|
291
|
-
});
|
|
292
|
-
}
|
|
293
|
-
else {
|
|
294
|
-
addJobEvent(job, {
|
|
295
|
-
type: 'log',
|
|
296
|
-
ts: new Date().toISOString(),
|
|
297
|
-
payload: {
|
|
298
|
-
message: 'Snapshot evaluation skipped for this run (disabled in run request)'
|
|
299
|
-
}
|
|
300
|
-
});
|
|
301
|
-
}
|
|
302
|
-
addJobEvent(job, {
|
|
303
|
-
type: 'log',
|
|
304
|
-
ts: new Date().toISOString(),
|
|
305
|
-
payload: { message: `Writing results to ${runDir}` }
|
|
306
|
-
});
|
|
307
|
-
writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
|
|
308
|
-
writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
|
|
309
|
-
addJobEvent(job, {
|
|
310
|
-
type: 'log',
|
|
311
|
-
ts: new Date().toISOString(),
|
|
312
|
-
payload: {
|
|
313
|
-
message: `Run finished: ${results.summary.total_runs} run(s), pass rate ${Math.round(results.summary.pass_rate * 100)}%`
|
|
314
|
-
}
|
|
315
|
-
});
|
|
316
|
-
addJobEvent(job, {
|
|
317
|
-
type: 'completed',
|
|
318
|
-
ts: new Date().toISOString(),
|
|
319
|
-
payload: {
|
|
320
|
-
runId: results.metadata.run_id,
|
|
321
|
-
runDir,
|
|
322
|
-
summary: results.summary,
|
|
323
|
-
snapshotEval: results.metadata.snapshot_eval ?? null
|
|
324
|
-
}
|
|
325
|
-
});
|
|
326
|
-
job.status = 'completed';
|
|
185
|
+
if (!runQueueState.activeJobId) {
|
|
186
|
+
// No active job — start immediately
|
|
187
|
+
job.status = 'running';
|
|
188
|
+
runQueueState.activeJobId = jobId;
|
|
189
|
+
addJobEvent(job, {
|
|
190
|
+
type: 'started',
|
|
191
|
+
ts: new Date().toISOString(),
|
|
192
|
+
payload: {
|
|
193
|
+
configPath,
|
|
194
|
+
runsPerScenario,
|
|
195
|
+
scenarioId: scenarioId ?? null,
|
|
196
|
+
scenarioIds: scenarioIds ?? null,
|
|
197
|
+
agents: requestedAgents ?? null
|
|
327
198
|
}
|
|
328
|
-
|
|
329
|
-
|
|
199
|
+
});
|
|
200
|
+
void executeRunJob(job, settings, jobs, runQueueState, deps);
|
|
201
|
+
asJson(res, 202, { jobId });
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
// Queue this job
|
|
205
|
+
runQueueState.queue.push(jobId);
|
|
206
|
+
addJobEvent(job, {
|
|
207
|
+
type: 'queued',
|
|
208
|
+
ts: new Date().toISOString(),
|
|
209
|
+
payload: {
|
|
210
|
+
configPath,
|
|
211
|
+
runsPerScenario,
|
|
212
|
+
scenarioId: scenarioId ?? null,
|
|
213
|
+
scenarioIds: scenarioIds ?? null,
|
|
214
|
+
agents: requestedAgents ?? null,
|
|
215
|
+
position: runQueueState.queue.length
|
|
330
216
|
}
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
addJobEvent(job, {
|
|
335
|
-
type: 'error',
|
|
336
|
-
ts: new Date().toISOString(),
|
|
337
|
-
payload: {
|
|
338
|
-
message: aborted
|
|
339
|
-
? 'Run aborted by user'
|
|
340
|
-
: error instanceof Error
|
|
341
|
-
? error.message
|
|
342
|
-
: String(error)
|
|
343
|
-
}
|
|
344
|
-
});
|
|
345
|
-
job.status = aborted ? 'stopped' : 'error';
|
|
346
|
-
}
|
|
347
|
-
finally {
|
|
348
|
-
activeJobState.set(null);
|
|
349
|
-
for (const client of job.clients)
|
|
350
|
-
client.end();
|
|
351
|
-
job.clients.clear();
|
|
352
|
-
}
|
|
353
|
-
})();
|
|
354
|
-
asJson(res, 202, { jobId });
|
|
217
|
+
});
|
|
218
|
+
asJson(res, 202, { jobId, queued: true, position: runQueueState.queue.length });
|
|
219
|
+
}
|
|
355
220
|
return true;
|
|
356
221
|
}
|
|
357
222
|
if (pathname.startsWith('/api/runs/') && pathname.endsWith('/assistant') && method === 'POST') {
|
|
@@ -507,6 +372,290 @@ export async function handleRunsRoutes(params) {
|
|
|
507
372
|
}
|
|
508
373
|
return false;
|
|
509
374
|
}
|
|
375
|
+
function advanceQueue(jobs, runQueueState, settings, deps) {
|
|
376
|
+
if (runQueueState.activeJobId)
|
|
377
|
+
return;
|
|
378
|
+
while (runQueueState.queue.length > 0) {
|
|
379
|
+
const nextId = runQueueState.queue.shift();
|
|
380
|
+
const nextJob = jobs.get(nextId);
|
|
381
|
+
if (!nextJob || nextJob.status !== 'queued')
|
|
382
|
+
continue;
|
|
383
|
+
nextJob.status = 'running';
|
|
384
|
+
runQueueState.activeJobId = nextId;
|
|
385
|
+
deps.addJobEvent(nextJob, {
|
|
386
|
+
type: 'started',
|
|
387
|
+
ts: new Date().toISOString(),
|
|
388
|
+
payload: {
|
|
389
|
+
configPath: nextJob.runParams.configPath,
|
|
390
|
+
runsPerScenario: nextJob.runParams.runsPerScenario,
|
|
391
|
+
scenarioId: nextJob.runParams.scenarioId ?? null,
|
|
392
|
+
scenarioIds: nextJob.runParams.scenarioIds ?? null,
|
|
393
|
+
agents: nextJob.runParams.requestedAgents ?? null
|
|
394
|
+
}
|
|
395
|
+
});
|
|
396
|
+
void executeRunJob(nextJob, settings, jobs, runQueueState, deps);
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
async function executeRunJob(job, settings, jobs, runQueueState, deps) {
|
|
401
|
+
const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, pkgVersion } = deps;
|
|
402
|
+
const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, applySnapshotEval } = job.runParams;
|
|
403
|
+
try {
|
|
404
|
+
addJobEvent(job, {
|
|
405
|
+
type: 'log',
|
|
406
|
+
ts: new Date().toISOString(),
|
|
407
|
+
payload: { message: `Loading MCP Evaluation config: ${configPath}` }
|
|
408
|
+
});
|
|
409
|
+
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
410
|
+
addJobEvent(job, {
|
|
411
|
+
type: 'log',
|
|
412
|
+
ts: new Date().toISOString(),
|
|
413
|
+
payload: {
|
|
414
|
+
message: `Loaded config (${loaded.config.scenarios.length} scenario(s), ${Object.keys(loaded.config.agents ?? {}).length} agent(s), ${Object.keys(loaded.config.servers ?? {}).length} server(s))`
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
for (const warning of loaded.warnings ?? []) {
|
|
418
|
+
addJobEvent(job, {
|
|
419
|
+
type: 'log',
|
|
420
|
+
ts: new Date().toISOString(),
|
|
421
|
+
payload: { message: warning }
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
addJobEvent(job, {
|
|
425
|
+
type: 'log',
|
|
426
|
+
ts: new Date().toISOString(),
|
|
427
|
+
payload: {
|
|
428
|
+
message: scenarioIds && scenarioIds.length > 0
|
|
429
|
+
? `Selecting requested scenarios: ${scenarioIds.join(', ')}`
|
|
430
|
+
: scenarioId
|
|
431
|
+
? `Selecting requested scenario: ${scenarioId}`
|
|
432
|
+
: 'Using all scenarios from config'
|
|
433
|
+
}
|
|
434
|
+
});
|
|
435
|
+
const selectedBaseScenarios = selectScenarioIds(loaded.config, scenarioIds && scenarioIds.length > 0 ? scenarioIds : scenarioId ? [scenarioId] : undefined);
|
|
436
|
+
addJobEvent(job, {
|
|
437
|
+
type: 'log',
|
|
438
|
+
ts: new Date().toISOString(),
|
|
439
|
+
payload: {
|
|
440
|
+
message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
|
|
441
|
+
}
|
|
442
|
+
});
|
|
443
|
+
const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
|
|
444
|
+
const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
|
|
445
|
+
addJobEvent(job, {
|
|
446
|
+
type: 'log',
|
|
447
|
+
ts: new Date().toISOString(),
|
|
448
|
+
payload: {
|
|
449
|
+
message: requestedAgents && requestedAgents.length > 0
|
|
450
|
+
? `Using requested agents: ${resolvedAgentList.join(', ')}`
|
|
451
|
+
: `Using resolved default agents: ${resolvedAgentList.join(', ')}`
|
|
452
|
+
}
|
|
453
|
+
});
|
|
454
|
+
const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
|
|
455
|
+
addJobEvent(job, {
|
|
456
|
+
type: 'log',
|
|
457
|
+
ts: new Date().toISOString(),
|
|
458
|
+
payload: {
|
|
459
|
+
message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
|
|
460
|
+
}
|
|
461
|
+
});
|
|
462
|
+
const cwdBefore = process.cwd();
|
|
463
|
+
process.chdir(settings.workspaceRoot);
|
|
464
|
+
try {
|
|
465
|
+
addJobEvent(job, {
|
|
466
|
+
type: 'log',
|
|
467
|
+
ts: new Date().toISOString(),
|
|
468
|
+
payload: {
|
|
469
|
+
message: `Running evaluation (${runsPerScenario} run(s) per scenario) ...`
|
|
470
|
+
}
|
|
471
|
+
});
|
|
472
|
+
const { runDir, results } = await runAll(expandedConfig, {
|
|
473
|
+
runsPerScenario,
|
|
474
|
+
scenarioId,
|
|
475
|
+
configHash: loaded.hash,
|
|
476
|
+
cliVersion: pkgVersion,
|
|
477
|
+
runsDir: settings.runsDir,
|
|
478
|
+
signal: job.abortController.signal,
|
|
479
|
+
onProgress: async (event) => {
|
|
480
|
+
const message = formatRunProgressMessage(event);
|
|
481
|
+
if (!message)
|
|
482
|
+
return;
|
|
483
|
+
addJobEvent(job, {
|
|
484
|
+
type: 'log',
|
|
485
|
+
ts: new Date().toISOString(),
|
|
486
|
+
payload: { message }
|
|
487
|
+
});
|
|
488
|
+
}
|
|
489
|
+
});
|
|
490
|
+
addJobEvent(job, {
|
|
491
|
+
type: 'log',
|
|
492
|
+
ts: new Date().toISOString(),
|
|
493
|
+
payload: {
|
|
494
|
+
message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
|
|
495
|
+
}
|
|
496
|
+
});
|
|
497
|
+
if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
|
|
498
|
+
addJobEvent(job, {
|
|
499
|
+
type: 'log',
|
|
500
|
+
ts: new Date().toISOString(),
|
|
501
|
+
payload: { message: 'Applying snapshot evaluation policy ...' }
|
|
502
|
+
});
|
|
503
|
+
const policy = expandedConfig.snapshot_eval;
|
|
504
|
+
const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
|
|
505
|
+
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
506
|
+
.map((scenario) => scenario.id));
|
|
507
|
+
const scenarioBaselineMap = new Map();
|
|
508
|
+
for (const scenario of selectedBaseScenarios.scenarios) {
|
|
509
|
+
if (scenario.snapshot_eval?.enabled === false)
|
|
510
|
+
continue;
|
|
511
|
+
const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
|
|
512
|
+
if (baselineId)
|
|
513
|
+
scenarioBaselineMap.set(scenario.id, baselineId);
|
|
514
|
+
}
|
|
515
|
+
const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
|
|
516
|
+
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
517
|
+
.filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
|
|
518
|
+
.map((scenario) => scenario.id);
|
|
519
|
+
if (scenariosWithoutBaseline.length > 0) {
|
|
520
|
+
addJobEvent(job, {
|
|
521
|
+
type: 'log',
|
|
522
|
+
ts: new Date().toISOString(),
|
|
523
|
+
payload: {
|
|
524
|
+
message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
|
|
525
|
+
}
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
const comparisons = [];
|
|
529
|
+
const scenarioIdsByBaseline = new Map();
|
|
530
|
+
for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
|
|
531
|
+
const list = scenarioIdsByBaseline.get(baselineId) ?? [];
|
|
532
|
+
list.push(scenarioIdItem);
|
|
533
|
+
scenarioIdsByBaseline.set(baselineId, list);
|
|
534
|
+
}
|
|
535
|
+
for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
|
|
536
|
+
addJobEvent(job, {
|
|
537
|
+
type: 'log',
|
|
538
|
+
ts: new Date().toISOString(),
|
|
539
|
+
payload: {
|
|
540
|
+
message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
|
|
541
|
+
}
|
|
542
|
+
});
|
|
543
|
+
const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
|
|
544
|
+
const fullComparison = compareRunToSnapshot(results, snapshot);
|
|
545
|
+
comparisons.push({
|
|
546
|
+
...fullComparison,
|
|
547
|
+
scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
|
|
548
|
+
});
|
|
549
|
+
}
|
|
550
|
+
if (comparisons.length > 0) {
|
|
551
|
+
applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
|
|
552
|
+
addJobEvent(job, {
|
|
553
|
+
type: 'log',
|
|
554
|
+
ts: new Date().toISOString(),
|
|
555
|
+
payload: {
|
|
556
|
+
message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
|
|
557
|
+
}
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
else {
|
|
561
|
+
addJobEvent(job, {
|
|
562
|
+
type: 'log',
|
|
563
|
+
ts: new Date().toISOString(),
|
|
564
|
+
payload: {
|
|
565
|
+
message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
|
|
566
|
+
}
|
|
567
|
+
});
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
else if (applySnapshotEval) {
|
|
571
|
+
addJobEvent(job, {
|
|
572
|
+
type: 'log',
|
|
573
|
+
ts: new Date().toISOString(),
|
|
574
|
+
payload: {
|
|
575
|
+
message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
|
|
576
|
+
}
|
|
577
|
+
});
|
|
578
|
+
}
|
|
579
|
+
else {
|
|
580
|
+
addJobEvent(job, {
|
|
581
|
+
type: 'log',
|
|
582
|
+
ts: new Date().toISOString(),
|
|
583
|
+
payload: {
|
|
584
|
+
message: 'Snapshot evaluation skipped for this run (disabled in run request)'
|
|
585
|
+
}
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
addJobEvent(job, {
|
|
589
|
+
type: 'log',
|
|
590
|
+
ts: new Date().toISOString(),
|
|
591
|
+
payload: { message: `Writing results to ${runDir}` }
|
|
592
|
+
});
|
|
593
|
+
writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
|
|
594
|
+
writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
|
|
595
|
+
addJobEvent(job, {
|
|
596
|
+
type: 'log',
|
|
597
|
+
ts: new Date().toISOString(),
|
|
598
|
+
payload: {
|
|
599
|
+
message: `Run finished: ${results.summary.total_runs} run(s), pass rate ${Math.round(results.summary.pass_rate * 100)}%`
|
|
600
|
+
}
|
|
601
|
+
});
|
|
602
|
+
addJobEvent(job, {
|
|
603
|
+
type: 'completed',
|
|
604
|
+
ts: new Date().toISOString(),
|
|
605
|
+
payload: {
|
|
606
|
+
runId: results.metadata.run_id,
|
|
607
|
+
runDir,
|
|
608
|
+
summary: results.summary,
|
|
609
|
+
snapshotEval: results.metadata.snapshot_eval ?? null
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
job.status = 'completed';
|
|
613
|
+
}
|
|
614
|
+
finally {
|
|
615
|
+
process.chdir(cwdBefore);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
catch (error) {
|
|
619
|
+
const aborted = job.abortController.signal.aborted || job.status === 'stopped';
|
|
620
|
+
addJobEvent(job, {
|
|
621
|
+
type: 'error',
|
|
622
|
+
ts: new Date().toISOString(),
|
|
623
|
+
payload: {
|
|
624
|
+
message: aborted
|
|
625
|
+
? 'Run aborted by user'
|
|
626
|
+
: error instanceof Error
|
|
627
|
+
? error.message
|
|
628
|
+
: String(error)
|
|
629
|
+
}
|
|
630
|
+
});
|
|
631
|
+
job.status = aborted ? 'stopped' : 'error';
|
|
632
|
+
}
|
|
633
|
+
finally {
|
|
634
|
+
runQueueState.activeJobId = null;
|
|
635
|
+
for (const client of job.clients)
|
|
636
|
+
client.end();
|
|
637
|
+
job.clients.clear();
|
|
638
|
+
advanceQueue(jobs, runQueueState, settings, deps);
|
|
639
|
+
pruneOldJobs(jobs, runQueueState);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
function pruneOldJobs(jobs, runQueueState) {
|
|
643
|
+
const maxAgeMs = 30 * 60_000;
|
|
644
|
+
const now = Date.now();
|
|
645
|
+
const activeIds = new Set([runQueueState.activeJobId, ...runQueueState.queue].filter(Boolean));
|
|
646
|
+
for (const [id, job] of jobs) {
|
|
647
|
+
if (activeIds.has(id))
|
|
648
|
+
continue;
|
|
649
|
+
if (job.status !== 'completed' && job.status !== 'error' && job.status !== 'stopped')
|
|
650
|
+
continue;
|
|
651
|
+
const lastEvent = job.events[job.events.length - 1];
|
|
652
|
+
if (!lastEvent)
|
|
653
|
+
continue;
|
|
654
|
+
if (now - new Date(lastEvent.ts).getTime() > maxAgeMs) {
|
|
655
|
+
jobs.delete(id);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
510
659
|
function formatRunProgressMessage(event) {
|
|
511
660
|
switch (event.type) {
|
|
512
661
|
case 'run_started':
|