@inspectr/mcplab 0.9.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
+ import { randomUUID } from 'node:crypto';
1
2
  import { existsSync, rmSync, writeFileSync } from 'node:fs';
2
3
  import { isAbsolute, join } from 'node:path';
3
4
  import { McpClientManager, loadConfig, runAll } from '@inspectr/mcplab-core';
4
5
  import { renderReport } from '@inspectr/mcplab-reporting';
5
6
  export async function handleRunsRoutes(params) {
6
- const { req, res, pathname, method, settings, jobs, activeJobState, deps } = params;
7
+ const { req, res, pathname, method, settings, jobs, runQueueState, deps } = params;
7
8
  const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pickDefaultAssistantAgentName, resolveAssistantAgentFromLibraries, chatWithAgent, pkgVersion } = deps;
8
9
  if (pathname === '/api/runs' && method === 'GET') {
9
10
  asJson(res, 200, listRuns(settings.runsDir));
@@ -30,7 +31,7 @@ export async function handleRunsRoutes(params) {
30
31
  }
31
32
  for (const event of job.events)
32
33
  sendSseEvent(res, event);
33
- if (job.status !== 'running') {
34
+ if (job.status !== 'running' && job.status !== 'queued') {
34
35
  res.end();
35
36
  return true;
36
37
  }
@@ -47,21 +48,96 @@ export async function handleRunsRoutes(params) {
47
48
  asJson(res, 404, { error: 'Job not found' });
48
49
  return true;
49
50
  }
51
+ if (job.status === 'queued') {
52
+ const idx = runQueueState.queue.indexOf(jobId);
53
+ if (idx !== -1)
54
+ runQueueState.queue.splice(idx, 1);
55
+ job.status = 'stopped';
56
+ addJobEvent(job, {
57
+ type: 'error',
58
+ ts: new Date().toISOString(),
59
+ payload: { message: 'Run stopped before it started' }
60
+ });
61
+ for (const client of job.clients)
62
+ client.end();
63
+ job.clients.clear();
64
+ asJson(res, 200, { ok: true, status: 'stopped' });
65
+ return true;
66
+ }
50
67
  if (job.status !== 'running') {
51
68
  asJson(res, 200, { ok: true, status: job.status });
52
69
  return true;
53
70
  }
54
71
  job.abortController.abort();
55
72
  job.status = 'stopped';
56
- activeJobState.set(null);
57
73
  asJson(res, 200, { ok: true, status: 'stopped' });
58
74
  return true;
59
75
  }
60
- if (pathname === '/api/runs' && method === 'POST') {
61
- if (activeJobState.get()) {
62
- asJson(res, 409, { error: 'Another run is already active', jobId: activeJobState.get() });
76
+ if (pathname === '/api/runs/queue' && method === 'GET') {
77
+ const activeJob = runQueueState.activeJobId ? jobs.get(runQueueState.activeJobId) : null;
78
+ const queuedEntries = runQueueState.queue
79
+ .map((id) => jobs.get(id))
80
+ .filter((j) => !!j && j.status === 'queued')
81
+ .map((j) => ({
82
+ jobId: j.id,
83
+ status: j.status,
84
+ runParams: {
85
+ configPath: j.runParams.configPath,
86
+ runsPerScenario: j.runParams.runsPerScenario,
87
+ scenarioIds: j.runParams.scenarioIds ?? null,
88
+ agents: j.runParams.requestedAgents ?? null
89
+ }
90
+ }));
91
+ asJson(res, 200, {
92
+ active: activeJob
93
+ ? {
94
+ jobId: activeJob.id,
95
+ status: activeJob.status,
96
+ runParams: {
97
+ configPath: activeJob.runParams.configPath,
98
+ runsPerScenario: activeJob.runParams.runsPerScenario,
99
+ scenarioIds: activeJob.runParams.scenarioIds ?? null,
100
+ agents: activeJob.runParams.requestedAgents ?? null
101
+ }
102
+ }
103
+ : null,
104
+ queued: queuedEntries
105
+ });
106
+ return true;
107
+ }
108
+ if (pathname.startsWith('/api/runs/queue/') &&
109
+ method === 'DELETE' &&
110
+ pathname.split('/').length === 5) {
111
+ const jobId = pathname.split('/')[4];
112
+ const job = jobs.get(jobId);
113
+ if (!job) {
114
+ asJson(res, 404, { error: 'Job not found' });
63
115
  return true;
64
116
  }
117
+ if (job.status === 'running') {
118
+ asJson(res, 400, { error: 'Cannot remove a running job. Use the /stop endpoint instead.' });
119
+ return true;
120
+ }
121
+ if (job.status !== 'queued') {
122
+ asJson(res, 404, { error: 'Job is not queued' });
123
+ return true;
124
+ }
125
+ const idx = runQueueState.queue.indexOf(jobId);
126
+ if (idx !== -1)
127
+ runQueueState.queue.splice(idx, 1);
128
+ job.status = 'stopped';
129
+ addJobEvent(job, {
130
+ type: 'error',
131
+ ts: new Date().toISOString(),
132
+ payload: { message: 'Removed from queue by user' }
133
+ });
134
+ for (const client of job.clients)
135
+ client.end();
136
+ job.clients.clear();
137
+ asJson(res, 200, { ok: true, jobId, status: 'stopped' });
138
+ return true;
139
+ }
140
+ if (pathname === '/api/runs' && method === 'POST') {
65
141
  const body = (await parseBody(req));
66
142
  const configPathRaw = String(body.configPath ?? '');
67
143
  const runsPerScenario = Number(body.runsPerScenario ?? 1);
@@ -88,270 +164,59 @@ export async function handleRunsRoutes(params) {
88
164
  asJson(res, 404, { error: `Config not found: ${configPath}` });
89
165
  return true;
90
166
  }
91
- const jobId = `${Date.now()}`;
167
+ const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
168
+ const runParamsObj = {
169
+ configPath,
170
+ runsPerScenario,
171
+ scenarioId,
172
+ scenarioIds,
173
+ requestedAgents,
174
+ applySnapshotEval
175
+ };
92
176
  const job = {
93
177
  id: jobId,
94
- status: 'running',
178
+ status: 'queued',
95
179
  events: [],
96
180
  clients: new Set(),
97
- abortController: new AbortController()
181
+ abortController: new AbortController(),
182
+ runParams: runParamsObj
98
183
  };
99
184
  jobs.set(jobId, job);
100
- activeJobState.set(jobId);
101
- addJobEvent(job, {
102
- type: 'started',
103
- ts: new Date().toISOString(),
104
- payload: {
105
- configPath,
106
- runsPerScenario,
107
- scenarioId: scenarioId ?? null,
108
- scenarioIds: scenarioIds ?? null,
109
- agents: requestedAgents ?? null
110
- }
111
- });
112
- void (async () => {
113
- try {
114
- addJobEvent(job, {
115
- type: 'log',
116
- ts: new Date().toISOString(),
117
- payload: { message: `Loading MCP Evaluation config: ${configPath}` }
118
- });
119
- const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
120
- addJobEvent(job, {
121
- type: 'log',
122
- ts: new Date().toISOString(),
123
- payload: {
124
- message: `Loaded config (${loaded.config.scenarios.length} scenario(s), ${Object.keys(loaded.config.agents ?? {}).length} agent(s), ${Object.keys(loaded.config.servers ?? {}).length} server(s))`
125
- }
126
- });
127
- for (const warning of loaded.warnings ?? []) {
128
- addJobEvent(job, {
129
- type: 'log',
130
- ts: new Date().toISOString(),
131
- payload: { message: warning }
132
- });
133
- }
134
- addJobEvent(job, {
135
- type: 'log',
136
- ts: new Date().toISOString(),
137
- payload: {
138
- message: scenarioIds && scenarioIds.length > 0
139
- ? `Selecting requested scenarios: ${scenarioIds.join(', ')}`
140
- : scenarioId
141
- ? `Selecting requested scenario: ${scenarioId}`
142
- : 'Using all scenarios from config'
143
- }
144
- });
145
- const selectedBaseScenarios = selectScenarioIds(loaded.config, scenarioIds && scenarioIds.length > 0
146
- ? scenarioIds
147
- : scenarioId
148
- ? [scenarioId]
149
- : undefined);
150
- addJobEvent(job, {
151
- type: 'log',
152
- ts: new Date().toISOString(),
153
- payload: {
154
- message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
155
- }
156
- });
157
- const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
158
- const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
159
- addJobEvent(job, {
160
- type: 'log',
161
- ts: new Date().toISOString(),
162
- payload: {
163
- message: requestedAgents && requestedAgents.length > 0
164
- ? `Using requested agents: ${resolvedAgentList.join(', ')}`
165
- : `Using resolved default agents: ${resolvedAgentList.join(', ')}`
166
- }
167
- });
168
- const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
169
- addJobEvent(job, {
170
- type: 'log',
171
- ts: new Date().toISOString(),
172
- payload: {
173
- message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
174
- }
175
- });
176
- const cwdBefore = process.cwd();
177
- process.chdir(settings.workspaceRoot);
178
- try {
179
- addJobEvent(job, {
180
- type: 'log',
181
- ts: new Date().toISOString(),
182
- payload: {
183
- message: `Running evaluation (${runsPerScenario} run(s) per scenario) ...`
184
- }
185
- });
186
- const { runDir, results } = await runAll(expandedConfig, {
187
- runsPerScenario,
188
- scenarioId,
189
- configHash: loaded.hash,
190
- cliVersion: pkgVersion,
191
- runsDir: settings.runsDir,
192
- signal: job.abortController.signal,
193
- onProgress: async (event) => {
194
- const message = formatRunProgressMessage(event);
195
- if (!message)
196
- return;
197
- addJobEvent(job, {
198
- type: 'log',
199
- ts: new Date().toISOString(),
200
- payload: { message }
201
- });
202
- }
203
- });
204
- addJobEvent(job, {
205
- type: 'log',
206
- ts: new Date().toISOString(),
207
- payload: {
208
- message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
209
- }
210
- });
211
- if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
212
- addJobEvent(job, {
213
- type: 'log',
214
- ts: new Date().toISOString(),
215
- payload: { message: 'Applying snapshot evaluation policy ...' }
216
- });
217
- const policy = expandedConfig.snapshot_eval;
218
- const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
219
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
220
- .map((scenario) => scenario.id));
221
- const scenarioBaselineMap = new Map();
222
- for (const scenario of selectedBaseScenarios.scenarios) {
223
- if (scenario.snapshot_eval?.enabled === false)
224
- continue;
225
- const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
226
- if (baselineId)
227
- scenarioBaselineMap.set(scenario.id, baselineId);
228
- }
229
- const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
230
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
231
- .filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
232
- .map((scenario) => scenario.id);
233
- if (scenariosWithoutBaseline.length > 0) {
234
- addJobEvent(job, {
235
- type: 'log',
236
- ts: new Date().toISOString(),
237
- payload: {
238
- message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
239
- }
240
- });
241
- }
242
- const comparisons = [];
243
- const scenarioIdsByBaseline = new Map();
244
- for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
245
- const list = scenarioIdsByBaseline.get(baselineId) ?? [];
246
- list.push(scenarioIdItem);
247
- scenarioIdsByBaseline.set(baselineId, list);
248
- }
249
- for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
250
- addJobEvent(job, {
251
- type: 'log',
252
- ts: new Date().toISOString(),
253
- payload: {
254
- message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
255
- }
256
- });
257
- const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
258
- const fullComparison = compareRunToSnapshot(results, snapshot);
259
- comparisons.push({
260
- ...fullComparison,
261
- scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
262
- });
263
- }
264
- if (comparisons.length > 0) {
265
- applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
266
- addJobEvent(job, {
267
- type: 'log',
268
- ts: new Date().toISOString(),
269
- payload: {
270
- message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
271
- }
272
- });
273
- }
274
- else {
275
- addJobEvent(job, {
276
- type: 'log',
277
- ts: new Date().toISOString(),
278
- payload: {
279
- message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
280
- }
281
- });
282
- }
283
- }
284
- else if (applySnapshotEval) {
285
- addJobEvent(job, {
286
- type: 'log',
287
- ts: new Date().toISOString(),
288
- payload: {
289
- message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
290
- }
291
- });
292
- }
293
- else {
294
- addJobEvent(job, {
295
- type: 'log',
296
- ts: new Date().toISOString(),
297
- payload: {
298
- message: 'Snapshot evaluation skipped for this run (disabled in run request)'
299
- }
300
- });
301
- }
302
- addJobEvent(job, {
303
- type: 'log',
304
- ts: new Date().toISOString(),
305
- payload: { message: `Writing results to ${runDir}` }
306
- });
307
- writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
308
- writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
309
- addJobEvent(job, {
310
- type: 'log',
311
- ts: new Date().toISOString(),
312
- payload: {
313
- message: `Run finished: ${results.summary.total_runs} run(s), pass rate ${Math.round(results.summary.pass_rate * 100)}%`
314
- }
315
- });
316
- addJobEvent(job, {
317
- type: 'completed',
318
- ts: new Date().toISOString(),
319
- payload: {
320
- runId: results.metadata.run_id,
321
- runDir,
322
- summary: results.summary,
323
- snapshotEval: results.metadata.snapshot_eval ?? null
324
- }
325
- });
326
- job.status = 'completed';
185
+ if (!runQueueState.activeJobId) {
186
+ // No active job — start immediately
187
+ job.status = 'running';
188
+ runQueueState.activeJobId = jobId;
189
+ addJobEvent(job, {
190
+ type: 'started',
191
+ ts: new Date().toISOString(),
192
+ payload: {
193
+ configPath,
194
+ runsPerScenario,
195
+ scenarioId: scenarioId ?? null,
196
+ scenarioIds: scenarioIds ?? null,
197
+ agents: requestedAgents ?? null
327
198
  }
328
- finally {
329
- process.chdir(cwdBefore);
199
+ });
200
+ void executeRunJob(job, settings, jobs, runQueueState, deps);
201
+ asJson(res, 202, { jobId });
202
+ }
203
+ else {
204
+ // Queue this job
205
+ runQueueState.queue.push(jobId);
206
+ addJobEvent(job, {
207
+ type: 'queued',
208
+ ts: new Date().toISOString(),
209
+ payload: {
210
+ configPath,
211
+ runsPerScenario,
212
+ scenarioId: scenarioId ?? null,
213
+ scenarioIds: scenarioIds ?? null,
214
+ agents: requestedAgents ?? null,
215
+ position: runQueueState.queue.length
330
216
  }
331
- }
332
- catch (error) {
333
- const aborted = job.abortController.signal.aborted || job.status === 'stopped';
334
- addJobEvent(job, {
335
- type: 'error',
336
- ts: new Date().toISOString(),
337
- payload: {
338
- message: aborted
339
- ? 'Run aborted by user'
340
- : error instanceof Error
341
- ? error.message
342
- : String(error)
343
- }
344
- });
345
- job.status = aborted ? 'stopped' : 'error';
346
- }
347
- finally {
348
- activeJobState.set(null);
349
- for (const client of job.clients)
350
- client.end();
351
- job.clients.clear();
352
- }
353
- })();
354
- asJson(res, 202, { jobId });
217
+ });
218
+ asJson(res, 202, { jobId, queued: true, position: runQueueState.queue.length });
219
+ }
355
220
  return true;
356
221
  }
357
222
  if (pathname.startsWith('/api/runs/') && pathname.endsWith('/assistant') && method === 'POST') {
@@ -507,6 +372,290 @@ export async function handleRunsRoutes(params) {
507
372
  }
508
373
  return false;
509
374
  }
375
+ function advanceQueue(jobs, runQueueState, settings, deps) {
376
+ if (runQueueState.activeJobId)
377
+ return;
378
+ while (runQueueState.queue.length > 0) {
379
+ const nextId = runQueueState.queue.shift();
380
+ const nextJob = jobs.get(nextId);
381
+ if (!nextJob || nextJob.status !== 'queued')
382
+ continue;
383
+ nextJob.status = 'running';
384
+ runQueueState.activeJobId = nextId;
385
+ deps.addJobEvent(nextJob, {
386
+ type: 'started',
387
+ ts: new Date().toISOString(),
388
+ payload: {
389
+ configPath: nextJob.runParams.configPath,
390
+ runsPerScenario: nextJob.runParams.runsPerScenario,
391
+ scenarioId: nextJob.runParams.scenarioId ?? null,
392
+ scenarioIds: nextJob.runParams.scenarioIds ?? null,
393
+ agents: nextJob.runParams.requestedAgents ?? null
394
+ }
395
+ });
396
+ void executeRunJob(nextJob, settings, jobs, runQueueState, deps);
397
+ return;
398
+ }
399
+ }
400
+ async function executeRunJob(job, settings, jobs, runQueueState, deps) {
401
+ const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, pkgVersion } = deps;
402
+ const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, applySnapshotEval } = job.runParams;
403
+ try {
404
+ addJobEvent(job, {
405
+ type: 'log',
406
+ ts: new Date().toISOString(),
407
+ payload: { message: `Loading MCP Evaluation config: ${configPath}` }
408
+ });
409
+ const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
410
+ addJobEvent(job, {
411
+ type: 'log',
412
+ ts: new Date().toISOString(),
413
+ payload: {
414
+ message: `Loaded config (${loaded.config.scenarios.length} scenario(s), ${Object.keys(loaded.config.agents ?? {}).length} agent(s), ${Object.keys(loaded.config.servers ?? {}).length} server(s))`
415
+ }
416
+ });
417
+ for (const warning of loaded.warnings ?? []) {
418
+ addJobEvent(job, {
419
+ type: 'log',
420
+ ts: new Date().toISOString(),
421
+ payload: { message: warning }
422
+ });
423
+ }
424
+ addJobEvent(job, {
425
+ type: 'log',
426
+ ts: new Date().toISOString(),
427
+ payload: {
428
+ message: scenarioIds && scenarioIds.length > 0
429
+ ? `Selecting requested scenarios: ${scenarioIds.join(', ')}`
430
+ : scenarioId
431
+ ? `Selecting requested scenario: ${scenarioId}`
432
+ : 'Using all scenarios from config'
433
+ }
434
+ });
435
+ const selectedBaseScenarios = selectScenarioIds(loaded.config, scenarioIds && scenarioIds.length > 0 ? scenarioIds : scenarioId ? [scenarioId] : undefined);
436
+ addJobEvent(job, {
437
+ type: 'log',
438
+ ts: new Date().toISOString(),
439
+ payload: {
440
+ message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
441
+ }
442
+ });
443
+ const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
444
+ const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
445
+ addJobEvent(job, {
446
+ type: 'log',
447
+ ts: new Date().toISOString(),
448
+ payload: {
449
+ message: requestedAgents && requestedAgents.length > 0
450
+ ? `Using requested agents: ${resolvedAgentList.join(', ')}`
451
+ : `Using resolved default agents: ${resolvedAgentList.join(', ')}`
452
+ }
453
+ });
454
+ const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
455
+ addJobEvent(job, {
456
+ type: 'log',
457
+ ts: new Date().toISOString(),
458
+ payload: {
459
+ message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
460
+ }
461
+ });
462
+ const cwdBefore = process.cwd();
463
+ process.chdir(settings.workspaceRoot);
464
+ try {
465
+ addJobEvent(job, {
466
+ type: 'log',
467
+ ts: new Date().toISOString(),
468
+ payload: {
469
+ message: `Running evaluation (${runsPerScenario} run(s) per scenario) ...`
470
+ }
471
+ });
472
+ const { runDir, results } = await runAll(expandedConfig, {
473
+ runsPerScenario,
474
+ scenarioId,
475
+ configHash: loaded.hash,
476
+ cliVersion: pkgVersion,
477
+ runsDir: settings.runsDir,
478
+ signal: job.abortController.signal,
479
+ onProgress: async (event) => {
480
+ const message = formatRunProgressMessage(event);
481
+ if (!message)
482
+ return;
483
+ addJobEvent(job, {
484
+ type: 'log',
485
+ ts: new Date().toISOString(),
486
+ payload: { message }
487
+ });
488
+ }
489
+ });
490
+ addJobEvent(job, {
491
+ type: 'log',
492
+ ts: new Date().toISOString(),
493
+ payload: {
494
+ message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
495
+ }
496
+ });
497
+ if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
498
+ addJobEvent(job, {
499
+ type: 'log',
500
+ ts: new Date().toISOString(),
501
+ payload: { message: 'Applying snapshot evaluation policy ...' }
502
+ });
503
+ const policy = expandedConfig.snapshot_eval;
504
+ const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
505
+ .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
506
+ .map((scenario) => scenario.id));
507
+ const scenarioBaselineMap = new Map();
508
+ for (const scenario of selectedBaseScenarios.scenarios) {
509
+ if (scenario.snapshot_eval?.enabled === false)
510
+ continue;
511
+ const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
512
+ if (baselineId)
513
+ scenarioBaselineMap.set(scenario.id, baselineId);
514
+ }
515
+ const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
516
+ .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
517
+ .filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
518
+ .map((scenario) => scenario.id);
519
+ if (scenariosWithoutBaseline.length > 0) {
520
+ addJobEvent(job, {
521
+ type: 'log',
522
+ ts: new Date().toISOString(),
523
+ payload: {
524
+ message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
525
+ }
526
+ });
527
+ }
528
+ const comparisons = [];
529
+ const scenarioIdsByBaseline = new Map();
530
+ for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
531
+ const list = scenarioIdsByBaseline.get(baselineId) ?? [];
532
+ list.push(scenarioIdItem);
533
+ scenarioIdsByBaseline.set(baselineId, list);
534
+ }
535
+ for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
536
+ addJobEvent(job, {
537
+ type: 'log',
538
+ ts: new Date().toISOString(),
539
+ payload: {
540
+ message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
541
+ }
542
+ });
543
+ const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
544
+ const fullComparison = compareRunToSnapshot(results, snapshot);
545
+ comparisons.push({
546
+ ...fullComparison,
547
+ scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
548
+ });
549
+ }
550
+ if (comparisons.length > 0) {
551
+ applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
552
+ addJobEvent(job, {
553
+ type: 'log',
554
+ ts: new Date().toISOString(),
555
+ payload: {
556
+ message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
557
+ }
558
+ });
559
+ }
560
+ else {
561
+ addJobEvent(job, {
562
+ type: 'log',
563
+ ts: new Date().toISOString(),
564
+ payload: {
565
+ message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
566
+ }
567
+ });
568
+ }
569
+ }
570
+ else if (applySnapshotEval) {
571
+ addJobEvent(job, {
572
+ type: 'log',
573
+ ts: new Date().toISOString(),
574
+ payload: {
575
+ message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
576
+ }
577
+ });
578
+ }
579
+ else {
580
+ addJobEvent(job, {
581
+ type: 'log',
582
+ ts: new Date().toISOString(),
583
+ payload: {
584
+ message: 'Snapshot evaluation skipped for this run (disabled in run request)'
585
+ }
586
+ });
587
+ }
588
+ addJobEvent(job, {
589
+ type: 'log',
590
+ ts: new Date().toISOString(),
591
+ payload: { message: `Writing results to ${runDir}` }
592
+ });
593
+ writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
594
+ writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
595
+ addJobEvent(job, {
596
+ type: 'log',
597
+ ts: new Date().toISOString(),
598
+ payload: {
599
+ message: `Run finished: ${results.summary.total_runs} run(s), pass rate ${Math.round(results.summary.pass_rate * 100)}%`
600
+ }
601
+ });
602
+ addJobEvent(job, {
603
+ type: 'completed',
604
+ ts: new Date().toISOString(),
605
+ payload: {
606
+ runId: results.metadata.run_id,
607
+ runDir,
608
+ summary: results.summary,
609
+ snapshotEval: results.metadata.snapshot_eval ?? null
610
+ }
611
+ });
612
+ job.status = 'completed';
613
+ }
614
+ finally {
615
+ process.chdir(cwdBefore);
616
+ }
617
+ }
618
+ catch (error) {
619
+ const aborted = job.abortController.signal.aborted || job.status === 'stopped';
620
+ addJobEvent(job, {
621
+ type: 'error',
622
+ ts: new Date().toISOString(),
623
+ payload: {
624
+ message: aborted
625
+ ? 'Run aborted by user'
626
+ : error instanceof Error
627
+ ? error.message
628
+ : String(error)
629
+ }
630
+ });
631
+ job.status = aborted ? 'stopped' : 'error';
632
+ }
633
+ finally {
634
+ runQueueState.activeJobId = null;
635
+ for (const client of job.clients)
636
+ client.end();
637
+ job.clients.clear();
638
+ advanceQueue(jobs, runQueueState, settings, deps);
639
+ pruneOldJobs(jobs, runQueueState);
640
+ }
641
+ }
642
+ function pruneOldJobs(jobs, runQueueState) {
643
+ const maxAgeMs = 30 * 60_000;
644
+ const now = Date.now();
645
+ const activeIds = new Set([runQueueState.activeJobId, ...runQueueState.queue].filter(Boolean));
646
+ for (const [id, job] of jobs) {
647
+ if (activeIds.has(id))
648
+ continue;
649
+ if (job.status !== 'completed' && job.status !== 'error' && job.status !== 'stopped')
650
+ continue;
651
+ const lastEvent = job.events[job.events.length - 1];
652
+ if (!lastEvent)
653
+ continue;
654
+ if (now - new Date(lastEvent.ts).getTime() > maxAgeMs) {
655
+ jobs.delete(id);
656
+ }
657
+ }
658
+ }
510
659
  function formatRunProgressMessage(event) {
511
660
  switch (event.type) {
512
661
  case 'run_started':