@inspectr/mcplab 1.15.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +0 -35
  2. package/dist/app/assets/index-17cleCWQ.js +254 -0
  3. package/dist/app/assets/index-Bekohuot.css +1 -0
  4. package/dist/app/index.html +2 -2
  5. package/dist/app-server/app-context.d.ts +0 -22
  6. package/dist/app-server/app-context.d.ts.map +1 -1
  7. package/dist/app-server/evals-routes.d.ts.map +1 -1
  8. package/dist/app-server/evals-routes.js +1 -38
  9. package/dist/app-server/evals-routes.js.map +1 -1
  10. package/dist/app-server/libraries-store.d.ts.map +1 -1
  11. package/dist/app-server/libraries-store.js +2 -3
  12. package/dist/app-server/libraries-store.js.map +1 -1
  13. package/dist/app-server/markdown-reports.d.ts.map +1 -1
  14. package/dist/app-server/markdown-reports.js +64 -4
  15. package/dist/app-server/markdown-reports.js.map +1 -1
  16. package/dist/app-server/result-assistant-domain.js +1 -2
  17. package/dist/app-server/result-assistant-domain.js.map +1 -1
  18. package/dist/app-server/result-assistant.d.ts.map +1 -1
  19. package/dist/app-server/result-assistant.js +7 -1
  20. package/dist/app-server/result-assistant.js.map +1 -1
  21. package/dist/app-server/router.d.ts.map +1 -1
  22. package/dist/app-server/router.js +0 -24
  23. package/dist/app-server/router.js.map +1 -1
  24. package/dist/app-server/runs-routes.d.ts +15 -4
  25. package/dist/app-server/runs-routes.d.ts.map +1 -1
  26. package/dist/app-server/runs-routes.js +324 -136
  27. package/dist/app-server/runs-routes.js.map +1 -1
  28. package/dist/app-server/runs-store.d.ts +10 -0
  29. package/dist/app-server/runs-store.d.ts.map +1 -1
  30. package/dist/app-server/runs-store.js +27 -0
  31. package/dist/app-server/runs-store.js.map +1 -1
  32. package/dist/app-server/scenario-assistant-domain.d.ts +0 -16
  33. package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
  34. package/dist/app-server/scenario-assistant-domain.js +5 -8
  35. package/dist/app-server/scenario-assistant-domain.js.map +1 -1
  36. package/dist/app-server/scenario-assistant.d.ts.map +1 -1
  37. package/dist/app-server/scenario-assistant.js +7 -1
  38. package/dist/app-server/scenario-assistant.js.map +1 -1
  39. package/dist/app-server/snapshots-routes.d.ts +1 -13
  40. package/dist/app-server/snapshots-routes.d.ts.map +1 -1
  41. package/dist/app-server/snapshots-routes.js +9 -79
  42. package/dist/app-server/snapshots-routes.js.map +1 -1
  43. package/dist/app-server/tool-analysis.d.ts.map +1 -1
  44. package/dist/app-server/tool-analysis.js +25 -1
  45. package/dist/app-server/tool-analysis.js.map +1 -1
  46. package/dist/app-server/types.d.ts +0 -2
  47. package/dist/app-server/types.d.ts.map +1 -1
  48. package/dist/cli.js +79 -288
  49. package/dist/cli.js.map +1 -1
  50. package/dist/interactive-helpers.d.ts +0 -1
  51. package/dist/interactive-helpers.d.ts.map +1 -1
  52. package/dist/interactive-helpers.js +0 -3
  53. package/dist/interactive-helpers.js.map +1 -1
  54. package/package.json +4 -4
  55. package/dist/app/assets/index-BH8cCzoo.css +0 -1
  56. package/dist/app/assets/index-C2W0NrXX.js +0 -250
@@ -2,33 +2,108 @@ import { randomUUID } from 'node:crypto';
2
2
  import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
3
3
  import { tmpdir } from 'node:os';
4
4
  import { isAbsolute, join, relative, resolve } from 'node:path';
5
- import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown } from '@inspectr/mcplab-core';
5
+ import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
6
6
  import { renderReport } from '@inspectr/mcplab-reporting';
7
7
  import { OAuthAuthorizationRequiredError } from './oauth-session-manager.js';
8
+ import { selectScenarioIds } from './runs-store.js';
9
+ import { readLibraries as readLibrariesFromStore } from './libraries-store.js';
10
+ export function mergeLibraryEntriesIntoConfig(config, libraryAgents, libraryServers) {
11
+ return {
12
+ ...config,
13
+ agents: { ...libraryAgents, ...config.agents },
14
+ servers: { ...libraryServers, ...config.servers }
15
+ };
16
+ }
17
+ export function applyLibraryEntries(loaded, libraryAgents, libraryServers) {
18
+ loaded.config = mergeLibraryEntriesIntoConfig(loaded.config, libraryAgents, libraryServers);
19
+ loaded.hash = hashConfig(loaded.config);
20
+ }
21
+ function filterScenarioOverridesToSelectedScenarios(selectedConfig, scenarioServerOverrides) {
22
+ if (!scenarioServerOverrides)
23
+ return undefined;
24
+ const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
25
+ const filtered = Object.fromEntries(Object.entries(scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
26
+ return Object.keys(filtered).length > 0 ? filtered : undefined;
27
+ }
28
+ // Backward-compatible exports used by existing tests/imports.
8
29
  export function mergeLibraryAgentsIntoConfig(config, libraryAgents) {
9
- return { ...config, agents: { ...libraryAgents, ...config.agents } };
30
+ return mergeLibraryEntriesIntoConfig(config, libraryAgents, {});
10
31
  }
11
32
  export function applyLibraryAgents(loaded, libraryAgents) {
12
- loaded.config = mergeLibraryAgentsIntoConfig(loaded.config, libraryAgents);
13
- loaded.hash = hashConfig(loaded.config);
33
+ applyLibraryEntries(loaded, libraryAgents, {});
14
34
  }
15
35
  export async function handleRunsRoutes(params) {
16
36
  const { req, res, pathname, method, settings, jobs, runQueueState, oauthSessionManager, deps } = params;
17
- const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
37
+ const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
18
38
  if (pathname === '/api/runs' && method === 'GET') {
19
39
  const requestUrl = new URL(req.url ?? '/api/runs', 'http://localhost');
20
40
  const since = requestUrl.searchParams.get('since') ?? undefined;
21
41
  const until = requestUrl.searchParams.get('until') ?? undefined;
42
+ const scenario = requestUrl.searchParams.get('scenario') ?? undefined;
22
43
  const lastDaysRaw = requestUrl.searchParams.get('last_days');
23
44
  const lastDaysParsed = lastDaysRaw === null ? NaN : Number(lastDaysRaw);
24
45
  const lastDays = Number.isFinite(lastDaysParsed) && lastDaysParsed > 0
25
46
  ? Math.floor(lastDaysParsed)
26
47
  : undefined;
27
- asJson(res, 200, listRuns(settings.runsDir, {
48
+ const limitRaw = Number(requestUrl.searchParams.get('limit'));
49
+ const offsetRaw = Number(requestUrl.searchParams.get('offset'));
50
+ const limit = Number.isFinite(limitRaw) ? Math.max(1, Math.min(100, Math.floor(limitRaw))) : 25;
51
+ const offset = Number.isFinite(offsetRaw) ? Math.max(0, Math.floor(offsetRaw)) : 0;
52
+ const all = listRuns(settings.runsDir, {
28
53
  since,
29
54
  until,
30
- lastDays
31
- }));
55
+ lastDays,
56
+ scenario
57
+ });
58
+ const data = all.slice(offset, offset + limit);
59
+ const totalCount = all.length;
60
+ const hasMore = offset + data.length < totalCount;
61
+ const nextOffset = hasMore ? offset + data.length : null;
62
+ const prevOffset = offset > 0 ? Math.max(0, offset - limit) : null;
63
+ asJson(res, 200, {
64
+ object: 'list',
65
+ url: `${pathname}${requestUrl.search}`,
66
+ data,
67
+ has_more: hasMore,
68
+ total_count: totalCount,
69
+ next_offset: nextOffset,
70
+ prev_offset: prevOffset
71
+ });
72
+ return true;
73
+ }
74
+ if (pathname === '/api/runs/latest-pass-rates' && method === 'POST') {
75
+ const body = (await parseBody(req));
76
+ const requestedConfigs = Array.isArray(body.configs) ? body.configs : [];
77
+ const normalizedConfigs = requestedConfigs
78
+ .map((entry) => ({
79
+ id: String(entry?.id ?? '').trim(),
80
+ sourcePath: String(entry?.sourcePath ?? '').trim(),
81
+ relativePath: String(entry?.relativePath ?? '').trim(),
82
+ configHash: String(entry?.configHash ?? '').trim()
83
+ }))
84
+ .filter((entry) => entry.id);
85
+ const lastDaysRaw = Number(body.lastDays);
86
+ const lastDays = Number.isFinite(lastDaysRaw) && lastDaysRaw > 0 ? Math.floor(lastDaysRaw) : undefined;
87
+ const summaries = listRuns(settings.runsDir, { lastDays });
88
+ const pending = new Set(normalizedConfigs.map((entry) => entry.id));
89
+ const byConfigId = {};
90
+ for (const summary of summaries) {
91
+ if (pending.size === 0)
92
+ break;
93
+ const summaryPath = String(summary.configPath ?? '').trim();
94
+ const summaryHash = String(summary.configHash ?? '').trim();
95
+ for (const cfg of normalizedConfigs) {
96
+ if (!pending.has(cfg.id))
97
+ continue;
98
+ if ((cfg.sourcePath && cfg.sourcePath === summaryPath) ||
99
+ (cfg.relativePath && cfg.relativePath === summaryPath) ||
100
+ (cfg.configHash && cfg.configHash === summaryHash)) {
101
+ byConfigId[cfg.id] = summary.passRate;
102
+ pending.delete(cfg.id);
103
+ }
104
+ }
105
+ }
106
+ asJson(res, 200, { byConfigId });
32
107
  return true;
33
108
  }
34
109
  if (pathname.startsWith('/api/runs/') && pathname.endsWith('/trace') && method === 'GET') {
@@ -109,7 +184,9 @@ export async function handleRunsRoutes(params) {
109
184
  runsPerScenario: j.runParams.runsPerScenario,
110
185
  scenarioIds: j.runParams.scenarioIds ?? null,
111
186
  agents: j.runParams.requestedAgents ?? null,
112
- runNote: j.runParams.runNote ?? null
187
+ runNote: j.runParams.runNote ?? null,
188
+ serverOverrideAll: j.runParams.serverOverrideAll ?? null,
189
+ scenarioServerOverrides: j.runParams.scenarioServerOverrides ?? null
113
190
  }
114
191
  }));
115
192
  asJson(res, 200, {
@@ -122,7 +199,9 @@ export async function handleRunsRoutes(params) {
122
199
  runsPerScenario: activeJob.runParams.runsPerScenario,
123
200
  scenarioIds: activeJob.runParams.scenarioIds ?? null,
124
201
  agents: activeJob.runParams.requestedAgents ?? null,
125
- runNote: activeJob.runParams.runNote ?? null
202
+ runNote: activeJob.runParams.runNote ?? null,
203
+ serverOverrideAll: activeJob.runParams.serverOverrideAll ?? null,
204
+ scenarioServerOverrides: activeJob.runParams.scenarioServerOverrides ?? null
126
205
  }
127
206
  }
128
207
  : null,
@@ -182,9 +261,45 @@ export async function handleRunsRoutes(params) {
182
261
  const requestedAgents = Array.isArray(body.agents)
183
262
  ? body.agents.map((agent) => String(agent).trim()).filter(Boolean)
184
263
  : undefined;
185
- const applySnapshotEval = body.applySnapshotEval !== false;
186
264
  const runNoteRaw = typeof body.runNote === 'string' ? body.runNote.trim() : '';
187
265
  const runNote = runNoteRaw ? runNoteRaw.slice(0, 500) : undefined;
266
+ const serverOverrideAll = Array.isArray(body.serverOverrideAll)
267
+ ? body.serverOverrideAll.map((id) => String(id).trim()).filter(Boolean)
268
+ : undefined;
269
+ if (Array.isArray(body.serverOverrideAll) &&
270
+ (!serverOverrideAll || serverOverrideAll.length === 0)) {
271
+ asJson(res, 400, { error: 'serverOverrideAll must include at least one server id' });
272
+ return true;
273
+ }
274
+ if (body.scenarioServerOverrides !== undefined &&
275
+ (typeof body.scenarioServerOverrides !== 'object' ||
276
+ body.scenarioServerOverrides === null ||
277
+ Array.isArray(body.scenarioServerOverrides))) {
278
+ asJson(res, 400, {
279
+ error: 'scenarioServerOverrides must be an object of scenarioId -> string[]'
280
+ });
281
+ return true;
282
+ }
283
+ let scenarioServerOverrides;
284
+ if (body.scenarioServerOverrides && typeof body.scenarioServerOverrides === 'object') {
285
+ const normalizedEntries = [];
286
+ for (const [rawScenarioId, rawServerIds] of Object.entries(body.scenarioServerOverrides)) {
287
+ const scenarioOverrideId = String(rawScenarioId).trim();
288
+ if (!scenarioOverrideId)
289
+ continue;
290
+ if (!Array.isArray(rawServerIds)) {
291
+ asJson(res, 400, {
292
+ error: `scenarioServerOverrides.${scenarioOverrideId} must be an array of server ids`
293
+ });
294
+ return true;
295
+ }
296
+ normalizedEntries.push([
297
+ scenarioOverrideId,
298
+ rawServerIds.map((id) => String(id).trim()).filter(Boolean)
299
+ ]);
300
+ }
301
+ scenarioServerOverrides = Object.fromEntries(normalizedEntries);
302
+ }
188
303
  if (!configPathRaw) {
189
304
  asJson(res, 400, { error: 'configPath is required' });
190
305
  return true;
@@ -200,17 +315,27 @@ export async function handleRunsRoutes(params) {
200
315
  asJson(res, 404, { error: `Config not found: ${configPath}` });
201
316
  return true;
202
317
  }
203
- // Eagerly cache OAuth server names to avoid re-parsing config in advanceQueue
204
- let oauthServerNames;
205
318
  try {
206
- const loaded = loadConfig(configPath);
207
- oauthServerNames = Object.entries(loaded.config.servers ?? {})
208
- .filter(([, v]) => v.auth?.type === 'oauth_authorization_code')
209
- .map(([name]) => name);
319
+ const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
320
+ const libraries = readLibraries(settings.librariesDir);
321
+ applyLibraryEntries(loaded, libraries.agents, libraries.servers);
322
+ const selected = scenarioIds?.length
323
+ ? deps.selectScenarioIds(loaded.config, scenarioIds)
324
+ : scenarioId
325
+ ? deps.selectScenarioIds(loaded.config, [scenarioId])
326
+ : loaded.config;
327
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, scenarioServerOverrides);
328
+ applyRuntimeServerOverrides(selected, {
329
+ serverOverrideAll,
330
+ scenarioServerOverrides: filteredScenarioOverrides
331
+ });
210
332
  }
211
- catch {
212
- // Will be resolved lazily in advanceQueue if needed
333
+ catch (error) {
334
+ asJson(res, 400, { error: error instanceof Error ? error.message : String(error) });
335
+ return true;
213
336
  }
337
+ // Resolve lazily in advanceQueue so runtime overrides are always reflected.
338
+ const oauthServerNames = undefined;
214
339
  const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
215
340
  const runParamsObj = {
216
341
  configPath,
@@ -218,9 +343,10 @@ export async function handleRunsRoutes(params) {
218
343
  scenarioId,
219
344
  scenarioIds,
220
345
  requestedAgents,
221
- applySnapshotEval,
222
346
  runNote,
223
- oauthServerNames
347
+ oauthServerNames,
348
+ serverOverrideAll,
349
+ scenarioServerOverrides
224
350
  };
225
351
  const job = {
226
352
  id: jobId,
@@ -244,6 +370,8 @@ export async function handleRunsRoutes(params) {
244
370
  scenarioIds: scenarioIds ?? null,
245
371
  agents: requestedAgents ?? null,
246
372
  runNote: runNote ?? null,
373
+ serverOverrideAll: serverOverrideAll ?? null,
374
+ scenarioServerOverrides: scenarioServerOverrides ?? null,
247
375
  position: runQueueState.queue.length
248
376
  }
249
377
  });
@@ -568,19 +696,39 @@ function toCoreExtractRules(extractRules) {
568
696
  }
569
697
  return rules;
570
698
  }
571
- function resolveOAuthServersForJob(job) {
572
- if (job.runParams.oauthServerNames !== undefined) {
699
+ function resolveOAuthServersForJob(job, librariesDir) {
700
+ if (job.runParams.oauthServerNames !== undefined)
573
701
  return job.runParams.oauthServerNames;
574
- }
575
702
  try {
576
- const loaded = loadConfig(job.runParams.configPath);
577
- const names = Object.entries(loaded.config.servers ?? {})
578
- .filter(([, v]) => v.auth?.type === 'oauth_authorization_code')
579
- .map(([name]) => name);
703
+ const loaded = loadConfig(job.runParams.configPath, { bundleRoot: librariesDir });
704
+ const libraries = readLibrariesFromStore(librariesDir);
705
+ applyLibraryEntries(loaded, libraries.agents, libraries.servers);
706
+ const selected = job.runParams.scenarioIds?.length
707
+ ? selectScenarioIds(loaded.config, job.runParams.scenarioIds)
708
+ : job.runParams.scenarioId
709
+ ? selectScenarioIds(loaded.config, [job.runParams.scenarioId])
710
+ : loaded.config;
711
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, job.runParams.scenarioServerOverrides);
712
+ const withOverrides = applyRuntimeServerOverrides(selected, {
713
+ serverOverrideAll: job.runParams.serverOverrideAll,
714
+ scenarioServerOverrides: filteredScenarioOverrides
715
+ });
716
+ const effectiveServers = new Set(withOverrides.scenarios.flatMap((scenario) => scenario.servers));
717
+ const names = Array.from(effectiveServers).filter((name) => {
718
+ const config = withOverrides.servers?.[name];
719
+ return config?.auth?.type === 'oauth_authorization_code';
720
+ });
580
721
  job.runParams.oauthServerNames = names;
581
722
  return names;
582
723
  }
583
- catch {
724
+ catch (error) {
725
+ const message = error instanceof Error ? error.message : String(error);
726
+ if (message.includes('Unknown server refs') ||
727
+ message.includes('Unknown scenarios in scenarioServerOverrides') ||
728
+ message.includes('serverOverrideAll must include at least one server id')) {
729
+ throw error;
730
+ }
731
+ console.warn(`[mcplab] Failed to resolve OAuth servers for queued job '${job.id}': ${message}`);
584
732
  return [];
585
733
  }
586
734
  }
@@ -599,7 +747,25 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
599
747
  continue;
600
748
  }
601
749
  // Pre-check OAuth before starting
602
- const oauthServers = resolveOAuthServersForJob(nextJob);
750
+ let oauthServers = [];
751
+ try {
752
+ oauthServers = resolveOAuthServersForJob(nextJob, settings.librariesDir);
753
+ }
754
+ catch (error) {
755
+ runQueueState.queue.shift();
756
+ nextJob.status = 'error';
757
+ deps.addJobEvent(nextJob, {
758
+ type: 'error',
759
+ ts: new Date().toISOString(),
760
+ payload: {
761
+ message: error instanceof Error ? error.message : String(error)
762
+ }
763
+ });
764
+ for (const client of nextJob.clients)
765
+ client.end();
766
+ nextJob.clients.clear();
767
+ continue;
768
+ }
603
769
  if (oauthServers.length > 0) {
604
770
  const authStatus = oauthSessionManager.checkServersAuthStatus(oauthServers);
605
771
  const needsAuth = authStatus.filter((s) => s.status === 'auth_required');
@@ -634,7 +800,9 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
634
800
  scenarioId: nextJob.runParams.scenarioId ?? null,
635
801
  scenarioIds: nextJob.runParams.scenarioIds ?? null,
636
802
  agents: nextJob.runParams.requestedAgents ?? null,
637
- runNote: nextJob.runParams.runNote ?? null
803
+ runNote: nextJob.runParams.runNote ?? null,
804
+ serverOverrideAll: nextJob.runParams.serverOverrideAll ?? null,
805
+ scenarioServerOverrides: nextJob.runParams.scenarioServerOverrides ?? null
638
806
  }
639
807
  });
640
808
  void executeRunJob(nextJob, settings, jobs, runQueueState, oauthSessionManager, deps);
@@ -646,8 +814,8 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
646
814
  }
647
815
  }
648
816
  async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionManager, deps) {
649
- const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pkgVersion } = deps;
650
- const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, applySnapshotEval, runNote } = job.runParams;
817
+ const { addJobEvent, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pkgVersion } = deps;
818
+ const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, runNote, serverOverrideAll, scenarioServerOverrides } = job.runParams;
651
819
  try {
652
820
  addJobEvent(job, {
653
821
  type: 'log',
@@ -655,8 +823,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
655
823
  payload: { message: `Loading MCP Evaluation config: ${configPath}` }
656
824
  });
657
825
  const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
658
- const { agents: libraryAgents } = readLibraries(settings.librariesDir);
659
- applyLibraryAgents(loaded, libraryAgents);
826
+ const { agents: libraryAgents, servers: libraryServers } = readLibraries(settings.librariesDir);
827
+ applyLibraryEntries(loaded, libraryAgents, libraryServers);
660
828
  addJobEvent(job, {
661
829
  type: 'log',
662
830
  ts: new Date().toISOString(),
@@ -690,7 +858,30 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
690
858
  message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
691
859
  }
692
860
  });
693
- const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
861
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selectedBaseScenarios, scenarioServerOverrides);
862
+ const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseScenarios, {
863
+ serverOverrideAll,
864
+ scenarioServerOverrides: filteredScenarioOverrides
865
+ });
866
+ const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
867
+ addJobEvent(job, {
868
+ type: 'log',
869
+ ts: new Date().toISOString(),
870
+ payload: {
871
+ message: `Applied runtime server overrides: global=${serverOverrideAll?.length ?? 0} scenario-specific=${Object.keys(filteredScenarioOverrides ?? {}).length}`
872
+ }
873
+ });
874
+ const effectiveScenarioServers = runtimeOverriddenConfig.scenarios
875
+ .map((scenario) => `${scenario.id}=[${scenario.servers.join(', ')}]`)
876
+ .join('; ');
877
+ addJobEvent(job, {
878
+ type: 'log',
879
+ ts: new Date().toISOString(),
880
+ payload: {
881
+ message: `Effective MCP servers per scenario: ${effectiveScenarioServers || '(none)'}`
882
+ }
883
+ });
884
+ const resolvedAgents = resolveRunSelectedAgents(runtimeOverriddenConfig, requestedAgents);
694
885
  const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
695
886
  addJobEvent(job, {
696
887
  type: 'log',
@@ -701,7 +892,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
701
892
  : `Using resolved default agents: ${resolvedAgentList.join(', ')}`
702
893
  }
703
894
  });
704
- const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
895
+ const expandedConfig = expandConfigForAgents(runtimeOverriddenConfig, resolvedAgents);
705
896
  addJobEvent(job, {
706
897
  type: 'log',
707
898
  ts: new Date().toISOString(),
@@ -709,9 +900,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
709
900
  message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
710
901
  }
711
902
  });
712
- const oauthServers = Object.entries(expandedConfig.servers)
713
- .filter(([, serverConfig]) => serverConfig.auth?.type === 'oauth_authorization_code')
714
- .map(([serverName]) => serverName);
903
+ const usedServerNames = new Set(expandedConfig.scenarios.flatMap((scenario) => scenario.servers));
904
+ const oauthServers = Array.from(usedServerNames).filter((serverName) => expandedConfig.servers[serverName]?.auth?.type === 'oauth_authorization_code');
715
905
  const mcpServerAuthHeaders = oauthServers.length > 0
716
906
  ? await oauthSessionManager.getAuthHeadersForServers(oauthServers)
717
907
  : undefined;
@@ -745,7 +935,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
745
935
  runsPerScenario,
746
936
  scenarioId,
747
937
  runNote,
748
- configHash: loaded.hash,
938
+ configHash: effectiveConfigHash,
749
939
  cliVersion: pkgVersion,
750
940
  runsDir: settings.runsDir,
751
941
  mcpServerAuthHeaders,
@@ -767,6 +957,23 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
767
957
  if (loaded.config.name && loaded.config.name.trim().length > 0) {
768
958
  results.metadata.config_name = loaded.config.name.trim();
769
959
  }
960
+ results.metadata.rerun_agents = [...resolvedAgentList];
961
+ results.metadata.rerun_scenario_ids = selectedBaseScenarios.scenarios.map((scenario) => scenario.id);
962
+ if (serverOverrideAll && serverOverrideAll.length > 0) {
963
+ results.metadata.rerun_server_override_all = [...serverOverrideAll];
964
+ }
965
+ else {
966
+ delete results.metadata.rerun_server_override_all;
967
+ }
968
+ if (filteredScenarioOverrides && Object.keys(filteredScenarioOverrides).length > 0) {
969
+ results.metadata.rerun_scenario_server_overrides = Object.fromEntries(Object.entries(filteredScenarioOverrides).map(([scenarioKey, serverIds]) => [
970
+ scenarioKey,
971
+ [...serverIds]
972
+ ]));
973
+ }
974
+ else {
975
+ delete results.metadata.rerun_scenario_server_overrides;
976
+ }
770
977
  addJobEvent(job, {
771
978
  type: 'log',
772
979
  ts: new Date().toISOString(),
@@ -774,102 +981,13 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
774
981
  message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
775
982
  }
776
983
  });
777
- if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
778
- addJobEvent(job, {
779
- type: 'log',
780
- ts: new Date().toISOString(),
781
- payload: { message: 'Applying snapshot evaluation policy ...' }
782
- });
783
- const policy = expandedConfig.snapshot_eval;
784
- const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
785
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
786
- .map((scenario) => scenario.id));
787
- const scenarioBaselineMap = new Map();
788
- for (const scenario of selectedBaseScenarios.scenarios) {
789
- if (scenario.snapshot_eval?.enabled === false)
790
- continue;
791
- const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
792
- if (baselineId)
793
- scenarioBaselineMap.set(scenario.id, baselineId);
794
- }
795
- const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
796
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
797
- .filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
798
- .map((scenario) => scenario.id);
799
- if (scenariosWithoutBaseline.length > 0) {
800
- addJobEvent(job, {
801
- type: 'log',
802
- ts: new Date().toISOString(),
803
- payload: {
804
- message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
805
- }
806
- });
807
- }
808
- const comparisons = [];
809
- const scenarioIdsByBaseline = new Map();
810
- for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
811
- const list = scenarioIdsByBaseline.get(baselineId) ?? [];
812
- list.push(scenarioIdItem);
813
- scenarioIdsByBaseline.set(baselineId, list);
814
- }
815
- for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
816
- addJobEvent(job, {
817
- type: 'log',
818
- ts: new Date().toISOString(),
819
- payload: {
820
- message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
821
- }
822
- });
823
- const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
824
- const fullComparison = compareRunToSnapshot(results, snapshot);
825
- comparisons.push({
826
- ...fullComparison,
827
- scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
828
- });
829
- }
830
- if (comparisons.length > 0) {
831
- applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
832
- addJobEvent(job, {
833
- type: 'log',
834
- ts: new Date().toISOString(),
835
- payload: {
836
- message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
837
- }
838
- });
839
- }
840
- else {
841
- addJobEvent(job, {
842
- type: 'log',
843
- ts: new Date().toISOString(),
844
- payload: {
845
- message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
846
- }
847
- });
848
- }
849
- }
850
- else if (applySnapshotEval) {
851
- addJobEvent(job, {
852
- type: 'log',
853
- ts: new Date().toISOString(),
854
- payload: {
855
- message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
856
- }
857
- });
858
- }
859
- else {
860
- addJobEvent(job, {
861
- type: 'log',
862
- ts: new Date().toISOString(),
863
- payload: {
864
- message: 'Snapshot evaluation skipped for this run (disabled in run request)'
865
- }
866
- });
867
- }
868
984
  addJobEvent(job, {
869
985
  type: 'log',
870
986
  ts: new Date().toISOString(),
871
987
  payload: { message: `Writing results to ${runDir}` }
872
988
  });
989
+ const traceRecords = getScenarioRunTraceRecords(results.metadata.run_id, settings.runsDir);
990
+ results.metadata.tool_tokens_total = estimateRunToolTokensTotal(traceRecords);
873
991
  writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
874
992
  writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
875
993
  writeFileSync(join(runDir, 'summary.md'), renderSummaryMarkdown(results), 'utf8');
@@ -886,8 +1004,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
886
1004
  payload: {
887
1005
  runId: results.metadata.run_id,
888
1006
  runDir,
889
- summary: results.summary,
890
- snapshotEval: results.metadata.snapshot_eval ?? null
1007
+ summary: results.summary
891
1008
  }
892
1009
  });
893
1010
  job.status = 'completed';
@@ -923,6 +1040,77 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
923
1040
  pruneOldJobs(jobs, runQueueState);
924
1041
  }
925
1042
  }
1043
+ function splitInteger(total, parts) {
1044
+ if (!Number.isFinite(total) || !parts || parts <= 0)
1045
+ return Array(parts).fill(0);
1046
+ const safeTotal = Math.max(0, Math.round(total ?? 0));
1047
+ const base = Math.floor(safeTotal / parts);
1048
+ let remainder = safeTotal % parts;
1049
+ return Array.from({ length: parts }, () => {
1050
+ const value = base + (remainder > 0 ? 1 : 0);
1051
+ if (remainder > 0)
1052
+ remainder -= 1;
1053
+ return value;
1054
+ });
1055
+ }
1056
+ function estimateRunToolTokensTotal(records) {
1057
+ let total = 0;
1058
+ let hasAny = false;
1059
+ for (const record of records) {
1060
+ const toolUsesById = new Map();
1061
+ for (const message of record.messages ?? []) {
1062
+ const toolUses = message.content.filter((block) => block.type === 'tool_use');
1063
+ if (toolUses.length > 0) {
1064
+ for (const toolUse of toolUses)
1065
+ toolUsesById.set(toolUse.id, toolUse.name);
1066
+ const allEstimated = toolUses.every((toolUse) => Boolean(toolUse.estimated_tokens));
1067
+ if (allEstimated) {
1068
+ for (const toolUse of toolUses)
1069
+ total += toolUse.estimated_tokens?.total ?? 0;
1070
+ hasAny = true;
1071
+ }
1072
+ else if (toolUses.length === 1 && typeof message.usage?.total_tokens === 'number') {
1073
+ total += message.usage.total_tokens;
1074
+ hasAny = true;
1075
+ }
1076
+ else {
1077
+ const shares = splitInteger(message.usage?.total_tokens, toolUses.length);
1078
+ total += shares.reduce((sum, value) => sum + value, 0);
1079
+ if (typeof message.usage?.total_tokens === 'number')
1080
+ hasAny = true;
1081
+ }
1082
+ }
1083
+ const toolResults = message.content.filter((block) => block.type === 'tool_result');
1084
+ if (toolResults.length === 0)
1085
+ continue;
1086
+ const allEstimated = toolResults.every((result) => Boolean(result.estimated_tokens));
1087
+ if (allEstimated) {
1088
+ for (const result of toolResults)
1089
+ total += result.estimated_tokens?.total ?? 0;
1090
+ hasAny = true;
1091
+ continue;
1092
+ }
1093
+ if (toolResults.length === 1) {
1094
+ const [result] = toolResults;
1095
+ if (result &&
1096
+ toolUsesById.has(result.tool_use_id) &&
1097
+ typeof message.usage?.total_tokens === 'number') {
1098
+ total += message.usage.total_tokens;
1099
+ hasAny = true;
1100
+ continue;
1101
+ }
1102
+ }
1103
+ const knownResults = toolResults.filter((result) => toolUsesById.has(result.tool_use_id));
1104
+ if (knownResults.length === 0)
1105
+ continue;
1106
+ const shares = splitInteger(message.usage?.total_tokens, knownResults.length);
1107
+ total += shares.reduce((sum, value) => sum + value, 0);
1108
+ if (typeof message.usage?.total_tokens === 'number')
1109
+ hasAny = true;
1110
+ }
1111
+ }
1112
+ return hasAny ? total : null;
1113
+ }
926
1114
  function pruneOldJobs(jobs, runQueueState) {
927
1115
  const maxAgeMs = 30 * 60_000;
928
1116
  const now = Date.now();
@@ -945,9 +1133,9 @@ function formatRunProgressMessage(event) {
945
1133
  case 'run_started':
946
1134
  return `Run initialized (id: ${event.runId}, ${event.totalScenarioRuns} scenario run(s))`;
947
1135
  case 'mcp_connect_started':
948
- return `Connecting to ${event.serverCount} MCP server(s) ...`;
1136
+ return `Connecting to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')} ...`;
949
1137
  case 'mcp_connect_finished':
950
- return `Connected to ${event.serverCount} MCP server(s)`;
1138
+ return `Connected to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')}`;
951
1139
  case 'scenario_run_started':
952
1140
  return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
953
1141
  case 'scenario_run_finished':