@inspectr/mcplab 1.14.3 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +0 -35
  2. package/dist/app/assets/index-BSGuUMv-.js +254 -0
  3. package/dist/app/assets/index-Bekohuot.css +1 -0
  4. package/dist/app/index.html +2 -2
  5. package/dist/app-server/app-context.d.ts +0 -22
  6. package/dist/app-server/app-context.d.ts.map +1 -1
  7. package/dist/app-server/assistant-common.d.ts +37 -24
  8. package/dist/app-server/evals-routes.d.ts.map +1 -1
  9. package/dist/app-server/evals-routes.js +5 -41
  10. package/dist/app-server/evals-routes.js.map +1 -1
  11. package/dist/app-server/libraries-store.d.ts.map +1 -1
  12. package/dist/app-server/libraries-store.js +2 -3
  13. package/dist/app-server/libraries-store.js.map +1 -1
  14. package/dist/app-server/result-assistant-domain.d.ts +81 -65
  15. package/dist/app-server/result-assistant-domain.js +1 -2
  16. package/dist/app-server/result-assistant-domain.js.map +1 -1
  17. package/dist/app-server/result-assistant.d.ts.map +1 -1
  18. package/dist/app-server/result-assistant.js +7 -1
  19. package/dist/app-server/result-assistant.js.map +1 -1
  20. package/dist/app-server/router.d.ts.map +1 -1
  21. package/dist/app-server/router.js +0 -24
  22. package/dist/app-server/router.js.map +1 -1
  23. package/dist/app-server/runs-routes.d.ts +15 -4
  24. package/dist/app-server/runs-routes.d.ts.map +1 -1
  25. package/dist/app-server/runs-routes.js +189 -134
  26. package/dist/app-server/runs-routes.js.map +1 -1
  27. package/dist/app-server/runs-store.d.ts +6 -1
  28. package/dist/app-server/runs-store.d.ts.map +1 -1
  29. package/dist/app-server/runs-store.js +15 -1
  30. package/dist/app-server/runs-store.js.map +1 -1
  31. package/dist/app-server/scenario-assistant-domain.d.ts +144 -134
  32. package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
  33. package/dist/app-server/scenario-assistant-domain.js +5 -8
  34. package/dist/app-server/scenario-assistant-domain.js.map +1 -1
  35. package/dist/app-server/scenario-assistant.d.ts.map +1 -1
  36. package/dist/app-server/scenario-assistant.js +7 -1
  37. package/dist/app-server/scenario-assistant.js.map +1 -1
  38. package/dist/app-server/snapshots-routes.d.ts +1 -13
  39. package/dist/app-server/snapshots-routes.d.ts.map +1 -1
  40. package/dist/app-server/snapshots-routes.js +9 -79
  41. package/dist/app-server/snapshots-routes.js.map +1 -1
  42. package/dist/app-server/types.d.ts +0 -2
  43. package/dist/app-server/types.d.ts.map +1 -1
  44. package/dist/cli.js +79 -288
  45. package/dist/cli.js.map +1 -1
  46. package/dist/interactive-helpers.d.ts +0 -1
  47. package/dist/interactive-helpers.d.ts.map +1 -1
  48. package/dist/interactive-helpers.js +0 -3
  49. package/dist/interactive-helpers.js.map +1 -1
  50. package/package.json +4 -4
  51. package/dist/app/assets/index-BBRB19an.js +0 -250
  52. package/dist/app/assets/index-DVQdbWhs.css +0 -1
@@ -2,21 +2,53 @@ import { randomUUID } from 'node:crypto';
2
2
  import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
3
3
  import { tmpdir } from 'node:os';
4
4
  import { isAbsolute, join, relative, resolve } from 'node:path';
5
- import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown } from '@inspectr/mcplab-core';
5
+ import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
6
6
  import { renderReport } from '@inspectr/mcplab-reporting';
7
7
  import { OAuthAuthorizationRequiredError } from './oauth-session-manager.js';
8
+ import { selectScenarioIds } from './runs-store.js';
9
+ import { readLibraries as readLibrariesFromStore } from './libraries-store.js';
10
+ export function mergeLibraryEntriesIntoConfig(config, libraryAgents, libraryServers) {
11
+ return {
12
+ ...config,
13
+ agents: { ...libraryAgents, ...config.agents },
14
+ servers: { ...libraryServers, ...config.servers }
15
+ };
16
+ }
17
+ export function applyLibraryEntries(loaded, libraryAgents, libraryServers) {
18
+ loaded.config = mergeLibraryEntriesIntoConfig(loaded.config, libraryAgents, libraryServers);
19
+ loaded.hash = hashConfig(loaded.config);
20
+ }
21
+ function filterScenarioOverridesToSelectedScenarios(selectedConfig, scenarioServerOverrides) {
22
+ if (!scenarioServerOverrides)
23
+ return undefined;
24
+ const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
25
+ const filtered = Object.fromEntries(Object.entries(scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
26
+ return Object.keys(filtered).length > 0 ? filtered : undefined;
27
+ }
28
+ // Backward-compatible exports used by existing tests/imports.
8
29
  export function mergeLibraryAgentsIntoConfig(config, libraryAgents) {
9
- return { ...config, agents: { ...libraryAgents, ...config.agents } };
30
+ return mergeLibraryEntriesIntoConfig(config, libraryAgents, {});
10
31
  }
11
32
  export function applyLibraryAgents(loaded, libraryAgents) {
12
- loaded.config = mergeLibraryAgentsIntoConfig(loaded.config, libraryAgents);
13
- loaded.hash = hashConfig(loaded.config);
33
+ applyLibraryEntries(loaded, libraryAgents, {});
14
34
  }
15
35
  export async function handleRunsRoutes(params) {
16
36
  const { req, res, pathname, method, settings, jobs, runQueueState, oauthSessionManager, deps } = params;
17
- const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
37
+ const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
18
38
  if (pathname === '/api/runs' && method === 'GET') {
19
- asJson(res, 200, listRuns(settings.runsDir));
39
+ const requestUrl = new URL(req.url ?? '/api/runs', 'http://localhost');
40
+ const since = requestUrl.searchParams.get('since') ?? undefined;
41
+ const until = requestUrl.searchParams.get('until') ?? undefined;
42
+ const lastDaysRaw = requestUrl.searchParams.get('last_days');
43
+ const lastDaysParsed = lastDaysRaw === null ? NaN : Number(lastDaysRaw);
44
+ const lastDays = Number.isFinite(lastDaysParsed) && lastDaysParsed > 0
45
+ ? Math.floor(lastDaysParsed)
46
+ : undefined;
47
+ asJson(res, 200, listRuns(settings.runsDir, {
48
+ since,
49
+ until,
50
+ lastDays
51
+ }));
20
52
  return true;
21
53
  }
22
54
  if (pathname.startsWith('/api/runs/') && pathname.endsWith('/trace') && method === 'GET') {
@@ -97,7 +129,9 @@ export async function handleRunsRoutes(params) {
97
129
  runsPerScenario: j.runParams.runsPerScenario,
98
130
  scenarioIds: j.runParams.scenarioIds ?? null,
99
131
  agents: j.runParams.requestedAgents ?? null,
100
- runNote: j.runParams.runNote ?? null
132
+ runNote: j.runParams.runNote ?? null,
133
+ serverOverrideAll: j.runParams.serverOverrideAll ?? null,
134
+ scenarioServerOverrides: j.runParams.scenarioServerOverrides ?? null
101
135
  }
102
136
  }));
103
137
  asJson(res, 200, {
@@ -110,7 +144,9 @@ export async function handleRunsRoutes(params) {
110
144
  runsPerScenario: activeJob.runParams.runsPerScenario,
111
145
  scenarioIds: activeJob.runParams.scenarioIds ?? null,
112
146
  agents: activeJob.runParams.requestedAgents ?? null,
113
- runNote: activeJob.runParams.runNote ?? null
147
+ runNote: activeJob.runParams.runNote ?? null,
148
+ serverOverrideAll: activeJob.runParams.serverOverrideAll ?? null,
149
+ scenarioServerOverrides: activeJob.runParams.scenarioServerOverrides ?? null
114
150
  }
115
151
  }
116
152
  : null,
@@ -170,9 +206,45 @@ export async function handleRunsRoutes(params) {
170
206
  const requestedAgents = Array.isArray(body.agents)
171
207
  ? body.agents.map((agent) => String(agent).trim()).filter(Boolean)
172
208
  : undefined;
173
- const applySnapshotEval = body.applySnapshotEval !== false;
174
209
  const runNoteRaw = typeof body.runNote === 'string' ? body.runNote.trim() : '';
175
210
  const runNote = runNoteRaw ? runNoteRaw.slice(0, 500) : undefined;
211
+ const serverOverrideAll = Array.isArray(body.serverOverrideAll)
212
+ ? body.serverOverrideAll.map((id) => String(id).trim()).filter(Boolean)
213
+ : undefined;
214
+ if (Array.isArray(body.serverOverrideAll) &&
215
+ (!serverOverrideAll || serverOverrideAll.length === 0)) {
216
+ asJson(res, 400, { error: 'serverOverrideAll must include at least one server id' });
217
+ return true;
218
+ }
219
+ if (body.scenarioServerOverrides !== undefined &&
220
+ (typeof body.scenarioServerOverrides !== 'object' ||
221
+ body.scenarioServerOverrides === null ||
222
+ Array.isArray(body.scenarioServerOverrides))) {
223
+ asJson(res, 400, {
224
+ error: 'scenarioServerOverrides must be an object of scenarioId -> string[]'
225
+ });
226
+ return true;
227
+ }
228
+ let scenarioServerOverrides;
229
+ if (body.scenarioServerOverrides && typeof body.scenarioServerOverrides === 'object') {
230
+ const normalizedEntries = [];
231
+ for (const [rawScenarioId, rawServerIds] of Object.entries(body.scenarioServerOverrides)) {
232
+ const scenarioOverrideId = String(rawScenarioId).trim();
233
+ if (!scenarioOverrideId)
234
+ continue;
235
+ if (!Array.isArray(rawServerIds)) {
236
+ asJson(res, 400, {
237
+ error: `scenarioServerOverrides.${scenarioOverrideId} must be an array of server ids`
238
+ });
239
+ return true;
240
+ }
241
+ normalizedEntries.push([
242
+ scenarioOverrideId,
243
+ rawServerIds.map((id) => String(id).trim()).filter(Boolean)
244
+ ]);
245
+ }
246
+ scenarioServerOverrides = Object.fromEntries(normalizedEntries);
247
+ }
176
248
  if (!configPathRaw) {
177
249
  asJson(res, 400, { error: 'configPath is required' });
178
250
  return true;
@@ -188,17 +260,27 @@ export async function handleRunsRoutes(params) {
188
260
  asJson(res, 404, { error: `Config not found: ${configPath}` });
189
261
  return true;
190
262
  }
191
- // Eagerly cache OAuth server names to avoid re-parsing config in advanceQueue
192
- let oauthServerNames;
193
263
  try {
194
- const loaded = loadConfig(configPath);
195
- oauthServerNames = Object.entries(loaded.config.servers ?? {})
196
- .filter(([, v]) => v.auth?.type === 'oauth_authorization_code')
197
- .map(([name]) => name);
264
+ const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
265
+ const libraries = readLibraries(settings.librariesDir);
266
+ applyLibraryEntries(loaded, libraries.agents, libraries.servers);
267
+ const selected = scenarioIds?.length
268
+ ? deps.selectScenarioIds(loaded.config, scenarioIds)
269
+ : scenarioId
270
+ ? deps.selectScenarioIds(loaded.config, [scenarioId])
271
+ : loaded.config;
272
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, scenarioServerOverrides);
273
+ applyRuntimeServerOverrides(selected, {
274
+ serverOverrideAll,
275
+ scenarioServerOverrides: filteredScenarioOverrides
276
+ });
198
277
  }
199
- catch {
200
- // Will be resolved lazily in advanceQueue if needed
278
+ catch (error) {
279
+ asJson(res, 400, { error: error instanceof Error ? error.message : String(error) });
280
+ return true;
201
281
  }
282
+ // Resolve lazily in advanceQueue so runtime overrides are always reflected.
283
+ const oauthServerNames = undefined;
202
284
  const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
203
285
  const runParamsObj = {
204
286
  configPath,
@@ -206,9 +288,10 @@ export async function handleRunsRoutes(params) {
206
288
  scenarioId,
207
289
  scenarioIds,
208
290
  requestedAgents,
209
- applySnapshotEval,
210
291
  runNote,
211
- oauthServerNames
292
+ oauthServerNames,
293
+ serverOverrideAll,
294
+ scenarioServerOverrides
212
295
  };
213
296
  const job = {
214
297
  id: jobId,
@@ -232,6 +315,8 @@ export async function handleRunsRoutes(params) {
232
315
  scenarioIds: scenarioIds ?? null,
233
316
  agents: requestedAgents ?? null,
234
317
  runNote: runNote ?? null,
318
+ serverOverrideAll: serverOverrideAll ?? null,
319
+ scenarioServerOverrides: scenarioServerOverrides ?? null,
235
320
  position: runQueueState.queue.length
236
321
  }
237
322
  });
@@ -556,19 +641,39 @@ function toCoreExtractRules(extractRules) {
556
641
  }
557
642
  return rules;
558
643
  }
559
- function resolveOAuthServersForJob(job) {
560
- if (job.runParams.oauthServerNames !== undefined) {
644
+ function resolveOAuthServersForJob(job, librariesDir) {
645
+ if (job.runParams.oauthServerNames !== undefined)
561
646
  return job.runParams.oauthServerNames;
562
- }
563
647
  try {
564
- const loaded = loadConfig(job.runParams.configPath);
565
- const names = Object.entries(loaded.config.servers ?? {})
566
- .filter(([, v]) => v.auth?.type === 'oauth_authorization_code')
567
- .map(([name]) => name);
648
+ const loaded = loadConfig(job.runParams.configPath, { bundleRoot: librariesDir });
649
+ const libraries = readLibrariesFromStore(librariesDir);
650
+ applyLibraryEntries(loaded, libraries.agents, libraries.servers);
651
+ const selected = job.runParams.scenarioIds?.length
652
+ ? selectScenarioIds(loaded.config, job.runParams.scenarioIds)
653
+ : job.runParams.scenarioId
654
+ ? selectScenarioIds(loaded.config, [job.runParams.scenarioId])
655
+ : loaded.config;
656
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, job.runParams.scenarioServerOverrides);
657
+ const withOverrides = applyRuntimeServerOverrides(selected, {
658
+ serverOverrideAll: job.runParams.serverOverrideAll,
659
+ scenarioServerOverrides: filteredScenarioOverrides
660
+ });
661
+ const effectiveServers = new Set(withOverrides.scenarios.flatMap((scenario) => scenario.servers));
662
+ const names = Array.from(effectiveServers).filter((name) => {
663
+ const config = withOverrides.servers?.[name];
664
+ return config?.auth?.type === 'oauth_authorization_code';
665
+ });
568
666
  job.runParams.oauthServerNames = names;
569
667
  return names;
570
668
  }
571
- catch {
669
+ catch (error) {
670
+ const message = error instanceof Error ? error.message : String(error);
671
+ if (message.includes('Unknown server refs') ||
672
+ message.includes('Unknown scenarios in scenarioServerOverrides') ||
673
+ message.includes('serverOverrideAll must include at least one server id')) {
674
+ throw error;
675
+ }
676
+ console.warn(`[mcplab] Failed to resolve OAuth servers for queued job '${job.id}': ${message}`);
572
677
  return [];
573
678
  }
574
679
  }
@@ -587,7 +692,25 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
587
692
  continue;
588
693
  }
589
694
  // Pre-check OAuth before starting
590
- const oauthServers = resolveOAuthServersForJob(nextJob);
695
+ let oauthServers = [];
696
+ try {
697
+ oauthServers = resolveOAuthServersForJob(nextJob, settings.librariesDir);
698
+ }
699
+ catch (error) {
700
+ runQueueState.queue.shift();
701
+ nextJob.status = 'error';
702
+ deps.addJobEvent(nextJob, {
703
+ type: 'error',
704
+ ts: new Date().toISOString(),
705
+ payload: {
706
+ message: error instanceof Error ? error.message : String(error)
707
+ }
708
+ });
709
+ for (const client of nextJob.clients)
710
+ client.end();
711
+ nextJob.clients.clear();
712
+ continue;
713
+ }
591
714
  if (oauthServers.length > 0) {
592
715
  const authStatus = oauthSessionManager.checkServersAuthStatus(oauthServers);
593
716
  const needsAuth = authStatus.filter((s) => s.status === 'auth_required');
@@ -622,7 +745,9 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
622
745
  scenarioId: nextJob.runParams.scenarioId ?? null,
623
746
  scenarioIds: nextJob.runParams.scenarioIds ?? null,
624
747
  agents: nextJob.runParams.requestedAgents ?? null,
625
- runNote: nextJob.runParams.runNote ?? null
748
+ runNote: nextJob.runParams.runNote ?? null,
749
+ serverOverrideAll: nextJob.runParams.serverOverrideAll ?? null,
750
+ scenarioServerOverrides: nextJob.runParams.scenarioServerOverrides ?? null
626
751
  }
627
752
  });
628
753
  void executeRunJob(nextJob, settings, jobs, runQueueState, oauthSessionManager, deps);
@@ -634,8 +759,8 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
634
759
  }
635
760
  }
636
761
  async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionManager, deps) {
637
- const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, loadSnapshot, compareRunToSnapshot, applySnapshotPolicyToRunResult, readLibraries, pkgVersion } = deps;
638
- const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, applySnapshotEval, runNote } = job.runParams;
762
+ const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pkgVersion } = deps;
763
+ const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, runNote, serverOverrideAll, scenarioServerOverrides } = job.runParams;
639
764
  try {
640
765
  addJobEvent(job, {
641
766
  type: 'log',
@@ -643,8 +768,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
643
768
  payload: { message: `Loading MCP Evaluation config: ${configPath}` }
644
769
  });
645
770
  const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
646
- const { agents: libraryAgents } = readLibraries(settings.librariesDir);
647
- applyLibraryAgents(loaded, libraryAgents);
771
+ const { agents: libraryAgents, servers: libraryServers } = readLibraries(settings.librariesDir);
772
+ applyLibraryEntries(loaded, libraryAgents, libraryServers);
648
773
  addJobEvent(job, {
649
774
  type: 'log',
650
775
  ts: new Date().toISOString(),
@@ -678,7 +803,30 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
678
803
  message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
679
804
  }
680
805
  });
681
- const resolvedAgents = resolveRunSelectedAgents(selectedBaseScenarios, requestedAgents);
806
+ const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selectedBaseScenarios, scenarioServerOverrides);
807
+ const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseScenarios, {
808
+ serverOverrideAll,
809
+ scenarioServerOverrides: filteredScenarioOverrides
810
+ });
811
+ const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
812
+ addJobEvent(job, {
813
+ type: 'log',
814
+ ts: new Date().toISOString(),
815
+ payload: {
816
+ message: `Applied runtime server overrides: global=${serverOverrideAll?.length ?? 0} scenario-specific=${Object.keys(filteredScenarioOverrides ?? {}).length}`
817
+ }
818
+ });
819
+ const effectiveScenarioServers = runtimeOverriddenConfig.scenarios
820
+ .map((scenario) => `${scenario.id}=[${scenario.servers.join(', ')}]`)
821
+ .join('; ');
822
+ addJobEvent(job, {
823
+ type: 'log',
824
+ ts: new Date().toISOString(),
825
+ payload: {
826
+ message: `Effective MCP servers per scenario: ${effectiveScenarioServers || '(none)'}`
827
+ }
828
+ });
829
+ const resolvedAgents = resolveRunSelectedAgents(runtimeOverriddenConfig, requestedAgents);
682
830
  const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
683
831
  addJobEvent(job, {
684
832
  type: 'log',
@@ -689,7 +837,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
689
837
  : `Using resolved default agents: ${resolvedAgentList.join(', ')}`
690
838
  }
691
839
  });
692
- const expandedConfig = expandConfigForAgents(selectedBaseScenarios, resolvedAgents);
840
+ const expandedConfig = expandConfigForAgents(runtimeOverriddenConfig, resolvedAgents);
693
841
  addJobEvent(job, {
694
842
  type: 'log',
695
843
  ts: new Date().toISOString(),
@@ -697,9 +845,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
697
845
  message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
698
846
  }
699
847
  });
700
- const oauthServers = Object.entries(expandedConfig.servers)
701
- .filter(([, serverConfig]) => serverConfig.auth?.type === 'oauth_authorization_code')
702
- .map(([serverName]) => serverName);
848
+ const usedServerNames = new Set(expandedConfig.scenarios.flatMap((scenario) => scenario.servers));
849
+ const oauthServers = Array.from(usedServerNames).filter((serverName) => expandedConfig.servers[serverName]?.auth?.type === 'oauth_authorization_code');
703
850
  const mcpServerAuthHeaders = oauthServers.length > 0
704
851
  ? await oauthSessionManager.getAuthHeadersForServers(oauthServers)
705
852
  : undefined;
@@ -733,7 +880,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
733
880
  runsPerScenario,
734
881
  scenarioId,
735
882
  runNote,
736
- configHash: loaded.hash,
883
+ configHash: effectiveConfigHash,
737
884
  cliVersion: pkgVersion,
738
885
  runsDir: settings.runsDir,
739
886
  mcpServerAuthHeaders,
@@ -762,97 +909,6 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
762
909
  message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
763
910
  }
764
911
  });
765
- if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
766
- addJobEvent(job, {
767
- type: 'log',
768
- ts: new Date().toISOString(),
769
- payload: { message: 'Applying snapshot evaluation policy ...' }
770
- });
771
- const policy = expandedConfig.snapshot_eval;
772
- const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
773
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
774
- .map((scenario) => scenario.id));
775
- const scenarioBaselineMap = new Map();
776
- for (const scenario of selectedBaseScenarios.scenarios) {
777
- if (scenario.snapshot_eval?.enabled === false)
778
- continue;
779
- const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
780
- if (baselineId)
781
- scenarioBaselineMap.set(scenario.id, baselineId);
782
- }
783
- const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
784
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
785
- .filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
786
- .map((scenario) => scenario.id);
787
- if (scenariosWithoutBaseline.length > 0) {
788
- addJobEvent(job, {
789
- type: 'log',
790
- ts: new Date().toISOString(),
791
- payload: {
792
- message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
793
- }
794
- });
795
- }
796
- const comparisons = [];
797
- const scenarioIdsByBaseline = new Map();
798
- for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
799
- const list = scenarioIdsByBaseline.get(baselineId) ?? [];
800
- list.push(scenarioIdItem);
801
- scenarioIdsByBaseline.set(baselineId, list);
802
- }
803
- for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
804
- addJobEvent(job, {
805
- type: 'log',
806
- ts: new Date().toISOString(),
807
- payload: {
808
- message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
809
- }
810
- });
811
- const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
812
- const fullComparison = compareRunToSnapshot(results, snapshot);
813
- comparisons.push({
814
- ...fullComparison,
815
- scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
816
- });
817
- }
818
- if (comparisons.length > 0) {
819
- applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
820
- addJobEvent(job, {
821
- type: 'log',
822
- ts: new Date().toISOString(),
823
- payload: {
824
- message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
825
- }
826
- });
827
- }
828
- else {
829
- addJobEvent(job, {
830
- type: 'log',
831
- ts: new Date().toISOString(),
832
- payload: {
833
- message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
834
- }
835
- });
836
- }
837
- }
838
- else if (applySnapshotEval) {
839
- addJobEvent(job, {
840
- type: 'log',
841
- ts: new Date().toISOString(),
842
- payload: {
843
- message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
844
- }
845
- });
846
- }
847
- else {
848
- addJobEvent(job, {
849
- type: 'log',
850
- ts: new Date().toISOString(),
851
- payload: {
852
- message: 'Snapshot evaluation skipped for this run (disabled in run request)'
853
- }
854
- });
855
- }
856
912
  addJobEvent(job, {
857
913
  type: 'log',
858
914
  ts: new Date().toISOString(),
@@ -874,8 +930,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
874
930
  payload: {
875
931
  runId: results.metadata.run_id,
876
932
  runDir,
877
- summary: results.summary,
878
- snapshotEval: results.metadata.snapshot_eval ?? null
933
+ summary: results.summary
879
934
  }
880
935
  });
881
936
  job.status = 'completed';
@@ -933,9 +988,9 @@ function formatRunProgressMessage(event) {
933
988
  case 'run_started':
934
989
  return `Run initialized (id: ${event.runId}, ${event.totalScenarioRuns} scenario run(s))`;
935
990
  case 'mcp_connect_started':
936
- return `Connecting to ${event.serverCount} MCP server(s) ...`;
991
+ return `Connecting to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')} ...`;
937
992
  case 'mcp_connect_finished':
938
- return `Connected to ${event.serverCount} MCP server(s)`;
993
+ return `Connected to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')}`;
939
994
  case 'scenario_run_started':
940
995
  return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
941
996
  case 'scenario_run_finished':