@inspectr/mcplab 1.14.3 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -35
- package/dist/app/assets/index-BSGuUMv-.js +254 -0
- package/dist/app/assets/index-Bekohuot.css +1 -0
- package/dist/app/index.html +2 -2
- package/dist/app-server/app-context.d.ts +0 -22
- package/dist/app-server/app-context.d.ts.map +1 -1
- package/dist/app-server/assistant-common.d.ts +37 -24
- package/dist/app-server/evals-routes.d.ts.map +1 -1
- package/dist/app-server/evals-routes.js +5 -41
- package/dist/app-server/evals-routes.js.map +1 -1
- package/dist/app-server/libraries-store.d.ts.map +1 -1
- package/dist/app-server/libraries-store.js +2 -3
- package/dist/app-server/libraries-store.js.map +1 -1
- package/dist/app-server/result-assistant-domain.d.ts +81 -65
- package/dist/app-server/result-assistant-domain.js +1 -2
- package/dist/app-server/result-assistant-domain.js.map +1 -1
- package/dist/app-server/result-assistant.d.ts.map +1 -1
- package/dist/app-server/result-assistant.js +7 -1
- package/dist/app-server/result-assistant.js.map +1 -1
- package/dist/app-server/router.d.ts.map +1 -1
- package/dist/app-server/router.js +0 -24
- package/dist/app-server/router.js.map +1 -1
- package/dist/app-server/runs-routes.d.ts +15 -4
- package/dist/app-server/runs-routes.d.ts.map +1 -1
- package/dist/app-server/runs-routes.js +189 -134
- package/dist/app-server/runs-routes.js.map +1 -1
- package/dist/app-server/runs-store.d.ts +6 -1
- package/dist/app-server/runs-store.d.ts.map +1 -1
- package/dist/app-server/runs-store.js +15 -1
- package/dist/app-server/runs-store.js.map +1 -1
- package/dist/app-server/scenario-assistant-domain.d.ts +144 -134
- package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant-domain.js +5 -8
- package/dist/app-server/scenario-assistant-domain.js.map +1 -1
- package/dist/app-server/scenario-assistant.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant.js +7 -1
- package/dist/app-server/scenario-assistant.js.map +1 -1
- package/dist/app-server/snapshots-routes.d.ts +1 -13
- package/dist/app-server/snapshots-routes.d.ts.map +1 -1
- package/dist/app-server/snapshots-routes.js +9 -79
- package/dist/app-server/snapshots-routes.js.map +1 -1
- package/dist/app-server/types.d.ts +0 -2
- package/dist/app-server/types.d.ts.map +1 -1
- package/dist/cli.js +79 -288
- package/dist/cli.js.map +1 -1
- package/dist/interactive-helpers.d.ts +0 -1
- package/dist/interactive-helpers.d.ts.map +1 -1
- package/dist/interactive-helpers.js +0 -3
- package/dist/interactive-helpers.js.map +1 -1
- package/package.json +4 -4
- package/dist/app/assets/index-BBRB19an.js +0 -250
- package/dist/app/assets/index-DVQdbWhs.css +0 -1
|
@@ -2,21 +2,53 @@ import { randomUUID } from 'node:crypto';
|
|
|
2
2
|
import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
|
|
3
3
|
import { tmpdir } from 'node:os';
|
|
4
4
|
import { isAbsolute, join, relative, resolve } from 'node:path';
|
|
5
|
-
import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown } from '@inspectr/mcplab-core';
|
|
5
|
+
import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
|
|
6
6
|
import { renderReport } from '@inspectr/mcplab-reporting';
|
|
7
7
|
import { OAuthAuthorizationRequiredError } from './oauth-session-manager.js';
|
|
8
|
+
import { selectScenarioIds } from './runs-store.js';
|
|
9
|
+
import { readLibraries as readLibrariesFromStore } from './libraries-store.js';
|
|
10
|
+
export function mergeLibraryEntriesIntoConfig(config, libraryAgents, libraryServers) {
|
|
11
|
+
return {
|
|
12
|
+
...config,
|
|
13
|
+
agents: { ...libraryAgents, ...config.agents },
|
|
14
|
+
servers: { ...libraryServers, ...config.servers }
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export function applyLibraryEntries(loaded, libraryAgents, libraryServers) {
|
|
18
|
+
loaded.config = mergeLibraryEntriesIntoConfig(loaded.config, libraryAgents, libraryServers);
|
|
19
|
+
loaded.hash = hashConfig(loaded.config);
|
|
20
|
+
}
|
|
21
|
+
function filterScenarioOverridesToSelectedScenarios(selectedConfig, scenarioServerOverrides) {
|
|
22
|
+
if (!scenarioServerOverrides)
|
|
23
|
+
return undefined;
|
|
24
|
+
const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
|
|
25
|
+
const filtered = Object.fromEntries(Object.entries(scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
|
|
26
|
+
return Object.keys(filtered).length > 0 ? filtered : undefined;
|
|
27
|
+
}
|
|
28
|
+
// Backward-compatible exports used by existing tests/imports.
|
|
8
29
|
export function mergeLibraryAgentsIntoConfig(config, libraryAgents) {
|
|
9
|
-
return
|
|
30
|
+
return mergeLibraryEntriesIntoConfig(config, libraryAgents, {});
|
|
10
31
|
}
|
|
11
32
|
export function applyLibraryAgents(loaded, libraryAgents) {
|
|
12
|
-
|
|
13
|
-
loaded.hash = hashConfig(loaded.config);
|
|
33
|
+
applyLibraryEntries(loaded, libraryAgents, {});
|
|
14
34
|
}
|
|
15
35
|
export async function handleRunsRoutes(params) {
|
|
16
36
|
const { req, res, pathname, method, settings, jobs, runQueueState, oauthSessionManager, deps } = params;
|
|
17
|
-
const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents,
|
|
37
|
+
const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
|
|
18
38
|
if (pathname === '/api/runs' && method === 'GET') {
|
|
19
|
-
|
|
39
|
+
const requestUrl = new URL(req.url ?? '/api/runs', 'http://localhost');
|
|
40
|
+
const since = requestUrl.searchParams.get('since') ?? undefined;
|
|
41
|
+
const until = requestUrl.searchParams.get('until') ?? undefined;
|
|
42
|
+
const lastDaysRaw = requestUrl.searchParams.get('last_days');
|
|
43
|
+
const lastDaysParsed = lastDaysRaw === null ? NaN : Number(lastDaysRaw);
|
|
44
|
+
const lastDays = Number.isFinite(lastDaysParsed) && lastDaysParsed > 0
|
|
45
|
+
? Math.floor(lastDaysParsed)
|
|
46
|
+
: undefined;
|
|
47
|
+
asJson(res, 200, listRuns(settings.runsDir, {
|
|
48
|
+
since,
|
|
49
|
+
until,
|
|
50
|
+
lastDays
|
|
51
|
+
}));
|
|
20
52
|
return true;
|
|
21
53
|
}
|
|
22
54
|
if (pathname.startsWith('/api/runs/') && pathname.endsWith('/trace') && method === 'GET') {
|
|
@@ -97,7 +129,9 @@ export async function handleRunsRoutes(params) {
|
|
|
97
129
|
runsPerScenario: j.runParams.runsPerScenario,
|
|
98
130
|
scenarioIds: j.runParams.scenarioIds ?? null,
|
|
99
131
|
agents: j.runParams.requestedAgents ?? null,
|
|
100
|
-
runNote: j.runParams.runNote ?? null
|
|
132
|
+
runNote: j.runParams.runNote ?? null,
|
|
133
|
+
serverOverrideAll: j.runParams.serverOverrideAll ?? null,
|
|
134
|
+
scenarioServerOverrides: j.runParams.scenarioServerOverrides ?? null
|
|
101
135
|
}
|
|
102
136
|
}));
|
|
103
137
|
asJson(res, 200, {
|
|
@@ -110,7 +144,9 @@ export async function handleRunsRoutes(params) {
|
|
|
110
144
|
runsPerScenario: activeJob.runParams.runsPerScenario,
|
|
111
145
|
scenarioIds: activeJob.runParams.scenarioIds ?? null,
|
|
112
146
|
agents: activeJob.runParams.requestedAgents ?? null,
|
|
113
|
-
runNote: activeJob.runParams.runNote ?? null
|
|
147
|
+
runNote: activeJob.runParams.runNote ?? null,
|
|
148
|
+
serverOverrideAll: activeJob.runParams.serverOverrideAll ?? null,
|
|
149
|
+
scenarioServerOverrides: activeJob.runParams.scenarioServerOverrides ?? null
|
|
114
150
|
}
|
|
115
151
|
}
|
|
116
152
|
: null,
|
|
@@ -170,9 +206,45 @@ export async function handleRunsRoutes(params) {
|
|
|
170
206
|
const requestedAgents = Array.isArray(body.agents)
|
|
171
207
|
? body.agents.map((agent) => String(agent).trim()).filter(Boolean)
|
|
172
208
|
: undefined;
|
|
173
|
-
const applySnapshotEval = body.applySnapshotEval !== false;
|
|
174
209
|
const runNoteRaw = typeof body.runNote === 'string' ? body.runNote.trim() : '';
|
|
175
210
|
const runNote = runNoteRaw ? runNoteRaw.slice(0, 500) : undefined;
|
|
211
|
+
const serverOverrideAll = Array.isArray(body.serverOverrideAll)
|
|
212
|
+
? body.serverOverrideAll.map((id) => String(id).trim()).filter(Boolean)
|
|
213
|
+
: undefined;
|
|
214
|
+
if (Array.isArray(body.serverOverrideAll) &&
|
|
215
|
+
(!serverOverrideAll || serverOverrideAll.length === 0)) {
|
|
216
|
+
asJson(res, 400, { error: 'serverOverrideAll must include at least one server id' });
|
|
217
|
+
return true;
|
|
218
|
+
}
|
|
219
|
+
if (body.scenarioServerOverrides !== undefined &&
|
|
220
|
+
(typeof body.scenarioServerOverrides !== 'object' ||
|
|
221
|
+
body.scenarioServerOverrides === null ||
|
|
222
|
+
Array.isArray(body.scenarioServerOverrides))) {
|
|
223
|
+
asJson(res, 400, {
|
|
224
|
+
error: 'scenarioServerOverrides must be an object of scenarioId -> string[]'
|
|
225
|
+
});
|
|
226
|
+
return true;
|
|
227
|
+
}
|
|
228
|
+
let scenarioServerOverrides;
|
|
229
|
+
if (body.scenarioServerOverrides && typeof body.scenarioServerOverrides === 'object') {
|
|
230
|
+
const normalizedEntries = [];
|
|
231
|
+
for (const [rawScenarioId, rawServerIds] of Object.entries(body.scenarioServerOverrides)) {
|
|
232
|
+
const scenarioOverrideId = String(rawScenarioId).trim();
|
|
233
|
+
if (!scenarioOverrideId)
|
|
234
|
+
continue;
|
|
235
|
+
if (!Array.isArray(rawServerIds)) {
|
|
236
|
+
asJson(res, 400, {
|
|
237
|
+
error: `scenarioServerOverrides.${scenarioOverrideId} must be an array of server ids`
|
|
238
|
+
});
|
|
239
|
+
return true;
|
|
240
|
+
}
|
|
241
|
+
normalizedEntries.push([
|
|
242
|
+
scenarioOverrideId,
|
|
243
|
+
rawServerIds.map((id) => String(id).trim()).filter(Boolean)
|
|
244
|
+
]);
|
|
245
|
+
}
|
|
246
|
+
scenarioServerOverrides = Object.fromEntries(normalizedEntries);
|
|
247
|
+
}
|
|
176
248
|
if (!configPathRaw) {
|
|
177
249
|
asJson(res, 400, { error: 'configPath is required' });
|
|
178
250
|
return true;
|
|
@@ -188,17 +260,27 @@ export async function handleRunsRoutes(params) {
|
|
|
188
260
|
asJson(res, 404, { error: `Config not found: ${configPath}` });
|
|
189
261
|
return true;
|
|
190
262
|
}
|
|
191
|
-
// Eagerly cache OAuth server names to avoid re-parsing config in advanceQueue
|
|
192
|
-
let oauthServerNames;
|
|
193
263
|
try {
|
|
194
|
-
const loaded = loadConfig(configPath);
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
264
|
+
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
265
|
+
const libraries = readLibraries(settings.librariesDir);
|
|
266
|
+
applyLibraryEntries(loaded, libraries.agents, libraries.servers);
|
|
267
|
+
const selected = scenarioIds?.length
|
|
268
|
+
? deps.selectScenarioIds(loaded.config, scenarioIds)
|
|
269
|
+
: scenarioId
|
|
270
|
+
? deps.selectScenarioIds(loaded.config, [scenarioId])
|
|
271
|
+
: loaded.config;
|
|
272
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, scenarioServerOverrides);
|
|
273
|
+
applyRuntimeServerOverrides(selected, {
|
|
274
|
+
serverOverrideAll,
|
|
275
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
276
|
+
});
|
|
198
277
|
}
|
|
199
|
-
catch {
|
|
200
|
-
|
|
278
|
+
catch (error) {
|
|
279
|
+
asJson(res, 400, { error: error instanceof Error ? error.message : String(error) });
|
|
280
|
+
return true;
|
|
201
281
|
}
|
|
282
|
+
// Resolve lazily in advanceQueue so runtime overrides are always reflected.
|
|
283
|
+
const oauthServerNames = undefined;
|
|
202
284
|
const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
203
285
|
const runParamsObj = {
|
|
204
286
|
configPath,
|
|
@@ -206,9 +288,10 @@ export async function handleRunsRoutes(params) {
|
|
|
206
288
|
scenarioId,
|
|
207
289
|
scenarioIds,
|
|
208
290
|
requestedAgents,
|
|
209
|
-
applySnapshotEval,
|
|
210
291
|
runNote,
|
|
211
|
-
oauthServerNames
|
|
292
|
+
oauthServerNames,
|
|
293
|
+
serverOverrideAll,
|
|
294
|
+
scenarioServerOverrides
|
|
212
295
|
};
|
|
213
296
|
const job = {
|
|
214
297
|
id: jobId,
|
|
@@ -232,6 +315,8 @@ export async function handleRunsRoutes(params) {
|
|
|
232
315
|
scenarioIds: scenarioIds ?? null,
|
|
233
316
|
agents: requestedAgents ?? null,
|
|
234
317
|
runNote: runNote ?? null,
|
|
318
|
+
serverOverrideAll: serverOverrideAll ?? null,
|
|
319
|
+
scenarioServerOverrides: scenarioServerOverrides ?? null,
|
|
235
320
|
position: runQueueState.queue.length
|
|
236
321
|
}
|
|
237
322
|
});
|
|
@@ -556,19 +641,39 @@ function toCoreExtractRules(extractRules) {
|
|
|
556
641
|
}
|
|
557
642
|
return rules;
|
|
558
643
|
}
|
|
559
|
-
function resolveOAuthServersForJob(job) {
|
|
560
|
-
if (job.runParams.oauthServerNames !== undefined)
|
|
644
|
+
function resolveOAuthServersForJob(job, librariesDir) {
|
|
645
|
+
if (job.runParams.oauthServerNames !== undefined)
|
|
561
646
|
return job.runParams.oauthServerNames;
|
|
562
|
-
}
|
|
563
647
|
try {
|
|
564
|
-
const loaded = loadConfig(job.runParams.configPath);
|
|
565
|
-
const
|
|
566
|
-
|
|
567
|
-
|
|
648
|
+
const loaded = loadConfig(job.runParams.configPath, { bundleRoot: librariesDir });
|
|
649
|
+
const libraries = readLibrariesFromStore(librariesDir);
|
|
650
|
+
applyLibraryEntries(loaded, libraries.agents, libraries.servers);
|
|
651
|
+
const selected = job.runParams.scenarioIds?.length
|
|
652
|
+
? selectScenarioIds(loaded.config, job.runParams.scenarioIds)
|
|
653
|
+
: job.runParams.scenarioId
|
|
654
|
+
? selectScenarioIds(loaded.config, [job.runParams.scenarioId])
|
|
655
|
+
: loaded.config;
|
|
656
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, job.runParams.scenarioServerOverrides);
|
|
657
|
+
const withOverrides = applyRuntimeServerOverrides(selected, {
|
|
658
|
+
serverOverrideAll: job.runParams.serverOverrideAll,
|
|
659
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
660
|
+
});
|
|
661
|
+
const effectiveServers = new Set(withOverrides.scenarios.flatMap((scenario) => scenario.servers));
|
|
662
|
+
const names = Array.from(effectiveServers).filter((name) => {
|
|
663
|
+
const config = withOverrides.servers?.[name];
|
|
664
|
+
return config?.auth?.type === 'oauth_authorization_code';
|
|
665
|
+
});
|
|
568
666
|
job.runParams.oauthServerNames = names;
|
|
569
667
|
return names;
|
|
570
668
|
}
|
|
571
|
-
catch {
|
|
669
|
+
catch (error) {
|
|
670
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
671
|
+
if (message.includes('Unknown server refs') ||
|
|
672
|
+
message.includes('Unknown scenarios in scenarioServerOverrides') ||
|
|
673
|
+
message.includes('serverOverrideAll must include at least one server id')) {
|
|
674
|
+
throw error;
|
|
675
|
+
}
|
|
676
|
+
console.warn(`[mcplab] Failed to resolve OAuth servers for queued job '${job.id}': ${message}`);
|
|
572
677
|
return [];
|
|
573
678
|
}
|
|
574
679
|
}
|
|
@@ -587,7 +692,25 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
587
692
|
continue;
|
|
588
693
|
}
|
|
589
694
|
// Pre-check OAuth before starting
|
|
590
|
-
|
|
695
|
+
let oauthServers = [];
|
|
696
|
+
try {
|
|
697
|
+
oauthServers = resolveOAuthServersForJob(nextJob, settings.librariesDir);
|
|
698
|
+
}
|
|
699
|
+
catch (error) {
|
|
700
|
+
runQueueState.queue.shift();
|
|
701
|
+
nextJob.status = 'error';
|
|
702
|
+
deps.addJobEvent(nextJob, {
|
|
703
|
+
type: 'error',
|
|
704
|
+
ts: new Date().toISOString(),
|
|
705
|
+
payload: {
|
|
706
|
+
message: error instanceof Error ? error.message : String(error)
|
|
707
|
+
}
|
|
708
|
+
});
|
|
709
|
+
for (const client of nextJob.clients)
|
|
710
|
+
client.end();
|
|
711
|
+
nextJob.clients.clear();
|
|
712
|
+
continue;
|
|
713
|
+
}
|
|
591
714
|
if (oauthServers.length > 0) {
|
|
592
715
|
const authStatus = oauthSessionManager.checkServersAuthStatus(oauthServers);
|
|
593
716
|
const needsAuth = authStatus.filter((s) => s.status === 'auth_required');
|
|
@@ -622,7 +745,9 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
622
745
|
scenarioId: nextJob.runParams.scenarioId ?? null,
|
|
623
746
|
scenarioIds: nextJob.runParams.scenarioIds ?? null,
|
|
624
747
|
agents: nextJob.runParams.requestedAgents ?? null,
|
|
625
|
-
runNote: nextJob.runParams.runNote ?? null
|
|
748
|
+
runNote: nextJob.runParams.runNote ?? null,
|
|
749
|
+
serverOverrideAll: nextJob.runParams.serverOverrideAll ?? null,
|
|
750
|
+
scenarioServerOverrides: nextJob.runParams.scenarioServerOverrides ?? null
|
|
626
751
|
}
|
|
627
752
|
});
|
|
628
753
|
void executeRunJob(nextJob, settings, jobs, runQueueState, oauthSessionManager, deps);
|
|
@@ -634,8 +759,8 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
634
759
|
}
|
|
635
760
|
}
|
|
636
761
|
async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionManager, deps) {
|
|
637
|
-
const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents,
|
|
638
|
-
const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents,
|
|
762
|
+
const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pkgVersion } = deps;
|
|
763
|
+
const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, runNote, serverOverrideAll, scenarioServerOverrides } = job.runParams;
|
|
639
764
|
try {
|
|
640
765
|
addJobEvent(job, {
|
|
641
766
|
type: 'log',
|
|
@@ -643,8 +768,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
643
768
|
payload: { message: `Loading MCP Evaluation config: ${configPath}` }
|
|
644
769
|
});
|
|
645
770
|
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
646
|
-
const { agents: libraryAgents } = readLibraries(settings.librariesDir);
|
|
647
|
-
|
|
771
|
+
const { agents: libraryAgents, servers: libraryServers } = readLibraries(settings.librariesDir);
|
|
772
|
+
applyLibraryEntries(loaded, libraryAgents, libraryServers);
|
|
648
773
|
addJobEvent(job, {
|
|
649
774
|
type: 'log',
|
|
650
775
|
ts: new Date().toISOString(),
|
|
@@ -678,7 +803,30 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
678
803
|
message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
|
|
679
804
|
}
|
|
680
805
|
});
|
|
681
|
-
const
|
|
806
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selectedBaseScenarios, scenarioServerOverrides);
|
|
807
|
+
const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseScenarios, {
|
|
808
|
+
serverOverrideAll,
|
|
809
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
810
|
+
});
|
|
811
|
+
const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
|
|
812
|
+
addJobEvent(job, {
|
|
813
|
+
type: 'log',
|
|
814
|
+
ts: new Date().toISOString(),
|
|
815
|
+
payload: {
|
|
816
|
+
message: `Applied runtime server overrides: global=${serverOverrideAll?.length ?? 0} scenario-specific=${Object.keys(filteredScenarioOverrides ?? {}).length}`
|
|
817
|
+
}
|
|
818
|
+
});
|
|
819
|
+
const effectiveScenarioServers = runtimeOverriddenConfig.scenarios
|
|
820
|
+
.map((scenario) => `${scenario.id}=[${scenario.servers.join(', ')}]`)
|
|
821
|
+
.join('; ');
|
|
822
|
+
addJobEvent(job, {
|
|
823
|
+
type: 'log',
|
|
824
|
+
ts: new Date().toISOString(),
|
|
825
|
+
payload: {
|
|
826
|
+
message: `Effective MCP servers per scenario: ${effectiveScenarioServers || '(none)'}`
|
|
827
|
+
}
|
|
828
|
+
});
|
|
829
|
+
const resolvedAgents = resolveRunSelectedAgents(runtimeOverriddenConfig, requestedAgents);
|
|
682
830
|
const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
|
|
683
831
|
addJobEvent(job, {
|
|
684
832
|
type: 'log',
|
|
@@ -689,7 +837,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
689
837
|
: `Using resolved default agents: ${resolvedAgentList.join(', ')}`
|
|
690
838
|
}
|
|
691
839
|
});
|
|
692
|
-
const expandedConfig = expandConfigForAgents(
|
|
840
|
+
const expandedConfig = expandConfigForAgents(runtimeOverriddenConfig, resolvedAgents);
|
|
693
841
|
addJobEvent(job, {
|
|
694
842
|
type: 'log',
|
|
695
843
|
ts: new Date().toISOString(),
|
|
@@ -697,9 +845,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
697
845
|
message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
|
|
698
846
|
}
|
|
699
847
|
});
|
|
700
|
-
const
|
|
701
|
-
|
|
702
|
-
.map(([serverName]) => serverName);
|
|
848
|
+
const usedServerNames = new Set(expandedConfig.scenarios.flatMap((scenario) => scenario.servers));
|
|
849
|
+
const oauthServers = Array.from(usedServerNames).filter((serverName) => expandedConfig.servers[serverName]?.auth?.type === 'oauth_authorization_code');
|
|
703
850
|
const mcpServerAuthHeaders = oauthServers.length > 0
|
|
704
851
|
? await oauthSessionManager.getAuthHeadersForServers(oauthServers)
|
|
705
852
|
: undefined;
|
|
@@ -733,7 +880,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
733
880
|
runsPerScenario,
|
|
734
881
|
scenarioId,
|
|
735
882
|
runNote,
|
|
736
|
-
configHash:
|
|
883
|
+
configHash: effectiveConfigHash,
|
|
737
884
|
cliVersion: pkgVersion,
|
|
738
885
|
runsDir: settings.runsDir,
|
|
739
886
|
mcpServerAuthHeaders,
|
|
@@ -762,97 +909,6 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
762
909
|
message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
|
|
763
910
|
}
|
|
764
911
|
});
|
|
765
|
-
if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
|
|
766
|
-
addJobEvent(job, {
|
|
767
|
-
type: 'log',
|
|
768
|
-
ts: new Date().toISOString(),
|
|
769
|
-
payload: { message: 'Applying snapshot evaluation policy ...' }
|
|
770
|
-
});
|
|
771
|
-
const policy = expandedConfig.snapshot_eval;
|
|
772
|
-
const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
|
|
773
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
774
|
-
.map((scenario) => scenario.id));
|
|
775
|
-
const scenarioBaselineMap = new Map();
|
|
776
|
-
for (const scenario of selectedBaseScenarios.scenarios) {
|
|
777
|
-
if (scenario.snapshot_eval?.enabled === false)
|
|
778
|
-
continue;
|
|
779
|
-
const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
|
|
780
|
-
if (baselineId)
|
|
781
|
-
scenarioBaselineMap.set(scenario.id, baselineId);
|
|
782
|
-
}
|
|
783
|
-
const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
|
|
784
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
785
|
-
.filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
|
|
786
|
-
.map((scenario) => scenario.id);
|
|
787
|
-
if (scenariosWithoutBaseline.length > 0) {
|
|
788
|
-
addJobEvent(job, {
|
|
789
|
-
type: 'log',
|
|
790
|
-
ts: new Date().toISOString(),
|
|
791
|
-
payload: {
|
|
792
|
-
message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
|
|
793
|
-
}
|
|
794
|
-
});
|
|
795
|
-
}
|
|
796
|
-
const comparisons = [];
|
|
797
|
-
const scenarioIdsByBaseline = new Map();
|
|
798
|
-
for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
|
|
799
|
-
const list = scenarioIdsByBaseline.get(baselineId) ?? [];
|
|
800
|
-
list.push(scenarioIdItem);
|
|
801
|
-
scenarioIdsByBaseline.set(baselineId, list);
|
|
802
|
-
}
|
|
803
|
-
for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
|
|
804
|
-
addJobEvent(job, {
|
|
805
|
-
type: 'log',
|
|
806
|
-
ts: new Date().toISOString(),
|
|
807
|
-
payload: {
|
|
808
|
-
message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
|
|
809
|
-
}
|
|
810
|
-
});
|
|
811
|
-
const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
|
|
812
|
-
const fullComparison = compareRunToSnapshot(results, snapshot);
|
|
813
|
-
comparisons.push({
|
|
814
|
-
...fullComparison,
|
|
815
|
-
scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
|
|
816
|
-
});
|
|
817
|
-
}
|
|
818
|
-
if (comparisons.length > 0) {
|
|
819
|
-
applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
|
|
820
|
-
addJobEvent(job, {
|
|
821
|
-
type: 'log',
|
|
822
|
-
ts: new Date().toISOString(),
|
|
823
|
-
payload: {
|
|
824
|
-
message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
|
|
825
|
-
}
|
|
826
|
-
});
|
|
827
|
-
}
|
|
828
|
-
else {
|
|
829
|
-
addJobEvent(job, {
|
|
830
|
-
type: 'log',
|
|
831
|
-
ts: new Date().toISOString(),
|
|
832
|
-
payload: {
|
|
833
|
-
message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
|
|
834
|
-
}
|
|
835
|
-
});
|
|
836
|
-
}
|
|
837
|
-
}
|
|
838
|
-
else if (applySnapshotEval) {
|
|
839
|
-
addJobEvent(job, {
|
|
840
|
-
type: 'log',
|
|
841
|
-
ts: new Date().toISOString(),
|
|
842
|
-
payload: {
|
|
843
|
-
message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
|
|
844
|
-
}
|
|
845
|
-
});
|
|
846
|
-
}
|
|
847
|
-
else {
|
|
848
|
-
addJobEvent(job, {
|
|
849
|
-
type: 'log',
|
|
850
|
-
ts: new Date().toISOString(),
|
|
851
|
-
payload: {
|
|
852
|
-
message: 'Snapshot evaluation skipped for this run (disabled in run request)'
|
|
853
|
-
}
|
|
854
|
-
});
|
|
855
|
-
}
|
|
856
912
|
addJobEvent(job, {
|
|
857
913
|
type: 'log',
|
|
858
914
|
ts: new Date().toISOString(),
|
|
@@ -874,8 +930,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
874
930
|
payload: {
|
|
875
931
|
runId: results.metadata.run_id,
|
|
876
932
|
runDir,
|
|
877
|
-
summary: results.summary
|
|
878
|
-
snapshotEval: results.metadata.snapshot_eval ?? null
|
|
933
|
+
summary: results.summary
|
|
879
934
|
}
|
|
880
935
|
});
|
|
881
936
|
job.status = 'completed';
|
|
@@ -933,9 +988,9 @@ function formatRunProgressMessage(event) {
|
|
|
933
988
|
case 'run_started':
|
|
934
989
|
return `Run initialized (id: ${event.runId}, ${event.totalScenarioRuns} scenario run(s))`;
|
|
935
990
|
case 'mcp_connect_started':
|
|
936
|
-
return `Connecting to ${event.serverCount} MCP server(s) ...`;
|
|
991
|
+
return `Connecting to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')} ...`;
|
|
937
992
|
case 'mcp_connect_finished':
|
|
938
|
-
return `Connected to ${event.serverCount} MCP server(s)`;
|
|
993
|
+
return `Connected to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')}`;
|
|
939
994
|
case 'scenario_run_started':
|
|
940
995
|
return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
|
|
941
996
|
case 'scenario_run_finished':
|