@inspectr/mcplab 1.15.0 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -35
- package/dist/app/assets/index-17cleCWQ.js +254 -0
- package/dist/app/assets/index-Bekohuot.css +1 -0
- package/dist/app/index.html +2 -2
- package/dist/app-server/app-context.d.ts +0 -22
- package/dist/app-server/app-context.d.ts.map +1 -1
- package/dist/app-server/evals-routes.d.ts.map +1 -1
- package/dist/app-server/evals-routes.js +1 -38
- package/dist/app-server/evals-routes.js.map +1 -1
- package/dist/app-server/libraries-store.d.ts.map +1 -1
- package/dist/app-server/libraries-store.js +2 -3
- package/dist/app-server/libraries-store.js.map +1 -1
- package/dist/app-server/markdown-reports.d.ts.map +1 -1
- package/dist/app-server/markdown-reports.js +64 -4
- package/dist/app-server/markdown-reports.js.map +1 -1
- package/dist/app-server/result-assistant-domain.js +1 -2
- package/dist/app-server/result-assistant-domain.js.map +1 -1
- package/dist/app-server/result-assistant.d.ts.map +1 -1
- package/dist/app-server/result-assistant.js +7 -1
- package/dist/app-server/result-assistant.js.map +1 -1
- package/dist/app-server/router.d.ts.map +1 -1
- package/dist/app-server/router.js +0 -24
- package/dist/app-server/router.js.map +1 -1
- package/dist/app-server/runs-routes.d.ts +15 -4
- package/dist/app-server/runs-routes.d.ts.map +1 -1
- package/dist/app-server/runs-routes.js +324 -136
- package/dist/app-server/runs-routes.js.map +1 -1
- package/dist/app-server/runs-store.d.ts +10 -0
- package/dist/app-server/runs-store.d.ts.map +1 -1
- package/dist/app-server/runs-store.js +27 -0
- package/dist/app-server/runs-store.js.map +1 -1
- package/dist/app-server/scenario-assistant-domain.d.ts +0 -16
- package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant-domain.js +5 -8
- package/dist/app-server/scenario-assistant-domain.js.map +1 -1
- package/dist/app-server/scenario-assistant.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant.js +7 -1
- package/dist/app-server/scenario-assistant.js.map +1 -1
- package/dist/app-server/snapshots-routes.d.ts +1 -13
- package/dist/app-server/snapshots-routes.d.ts.map +1 -1
- package/dist/app-server/snapshots-routes.js +9 -79
- package/dist/app-server/snapshots-routes.js.map +1 -1
- package/dist/app-server/tool-analysis.d.ts.map +1 -1
- package/dist/app-server/tool-analysis.js +25 -1
- package/dist/app-server/tool-analysis.js.map +1 -1
- package/dist/app-server/types.d.ts +0 -2
- package/dist/app-server/types.d.ts.map +1 -1
- package/dist/cli.js +79 -288
- package/dist/cli.js.map +1 -1
- package/dist/interactive-helpers.d.ts +0 -1
- package/dist/interactive-helpers.d.ts.map +1 -1
- package/dist/interactive-helpers.js +0 -3
- package/dist/interactive-helpers.js.map +1 -1
- package/package.json +4 -4
- package/dist/app/assets/index-BH8cCzoo.css +0 -1
- package/dist/app/assets/index-C2W0NrXX.js +0 -250
|
@@ -2,33 +2,108 @@ import { randomUUID } from 'node:crypto';
|
|
|
2
2
|
import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
|
|
3
3
|
import { tmpdir } from 'node:os';
|
|
4
4
|
import { isAbsolute, join, relative, resolve } from 'node:path';
|
|
5
|
-
import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown } from '@inspectr/mcplab-core';
|
|
5
|
+
import { McpClientManager, loadConfig, hashConfig, runAll, renderSummaryMarkdown, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
|
|
6
6
|
import { renderReport } from '@inspectr/mcplab-reporting';
|
|
7
7
|
import { OAuthAuthorizationRequiredError } from './oauth-session-manager.js';
|
|
8
|
+
import { selectScenarioIds } from './runs-store.js';
|
|
9
|
+
import { readLibraries as readLibrariesFromStore } from './libraries-store.js';
|
|
10
|
+
export function mergeLibraryEntriesIntoConfig(config, libraryAgents, libraryServers) {
|
|
11
|
+
return {
|
|
12
|
+
...config,
|
|
13
|
+
agents: { ...libraryAgents, ...config.agents },
|
|
14
|
+
servers: { ...libraryServers, ...config.servers }
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export function applyLibraryEntries(loaded, libraryAgents, libraryServers) {
|
|
18
|
+
loaded.config = mergeLibraryEntriesIntoConfig(loaded.config, libraryAgents, libraryServers);
|
|
19
|
+
loaded.hash = hashConfig(loaded.config);
|
|
20
|
+
}
|
|
21
|
+
function filterScenarioOverridesToSelectedScenarios(selectedConfig, scenarioServerOverrides) {
|
|
22
|
+
if (!scenarioServerOverrides)
|
|
23
|
+
return undefined;
|
|
24
|
+
const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
|
|
25
|
+
const filtered = Object.fromEntries(Object.entries(scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
|
|
26
|
+
return Object.keys(filtered).length > 0 ? filtered : undefined;
|
|
27
|
+
}
|
|
28
|
+
// Backward-compatible exports used by existing tests/imports.
|
|
8
29
|
export function mergeLibraryAgentsIntoConfig(config, libraryAgents) {
|
|
9
|
-
return
|
|
30
|
+
return mergeLibraryEntriesIntoConfig(config, libraryAgents, {});
|
|
10
31
|
}
|
|
11
32
|
export function applyLibraryAgents(loaded, libraryAgents) {
|
|
12
|
-
|
|
13
|
-
loaded.hash = hashConfig(loaded.config);
|
|
33
|
+
applyLibraryEntries(loaded, libraryAgents, {});
|
|
14
34
|
}
|
|
15
35
|
export async function handleRunsRoutes(params) {
|
|
16
36
|
const { req, res, pathname, method, settings, jobs, runQueueState, oauthSessionManager, deps } = params;
|
|
17
|
-
const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents,
|
|
37
|
+
const { parseBody, asJson, addJobEvent, sendSseEvent, ensureInsideRoot, listRuns, getRunResults, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pickDefaultAssistantAgentName, pkgVersion } = deps;
|
|
18
38
|
if (pathname === '/api/runs' && method === 'GET') {
|
|
19
39
|
const requestUrl = new URL(req.url ?? '/api/runs', 'http://localhost');
|
|
20
40
|
const since = requestUrl.searchParams.get('since') ?? undefined;
|
|
21
41
|
const until = requestUrl.searchParams.get('until') ?? undefined;
|
|
42
|
+
const scenario = requestUrl.searchParams.get('scenario') ?? undefined;
|
|
22
43
|
const lastDaysRaw = requestUrl.searchParams.get('last_days');
|
|
23
44
|
const lastDaysParsed = lastDaysRaw === null ? NaN : Number(lastDaysRaw);
|
|
24
45
|
const lastDays = Number.isFinite(lastDaysParsed) && lastDaysParsed > 0
|
|
25
46
|
? Math.floor(lastDaysParsed)
|
|
26
47
|
: undefined;
|
|
27
|
-
|
|
48
|
+
const limitRaw = Number(requestUrl.searchParams.get('limit'));
|
|
49
|
+
const offsetRaw = Number(requestUrl.searchParams.get('offset'));
|
|
50
|
+
const limit = Number.isFinite(limitRaw) ? Math.max(1, Math.min(100, Math.floor(limitRaw))) : 25;
|
|
51
|
+
const offset = Number.isFinite(offsetRaw) ? Math.max(0, Math.floor(offsetRaw)) : 0;
|
|
52
|
+
const all = listRuns(settings.runsDir, {
|
|
28
53
|
since,
|
|
29
54
|
until,
|
|
30
|
-
lastDays
|
|
31
|
-
|
|
55
|
+
lastDays,
|
|
56
|
+
scenario
|
|
57
|
+
});
|
|
58
|
+
const data = all.slice(offset, offset + limit);
|
|
59
|
+
const totalCount = all.length;
|
|
60
|
+
const hasMore = offset + data.length < totalCount;
|
|
61
|
+
const nextOffset = hasMore ? offset + data.length : null;
|
|
62
|
+
const prevOffset = offset > 0 ? Math.max(0, offset - limit) : null;
|
|
63
|
+
asJson(res, 200, {
|
|
64
|
+
object: 'list',
|
|
65
|
+
url: `${pathname}${requestUrl.search}`,
|
|
66
|
+
data,
|
|
67
|
+
has_more: hasMore,
|
|
68
|
+
total_count: totalCount,
|
|
69
|
+
next_offset: nextOffset,
|
|
70
|
+
prev_offset: prevOffset
|
|
71
|
+
});
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
if (pathname === '/api/runs/latest-pass-rates' && method === 'POST') {
|
|
75
|
+
const body = (await parseBody(req));
|
|
76
|
+
const requestedConfigs = Array.isArray(body.configs) ? body.configs : [];
|
|
77
|
+
const normalizedConfigs = requestedConfigs
|
|
78
|
+
.map((entry) => ({
|
|
79
|
+
id: String(entry?.id ?? '').trim(),
|
|
80
|
+
sourcePath: String(entry?.sourcePath ?? '').trim(),
|
|
81
|
+
relativePath: String(entry?.relativePath ?? '').trim(),
|
|
82
|
+
configHash: String(entry?.configHash ?? '').trim()
|
|
83
|
+
}))
|
|
84
|
+
.filter((entry) => entry.id);
|
|
85
|
+
const lastDaysRaw = Number(body.lastDays);
|
|
86
|
+
const lastDays = Number.isFinite(lastDaysRaw) && lastDaysRaw > 0 ? Math.floor(lastDaysRaw) : undefined;
|
|
87
|
+
const summaries = listRuns(settings.runsDir, { lastDays });
|
|
88
|
+
const pending = new Set(normalizedConfigs.map((entry) => entry.id));
|
|
89
|
+
const byConfigId = {};
|
|
90
|
+
for (const summary of summaries) {
|
|
91
|
+
if (pending.size === 0)
|
|
92
|
+
break;
|
|
93
|
+
const summaryPath = String(summary.configPath ?? '').trim();
|
|
94
|
+
const summaryHash = String(summary.configHash ?? '').trim();
|
|
95
|
+
for (const cfg of normalizedConfigs) {
|
|
96
|
+
if (!pending.has(cfg.id))
|
|
97
|
+
continue;
|
|
98
|
+
if ((cfg.sourcePath && cfg.sourcePath === summaryPath) ||
|
|
99
|
+
(cfg.relativePath && cfg.relativePath === summaryPath) ||
|
|
100
|
+
(cfg.configHash && cfg.configHash === summaryHash)) {
|
|
101
|
+
byConfigId[cfg.id] = summary.passRate;
|
|
102
|
+
pending.delete(cfg.id);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
asJson(res, 200, { byConfigId });
|
|
32
107
|
return true;
|
|
33
108
|
}
|
|
34
109
|
if (pathname.startsWith('/api/runs/') && pathname.endsWith('/trace') && method === 'GET') {
|
|
@@ -109,7 +184,9 @@ export async function handleRunsRoutes(params) {
|
|
|
109
184
|
runsPerScenario: j.runParams.runsPerScenario,
|
|
110
185
|
scenarioIds: j.runParams.scenarioIds ?? null,
|
|
111
186
|
agents: j.runParams.requestedAgents ?? null,
|
|
112
|
-
runNote: j.runParams.runNote ?? null
|
|
187
|
+
runNote: j.runParams.runNote ?? null,
|
|
188
|
+
serverOverrideAll: j.runParams.serverOverrideAll ?? null,
|
|
189
|
+
scenarioServerOverrides: j.runParams.scenarioServerOverrides ?? null
|
|
113
190
|
}
|
|
114
191
|
}));
|
|
115
192
|
asJson(res, 200, {
|
|
@@ -122,7 +199,9 @@ export async function handleRunsRoutes(params) {
|
|
|
122
199
|
runsPerScenario: activeJob.runParams.runsPerScenario,
|
|
123
200
|
scenarioIds: activeJob.runParams.scenarioIds ?? null,
|
|
124
201
|
agents: activeJob.runParams.requestedAgents ?? null,
|
|
125
|
-
runNote: activeJob.runParams.runNote ?? null
|
|
202
|
+
runNote: activeJob.runParams.runNote ?? null,
|
|
203
|
+
serverOverrideAll: activeJob.runParams.serverOverrideAll ?? null,
|
|
204
|
+
scenarioServerOverrides: activeJob.runParams.scenarioServerOverrides ?? null
|
|
126
205
|
}
|
|
127
206
|
}
|
|
128
207
|
: null,
|
|
@@ -182,9 +261,45 @@ export async function handleRunsRoutes(params) {
|
|
|
182
261
|
const requestedAgents = Array.isArray(body.agents)
|
|
183
262
|
? body.agents.map((agent) => String(agent).trim()).filter(Boolean)
|
|
184
263
|
: undefined;
|
|
185
|
-
const applySnapshotEval = body.applySnapshotEval !== false;
|
|
186
264
|
const runNoteRaw = typeof body.runNote === 'string' ? body.runNote.trim() : '';
|
|
187
265
|
const runNote = runNoteRaw ? runNoteRaw.slice(0, 500) : undefined;
|
|
266
|
+
const serverOverrideAll = Array.isArray(body.serverOverrideAll)
|
|
267
|
+
? body.serverOverrideAll.map((id) => String(id).trim()).filter(Boolean)
|
|
268
|
+
: undefined;
|
|
269
|
+
if (Array.isArray(body.serverOverrideAll) &&
|
|
270
|
+
(!serverOverrideAll || serverOverrideAll.length === 0)) {
|
|
271
|
+
asJson(res, 400, { error: 'serverOverrideAll must include at least one server id' });
|
|
272
|
+
return true;
|
|
273
|
+
}
|
|
274
|
+
if (body.scenarioServerOverrides !== undefined &&
|
|
275
|
+
(typeof body.scenarioServerOverrides !== 'object' ||
|
|
276
|
+
body.scenarioServerOverrides === null ||
|
|
277
|
+
Array.isArray(body.scenarioServerOverrides))) {
|
|
278
|
+
asJson(res, 400, {
|
|
279
|
+
error: 'scenarioServerOverrides must be an object of scenarioId -> string[]'
|
|
280
|
+
});
|
|
281
|
+
return true;
|
|
282
|
+
}
|
|
283
|
+
let scenarioServerOverrides;
|
|
284
|
+
if (body.scenarioServerOverrides && typeof body.scenarioServerOverrides === 'object') {
|
|
285
|
+
const normalizedEntries = [];
|
|
286
|
+
for (const [rawScenarioId, rawServerIds] of Object.entries(body.scenarioServerOverrides)) {
|
|
287
|
+
const scenarioOverrideId = String(rawScenarioId).trim();
|
|
288
|
+
if (!scenarioOverrideId)
|
|
289
|
+
continue;
|
|
290
|
+
if (!Array.isArray(rawServerIds)) {
|
|
291
|
+
asJson(res, 400, {
|
|
292
|
+
error: `scenarioServerOverrides.${scenarioOverrideId} must be an array of server ids`
|
|
293
|
+
});
|
|
294
|
+
return true;
|
|
295
|
+
}
|
|
296
|
+
normalizedEntries.push([
|
|
297
|
+
scenarioOverrideId,
|
|
298
|
+
rawServerIds.map((id) => String(id).trim()).filter(Boolean)
|
|
299
|
+
]);
|
|
300
|
+
}
|
|
301
|
+
scenarioServerOverrides = Object.fromEntries(normalizedEntries);
|
|
302
|
+
}
|
|
188
303
|
if (!configPathRaw) {
|
|
189
304
|
asJson(res, 400, { error: 'configPath is required' });
|
|
190
305
|
return true;
|
|
@@ -200,17 +315,27 @@ export async function handleRunsRoutes(params) {
|
|
|
200
315
|
asJson(res, 404, { error: `Config not found: ${configPath}` });
|
|
201
316
|
return true;
|
|
202
317
|
}
|
|
203
|
-
// Eagerly cache OAuth server names to avoid re-parsing config in advanceQueue
|
|
204
|
-
let oauthServerNames;
|
|
205
318
|
try {
|
|
206
|
-
const loaded = loadConfig(configPath);
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
319
|
+
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
320
|
+
const libraries = readLibraries(settings.librariesDir);
|
|
321
|
+
applyLibraryEntries(loaded, libraries.agents, libraries.servers);
|
|
322
|
+
const selected = scenarioIds?.length
|
|
323
|
+
? deps.selectScenarioIds(loaded.config, scenarioIds)
|
|
324
|
+
: scenarioId
|
|
325
|
+
? deps.selectScenarioIds(loaded.config, [scenarioId])
|
|
326
|
+
: loaded.config;
|
|
327
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, scenarioServerOverrides);
|
|
328
|
+
applyRuntimeServerOverrides(selected, {
|
|
329
|
+
serverOverrideAll,
|
|
330
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
331
|
+
});
|
|
210
332
|
}
|
|
211
|
-
catch {
|
|
212
|
-
|
|
333
|
+
catch (error) {
|
|
334
|
+
asJson(res, 400, { error: error instanceof Error ? error.message : String(error) });
|
|
335
|
+
return true;
|
|
213
336
|
}
|
|
337
|
+
// Resolve lazily in advanceQueue so runtime overrides are always reflected.
|
|
338
|
+
const oauthServerNames = undefined;
|
|
214
339
|
const jobId = `run-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
215
340
|
const runParamsObj = {
|
|
216
341
|
configPath,
|
|
@@ -218,9 +343,10 @@ export async function handleRunsRoutes(params) {
|
|
|
218
343
|
scenarioId,
|
|
219
344
|
scenarioIds,
|
|
220
345
|
requestedAgents,
|
|
221
|
-
applySnapshotEval,
|
|
222
346
|
runNote,
|
|
223
|
-
oauthServerNames
|
|
347
|
+
oauthServerNames,
|
|
348
|
+
serverOverrideAll,
|
|
349
|
+
scenarioServerOverrides
|
|
224
350
|
};
|
|
225
351
|
const job = {
|
|
226
352
|
id: jobId,
|
|
@@ -244,6 +370,8 @@ export async function handleRunsRoutes(params) {
|
|
|
244
370
|
scenarioIds: scenarioIds ?? null,
|
|
245
371
|
agents: requestedAgents ?? null,
|
|
246
372
|
runNote: runNote ?? null,
|
|
373
|
+
serverOverrideAll: serverOverrideAll ?? null,
|
|
374
|
+
scenarioServerOverrides: scenarioServerOverrides ?? null,
|
|
247
375
|
position: runQueueState.queue.length
|
|
248
376
|
}
|
|
249
377
|
});
|
|
@@ -568,19 +696,39 @@ function toCoreExtractRules(extractRules) {
|
|
|
568
696
|
}
|
|
569
697
|
return rules;
|
|
570
698
|
}
|
|
571
|
-
function resolveOAuthServersForJob(job) {
|
|
572
|
-
if (job.runParams.oauthServerNames !== undefined)
|
|
699
|
+
function resolveOAuthServersForJob(job, librariesDir) {
|
|
700
|
+
if (job.runParams.oauthServerNames !== undefined)
|
|
573
701
|
return job.runParams.oauthServerNames;
|
|
574
|
-
}
|
|
575
702
|
try {
|
|
576
|
-
const loaded = loadConfig(job.runParams.configPath);
|
|
577
|
-
const
|
|
578
|
-
|
|
579
|
-
|
|
703
|
+
const loaded = loadConfig(job.runParams.configPath, { bundleRoot: librariesDir });
|
|
704
|
+
const libraries = readLibrariesFromStore(librariesDir);
|
|
705
|
+
applyLibraryEntries(loaded, libraries.agents, libraries.servers);
|
|
706
|
+
const selected = job.runParams.scenarioIds?.length
|
|
707
|
+
? selectScenarioIds(loaded.config, job.runParams.scenarioIds)
|
|
708
|
+
: job.runParams.scenarioId
|
|
709
|
+
? selectScenarioIds(loaded.config, [job.runParams.scenarioId])
|
|
710
|
+
: loaded.config;
|
|
711
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selected, job.runParams.scenarioServerOverrides);
|
|
712
|
+
const withOverrides = applyRuntimeServerOverrides(selected, {
|
|
713
|
+
serverOverrideAll: job.runParams.serverOverrideAll,
|
|
714
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
715
|
+
});
|
|
716
|
+
const effectiveServers = new Set(withOverrides.scenarios.flatMap((scenario) => scenario.servers));
|
|
717
|
+
const names = Array.from(effectiveServers).filter((name) => {
|
|
718
|
+
const config = withOverrides.servers?.[name];
|
|
719
|
+
return config?.auth?.type === 'oauth_authorization_code';
|
|
720
|
+
});
|
|
580
721
|
job.runParams.oauthServerNames = names;
|
|
581
722
|
return names;
|
|
582
723
|
}
|
|
583
|
-
catch {
|
|
724
|
+
catch (error) {
|
|
725
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
726
|
+
if (message.includes('Unknown server refs') ||
|
|
727
|
+
message.includes('Unknown scenarios in scenarioServerOverrides') ||
|
|
728
|
+
message.includes('serverOverrideAll must include at least one server id')) {
|
|
729
|
+
throw error;
|
|
730
|
+
}
|
|
731
|
+
console.warn(`[mcplab] Failed to resolve OAuth servers for queued job '${job.id}': ${message}`);
|
|
584
732
|
return [];
|
|
585
733
|
}
|
|
586
734
|
}
|
|
@@ -599,7 +747,25 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
599
747
|
continue;
|
|
600
748
|
}
|
|
601
749
|
// Pre-check OAuth before starting
|
|
602
|
-
|
|
750
|
+
let oauthServers = [];
|
|
751
|
+
try {
|
|
752
|
+
oauthServers = resolveOAuthServersForJob(nextJob, settings.librariesDir);
|
|
753
|
+
}
|
|
754
|
+
catch (error) {
|
|
755
|
+
runQueueState.queue.shift();
|
|
756
|
+
nextJob.status = 'error';
|
|
757
|
+
deps.addJobEvent(nextJob, {
|
|
758
|
+
type: 'error',
|
|
759
|
+
ts: new Date().toISOString(),
|
|
760
|
+
payload: {
|
|
761
|
+
message: error instanceof Error ? error.message : String(error)
|
|
762
|
+
}
|
|
763
|
+
});
|
|
764
|
+
for (const client of nextJob.clients)
|
|
765
|
+
client.end();
|
|
766
|
+
nextJob.clients.clear();
|
|
767
|
+
continue;
|
|
768
|
+
}
|
|
603
769
|
if (oauthServers.length > 0) {
|
|
604
770
|
const authStatus = oauthSessionManager.checkServersAuthStatus(oauthServers);
|
|
605
771
|
const needsAuth = authStatus.filter((s) => s.status === 'auth_required');
|
|
@@ -634,7 +800,9 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
634
800
|
scenarioId: nextJob.runParams.scenarioId ?? null,
|
|
635
801
|
scenarioIds: nextJob.runParams.scenarioIds ?? null,
|
|
636
802
|
agents: nextJob.runParams.requestedAgents ?? null,
|
|
637
|
-
runNote: nextJob.runParams.runNote ?? null
|
|
803
|
+
runNote: nextJob.runParams.runNote ?? null,
|
|
804
|
+
serverOverrideAll: nextJob.runParams.serverOverrideAll ?? null,
|
|
805
|
+
scenarioServerOverrides: nextJob.runParams.scenarioServerOverrides ?? null
|
|
638
806
|
}
|
|
639
807
|
});
|
|
640
808
|
void executeRunJob(nextJob, settings, jobs, runQueueState, oauthSessionManager, deps);
|
|
@@ -646,8 +814,8 @@ async function advanceQueue(jobs, runQueueState, settings, oauthSessionManager,
|
|
|
646
814
|
}
|
|
647
815
|
}
|
|
648
816
|
async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionManager, deps) {
|
|
649
|
-
const { addJobEvent, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents,
|
|
650
|
-
const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents,
|
|
817
|
+
const { addJobEvent, getScenarioRunTraceRecords, selectScenarioIds, expandConfigForAgents, resolveRunSelectedAgents, readLibraries, pkgVersion } = deps;
|
|
818
|
+
const { configPath, runsPerScenario, scenarioId, scenarioIds, requestedAgents, runNote, serverOverrideAll, scenarioServerOverrides } = job.runParams;
|
|
651
819
|
try {
|
|
652
820
|
addJobEvent(job, {
|
|
653
821
|
type: 'log',
|
|
@@ -655,8 +823,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
655
823
|
payload: { message: `Loading MCP Evaluation config: ${configPath}` }
|
|
656
824
|
});
|
|
657
825
|
const loaded = loadConfig(configPath, { bundleRoot: settings.librariesDir });
|
|
658
|
-
const { agents: libraryAgents } = readLibraries(settings.librariesDir);
|
|
659
|
-
|
|
826
|
+
const { agents: libraryAgents, servers: libraryServers } = readLibraries(settings.librariesDir);
|
|
827
|
+
applyLibraryEntries(loaded, libraryAgents, libraryServers);
|
|
660
828
|
addJobEvent(job, {
|
|
661
829
|
type: 'log',
|
|
662
830
|
ts: new Date().toISOString(),
|
|
@@ -690,7 +858,30 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
690
858
|
message: `Selected ${selectedBaseScenarios.scenarios.length} base scenario(s)`
|
|
691
859
|
}
|
|
692
860
|
});
|
|
693
|
-
const
|
|
861
|
+
const filteredScenarioOverrides = filterScenarioOverridesToSelectedScenarios(selectedBaseScenarios, scenarioServerOverrides);
|
|
862
|
+
const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseScenarios, {
|
|
863
|
+
serverOverrideAll,
|
|
864
|
+
scenarioServerOverrides: filteredScenarioOverrides
|
|
865
|
+
});
|
|
866
|
+
const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
|
|
867
|
+
addJobEvent(job, {
|
|
868
|
+
type: 'log',
|
|
869
|
+
ts: new Date().toISOString(),
|
|
870
|
+
payload: {
|
|
871
|
+
message: `Applied runtime server overrides: global=${serverOverrideAll?.length ?? 0} scenario-specific=${Object.keys(filteredScenarioOverrides ?? {}).length}`
|
|
872
|
+
}
|
|
873
|
+
});
|
|
874
|
+
const effectiveScenarioServers = runtimeOverriddenConfig.scenarios
|
|
875
|
+
.map((scenario) => `${scenario.id}=[${scenario.servers.join(', ')}]`)
|
|
876
|
+
.join('; ');
|
|
877
|
+
addJobEvent(job, {
|
|
878
|
+
type: 'log',
|
|
879
|
+
ts: new Date().toISOString(),
|
|
880
|
+
payload: {
|
|
881
|
+
message: `Effective MCP servers per scenario: ${effectiveScenarioServers || '(none)'}`
|
|
882
|
+
}
|
|
883
|
+
});
|
|
884
|
+
const resolvedAgents = resolveRunSelectedAgents(runtimeOverriddenConfig, requestedAgents);
|
|
694
885
|
const resolvedAgentList = Array.isArray(resolvedAgents) ? resolvedAgents : [];
|
|
695
886
|
addJobEvent(job, {
|
|
696
887
|
type: 'log',
|
|
@@ -701,7 +892,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
701
892
|
: `Using resolved default agents: ${resolvedAgentList.join(', ')}`
|
|
702
893
|
}
|
|
703
894
|
});
|
|
704
|
-
const expandedConfig = expandConfigForAgents(
|
|
895
|
+
const expandedConfig = expandConfigForAgents(runtimeOverriddenConfig, resolvedAgents);
|
|
705
896
|
addJobEvent(job, {
|
|
706
897
|
type: 'log',
|
|
707
898
|
ts: new Date().toISOString(),
|
|
@@ -709,9 +900,8 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
709
900
|
message: `Expanded to ${expandedConfig.scenarios.length} executable scenario run(s) across selected agents`
|
|
710
901
|
}
|
|
711
902
|
});
|
|
712
|
-
const
|
|
713
|
-
|
|
714
|
-
.map(([serverName]) => serverName);
|
|
903
|
+
const usedServerNames = new Set(expandedConfig.scenarios.flatMap((scenario) => scenario.servers));
|
|
904
|
+
const oauthServers = Array.from(usedServerNames).filter((serverName) => expandedConfig.servers[serverName]?.auth?.type === 'oauth_authorization_code');
|
|
715
905
|
const mcpServerAuthHeaders = oauthServers.length > 0
|
|
716
906
|
? await oauthSessionManager.getAuthHeadersForServers(oauthServers)
|
|
717
907
|
: undefined;
|
|
@@ -745,7 +935,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
745
935
|
runsPerScenario,
|
|
746
936
|
scenarioId,
|
|
747
937
|
runNote,
|
|
748
|
-
configHash:
|
|
938
|
+
configHash: effectiveConfigHash,
|
|
749
939
|
cliVersion: pkgVersion,
|
|
750
940
|
runsDir: settings.runsDir,
|
|
751
941
|
mcpServerAuthHeaders,
|
|
@@ -767,6 +957,23 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
767
957
|
if (loaded.config.name && loaded.config.name.trim().length > 0) {
|
|
768
958
|
results.metadata.config_name = loaded.config.name.trim();
|
|
769
959
|
}
|
|
960
|
+
results.metadata.rerun_agents = [...resolvedAgentList];
|
|
961
|
+
results.metadata.rerun_scenario_ids = selectedBaseScenarios.scenarios.map((scenario) => scenario.id);
|
|
962
|
+
if (serverOverrideAll && serverOverrideAll.length > 0) {
|
|
963
|
+
results.metadata.rerun_server_override_all = [...serverOverrideAll];
|
|
964
|
+
}
|
|
965
|
+
else {
|
|
966
|
+
delete results.metadata.rerun_server_override_all;
|
|
967
|
+
}
|
|
968
|
+
if (filteredScenarioOverrides && Object.keys(filteredScenarioOverrides).length > 0) {
|
|
969
|
+
results.metadata.rerun_scenario_server_overrides = Object.fromEntries(Object.entries(filteredScenarioOverrides).map(([scenarioKey, serverIds]) => [
|
|
970
|
+
scenarioKey,
|
|
971
|
+
[...serverIds]
|
|
972
|
+
]));
|
|
973
|
+
}
|
|
974
|
+
else {
|
|
975
|
+
delete results.metadata.rerun_scenario_server_overrides;
|
|
976
|
+
}
|
|
770
977
|
addJobEvent(job, {
|
|
771
978
|
type: 'log',
|
|
772
979
|
ts: new Date().toISOString(),
|
|
@@ -774,102 +981,13 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
774
981
|
message: `Evaluation execution finished (run id: ${results.metadata.run_id})`
|
|
775
982
|
}
|
|
776
983
|
});
|
|
777
|
-
if (applySnapshotEval && expandedConfig.snapshot_eval?.enabled) {
|
|
778
|
-
addJobEvent(job, {
|
|
779
|
-
type: 'log',
|
|
780
|
-
ts: new Date().toISOString(),
|
|
781
|
-
payload: { message: 'Applying snapshot evaluation policy ...' }
|
|
782
|
-
});
|
|
783
|
-
const policy = expandedConfig.snapshot_eval;
|
|
784
|
-
const enabledScenarioIds = new Set(selectedBaseScenarios.scenarios
|
|
785
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
786
|
-
.map((scenario) => scenario.id));
|
|
787
|
-
const scenarioBaselineMap = new Map();
|
|
788
|
-
for (const scenario of selectedBaseScenarios.scenarios) {
|
|
789
|
-
if (scenario.snapshot_eval?.enabled === false)
|
|
790
|
-
continue;
|
|
791
|
-
const baselineId = scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id;
|
|
792
|
-
if (baselineId)
|
|
793
|
-
scenarioBaselineMap.set(scenario.id, baselineId);
|
|
794
|
-
}
|
|
795
|
-
const scenariosWithoutBaseline = selectedBaseScenarios.scenarios
|
|
796
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
797
|
-
.filter((scenario) => !(scenario.snapshot_eval?.baseline_snapshot_id ?? policy.baseline_snapshot_id))
|
|
798
|
-
.map((scenario) => scenario.id);
|
|
799
|
-
if (scenariosWithoutBaseline.length > 0) {
|
|
800
|
-
addJobEvent(job, {
|
|
801
|
-
type: 'log',
|
|
802
|
-
ts: new Date().toISOString(),
|
|
803
|
-
payload: {
|
|
804
|
-
message: `Snapshot eval enabled but no baseline configured for scenarios: ${scenariosWithoutBaseline.join(', ')}`
|
|
805
|
-
}
|
|
806
|
-
});
|
|
807
|
-
}
|
|
808
|
-
const comparisons = [];
|
|
809
|
-
const scenarioIdsByBaseline = new Map();
|
|
810
|
-
for (const [scenarioIdItem, baselineId] of scenarioBaselineMap) {
|
|
811
|
-
const list = scenarioIdsByBaseline.get(baselineId) ?? [];
|
|
812
|
-
list.push(scenarioIdItem);
|
|
813
|
-
scenarioIdsByBaseline.set(baselineId, list);
|
|
814
|
-
}
|
|
815
|
-
for (const [baselineId, scenarioIdsForBaseline] of scenarioIdsByBaseline) {
|
|
816
|
-
addJobEvent(job, {
|
|
817
|
-
type: 'log',
|
|
818
|
-
ts: new Date().toISOString(),
|
|
819
|
-
payload: {
|
|
820
|
-
message: `Comparing ${scenarioIdsForBaseline.length} scenario(s) to snapshot baseline '${baselineId}'`
|
|
821
|
-
}
|
|
822
|
-
});
|
|
823
|
-
const snapshot = loadSnapshot(baselineId, settings.snapshotsDir);
|
|
824
|
-
const fullComparison = compareRunToSnapshot(results, snapshot);
|
|
825
|
-
comparisons.push({
|
|
826
|
-
...fullComparison,
|
|
827
|
-
scenario_results: fullComparison.scenario_results.filter((row) => scenarioIdsForBaseline.includes(row.scenario_id))
|
|
828
|
-
});
|
|
829
|
-
}
|
|
830
|
-
if (comparisons.length > 0) {
|
|
831
|
-
applySnapshotPolicyToRunResult({ results, comparisons, policy, enabledScenarioIds });
|
|
832
|
-
addJobEvent(job, {
|
|
833
|
-
type: 'log',
|
|
834
|
-
ts: new Date().toISOString(),
|
|
835
|
-
payload: {
|
|
836
|
-
message: `Snapshot evaluation applied (${comparisons.length} baseline comparison group(s))`
|
|
837
|
-
}
|
|
838
|
-
});
|
|
839
|
-
}
|
|
840
|
-
else {
|
|
841
|
-
addJobEvent(job, {
|
|
842
|
-
type: 'log',
|
|
843
|
-
ts: new Date().toISOString(),
|
|
844
|
-
payload: {
|
|
845
|
-
message: 'Snapshot evaluation enabled, but no baseline comparisons were applied'
|
|
846
|
-
}
|
|
847
|
-
});
|
|
848
|
-
}
|
|
849
|
-
}
|
|
850
|
-
else if (applySnapshotEval) {
|
|
851
|
-
addJobEvent(job, {
|
|
852
|
-
type: 'log',
|
|
853
|
-
ts: new Date().toISOString(),
|
|
854
|
-
payload: {
|
|
855
|
-
message: 'Snapshot evaluation requested, but config snapshot evaluation is disabled'
|
|
856
|
-
}
|
|
857
|
-
});
|
|
858
|
-
}
|
|
859
|
-
else {
|
|
860
|
-
addJobEvent(job, {
|
|
861
|
-
type: 'log',
|
|
862
|
-
ts: new Date().toISOString(),
|
|
863
|
-
payload: {
|
|
864
|
-
message: 'Snapshot evaluation skipped for this run (disabled in run request)'
|
|
865
|
-
}
|
|
866
|
-
});
|
|
867
|
-
}
|
|
868
984
|
addJobEvent(job, {
|
|
869
985
|
type: 'log',
|
|
870
986
|
ts: new Date().toISOString(),
|
|
871
987
|
payload: { message: `Writing results to ${runDir}` }
|
|
872
988
|
});
|
|
989
|
+
const traceRecords = getScenarioRunTraceRecords(results.metadata.run_id, settings.runsDir);
|
|
990
|
+
results.metadata.tool_tokens_total = estimateRunToolTokensTotal(traceRecords);
|
|
873
991
|
writeFileSync(join(runDir, 'results.json'), `${JSON.stringify(results, null, 2)}\n`, 'utf8');
|
|
874
992
|
writeFileSync(join(runDir, 'report.html'), renderReport(results), 'utf8');
|
|
875
993
|
writeFileSync(join(runDir, 'summary.md'), renderSummaryMarkdown(results), 'utf8');
|
|
@@ -886,8 +1004,7 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
886
1004
|
payload: {
|
|
887
1005
|
runId: results.metadata.run_id,
|
|
888
1006
|
runDir,
|
|
889
|
-
summary: results.summary
|
|
890
|
-
snapshotEval: results.metadata.snapshot_eval ?? null
|
|
1007
|
+
summary: results.summary
|
|
891
1008
|
}
|
|
892
1009
|
});
|
|
893
1010
|
job.status = 'completed';
|
|
@@ -923,6 +1040,77 @@ async function executeRunJob(job, settings, jobs, runQueueState, oauthSessionMan
|
|
|
923
1040
|
pruneOldJobs(jobs, runQueueState);
|
|
924
1041
|
}
|
|
925
1042
|
}
|
|
1043
|
+
function splitInteger(total, parts) {
|
|
1044
|
+
if (!Number.isFinite(total) || !parts || parts <= 0)
|
|
1045
|
+
return Array(parts).fill(0);
|
|
1046
|
+
const safeTotal = Math.max(0, Math.round(total ?? 0));
|
|
1047
|
+
const base = Math.floor(safeTotal / parts);
|
|
1048
|
+
let remainder = safeTotal % parts;
|
|
1049
|
+
return Array.from({ length: parts }, () => {
|
|
1050
|
+
const value = base + (remainder > 0 ? 1 : 0);
|
|
1051
|
+
if (remainder > 0)
|
|
1052
|
+
remainder -= 1;
|
|
1053
|
+
return value;
|
|
1054
|
+
});
|
|
1055
|
+
}
|
|
1056
|
+
function estimateRunToolTokensTotal(records) {
|
|
1057
|
+
let total = 0;
|
|
1058
|
+
let hasAny = false;
|
|
1059
|
+
for (const record of records) {
|
|
1060
|
+
const toolUsesById = new Map();
|
|
1061
|
+
for (const message of record.messages ?? []) {
|
|
1062
|
+
const toolUses = message.content.filter((block) => block.type === 'tool_use');
|
|
1063
|
+
if (toolUses.length > 0) {
|
|
1064
|
+
for (const toolUse of toolUses)
|
|
1065
|
+
toolUsesById.set(toolUse.id, toolUse.name);
|
|
1066
|
+
const allEstimated = toolUses.every((toolUse) => Boolean(toolUse.estimated_tokens));
|
|
1067
|
+
if (allEstimated) {
|
|
1068
|
+
for (const toolUse of toolUses)
|
|
1069
|
+
total += toolUse.estimated_tokens?.total ?? 0;
|
|
1070
|
+
hasAny = true;
|
|
1071
|
+
}
|
|
1072
|
+
else if (toolUses.length === 1 && typeof message.usage?.total_tokens === 'number') {
|
|
1073
|
+
total += message.usage.total_tokens;
|
|
1074
|
+
hasAny = true;
|
|
1075
|
+
}
|
|
1076
|
+
else {
|
|
1077
|
+
const shares = splitInteger(message.usage?.total_tokens, toolUses.length);
|
|
1078
|
+
total += shares.reduce((sum, value) => sum + value, 0);
|
|
1079
|
+
if (typeof message.usage?.total_tokens === 'number')
|
|
1080
|
+
hasAny = true;
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
const toolResults = message.content.filter((block) => block.type === 'tool_result');
|
|
1084
|
+
if (toolResults.length === 0)
|
|
1085
|
+
continue;
|
|
1086
|
+
const allEstimated = toolResults.every((result) => Boolean(result.estimated_tokens));
|
|
1087
|
+
if (allEstimated) {
|
|
1088
|
+
for (const result of toolResults)
|
|
1089
|
+
total += result.estimated_tokens?.total ?? 0;
|
|
1090
|
+
hasAny = true;
|
|
1091
|
+
continue;
|
|
1092
|
+
}
|
|
1093
|
+
if (toolResults.length === 1) {
|
|
1094
|
+
const [result] = toolResults;
|
|
1095
|
+
if (result &&
|
|
1096
|
+
toolUsesById.has(result.tool_use_id) &&
|
|
1097
|
+
typeof message.usage?.total_tokens === 'number') {
|
|
1098
|
+
total += message.usage.total_tokens;
|
|
1099
|
+
hasAny = true;
|
|
1100
|
+
continue;
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
const knownResults = toolResults.filter((result) => toolUsesById.has(result.tool_use_id));
|
|
1104
|
+
if (knownResults.length === 0)
|
|
1105
|
+
continue;
|
|
1106
|
+
const shares = splitInteger(message.usage?.total_tokens, knownResults.length);
|
|
1107
|
+
total += shares.reduce((sum, value) => sum + value, 0);
|
|
1108
|
+
if (typeof message.usage?.total_tokens === 'number')
|
|
1109
|
+
hasAny = true;
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
return hasAny ? total : null;
|
|
1113
|
+
}
|
|
926
1114
|
function pruneOldJobs(jobs, runQueueState) {
|
|
927
1115
|
const maxAgeMs = 30 * 60_000;
|
|
928
1116
|
const now = Date.now();
|
|
@@ -945,9 +1133,9 @@ function formatRunProgressMessage(event) {
|
|
|
945
1133
|
case 'run_started':
|
|
946
1134
|
return `Run initialized (id: ${event.runId}, ${event.totalScenarioRuns} scenario run(s))`;
|
|
947
1135
|
case 'mcp_connect_started':
|
|
948
|
-
return `Connecting to ${event.serverCount} MCP server(s) ...`;
|
|
1136
|
+
return `Connecting to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')} ...`;
|
|
949
1137
|
case 'mcp_connect_finished':
|
|
950
|
-
return `Connected to ${event.serverCount} MCP server(s)`;
|
|
1138
|
+
return `Connected to ${event.serverCount} MCP server(s): ${event.serverNames.join(', ')}`;
|
|
951
1139
|
case 'scenario_run_started':
|
|
952
1140
|
return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
|
|
953
1141
|
case 'scenario_run_finished':
|