@inspectr/mcplab 1.15.0 โ 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -35
- package/dist/app/assets/index-17cleCWQ.js +254 -0
- package/dist/app/assets/index-Bekohuot.css +1 -0
- package/dist/app/index.html +2 -2
- package/dist/app-server/app-context.d.ts +0 -22
- package/dist/app-server/app-context.d.ts.map +1 -1
- package/dist/app-server/evals-routes.d.ts.map +1 -1
- package/dist/app-server/evals-routes.js +1 -38
- package/dist/app-server/evals-routes.js.map +1 -1
- package/dist/app-server/libraries-store.d.ts.map +1 -1
- package/dist/app-server/libraries-store.js +2 -3
- package/dist/app-server/libraries-store.js.map +1 -1
- package/dist/app-server/markdown-reports.d.ts.map +1 -1
- package/dist/app-server/markdown-reports.js +64 -4
- package/dist/app-server/markdown-reports.js.map +1 -1
- package/dist/app-server/result-assistant-domain.js +1 -2
- package/dist/app-server/result-assistant-domain.js.map +1 -1
- package/dist/app-server/result-assistant.d.ts.map +1 -1
- package/dist/app-server/result-assistant.js +7 -1
- package/dist/app-server/result-assistant.js.map +1 -1
- package/dist/app-server/router.d.ts.map +1 -1
- package/dist/app-server/router.js +0 -24
- package/dist/app-server/router.js.map +1 -1
- package/dist/app-server/runs-routes.d.ts +15 -4
- package/dist/app-server/runs-routes.d.ts.map +1 -1
- package/dist/app-server/runs-routes.js +324 -136
- package/dist/app-server/runs-routes.js.map +1 -1
- package/dist/app-server/runs-store.d.ts +10 -0
- package/dist/app-server/runs-store.d.ts.map +1 -1
- package/dist/app-server/runs-store.js +27 -0
- package/dist/app-server/runs-store.js.map +1 -1
- package/dist/app-server/scenario-assistant-domain.d.ts +0 -16
- package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant-domain.js +5 -8
- package/dist/app-server/scenario-assistant-domain.js.map +1 -1
- package/dist/app-server/scenario-assistant.d.ts.map +1 -1
- package/dist/app-server/scenario-assistant.js +7 -1
- package/dist/app-server/scenario-assistant.js.map +1 -1
- package/dist/app-server/snapshots-routes.d.ts +1 -13
- package/dist/app-server/snapshots-routes.d.ts.map +1 -1
- package/dist/app-server/snapshots-routes.js +9 -79
- package/dist/app-server/snapshots-routes.js.map +1 -1
- package/dist/app-server/tool-analysis.d.ts.map +1 -1
- package/dist/app-server/tool-analysis.js +25 -1
- package/dist/app-server/tool-analysis.js.map +1 -1
- package/dist/app-server/types.d.ts +0 -2
- package/dist/app-server/types.d.ts.map +1 -1
- package/dist/cli.js +79 -288
- package/dist/cli.js.map +1 -1
- package/dist/interactive-helpers.d.ts +0 -1
- package/dist/interactive-helpers.d.ts.map +1 -1
- package/dist/interactive-helpers.js +0 -3
- package/dist/interactive-helpers.js.map +1 -1
- package/package.json +4 -4
- package/dist/app/assets/index-BH8cCzoo.css +0 -1
- package/dist/app/assets/index-C2W0NrXX.js +0 -250
package/dist/cli.js
CHANGED
|
@@ -5,7 +5,7 @@ import { Command } from 'commander';
|
|
|
5
5
|
import kleur from 'kleur';
|
|
6
6
|
import { readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
7
7
|
import { join, relative, resolve } from 'node:path';
|
|
8
|
-
import { loadConfig, hashConfig, selectScenarios, runAll, renderSummaryMarkdown, expandConfigForAgents } from '@inspectr/mcplab-core';
|
|
8
|
+
import { loadConfig, hashConfig, selectScenarios, runAll, renderSummaryMarkdown, expandConfigForAgents, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
|
|
9
9
|
import { renderReport } from '@inspectr/mcplab-reporting';
|
|
10
10
|
import { execSync, spawn } from 'node:child_process';
|
|
11
11
|
import { stringify as stringifyYaml, parse } from 'yaml';
|
|
@@ -15,7 +15,6 @@ import { migrateSourceConfig } from './migrate-utils.js';
|
|
|
15
15
|
import { resolveRunOptions, runInteractiveSelection } from './run-interactive.js';
|
|
16
16
|
import { promptAppOptionsInteractive, selectRunDirInteractive } from './interactive-helpers.js';
|
|
17
17
|
import { deriveConfigRelativePath, resolveRunConfigSelection } from './eval-config-files.js';
|
|
18
|
-
import { applySnapshotPolicyToRunResult, buildSnapshotFromRun, compareRunToSnapshot, formatSnapshotComparisonTable, listSnapshots, loadSnapshot, saveSnapshot } from './snapshot.js';
|
|
19
18
|
import { loadOrBuildSearchIndex } from './results/indexer.js';
|
|
20
19
|
import { searchDocs } from './results/search.js';
|
|
21
20
|
import { formatContext, formatRunList, formatSearchHits, listRuns, showRun } from './results/format.js';
|
|
@@ -36,14 +35,13 @@ program
|
|
|
36
35
|
.option('--agents <agents>', 'Comma-separated list of agents to test (runs each scenario with each agent)')
|
|
37
36
|
.option('--agents-all', 'Run all configured agents for the selected scenarios')
|
|
38
37
|
.option('--interactive', 'Prompt for required inputs')
|
|
39
|
-
.option('--snapshot-eval', 'Apply snapshot eval policy configured in the config')
|
|
40
|
-
.option('--compare-snapshot <snapshotId>', 'Compare completed run against snapshot id')
|
|
41
38
|
.option('--bail', 'Stop after first failed config when --config points to a folder')
|
|
42
39
|
.option('--run-note <text>', 'Optional note attached to the run metadata (max 500 chars)')
|
|
43
40
|
.option('--runs-dir <path>', 'Directory for run artifacts', 'mcplab/results/evaluation-runs')
|
|
44
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
45
41
|
.option('--oauth-token <server=token>', 'Pre-obtained OAuth Bearer token for a server (repeatable, format: server-name=token)', (val, acc) => [...acc, val], [])
|
|
46
42
|
.option('--open-browser', 'Open browser to mcplab serve UI when OAuth is required (default: print URL only)')
|
|
43
|
+
.option('--server-override-all <serverRef[,serverRef...]>', 'Override MCP server refs for all selected scenarios for this run only')
|
|
44
|
+
.option('--server-override <scenarioId=serverRef[,serverRef...]>', 'Override MCP server refs for one scenario (repeatable, higher priority than --server-override-all)', (val, acc) => [...acc, val], [])
|
|
47
45
|
.action(async (options) => {
|
|
48
46
|
try {
|
|
49
47
|
const hasAgentOverride = Boolean(options.agents) || Boolean(options.agentsAll);
|
|
@@ -57,10 +55,11 @@ program
|
|
|
57
55
|
promptAgentSelection: needsAgentPrompt,
|
|
58
56
|
loadConfigForValidation: (path) => {
|
|
59
57
|
const loaded = loadConfig(path);
|
|
60
|
-
const { agents: libraryAgents } = readLibraries(loaded.bundleRoot);
|
|
58
|
+
const { agents: libraryAgents, servers: libraryServers } = readLibraries(loaded.bundleRoot);
|
|
61
59
|
loaded.config = {
|
|
62
60
|
...loaded.config,
|
|
63
|
-
agents: { ...libraryAgents, ...loaded.config.agents }
|
|
61
|
+
agents: { ...libraryAgents, ...loaded.config.agents },
|
|
62
|
+
servers: { ...libraryServers, ...loaded.config.servers }
|
|
64
63
|
};
|
|
65
64
|
return loaded;
|
|
66
65
|
}
|
|
@@ -78,19 +77,12 @@ program
|
|
|
78
77
|
const requestedPath = selection.requestedPath;
|
|
79
78
|
const requestedPathIsDirectory = selection.requestedPathIsDirectory;
|
|
80
79
|
const isBatch = requestedPathIsDirectory;
|
|
81
|
-
if (isBatch && options.compareSnapshot) {
|
|
82
|
-
throw new Error('--compare-snapshot is not supported when running a config folder');
|
|
83
|
-
}
|
|
84
80
|
if (!isBatch) {
|
|
85
|
-
|
|
81
|
+
await executeSingleConfigRun({
|
|
86
82
|
configPath: configPaths[0],
|
|
87
83
|
options,
|
|
88
84
|
resolvedOptions
|
|
89
85
|
});
|
|
90
|
-
if (outcome.shouldFailOnDrift) {
|
|
91
|
-
console.error(kleur.red('Snapshot eval drift detected in fail_on_drift mode.'));
|
|
92
|
-
process.exit(2);
|
|
93
|
-
}
|
|
94
86
|
console.log(kleur.gray('Process exiting.'));
|
|
95
87
|
return;
|
|
96
88
|
}
|
|
@@ -160,224 +152,6 @@ program
|
|
|
160
152
|
process.exit(1);
|
|
161
153
|
}
|
|
162
154
|
});
|
|
163
|
-
program
|
|
164
|
-
.command('snapshot')
|
|
165
|
-
.description('Manage evaluation snapshots')
|
|
166
|
-
.addCommand(new Command('create')
|
|
167
|
-
.description('Create snapshot from a run (only fully passing runs)')
|
|
168
|
-
.requiredOption('--run <runId>', 'Run id from runs/<runId>')
|
|
169
|
-
.option('--name <name>', 'Snapshot name')
|
|
170
|
-
.option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
|
|
171
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
172
|
-
.action((options) => {
|
|
173
|
-
try {
|
|
174
|
-
const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
|
|
175
|
-
const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
|
|
176
|
-
const snapshot = buildSnapshotFromRun(results, options.name);
|
|
177
|
-
const path = saveSnapshot(snapshot, resolve(options.snapshotsDir));
|
|
178
|
-
console.log(kleur.green(`Snapshot created: ${snapshot.id}`));
|
|
179
|
-
console.log(kleur.gray(`Path: ${path}`));
|
|
180
|
-
}
|
|
181
|
-
catch (err) {
|
|
182
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
183
|
-
process.exit(1);
|
|
184
|
-
}
|
|
185
|
-
}))
|
|
186
|
-
.addCommand(new Command('eval-init')
|
|
187
|
-
.description('Create baseline snapshot from a run and link it to config snapshot_eval policy')
|
|
188
|
-
.requiredOption('--run <runId>', 'Run id from runs/<runId>')
|
|
189
|
-
.requiredOption('--config <path>', 'Path to eval.yaml')
|
|
190
|
-
.option('--name <name>', 'Snapshot name')
|
|
191
|
-
.option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
|
|
192
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
193
|
-
.action((options) => {
|
|
194
|
-
try {
|
|
195
|
-
const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
|
|
196
|
-
const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
|
|
197
|
-
const snapshot = buildSnapshotFromRun(results, options.name);
|
|
198
|
-
saveSnapshot(snapshot, resolve(options.snapshotsDir));
|
|
199
|
-
const configPath = resolve(String(options.config));
|
|
200
|
-
const { sourceConfig } = loadConfig(configPath);
|
|
201
|
-
const nextConfig = {
|
|
202
|
-
...sourceConfig,
|
|
203
|
-
snapshot_eval: {
|
|
204
|
-
enabled: true,
|
|
205
|
-
mode: sourceConfig.snapshot_eval?.mode ?? 'warn',
|
|
206
|
-
baseline_snapshot_id: snapshot.id,
|
|
207
|
-
baseline_source_run_id: results.metadata.run_id,
|
|
208
|
-
last_updated_at: new Date().toISOString()
|
|
209
|
-
}
|
|
210
|
-
};
|
|
211
|
-
writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
|
|
212
|
-
console.log(kleur.green(`Snapshot eval baseline linked: ${snapshot.id}`));
|
|
213
|
-
console.log(kleur.gray(`Config updated: ${configPath}`));
|
|
214
|
-
}
|
|
215
|
-
catch (err) {
|
|
216
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
217
|
-
process.exit(1);
|
|
218
|
-
}
|
|
219
|
-
}))
|
|
220
|
-
.addCommand(new Command('eval-policy')
|
|
221
|
-
.description('Update snapshot_eval policy in a config')
|
|
222
|
-
.requiredOption('--config <path>', 'Path to eval.yaml')
|
|
223
|
-
.requiredOption('--enabled <true|false>', 'Whether snapshot eval is enabled')
|
|
224
|
-
.requiredOption('--mode <warn|fail_on_drift>', 'Snapshot eval mode')
|
|
225
|
-
.option('--baseline-snapshot <snapshotId>', 'Baseline snapshot id')
|
|
226
|
-
.option('--baseline-source-run <runId>', 'Source run id used to create baseline')
|
|
227
|
-
.action((options) => {
|
|
228
|
-
try {
|
|
229
|
-
const enabled = String(options.enabled).toLowerCase() === 'true';
|
|
230
|
-
const mode = String(options.mode);
|
|
231
|
-
if (mode !== 'warn' && mode !== 'fail_on_drift') {
|
|
232
|
-
throw new Error('mode must be warn or fail_on_drift');
|
|
233
|
-
}
|
|
234
|
-
const configPath = resolve(String(options.config));
|
|
235
|
-
const { sourceConfig } = loadConfig(configPath);
|
|
236
|
-
const nextConfig = {
|
|
237
|
-
...sourceConfig,
|
|
238
|
-
snapshot_eval: {
|
|
239
|
-
enabled,
|
|
240
|
-
mode,
|
|
241
|
-
baseline_snapshot_id: options.baselineSnapshot ?? sourceConfig.snapshot_eval?.baseline_snapshot_id,
|
|
242
|
-
baseline_source_run_id: options.baselineSourceRun ?? sourceConfig.snapshot_eval?.baseline_source_run_id,
|
|
243
|
-
last_updated_at: new Date().toISOString()
|
|
244
|
-
}
|
|
245
|
-
};
|
|
246
|
-
writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
|
|
247
|
-
console.log(kleur.green(`Snapshot eval policy updated: ${configPath}`));
|
|
248
|
-
}
|
|
249
|
-
catch (err) {
|
|
250
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
251
|
-
process.exit(1);
|
|
252
|
-
}
|
|
253
|
-
}))
|
|
254
|
-
.addCommand(new Command('eval-set-scenario')
|
|
255
|
-
.description('Set or clear a scenario-level snapshot baseline override in a config')
|
|
256
|
-
.requiredOption('--config <path>', 'Path to eval.yaml')
|
|
257
|
-
.requiredOption('--scenario <id>', 'Scenario id')
|
|
258
|
-
.option('--snapshot <snapshotId>', 'Override baseline snapshot id (omit to clear override)')
|
|
259
|
-
.option('--source-run <runId>', 'Source run id used to create the scenario baseline')
|
|
260
|
-
.option('--enabled <true|false>', 'Scenario snapshot eval enabled override')
|
|
261
|
-
.action((options) => {
|
|
262
|
-
try {
|
|
263
|
-
const configPath = resolve(String(options.config));
|
|
264
|
-
const scenarioId = String(options.scenario).trim();
|
|
265
|
-
if (!scenarioId)
|
|
266
|
-
throw new Error('scenario is required');
|
|
267
|
-
const { sourceConfig } = loadConfig(configPath);
|
|
268
|
-
const scenarios = [...(sourceConfig.scenarios ?? [])];
|
|
269
|
-
const scenarioIndex = scenarios.findIndex((s) => typeof s === 'object' && s !== null && !('ref' in s) && s.id === scenarioId);
|
|
270
|
-
if (scenarioIndex < 0) {
|
|
271
|
-
throw new Error(`Scenario not found in config.scenarios (inline only): ${scenarioId}`);
|
|
272
|
-
}
|
|
273
|
-
const current = scenarios[scenarioIndex];
|
|
274
|
-
if (!current || typeof current !== 'object' || 'ref' in current) {
|
|
275
|
-
throw new Error(`Scenario not found in config.scenarios (inline only): ${scenarioId}`);
|
|
276
|
-
}
|
|
277
|
-
const nextScenarioSnapshotEval = {
|
|
278
|
-
...(current.snapshot_eval ?? {}),
|
|
279
|
-
...(options.snapshot !== undefined
|
|
280
|
-
? { baseline_snapshot_id: String(options.snapshot || '') || undefined }
|
|
281
|
-
: {}),
|
|
282
|
-
...(options.sourceRun !== undefined
|
|
283
|
-
? { baseline_source_run_id: String(options.sourceRun || '') || undefined }
|
|
284
|
-
: {}),
|
|
285
|
-
...(options.enabled !== undefined
|
|
286
|
-
? { enabled: String(options.enabled).toLowerCase() === 'true' }
|
|
287
|
-
: {}),
|
|
288
|
-
last_updated_at: new Date().toISOString()
|
|
289
|
-
};
|
|
290
|
-
if (!nextScenarioSnapshotEval.baseline_snapshot_id) {
|
|
291
|
-
delete nextScenarioSnapshotEval.baseline_snapshot_id;
|
|
292
|
-
}
|
|
293
|
-
if (!nextScenarioSnapshotEval.baseline_source_run_id) {
|
|
294
|
-
delete nextScenarioSnapshotEval.baseline_source_run_id;
|
|
295
|
-
}
|
|
296
|
-
if (nextScenarioSnapshotEval.enabled === undefined &&
|
|
297
|
-
!nextScenarioSnapshotEval.baseline_snapshot_id &&
|
|
298
|
-
!nextScenarioSnapshotEval.baseline_source_run_id) {
|
|
299
|
-
scenarios[scenarioIndex] = {
|
|
300
|
-
...current,
|
|
301
|
-
snapshot_eval: undefined
|
|
302
|
-
};
|
|
303
|
-
}
|
|
304
|
-
else {
|
|
305
|
-
scenarios[scenarioIndex] = {
|
|
306
|
-
...current,
|
|
307
|
-
snapshot_eval: nextScenarioSnapshotEval
|
|
308
|
-
};
|
|
309
|
-
}
|
|
310
|
-
const nextConfig = {
|
|
311
|
-
...sourceConfig,
|
|
312
|
-
scenarios
|
|
313
|
-
};
|
|
314
|
-
writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
|
|
315
|
-
console.log(kleur.green(`Scenario snapshot baseline ${options.snapshot ? 'set' : 'updated'}: ${scenarioId}`));
|
|
316
|
-
console.log(kleur.gray(`Config updated: ${configPath}`));
|
|
317
|
-
}
|
|
318
|
-
catch (err) {
|
|
319
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
320
|
-
process.exit(1);
|
|
321
|
-
}
|
|
322
|
-
}))
|
|
323
|
-
.addCommand(new Command('list')
|
|
324
|
-
.description('List snapshots')
|
|
325
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
326
|
-
.action((options) => {
|
|
327
|
-
try {
|
|
328
|
-
const snapshots = listSnapshots(resolve(options.snapshotsDir));
|
|
329
|
-
if (snapshots.length === 0) {
|
|
330
|
-
console.log('No snapshots found.');
|
|
331
|
-
return;
|
|
332
|
-
}
|
|
333
|
-
for (const snapshot of snapshots) {
|
|
334
|
-
console.log(`${snapshot.id} ${snapshot.name} (run=${snapshot.source_run_id}, created=${snapshot.created_at})`);
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
catch (err) {
|
|
338
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
339
|
-
process.exit(1);
|
|
340
|
-
}
|
|
341
|
-
}))
|
|
342
|
-
.addCommand(new Command('show')
|
|
343
|
-
.description('Show snapshot JSON')
|
|
344
|
-
.requiredOption('--id <snapshotId>', 'Snapshot id')
|
|
345
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
346
|
-
.action((options) => {
|
|
347
|
-
try {
|
|
348
|
-
const snapshot = loadSnapshot(String(options.id), resolve(options.snapshotsDir));
|
|
349
|
-
console.log(JSON.stringify(snapshot, null, 2));
|
|
350
|
-
}
|
|
351
|
-
catch (err) {
|
|
352
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
353
|
-
process.exit(1);
|
|
354
|
-
}
|
|
355
|
-
}))
|
|
356
|
-
.addCommand(new Command('compare')
|
|
357
|
-
.description('Compare run against snapshot')
|
|
358
|
-
.requiredOption('--id <snapshotId>', 'Snapshot id')
|
|
359
|
-
.requiredOption('--run <runId>', 'Run id from runs/<runId>')
|
|
360
|
-
.option('--format <format>', 'Output format: table|json', 'table')
|
|
361
|
-
.option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
|
|
362
|
-
.option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
|
|
363
|
-
.action((options) => {
|
|
364
|
-
try {
|
|
365
|
-
const snapshot = loadSnapshot(String(options.id), resolve(options.snapshotsDir));
|
|
366
|
-
const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
|
|
367
|
-
const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
|
|
368
|
-
const comparison = compareRunToSnapshot(results, snapshot);
|
|
369
|
-
if (String(options.format) === 'json') {
|
|
370
|
-
console.log(JSON.stringify(comparison, null, 2));
|
|
371
|
-
}
|
|
372
|
-
else {
|
|
373
|
-
console.log(formatSnapshotComparisonTable(comparison));
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
catch (err) {
|
|
377
|
-
console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
|
|
378
|
-
process.exit(1);
|
|
379
|
-
}
|
|
380
|
-
}));
|
|
381
155
|
program
|
|
382
156
|
.command('results')
|
|
383
157
|
.description('Query evaluation run artifacts for LLM-first workflows')
|
|
@@ -680,7 +454,6 @@ program
|
|
|
680
454
|
.description('Serve MCPLab app frontend and local API bridge')
|
|
681
455
|
.option('--evals-dir <path>', 'Directory for YAML evals', 'mcplab/evals')
|
|
682
456
|
.option('--runs-dir <path>', 'Directory for run artifacts', 'mcplab/results/evaluation-runs')
|
|
683
|
-
.option('--snapshots-dir <path>', 'Directory for snapshot artifacts', 'mcplab/snapshots')
|
|
684
457
|
.option('--tool-analysis-results-dir <path>', 'Directory for saved tool analysis reports', 'mcplab/results/tool-analysis')
|
|
685
458
|
.option('--libraries-dir <path>', 'Bundle root for reusable servers/agents/test-cases', 'mcplab')
|
|
686
459
|
.option('--port <number>', 'Port to bind', '8787')
|
|
@@ -696,7 +469,6 @@ program
|
|
|
696
469
|
port: String(options.port),
|
|
697
470
|
evalsDir: String(options.evalsDir),
|
|
698
471
|
runsDir: String(options.runsDir),
|
|
699
|
-
snapshotsDir: String(options.snapshotsDir),
|
|
700
472
|
toolAnalysisResultsDir: String(options.toolAnalysisResultsDir),
|
|
701
473
|
librariesDir: String(options.librariesDir)
|
|
702
474
|
})
|
|
@@ -705,7 +477,6 @@ program
|
|
|
705
477
|
port: String(options.port),
|
|
706
478
|
evalsDir: String(options.evalsDir),
|
|
707
479
|
runsDir: String(options.runsDir),
|
|
708
|
-
snapshotsDir: String(options.snapshotsDir),
|
|
709
480
|
toolAnalysisResultsDir: String(options.toolAnalysisResultsDir),
|
|
710
481
|
librariesDir: String(options.librariesDir)
|
|
711
482
|
};
|
|
@@ -718,7 +489,6 @@ program
|
|
|
718
489
|
port,
|
|
719
490
|
evalsDir: resolve(resolvedAppOptions.evalsDir),
|
|
720
491
|
runsDir: resolve(resolvedAppOptions.runsDir),
|
|
721
|
-
snapshotsDir: resolve(resolvedAppOptions.snapshotsDir),
|
|
722
492
|
toolAnalysisResultsDir: resolve(resolvedAppOptions.toolAnalysisResultsDir),
|
|
723
493
|
librariesDir: resolve(resolvedAppOptions.librariesDir),
|
|
724
494
|
dev: Boolean(options.dev),
|
|
@@ -743,13 +513,64 @@ function openBrowserUrl(url) {
|
|
|
743
513
|
spawn(cmd, [url], { stdio: 'ignore', detached: true }).unref();
|
|
744
514
|
}
|
|
745
515
|
}
|
|
516
|
+
function parseRuntimeServerOverrides(options) {
|
|
517
|
+
const serverOverrideAll = options.serverOverrideAll
|
|
518
|
+
? options.serverOverrideAll
|
|
519
|
+
.split(',')
|
|
520
|
+
.map((id) => id.trim())
|
|
521
|
+
.filter(Boolean)
|
|
522
|
+
: undefined;
|
|
523
|
+
if (options.serverOverrideAll !== undefined &&
|
|
524
|
+
(!serverOverrideAll || serverOverrideAll.length === 0)) {
|
|
525
|
+
throw new Error('serverOverrideAll must include at least one server id');
|
|
526
|
+
}
|
|
527
|
+
const scenarioServerOverrides = {};
|
|
528
|
+
for (const rawEntry of options.serverOverride) {
|
|
529
|
+
const entry = String(rawEntry ?? '').trim();
|
|
530
|
+
const eqIdx = entry.indexOf('=');
|
|
531
|
+
if (eqIdx < 1) {
|
|
532
|
+
throw new Error(`Invalid --server-override format '${entry}'. Expected: <scenarioId>=<serverRef[,serverRef...]>`);
|
|
533
|
+
}
|
|
534
|
+
const scenarioId = entry.slice(0, eqIdx).trim();
|
|
535
|
+
const csv = entry.slice(eqIdx + 1);
|
|
536
|
+
if (!scenarioId) {
|
|
537
|
+
throw new Error(`Invalid --server-override '${entry}': scenario id cannot be empty`);
|
|
538
|
+
}
|
|
539
|
+
const parsedServerIds = csv
|
|
540
|
+
.split(',')
|
|
541
|
+
.map((id) => id.trim())
|
|
542
|
+
.filter(Boolean);
|
|
543
|
+
if (parsedServerIds.length === 0) {
|
|
544
|
+
throw new Error(`Invalid --server-override '${entry}': must include at least one server id after '='`);
|
|
545
|
+
}
|
|
546
|
+
scenarioServerOverrides[scenarioId] = parsedServerIds;
|
|
547
|
+
}
|
|
548
|
+
return {
|
|
549
|
+
serverOverrideAll,
|
|
550
|
+
scenarioServerOverrides: Object.keys(scenarioServerOverrides).length > 0 ? scenarioServerOverrides : undefined
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
function filterRuntimeOverridesToSelectedScenarios(selectedConfig, overrides) {
|
|
554
|
+
if (!overrides.scenarioServerOverrides)
|
|
555
|
+
return overrides;
|
|
556
|
+
const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
|
|
557
|
+
const filtered = Object.fromEntries(Object.entries(overrides.scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
|
|
558
|
+
return {
|
|
559
|
+
...overrides,
|
|
560
|
+
scenarioServerOverrides: Object.keys(filtered).length > 0 ? filtered : undefined
|
|
561
|
+
};
|
|
562
|
+
}
|
|
746
563
|
async function executeSingleConfigRun(params) {
|
|
747
564
|
const { configPath, options, resolvedOptions } = params;
|
|
748
565
|
const loaded = loadConfig(resolve(configPath));
|
|
749
|
-
const { agents: libraryAgents } = readLibraries(loaded.bundleRoot);
|
|
750
|
-
loaded.config = {
|
|
566
|
+
const { agents: libraryAgents, servers: libraryServers } = readLibraries(loaded.bundleRoot);
|
|
567
|
+
loaded.config = {
|
|
568
|
+
...loaded.config,
|
|
569
|
+
agents: { ...libraryAgents, ...loaded.config.agents },
|
|
570
|
+
servers: { ...libraryServers, ...loaded.config.servers }
|
|
571
|
+
};
|
|
751
572
|
loaded.hash = hashConfig(loaded.config);
|
|
752
|
-
const { config,
|
|
573
|
+
const { config, warnings } = loaded;
|
|
753
574
|
for (const warning of warnings) {
|
|
754
575
|
console.log(kleur.yellow(`โ ${warning}`));
|
|
755
576
|
}
|
|
@@ -764,14 +585,18 @@ async function executeSingleConfigRun(params) {
|
|
|
764
585
|
: requestedAgentsFromCsv.length > 0
|
|
765
586
|
? requestedAgentsFromCsv
|
|
766
587
|
: undefined;
|
|
767
|
-
const
|
|
768
|
-
const
|
|
769
|
-
const
|
|
588
|
+
const runtimeOverrides = parseRuntimeServerOverrides(options);
|
|
589
|
+
const selectedBaseConfig = options.scenario ? selectScenarios(config, options.scenario) : config;
|
|
590
|
+
const selectedOverrides = filterRuntimeOverridesToSelectedScenarios(selectedBaseConfig, runtimeOverrides);
|
|
591
|
+
const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseConfig, selectedOverrides);
|
|
592
|
+
const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
|
|
593
|
+
const beforeExpandCount = runtimeOverriddenConfig.scenarios.length;
|
|
594
|
+
const effectiveAgents = requestedAgents ?? runtimeOverriddenConfig.run_defaults?.selected_agents;
|
|
595
|
+
const expanded = expandConfigForAgents(runtimeOverriddenConfig, effectiveAgents);
|
|
770
596
|
if (expanded.scenarios.length !== beforeExpandCount || effectiveAgents?.length) {
|
|
771
597
|
const agentCount = effectiveAgents?.length ?? Object.keys(config.agents).length;
|
|
772
598
|
console.log(kleur.cyan(`๐ Testing ${beforeExpandCount} scenarios ร ${agentCount} selected agents = ${expanded.scenarios.length} total tests`));
|
|
773
599
|
}
|
|
774
|
-
const selected = selectScenarios(expanded, options.scenario);
|
|
775
600
|
const runsPerScenario = Number(options.runs);
|
|
776
601
|
if (Number.isNaN(runsPerScenario) || runsPerScenario <= 0) {
|
|
777
602
|
throw new Error('Runs must be a positive number');
|
|
@@ -795,9 +620,11 @@ async function executeSingleConfigRun(params) {
|
|
|
795
620
|
oauthTokens[serverName] = token;
|
|
796
621
|
}
|
|
797
622
|
// Detect OAuth servers missing a token and fail early with a helpful message
|
|
798
|
-
const
|
|
799
|
-
|
|
800
|
-
.
|
|
623
|
+
const effectiveServerIds = new Set(expanded.scenarios.flatMap((scenario) => scenario.servers));
|
|
624
|
+
const oauthServers = Array.from(effectiveServerIds).filter((name) => {
|
|
625
|
+
const cfg = expanded.servers?.[name];
|
|
626
|
+
return cfg?.auth?.type === 'oauth_authorization_code';
|
|
627
|
+
});
|
|
801
628
|
const missingTokenServers = oauthServers.filter((name) => !oauthTokens[name]);
|
|
802
629
|
if (missingTokenServers.length > 0) {
|
|
803
630
|
for (const name of missingTokenServers) {
|
|
@@ -814,11 +641,11 @@ async function executeSingleConfigRun(params) {
|
|
|
814
641
|
}
|
|
815
642
|
throw new Error(`OAuth login required for server(s): ${missingTokenServers.join(', ')}. Provide tokens via --oauth-token.`);
|
|
816
643
|
}
|
|
817
|
-
const { runDir, results } = await runAll(
|
|
644
|
+
const { runDir, results } = await runAll(expanded, {
|
|
818
645
|
runsPerScenario,
|
|
819
646
|
scenarioId: options.scenario,
|
|
820
647
|
runNote,
|
|
821
|
-
configHash:
|
|
648
|
+
configHash: effectiveConfigHash,
|
|
822
649
|
gitCommit: getGitCommit(),
|
|
823
650
|
cliVersion: pkgVersion,
|
|
824
651
|
runsDir: String(options.runsDir),
|
|
@@ -830,35 +657,6 @@ async function executeSingleConfigRun(params) {
|
|
|
830
657
|
}
|
|
831
658
|
}
|
|
832
659
|
});
|
|
833
|
-
let shouldFailOnDrift = false;
|
|
834
|
-
const useSnapshotEval = Boolean(options.snapshotEval) || Boolean(config.snapshot_eval?.enabled);
|
|
835
|
-
if (useSnapshotEval) {
|
|
836
|
-
const policy = config.snapshot_eval;
|
|
837
|
-
if (!policy?.baseline_snapshot_id) {
|
|
838
|
-
console.log(kleur.yellow('โ Snapshot eval enabled but no baseline snapshot is configured.'));
|
|
839
|
-
}
|
|
840
|
-
else {
|
|
841
|
-
const snapshot = loadSnapshot(String(policy.baseline_snapshot_id), resolve(options.snapshotsDir));
|
|
842
|
-
const comparison = compareRunToSnapshot(results, snapshot);
|
|
843
|
-
const enabledScenarioIds = new Set(selected.scenarios
|
|
844
|
-
.filter((scenario) => scenario.snapshot_eval?.enabled !== false)
|
|
845
|
-
.map((scenario) => scenario.id));
|
|
846
|
-
const applied = applySnapshotPolicyToRunResult({
|
|
847
|
-
results,
|
|
848
|
-
comparisons: [comparison],
|
|
849
|
-
policy,
|
|
850
|
-
enabledScenarioIds
|
|
851
|
-
});
|
|
852
|
-
console.log('');
|
|
853
|
-
console.log(kleur.cyan('๐ธ Snapshot Eval Policy'));
|
|
854
|
-
console.log(`${applied.mode} ยท baseline=${applied.baseline_snapshot_id} ยท overall=${applied.overall_score} ยท status=${applied.status}`);
|
|
855
|
-
if (applied.impacted_scenarios.length > 0) {
|
|
856
|
-
console.log(kleur.yellow(`Impacted scenarios: ${applied.impacted_scenarios.join(', ')}`));
|
|
857
|
-
}
|
|
858
|
-
console.log(formatSnapshotComparisonTable(comparison));
|
|
859
|
-
shouldFailOnDrift = policy.mode === 'fail_on_drift' && applied.impacted_scenarios.length > 0;
|
|
860
|
-
}
|
|
861
|
-
}
|
|
862
660
|
const reportPath = join(runDir, 'report.html');
|
|
863
661
|
const resultsPath = join(runDir, 'results.json');
|
|
864
662
|
const summaryPath = join(runDir, 'summary.md');
|
|
@@ -866,19 +664,12 @@ async function executeSingleConfigRun(params) {
|
|
|
866
664
|
writeFileSync(reportPath, renderReport(results), 'utf8');
|
|
867
665
|
writeFileSync(summaryPath, renderSummaryMarkdown(results), 'utf8');
|
|
868
666
|
console.log(kleur.green(`โ
Run complete. Results: ${runDir}`));
|
|
869
|
-
if (options.compareSnapshot) {
|
|
870
|
-
const snapshot = loadSnapshot(String(options.compareSnapshot), resolve(options.snapshotsDir));
|
|
871
|
-
const comparison = compareRunToSnapshot(results, snapshot);
|
|
872
|
-
console.log('');
|
|
873
|
-
console.log(kleur.cyan('๐ธ Snapshot Comparison'));
|
|
874
|
-
console.log(formatSnapshotComparisonTable(comparison));
|
|
875
|
-
}
|
|
876
667
|
const failedRuns = results.scenarios.reduce((sum, scenario) => sum + scenario.runs.filter((run) => !run.pass).length, 0);
|
|
877
668
|
return {
|
|
878
669
|
runDir,
|
|
879
670
|
runId: results.metadata.run_id,
|
|
880
671
|
passed: failedRuns === 0,
|
|
881
|
-
shouldFailOnDrift
|
|
672
|
+
shouldFailOnDrift: false
|
|
882
673
|
};
|
|
883
674
|
}
|
|
884
675
|
function getGitCommit() {
|
|
@@ -902,9 +693,9 @@ function formatRunProgressEvent(event) {
|
|
|
902
693
|
case 'run_started':
|
|
903
694
|
return `Run started (${event.totalScenarioRuns} scenario run(s), ${event.runsPerScenario} run(s) each).`;
|
|
904
695
|
case 'mcp_connect_started':
|
|
905
|
-
return `Connecting MCP servers (${event.serverCount})...`;
|
|
696
|
+
return `Connecting MCP servers (${event.serverCount}): ${event.serverNames.join(', ')}...`;
|
|
906
697
|
case 'mcp_connect_finished':
|
|
907
|
-
return `Connected MCP servers (${event.serverCount}).`;
|
|
698
|
+
return `Connected MCP servers (${event.serverCount}): ${event.serverNames.join(', ')}.`;
|
|
908
699
|
case 'scenario_run_started':
|
|
909
700
|
return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
|
|
910
701
|
case 'scenario_run_finished':
|