@inspectr/mcplab 1.14.3 โ†’ 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +0 -35
  2. package/dist/app/assets/index-BSGuUMv-.js +254 -0
  3. package/dist/app/assets/index-Bekohuot.css +1 -0
  4. package/dist/app/index.html +2 -2
  5. package/dist/app-server/app-context.d.ts +0 -22
  6. package/dist/app-server/app-context.d.ts.map +1 -1
  7. package/dist/app-server/assistant-common.d.ts +37 -24
  8. package/dist/app-server/evals-routes.d.ts.map +1 -1
  9. package/dist/app-server/evals-routes.js +5 -41
  10. package/dist/app-server/evals-routes.js.map +1 -1
  11. package/dist/app-server/libraries-store.d.ts.map +1 -1
  12. package/dist/app-server/libraries-store.js +2 -3
  13. package/dist/app-server/libraries-store.js.map +1 -1
  14. package/dist/app-server/result-assistant-domain.d.ts +81 -65
  15. package/dist/app-server/result-assistant-domain.js +1 -2
  16. package/dist/app-server/result-assistant-domain.js.map +1 -1
  17. package/dist/app-server/result-assistant.d.ts.map +1 -1
  18. package/dist/app-server/result-assistant.js +7 -1
  19. package/dist/app-server/result-assistant.js.map +1 -1
  20. package/dist/app-server/router.d.ts.map +1 -1
  21. package/dist/app-server/router.js +0 -24
  22. package/dist/app-server/router.js.map +1 -1
  23. package/dist/app-server/runs-routes.d.ts +15 -4
  24. package/dist/app-server/runs-routes.d.ts.map +1 -1
  25. package/dist/app-server/runs-routes.js +189 -134
  26. package/dist/app-server/runs-routes.js.map +1 -1
  27. package/dist/app-server/runs-store.d.ts +6 -1
  28. package/dist/app-server/runs-store.d.ts.map +1 -1
  29. package/dist/app-server/runs-store.js +15 -1
  30. package/dist/app-server/runs-store.js.map +1 -1
  31. package/dist/app-server/scenario-assistant-domain.d.ts +144 -134
  32. package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -1
  33. package/dist/app-server/scenario-assistant-domain.js +5 -8
  34. package/dist/app-server/scenario-assistant-domain.js.map +1 -1
  35. package/dist/app-server/scenario-assistant.d.ts.map +1 -1
  36. package/dist/app-server/scenario-assistant.js +7 -1
  37. package/dist/app-server/scenario-assistant.js.map +1 -1
  38. package/dist/app-server/snapshots-routes.d.ts +1 -13
  39. package/dist/app-server/snapshots-routes.d.ts.map +1 -1
  40. package/dist/app-server/snapshots-routes.js +9 -79
  41. package/dist/app-server/snapshots-routes.js.map +1 -1
  42. package/dist/app-server/types.d.ts +0 -2
  43. package/dist/app-server/types.d.ts.map +1 -1
  44. package/dist/cli.js +79 -288
  45. package/dist/cli.js.map +1 -1
  46. package/dist/interactive-helpers.d.ts +0 -1
  47. package/dist/interactive-helpers.d.ts.map +1 -1
  48. package/dist/interactive-helpers.js +0 -3
  49. package/dist/interactive-helpers.js.map +1 -1
  50. package/package.json +4 -4
  51. package/dist/app/assets/index-BBRB19an.js +0 -250
  52. package/dist/app/assets/index-DVQdbWhs.css +0 -1
package/dist/cli.js CHANGED
@@ -5,7 +5,7 @@ import { Command } from 'commander';
5
5
  import kleur from 'kleur';
6
6
  import { readFileSync, readdirSync, writeFileSync } from 'node:fs';
7
7
  import { join, relative, resolve } from 'node:path';
8
- import { loadConfig, hashConfig, selectScenarios, runAll, renderSummaryMarkdown, expandConfigForAgents } from '@inspectr/mcplab-core';
8
+ import { loadConfig, hashConfig, selectScenarios, runAll, renderSummaryMarkdown, expandConfigForAgents, applyRuntimeServerOverrides } from '@inspectr/mcplab-core';
9
9
  import { renderReport } from '@inspectr/mcplab-reporting';
10
10
  import { execSync, spawn } from 'node:child_process';
11
11
  import { stringify as stringifyYaml, parse } from 'yaml';
@@ -15,7 +15,6 @@ import { migrateSourceConfig } from './migrate-utils.js';
15
15
  import { resolveRunOptions, runInteractiveSelection } from './run-interactive.js';
16
16
  import { promptAppOptionsInteractive, selectRunDirInteractive } from './interactive-helpers.js';
17
17
  import { deriveConfigRelativePath, resolveRunConfigSelection } from './eval-config-files.js';
18
- import { applySnapshotPolicyToRunResult, buildSnapshotFromRun, compareRunToSnapshot, formatSnapshotComparisonTable, listSnapshots, loadSnapshot, saveSnapshot } from './snapshot.js';
19
18
  import { loadOrBuildSearchIndex } from './results/indexer.js';
20
19
  import { searchDocs } from './results/search.js';
21
20
  import { formatContext, formatRunList, formatSearchHits, listRuns, showRun } from './results/format.js';
@@ -36,14 +35,13 @@ program
36
35
  .option('--agents <agents>', 'Comma-separated list of agents to test (runs each scenario with each agent)')
37
36
  .option('--agents-all', 'Run all configured agents for the selected scenarios')
38
37
  .option('--interactive', 'Prompt for required inputs')
39
- .option('--snapshot-eval', 'Apply snapshot eval policy configured in the config')
40
- .option('--compare-snapshot <snapshotId>', 'Compare completed run against snapshot id')
41
38
  .option('--bail', 'Stop after first failed config when --config points to a folder')
42
39
  .option('--run-note <text>', 'Optional note attached to the run metadata (max 500 chars)')
43
40
  .option('--runs-dir <path>', 'Directory for run artifacts', 'mcplab/results/evaluation-runs')
44
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
45
41
  .option('--oauth-token <server=token>', 'Pre-obtained OAuth Bearer token for a server (repeatable, format: server-name=token)', (val, acc) => [...acc, val], [])
46
42
  .option('--open-browser', 'Open browser to mcplab serve UI when OAuth is required (default: print URL only)')
43
+ .option('--server-override-all <serverRef[,serverRef...]>', 'Override MCP server refs for all selected scenarios for this run only')
44
+ .option('--server-override <scenarioId=serverRef[,serverRef...]>', 'Override MCP server refs for one scenario (repeatable, higher priority than --server-override-all)', (val, acc) => [...acc, val], [])
47
45
  .action(async (options) => {
48
46
  try {
49
47
  const hasAgentOverride = Boolean(options.agents) || Boolean(options.agentsAll);
@@ -57,10 +55,11 @@ program
57
55
  promptAgentSelection: needsAgentPrompt,
58
56
  loadConfigForValidation: (path) => {
59
57
  const loaded = loadConfig(path);
60
- const { agents: libraryAgents } = readLibraries(loaded.bundleRoot);
58
+ const { agents: libraryAgents, servers: libraryServers } = readLibraries(loaded.bundleRoot);
61
59
  loaded.config = {
62
60
  ...loaded.config,
63
- agents: { ...libraryAgents, ...loaded.config.agents }
61
+ agents: { ...libraryAgents, ...loaded.config.agents },
62
+ servers: { ...libraryServers, ...loaded.config.servers }
64
63
  };
65
64
  return loaded;
66
65
  }
@@ -78,19 +77,12 @@ program
78
77
  const requestedPath = selection.requestedPath;
79
78
  const requestedPathIsDirectory = selection.requestedPathIsDirectory;
80
79
  const isBatch = requestedPathIsDirectory;
81
- if (isBatch && options.compareSnapshot) {
82
- throw new Error('--compare-snapshot is not supported when running a config folder');
83
- }
84
80
  if (!isBatch) {
85
- const outcome = await executeSingleConfigRun({
81
+ await executeSingleConfigRun({
86
82
  configPath: configPaths[0],
87
83
  options,
88
84
  resolvedOptions
89
85
  });
90
- if (outcome.shouldFailOnDrift) {
91
- console.error(kleur.red('Snapshot eval drift detected in fail_on_drift mode.'));
92
- process.exit(2);
93
- }
94
86
  console.log(kleur.gray('Process exiting.'));
95
87
  return;
96
88
  }
@@ -160,224 +152,6 @@ program
160
152
  process.exit(1);
161
153
  }
162
154
  });
163
- program
164
- .command('snapshot')
165
- .description('Manage evaluation snapshots')
166
- .addCommand(new Command('create')
167
- .description('Create snapshot from a run (only fully passing runs)')
168
- .requiredOption('--run <runId>', 'Run id from runs/<runId>')
169
- .option('--name <name>', 'Snapshot name')
170
- .option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
171
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
172
- .action((options) => {
173
- try {
174
- const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
175
- const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
176
- const snapshot = buildSnapshotFromRun(results, options.name);
177
- const path = saveSnapshot(snapshot, resolve(options.snapshotsDir));
178
- console.log(kleur.green(`Snapshot created: ${snapshot.id}`));
179
- console.log(kleur.gray(`Path: ${path}`));
180
- }
181
- catch (err) {
182
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
183
- process.exit(1);
184
- }
185
- }))
186
- .addCommand(new Command('eval-init')
187
- .description('Create baseline snapshot from a run and link it to config snapshot_eval policy')
188
- .requiredOption('--run <runId>', 'Run id from runs/<runId>')
189
- .requiredOption('--config <path>', 'Path to eval.yaml')
190
- .option('--name <name>', 'Snapshot name')
191
- .option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
192
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
193
- .action((options) => {
194
- try {
195
- const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
196
- const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
197
- const snapshot = buildSnapshotFromRun(results, options.name);
198
- saveSnapshot(snapshot, resolve(options.snapshotsDir));
199
- const configPath = resolve(String(options.config));
200
- const { sourceConfig } = loadConfig(configPath);
201
- const nextConfig = {
202
- ...sourceConfig,
203
- snapshot_eval: {
204
- enabled: true,
205
- mode: sourceConfig.snapshot_eval?.mode ?? 'warn',
206
- baseline_snapshot_id: snapshot.id,
207
- baseline_source_run_id: results.metadata.run_id,
208
- last_updated_at: new Date().toISOString()
209
- }
210
- };
211
- writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
212
- console.log(kleur.green(`Snapshot eval baseline linked: ${snapshot.id}`));
213
- console.log(kleur.gray(`Config updated: ${configPath}`));
214
- }
215
- catch (err) {
216
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
217
- process.exit(1);
218
- }
219
- }))
220
- .addCommand(new Command('eval-policy')
221
- .description('Update snapshot_eval policy in a config')
222
- .requiredOption('--config <path>', 'Path to eval.yaml')
223
- .requiredOption('--enabled <true|false>', 'Whether snapshot eval is enabled')
224
- .requiredOption('--mode <warn|fail_on_drift>', 'Snapshot eval mode')
225
- .option('--baseline-snapshot <snapshotId>', 'Baseline snapshot id')
226
- .option('--baseline-source-run <runId>', 'Source run id used to create baseline')
227
- .action((options) => {
228
- try {
229
- const enabled = String(options.enabled).toLowerCase() === 'true';
230
- const mode = String(options.mode);
231
- if (mode !== 'warn' && mode !== 'fail_on_drift') {
232
- throw new Error('mode must be warn or fail_on_drift');
233
- }
234
- const configPath = resolve(String(options.config));
235
- const { sourceConfig } = loadConfig(configPath);
236
- const nextConfig = {
237
- ...sourceConfig,
238
- snapshot_eval: {
239
- enabled,
240
- mode,
241
- baseline_snapshot_id: options.baselineSnapshot ?? sourceConfig.snapshot_eval?.baseline_snapshot_id,
242
- baseline_source_run_id: options.baselineSourceRun ?? sourceConfig.snapshot_eval?.baseline_source_run_id,
243
- last_updated_at: new Date().toISOString()
244
- }
245
- };
246
- writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
247
- console.log(kleur.green(`Snapshot eval policy updated: ${configPath}`));
248
- }
249
- catch (err) {
250
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
251
- process.exit(1);
252
- }
253
- }))
254
- .addCommand(new Command('eval-set-scenario')
255
- .description('Set or clear a scenario-level snapshot baseline override in a config')
256
- .requiredOption('--config <path>', 'Path to eval.yaml')
257
- .requiredOption('--scenario <id>', 'Scenario id')
258
- .option('--snapshot <snapshotId>', 'Override baseline snapshot id (omit to clear override)')
259
- .option('--source-run <runId>', 'Source run id used to create the scenario baseline')
260
- .option('--enabled <true|false>', 'Scenario snapshot eval enabled override')
261
- .action((options) => {
262
- try {
263
- const configPath = resolve(String(options.config));
264
- const scenarioId = String(options.scenario).trim();
265
- if (!scenarioId)
266
- throw new Error('scenario is required');
267
- const { sourceConfig } = loadConfig(configPath);
268
- const scenarios = [...(sourceConfig.scenarios ?? [])];
269
- const scenarioIndex = scenarios.findIndex((s) => typeof s === 'object' && s !== null && !('ref' in s) && s.id === scenarioId);
270
- if (scenarioIndex < 0) {
271
- throw new Error(`Scenario not found in config.scenarios (inline only): ${scenarioId}`);
272
- }
273
- const current = scenarios[scenarioIndex];
274
- if (!current || typeof current !== 'object' || 'ref' in current) {
275
- throw new Error(`Scenario not found in config.scenarios (inline only): ${scenarioId}`);
276
- }
277
- const nextScenarioSnapshotEval = {
278
- ...(current.snapshot_eval ?? {}),
279
- ...(options.snapshot !== undefined
280
- ? { baseline_snapshot_id: String(options.snapshot || '') || undefined }
281
- : {}),
282
- ...(options.sourceRun !== undefined
283
- ? { baseline_source_run_id: String(options.sourceRun || '') || undefined }
284
- : {}),
285
- ...(options.enabled !== undefined
286
- ? { enabled: String(options.enabled).toLowerCase() === 'true' }
287
- : {}),
288
- last_updated_at: new Date().toISOString()
289
- };
290
- if (!nextScenarioSnapshotEval.baseline_snapshot_id) {
291
- delete nextScenarioSnapshotEval.baseline_snapshot_id;
292
- }
293
- if (!nextScenarioSnapshotEval.baseline_source_run_id) {
294
- delete nextScenarioSnapshotEval.baseline_source_run_id;
295
- }
296
- if (nextScenarioSnapshotEval.enabled === undefined &&
297
- !nextScenarioSnapshotEval.baseline_snapshot_id &&
298
- !nextScenarioSnapshotEval.baseline_source_run_id) {
299
- scenarios[scenarioIndex] = {
300
- ...current,
301
- snapshot_eval: undefined
302
- };
303
- }
304
- else {
305
- scenarios[scenarioIndex] = {
306
- ...current,
307
- snapshot_eval: nextScenarioSnapshotEval
308
- };
309
- }
310
- const nextConfig = {
311
- ...sourceConfig,
312
- scenarios
313
- };
314
- writeFileSync(configPath, `${stringifyYaml(nextConfig)}\n`, 'utf8');
315
- console.log(kleur.green(`Scenario snapshot baseline ${options.snapshot ? 'set' : 'updated'}: ${scenarioId}`));
316
- console.log(kleur.gray(`Config updated: ${configPath}`));
317
- }
318
- catch (err) {
319
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
320
- process.exit(1);
321
- }
322
- }))
323
- .addCommand(new Command('list')
324
- .description('List snapshots')
325
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
326
- .action((options) => {
327
- try {
328
- const snapshots = listSnapshots(resolve(options.snapshotsDir));
329
- if (snapshots.length === 0) {
330
- console.log('No snapshots found.');
331
- return;
332
- }
333
- for (const snapshot of snapshots) {
334
- console.log(`${snapshot.id} ${snapshot.name} (run=${snapshot.source_run_id}, created=${snapshot.created_at})`);
335
- }
336
- }
337
- catch (err) {
338
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
339
- process.exit(1);
340
- }
341
- }))
342
- .addCommand(new Command('show')
343
- .description('Show snapshot JSON')
344
- .requiredOption('--id <snapshotId>', 'Snapshot id')
345
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
346
- .action((options) => {
347
- try {
348
- const snapshot = loadSnapshot(String(options.id), resolve(options.snapshotsDir));
349
- console.log(JSON.stringify(snapshot, null, 2));
350
- }
351
- catch (err) {
352
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
353
- process.exit(1);
354
- }
355
- }))
356
- .addCommand(new Command('compare')
357
- .description('Compare run against snapshot')
358
- .requiredOption('--id <snapshotId>', 'Snapshot id')
359
- .requiredOption('--run <runId>', 'Run id from runs/<runId>')
360
- .option('--format <format>', 'Output format: table|json', 'table')
361
- .option('--runs-dir <path>', 'Directory with run artifacts', 'mcplab/results/evaluation-runs')
362
- .option('--snapshots-dir <path>', 'Directory for snapshots', 'mcplab/snapshots')
363
- .action((options) => {
364
- try {
365
- const snapshot = loadSnapshot(String(options.id), resolve(options.snapshotsDir));
366
- const resultsPath = resolve(options.runsDir, String(options.run), 'results.json');
367
- const results = JSON.parse(readFileSync(resultsPath, 'utf8'));
368
- const comparison = compareRunToSnapshot(results, snapshot);
369
- if (String(options.format) === 'json') {
370
- console.log(JSON.stringify(comparison, null, 2));
371
- }
372
- else {
373
- console.log(formatSnapshotComparisonTable(comparison));
374
- }
375
- }
376
- catch (err) {
377
- console.error(kleur.red(`Error: ${err?.message ?? String(err)}`));
378
- process.exit(1);
379
- }
380
- }));
381
155
  program
382
156
  .command('results')
383
157
  .description('Query evaluation run artifacts for LLM-first workflows')
@@ -680,7 +454,6 @@ program
680
454
  .description('Serve MCPLab app frontend and local API bridge')
681
455
  .option('--evals-dir <path>', 'Directory for YAML evals', 'mcplab/evals')
682
456
  .option('--runs-dir <path>', 'Directory for run artifacts', 'mcplab/results/evaluation-runs')
683
- .option('--snapshots-dir <path>', 'Directory for snapshot artifacts', 'mcplab/snapshots')
684
457
  .option('--tool-analysis-results-dir <path>', 'Directory for saved tool analysis reports', 'mcplab/results/tool-analysis')
685
458
  .option('--libraries-dir <path>', 'Bundle root for reusable servers/agents/test-cases', 'mcplab')
686
459
  .option('--port <number>', 'Port to bind', '8787')
@@ -696,7 +469,6 @@ program
696
469
  port: String(options.port),
697
470
  evalsDir: String(options.evalsDir),
698
471
  runsDir: String(options.runsDir),
699
- snapshotsDir: String(options.snapshotsDir),
700
472
  toolAnalysisResultsDir: String(options.toolAnalysisResultsDir),
701
473
  librariesDir: String(options.librariesDir)
702
474
  })
@@ -705,7 +477,6 @@ program
705
477
  port: String(options.port),
706
478
  evalsDir: String(options.evalsDir),
707
479
  runsDir: String(options.runsDir),
708
- snapshotsDir: String(options.snapshotsDir),
709
480
  toolAnalysisResultsDir: String(options.toolAnalysisResultsDir),
710
481
  librariesDir: String(options.librariesDir)
711
482
  };
@@ -718,7 +489,6 @@ program
718
489
  port,
719
490
  evalsDir: resolve(resolvedAppOptions.evalsDir),
720
491
  runsDir: resolve(resolvedAppOptions.runsDir),
721
- snapshotsDir: resolve(resolvedAppOptions.snapshotsDir),
722
492
  toolAnalysisResultsDir: resolve(resolvedAppOptions.toolAnalysisResultsDir),
723
493
  librariesDir: resolve(resolvedAppOptions.librariesDir),
724
494
  dev: Boolean(options.dev),
@@ -743,13 +513,64 @@ function openBrowserUrl(url) {
743
513
  spawn(cmd, [url], { stdio: 'ignore', detached: true }).unref();
744
514
  }
745
515
  }
516
+ function parseRuntimeServerOverrides(options) {
517
+ const serverOverrideAll = options.serverOverrideAll
518
+ ? options.serverOverrideAll
519
+ .split(',')
520
+ .map((id) => id.trim())
521
+ .filter(Boolean)
522
+ : undefined;
523
+ if (options.serverOverrideAll !== undefined &&
524
+ (!serverOverrideAll || serverOverrideAll.length === 0)) {
525
+ throw new Error('serverOverrideAll must include at least one server id');
526
+ }
527
+ const scenarioServerOverrides = {};
528
+ for (const rawEntry of options.serverOverride) {
529
+ const entry = String(rawEntry ?? '').trim();
530
+ const eqIdx = entry.indexOf('=');
531
+ if (eqIdx < 1) {
532
+ throw new Error(`Invalid --server-override format '${entry}'. Expected: <scenarioId>=<serverRef[,serverRef...]>`);
533
+ }
534
+ const scenarioId = entry.slice(0, eqIdx).trim();
535
+ const csv = entry.slice(eqIdx + 1);
536
+ if (!scenarioId) {
537
+ throw new Error(`Invalid --server-override '${entry}': scenario id cannot be empty`);
538
+ }
539
+ const parsedServerIds = csv
540
+ .split(',')
541
+ .map((id) => id.trim())
542
+ .filter(Boolean);
543
+ if (parsedServerIds.length === 0) {
544
+ throw new Error(`Invalid --server-override '${entry}': must include at least one server id after '='`);
545
+ }
546
+ scenarioServerOverrides[scenarioId] = parsedServerIds;
547
+ }
548
+ return {
549
+ serverOverrideAll,
550
+ scenarioServerOverrides: Object.keys(scenarioServerOverrides).length > 0 ? scenarioServerOverrides : undefined
551
+ };
552
+ }
553
+ function filterRuntimeOverridesToSelectedScenarios(selectedConfig, overrides) {
554
+ if (!overrides.scenarioServerOverrides)
555
+ return overrides;
556
+ const selectedIds = new Set(selectedConfig.scenarios.map((scenario) => scenario.id));
557
+ const filtered = Object.fromEntries(Object.entries(overrides.scenarioServerOverrides).filter(([scenarioId]) => selectedIds.has(scenarioId)));
558
+ return {
559
+ ...overrides,
560
+ scenarioServerOverrides: Object.keys(filtered).length > 0 ? filtered : undefined
561
+ };
562
+ }
746
563
  async function executeSingleConfigRun(params) {
747
564
  const { configPath, options, resolvedOptions } = params;
748
565
  const loaded = loadConfig(resolve(configPath));
749
- const { agents: libraryAgents } = readLibraries(loaded.bundleRoot);
750
- loaded.config = { ...loaded.config, agents: { ...libraryAgents, ...loaded.config.agents } };
566
+ const { agents: libraryAgents, servers: libraryServers } = readLibraries(loaded.bundleRoot);
567
+ loaded.config = {
568
+ ...loaded.config,
569
+ agents: { ...libraryAgents, ...loaded.config.agents },
570
+ servers: { ...libraryServers, ...loaded.config.servers }
571
+ };
751
572
  loaded.hash = hashConfig(loaded.config);
752
- const { config, hash, warnings } = loaded;
573
+ const { config, warnings } = loaded;
753
574
  for (const warning of warnings) {
754
575
  console.log(kleur.yellow(`โš  ${warning}`));
755
576
  }
@@ -764,14 +585,18 @@ async function executeSingleConfigRun(params) {
764
585
  : requestedAgentsFromCsv.length > 0
765
586
  ? requestedAgentsFromCsv
766
587
  : undefined;
767
- const beforeExpandCount = config.scenarios.length;
768
- const effectiveAgents = requestedAgents ?? config.run_defaults?.selected_agents;
769
- const expanded = expandConfigForAgents(config, effectiveAgents);
588
+ const runtimeOverrides = parseRuntimeServerOverrides(options);
589
+ const selectedBaseConfig = options.scenario ? selectScenarios(config, options.scenario) : config;
590
+ const selectedOverrides = filterRuntimeOverridesToSelectedScenarios(selectedBaseConfig, runtimeOverrides);
591
+ const runtimeOverriddenConfig = applyRuntimeServerOverrides(selectedBaseConfig, selectedOverrides);
592
+ const effectiveConfigHash = hashConfig(runtimeOverriddenConfig);
593
+ const beforeExpandCount = runtimeOverriddenConfig.scenarios.length;
594
+ const effectiveAgents = requestedAgents ?? runtimeOverriddenConfig.run_defaults?.selected_agents;
595
+ const expanded = expandConfigForAgents(runtimeOverriddenConfig, effectiveAgents);
770
596
  if (expanded.scenarios.length !== beforeExpandCount || effectiveAgents?.length) {
771
597
  const agentCount = effectiveAgents?.length ?? Object.keys(config.agents).length;
772
598
  console.log(kleur.cyan(`๐Ÿ“Š Testing ${beforeExpandCount} scenarios ร— ${agentCount} selected agents = ${expanded.scenarios.length} total tests`));
773
599
  }
774
- const selected = selectScenarios(expanded, options.scenario);
775
600
  const runsPerScenario = Number(options.runs);
776
601
  if (Number.isNaN(runsPerScenario) || runsPerScenario <= 0) {
777
602
  throw new Error('Runs must be a positive number');
@@ -795,9 +620,11 @@ async function executeSingleConfigRun(params) {
795
620
  oauthTokens[serverName] = token;
796
621
  }
797
622
  // Detect OAuth servers missing a token and fail early with a helpful message
798
- const oauthServers = Object.entries(config.servers ?? {})
799
- .filter(([, v]) => v.auth?.type === 'oauth_authorization_code')
800
- .map(([name]) => name);
623
+ const effectiveServerIds = new Set(expanded.scenarios.flatMap((scenario) => scenario.servers));
624
+ const oauthServers = Array.from(effectiveServerIds).filter((name) => {
625
+ const cfg = expanded.servers?.[name];
626
+ return cfg?.auth?.type === 'oauth_authorization_code';
627
+ });
801
628
  const missingTokenServers = oauthServers.filter((name) => !oauthTokens[name]);
802
629
  if (missingTokenServers.length > 0) {
803
630
  for (const name of missingTokenServers) {
@@ -814,11 +641,11 @@ async function executeSingleConfigRun(params) {
814
641
  }
815
642
  throw new Error(`OAuth login required for server(s): ${missingTokenServers.join(', ')}. Provide tokens via --oauth-token.`);
816
643
  }
817
- const { runDir, results } = await runAll(selected, {
644
+ const { runDir, results } = await runAll(expanded, {
818
645
  runsPerScenario,
819
646
  scenarioId: options.scenario,
820
647
  runNote,
821
- configHash: hash,
648
+ configHash: effectiveConfigHash,
822
649
  gitCommit: getGitCommit(),
823
650
  cliVersion: pkgVersion,
824
651
  runsDir: String(options.runsDir),
@@ -830,35 +657,6 @@ async function executeSingleConfigRun(params) {
830
657
  }
831
658
  }
832
659
  });
833
- let shouldFailOnDrift = false;
834
- const useSnapshotEval = Boolean(options.snapshotEval) || Boolean(config.snapshot_eval?.enabled);
835
- if (useSnapshotEval) {
836
- const policy = config.snapshot_eval;
837
- if (!policy?.baseline_snapshot_id) {
838
- console.log(kleur.yellow('โš  Snapshot eval enabled but no baseline snapshot is configured.'));
839
- }
840
- else {
841
- const snapshot = loadSnapshot(String(policy.baseline_snapshot_id), resolve(options.snapshotsDir));
842
- const comparison = compareRunToSnapshot(results, snapshot);
843
- const enabledScenarioIds = new Set(selected.scenarios
844
- .filter((scenario) => scenario.snapshot_eval?.enabled !== false)
845
- .map((scenario) => scenario.id));
846
- const applied = applySnapshotPolicyToRunResult({
847
- results,
848
- comparisons: [comparison],
849
- policy,
850
- enabledScenarioIds
851
- });
852
- console.log('');
853
- console.log(kleur.cyan('๐Ÿ“ธ Snapshot Eval Policy'));
854
- console.log(`${applied.mode} ยท baseline=${applied.baseline_snapshot_id} ยท overall=${applied.overall_score} ยท status=${applied.status}`);
855
- if (applied.impacted_scenarios.length > 0) {
856
- console.log(kleur.yellow(`Impacted scenarios: ${applied.impacted_scenarios.join(', ')}`));
857
- }
858
- console.log(formatSnapshotComparisonTable(comparison));
859
- shouldFailOnDrift = policy.mode === 'fail_on_drift' && applied.impacted_scenarios.length > 0;
860
- }
861
- }
862
660
  const reportPath = join(runDir, 'report.html');
863
661
  const resultsPath = join(runDir, 'results.json');
864
662
  const summaryPath = join(runDir, 'summary.md');
@@ -866,19 +664,12 @@ async function executeSingleConfigRun(params) {
866
664
  writeFileSync(reportPath, renderReport(results), 'utf8');
867
665
  writeFileSync(summaryPath, renderSummaryMarkdown(results), 'utf8');
868
666
  console.log(kleur.green(`โœ… Run complete. Results: ${runDir}`));
869
- if (options.compareSnapshot) {
870
- const snapshot = loadSnapshot(String(options.compareSnapshot), resolve(options.snapshotsDir));
871
- const comparison = compareRunToSnapshot(results, snapshot);
872
- console.log('');
873
- console.log(kleur.cyan('๐Ÿ“ธ Snapshot Comparison'));
874
- console.log(formatSnapshotComparisonTable(comparison));
875
- }
876
667
  const failedRuns = results.scenarios.reduce((sum, scenario) => sum + scenario.runs.filter((run) => !run.pass).length, 0);
877
668
  return {
878
669
  runDir,
879
670
  runId: results.metadata.run_id,
880
671
  passed: failedRuns === 0,
881
- shouldFailOnDrift
672
+ shouldFailOnDrift: false
882
673
  };
883
674
  }
884
675
  function getGitCommit() {
@@ -902,9 +693,9 @@ function formatRunProgressEvent(event) {
902
693
  case 'run_started':
903
694
  return `Run started (${event.totalScenarioRuns} scenario run(s), ${event.runsPerScenario} run(s) each).`;
904
695
  case 'mcp_connect_started':
905
- return `Connecting MCP servers (${event.serverCount})...`;
696
+ return `Connecting MCP servers (${event.serverCount}): ${event.serverNames.join(', ')}...`;
906
697
  case 'mcp_connect_finished':
907
- return `Connected MCP servers (${event.serverCount}).`;
698
+ return `Connected MCP servers (${event.serverCount}): ${event.serverNames.join(', ')}.`;
908
699
  case 'scenario_run_started':
909
700
  return `Scenario ${event.scenarioRunIndex}/${event.totalScenarioRuns} started: ${event.scenarioId} [agent=${event.agentName}, run=${event.runIndex + 1}/${event.runsPerScenario}]`;
910
701
  case 'scenario_run_finished':