mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/config_wizard.py ADDED
@@ -0,0 +1,647 @@
1
+ """Interactive configuration wizard for creating mcpbr config files.
2
+
3
+ Provides a step-by-step CLI wizard that guides users through creating
4
+ a valid YAML configuration file with helpful inline comments.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import click
11
+ import yaml
12
+
13
+ from .config import VALID_BENCHMARKS
14
+ from .models import DEFAULT_MODEL, SUPPORTED_MODELS, list_supported_models
15
+
16
+ # Preset configurations for common MCP server use cases
17
+ PRESETS: dict[str, dict[str, Any]] = {
18
+ "filesystem": {
19
+ "description": "Local filesystem access (read/write files in the workspace)",
20
+ "mcp_server": {
21
+ "name": "filesystem",
22
+ "command": "npx",
23
+ "args": ["-y", "@modelcontextprotocol/server-filesystem", "{workdir}"],
24
+ "env": {},
25
+ },
26
+ },
27
+ "web-search": {
28
+ "description": "Web search capabilities via Brave Search API",
29
+ "mcp_server": {
30
+ "name": "brave-search",
31
+ "command": "npx",
32
+ "args": ["-y", "@modelcontextprotocol/server-brave-search"],
33
+ "env": {"BRAVE_API_KEY": "${BRAVE_API_KEY}"},
34
+ },
35
+ },
36
+ "database": {
37
+ "description": "PostgreSQL database access via MCP",
38
+ "mcp_server": {
39
+ "name": "postgres",
40
+ "command": "npx",
41
+ "args": [
42
+ "-y",
43
+ "@modelcontextprotocol/server-postgres",
44
+ "${DATABASE_URL}",
45
+ ],
46
+ "env": {},
47
+ },
48
+ },
49
+ "custom": {
50
+ "description": "Custom MCP server (you provide command, args, and env)",
51
+ "mcp_server": None,
52
+ },
53
+ }
54
+
55
+
56
+ class ConfigWizard:
57
+ """Interactive wizard for creating mcpbr configuration files.
58
+
59
+ Guides users through configuring an MCP server, selecting a model
60
+ and benchmark, and setting advanced options. Generates a valid YAML
61
+ config file with inline comments.
62
+
63
+ Example usage::
64
+
65
+ wizard = ConfigWizard()
66
+ config = wizard.run()
67
+ # config is a dict ready for YAML serialization
68
+ """
69
+
70
+ def __init__(self) -> None:
71
+ """Initialize the configuration wizard."""
72
+ self.config: dict[str, Any] = {}
73
+
74
+ def run(self) -> dict[str, Any]:
75
+ """Run the full interactive configuration wizard.
76
+
77
+ Walks the user through each configuration section in order:
78
+ preset selection, model, benchmark, MCP servers, and advanced
79
+ settings. Returns the assembled configuration dictionary.
80
+
81
+ Returns:
82
+ Complete configuration dictionary suitable for YAML output.
83
+ """
84
+ click.echo("\n=== mcpbr Configuration Wizard ===\n")
85
+ click.echo("This wizard will help you create a configuration file")
86
+ click.echo("for running MCP server benchmarks.\n")
87
+
88
+ self._select_preset()
89
+ self._configure_model()
90
+ self._configure_benchmark()
91
+ self._configure_mcp_servers()
92
+ self._configure_advanced()
93
+
94
+ return self.config
95
+
96
+ def _select_preset(self) -> None:
97
+ """Prompt the user to select a preset MCP server configuration.
98
+
99
+ Displays available presets with descriptions and applies the
100
+ selected preset's MCP server config to self.config. For the
101
+ 'custom' preset, delegates to manual MCP server configuration.
102
+ """
103
+ click.echo("--- Step 1: Select a Preset ---\n")
104
+ click.echo("Choose a starting point for your MCP server configuration:\n")
105
+
106
+ preset_names = list(PRESETS.keys())
107
+ for i, name in enumerate(preset_names, 1):
108
+ desc = PRESETS[name]["description"]
109
+ click.echo(f" [{i}] {name} - {desc}")
110
+
111
+ click.echo()
112
+
113
+ choice = click.prompt(
114
+ "Select a preset",
115
+ type=click.IntRange(1, len(preset_names)),
116
+ default=1,
117
+ )
118
+
119
+ selected_name = preset_names[choice - 1]
120
+ preset = PRESETS[selected_name]
121
+
122
+ click.echo(f"\nSelected: {selected_name}\n")
123
+
124
+ if preset["mcp_server"] is not None:
125
+ self.config["mcp_server"] = dict(preset["mcp_server"])
126
+ else:
127
+ # Custom preset: gather MCP server details manually
128
+ self.config["mcp_server"] = self._prompt_custom_mcp_server()
129
+
130
+ def _prompt_custom_mcp_server(self) -> dict[str, Any]:
131
+ """Prompt the user to configure a custom MCP server.
132
+
133
+ Asks for server name, command, arguments, and optional
134
+ environment variables.
135
+
136
+ Returns:
137
+ MCP server configuration dictionary.
138
+ """
139
+ click.echo("--- Custom MCP Server Configuration ---\n")
140
+
141
+ name = click.prompt("Server name", type=str, default="my-server")
142
+ command = click.prompt(
143
+ "Command to start the server (e.g., npx, uvx, python, node)",
144
+ type=str,
145
+ )
146
+
147
+ args_str = click.prompt(
148
+ "Arguments (space-separated, use {workdir} for workspace path)",
149
+ type=str,
150
+ default="",
151
+ )
152
+ args = args_str.split() if args_str.strip() else []
153
+
154
+ env: dict[str, str] = {}
155
+ if click.confirm("Add environment variables?", default=False):
156
+ while True:
157
+ key = click.prompt(" Variable name (empty to finish)", type=str, default="")
158
+ if not key:
159
+ break
160
+ value = click.prompt(f" Value for {key}", type=str)
161
+ env[key] = value
162
+
163
+ return {
164
+ "name": name,
165
+ "command": command,
166
+ "args": args,
167
+ "env": env,
168
+ }
169
+
170
+ def _configure_model(self) -> None:
171
+ """Prompt the user to select an LLM model for evaluation.
172
+
173
+ Displays available models from the model registry and lets the
174
+ user pick one, defaulting to the project default model.
175
+ """
176
+ click.echo("--- Step 2: Select a Model ---\n")
177
+
178
+ models = list_supported_models()
179
+ # Show unique models (skip duplicates from aliases pointing to same display name)
180
+ seen_display: set[str] = set()
181
+ unique_models: list[tuple[str, str]] = []
182
+ for m in models:
183
+ if m.display_name not in seen_display:
184
+ seen_display.add(m.display_name)
185
+ unique_models.append((m.id, m.display_name))
186
+
187
+ click.echo("Available models:\n")
188
+ for model_id, display_name in unique_models:
189
+ marker = " (default)" if model_id == DEFAULT_MODEL else ""
190
+ click.echo(f" - {model_id}: {display_name}{marker}")
191
+
192
+ click.echo()
193
+
194
+ model_id = click.prompt(
195
+ "Model ID",
196
+ type=str,
197
+ default=DEFAULT_MODEL,
198
+ )
199
+
200
+ # Warn if model is not in the supported list but allow it
201
+ if model_id not in SUPPORTED_MODELS:
202
+ click.echo(
203
+ f"\nWarning: '{model_id}' is not in the known model list. "
204
+ "It may still work if your provider supports it."
205
+ )
206
+ if not click.confirm("Use this model anyway?", default=True):
207
+ model_id = DEFAULT_MODEL
208
+ click.echo(f"Using default model: {model_id}")
209
+
210
+ self.config["model"] = model_id
211
+
212
+ def _configure_benchmark(self) -> None:
213
+ """Prompt the user to select a benchmark for evaluation.
214
+
215
+ Shows all valid benchmarks and lets the user pick one. Defaults
216
+ to swe-bench-verified.
217
+ """
218
+ click.echo("\n--- Step 3: Select a Benchmark ---\n")
219
+
220
+ click.echo("Available benchmarks:\n")
221
+ for benchmark in VALID_BENCHMARKS:
222
+ marker = " (default)" if benchmark == "swe-bench-verified" else ""
223
+ click.echo(f" - {benchmark}{marker}")
224
+
225
+ click.echo()
226
+
227
+ benchmark = click.prompt(
228
+ "Benchmark",
229
+ type=click.Choice(list(VALID_BENCHMARKS), case_sensitive=False),
230
+ default="swe-bench-verified",
231
+ show_choices=False,
232
+ )
233
+
234
+ self.config["benchmark"] = benchmark
235
+
236
+ def _configure_mcp_servers(self) -> None:
237
+ """Prompt the user to refine MCP server connection settings.
238
+
239
+ Asks about the connection type (stdio vs SSE) and allows
240
+ customization of timeouts and optional setup commands.
241
+ """
242
+ click.echo("\n--- Step 4: MCP Server Settings ---\n")
243
+
244
+ # Connection type
245
+ click.echo("MCP server connection type:\n")
246
+ click.echo(" [1] stdio - Standard I/O (local process, most common)")
247
+ click.echo(" [2] sse - Server-Sent Events (remote HTTP server)")
248
+ click.echo()
249
+
250
+ conn_choice = click.prompt(
251
+ "Connection type",
252
+ type=click.IntRange(1, 2),
253
+ default=1,
254
+ )
255
+
256
+ if conn_choice == 2:
257
+ # SSE mode: override command/args with SSE URL
258
+ sse_url = click.prompt("SSE server URL", type=str)
259
+ self.config["mcp_server"]["command"] = "npx"
260
+ self.config["mcp_server"]["args"] = [
261
+ "-y",
262
+ "@modelcontextprotocol/client-sse",
263
+ sse_url,
264
+ ]
265
+ click.echo(f"\nConfigured SSE connection to: {sse_url}")
266
+
267
+ # Server name
268
+ current_name = self.config["mcp_server"].get("name", "mcpbr")
269
+ name = click.prompt("Server name", type=str, default=current_name)
270
+ self.config["mcp_server"]["name"] = name
271
+
272
+ # Startup timeout
273
+ startup_timeout = click.prompt(
274
+ "Startup timeout (ms)",
275
+ type=int,
276
+ default=60000,
277
+ )
278
+ if startup_timeout != 60000:
279
+ self.config["mcp_server"]["startup_timeout_ms"] = startup_timeout
280
+
281
+ # Tool timeout
282
+ tool_timeout = click.prompt(
283
+ "Tool execution timeout (ms)",
284
+ type=int,
285
+ default=900000,
286
+ )
287
+ if tool_timeout != 900000:
288
+ self.config["mcp_server"]["tool_timeout_ms"] = tool_timeout
289
+
290
+ # Setup command
291
+ if click.confirm("Add a setup command (runs before agent starts)?", default=False):
292
+ setup_cmd = click.prompt(
293
+ "Setup command (use {workdir} for workspace path)",
294
+ type=str,
295
+ )
296
+ self.config["mcp_server"]["setup_command"] = setup_cmd
297
+
298
+ def _configure_advanced(self) -> None:
299
+ """Prompt the user for advanced evaluation settings.
300
+
301
+ Covers sample size, timeout, concurrency, iteration limits,
302
+ thinking budget, and budget cap.
303
+ """
304
+ click.echo("\n--- Step 5: Advanced Settings ---\n")
305
+
306
+ if not click.confirm("Configure advanced settings?", default=False):
307
+ # Apply sensible defaults
308
+ self.config.setdefault("provider", "anthropic")
309
+ self.config.setdefault("agent_harness", "claude-code")
310
+ self.config.setdefault("sample_size", 10)
311
+ self.config.setdefault("timeout_seconds", 300)
312
+ self.config.setdefault("max_concurrent", 4)
313
+ self.config.setdefault("max_iterations", 10)
314
+ return
315
+
316
+ # Provider (currently only anthropic)
317
+ self.config["provider"] = "anthropic"
318
+ self.config["agent_harness"] = "claude-code"
319
+
320
+ # Sample size
321
+ sample_input = click.prompt(
322
+ "Sample size (number of tasks, or 'all' for full dataset)",
323
+ type=str,
324
+ default="10",
325
+ )
326
+ if sample_input.lower() == "all":
327
+ self.config["sample_size"] = None
328
+ else:
329
+ try:
330
+ sample_val = int(sample_input)
331
+ if sample_val < 1:
332
+ click.echo("Sample size must be at least 1. Using 10.")
333
+ sample_val = 10
334
+ self.config["sample_size"] = sample_val
335
+ except ValueError:
336
+ click.echo("Invalid number. Using default of 10.")
337
+ self.config["sample_size"] = 10
338
+
339
+ # Timeout
340
+ timeout = click.prompt(
341
+ "Timeout per task (seconds, minimum 30)",
342
+ type=int,
343
+ default=300,
344
+ )
345
+ if timeout < 30:
346
+ click.echo("Timeout must be at least 30 seconds. Using 30.")
347
+ timeout = 30
348
+ self.config["timeout_seconds"] = timeout
349
+
350
+ # Max concurrent
351
+ max_concurrent = click.prompt(
352
+ "Maximum concurrent tasks",
353
+ type=int,
354
+ default=4,
355
+ )
356
+ if max_concurrent < 1:
357
+ click.echo("Must be at least 1. Using 1.")
358
+ max_concurrent = 1
359
+ self.config["max_concurrent"] = max_concurrent
360
+
361
+ # Max iterations
362
+ max_iterations = click.prompt(
363
+ "Maximum agent iterations per task",
364
+ type=int,
365
+ default=10,
366
+ )
367
+ if max_iterations < 1:
368
+ click.echo("Must be at least 1. Using 1.")
369
+ max_iterations = 1
370
+ self.config["max_iterations"] = max_iterations
371
+
372
+ # Thinking budget
373
+ if click.confirm("Enable extended thinking?", default=False):
374
+ thinking = click.prompt(
375
+ "Thinking budget (tokens, 1024-31999)",
376
+ type=int,
377
+ default=10000,
378
+ )
379
+ if thinking < 1024:
380
+ click.echo("Minimum is 1024. Using 1024.")
381
+ thinking = 1024
382
+ elif thinking > 31999:
383
+ click.echo("Maximum is 31999. Using 31999.")
384
+ thinking = 31999
385
+ self.config["thinking_budget"] = thinking
386
+
387
+ # Budget cap
388
+ if click.confirm("Set a budget cap (USD)?", default=False):
389
+ budget = click.prompt("Maximum budget (USD)", type=float)
390
+ if budget <= 0:
391
+ click.echo("Budget must be positive. Skipping budget cap.")
392
+ else:
393
+ self.config["budget"] = budget
394
+
395
+ def _generate_config(self, output_path: Path) -> str:
396
+ """Generate a YAML config file with inline comments.
397
+
398
+ Writes the current configuration to the specified path as YAML,
399
+ with a header and inline comments explaining each field.
400
+
401
+ Args:
402
+ output_path: File path to write the YAML configuration to.
403
+
404
+ Returns:
405
+ The generated YAML string.
406
+ """
407
+ yaml_str = generate_commented_yaml(self.config)
408
+
409
+ output_path.parent.mkdir(parents=True, exist_ok=True)
410
+ output_path.write_text(yaml_str)
411
+
412
+ return yaml_str
413
+
414
+
415
+ def generate_commented_yaml(config: dict[str, Any]) -> str:
416
+ """Generate a YAML configuration string with helpful inline comments.
417
+
418
+ Produces a human-readable YAML file with a header block and comments
419
+ explaining each configuration field. The output is compatible with
420
+ ``mcpbr.config.load_config()``.
421
+
422
+ Args:
423
+ config: Configuration dictionary to serialize.
424
+
425
+ Returns:
426
+ YAML string with inline comments.
427
+ """
428
+ lines: list[str] = []
429
+
430
+ # Header
431
+ lines.append("# mcpbr - Model Context Protocol Benchmark Runner")
432
+ lines.append("#")
433
+ lines.append("# Generated by the interactive configuration wizard.")
434
+ lines.append("# Edit this file to customize your evaluation settings.")
435
+ lines.append("#")
436
+ lines.append("# Requires ANTHROPIC_API_KEY environment variable.")
437
+ lines.append("# Docs: https://github.com/greynewell/mcpbr")
438
+ lines.append("")
439
+
440
+ # MCP server section
441
+ mcp = config.get("mcp_server", {})
442
+ lines.append("# MCP server configuration")
443
+ lines.append("# The MCP server provides tools for the agent to use during evaluation.")
444
+ lines.append("mcp_server:")
445
+ lines.append(" # Name to register the server as (appears in tool names like mcp__<name>__*)")
446
+ lines.append(f' name: "{mcp.get("name", "mcpbr")}"')
447
+ lines.append("")
448
+ lines.append(" # Command to start the MCP server")
449
+ lines.append(f' command: "{mcp.get("command", "npx")}"')
450
+ lines.append("")
451
+ lines.append(" # Arguments to pass to the command")
452
+ lines.append(" # Use {workdir} as a placeholder for the task working directory")
453
+ lines.append(" args:")
454
+ for arg in mcp.get("args", []):
455
+ lines.append(f' - "{arg}"')
456
+ lines.append("")
457
+
458
+ # Environment variables
459
+ env = mcp.get("env", {})
460
+ lines.append(" # Environment variables for the MCP server")
461
+ if env:
462
+ lines.append(" env:")
463
+ for key, value in env.items():
464
+ lines.append(f' {key}: "{value}"')
465
+ else:
466
+ lines.append(" env: {}")
467
+ lines.append("")
468
+
469
+ # Optional MCP server fields
470
+ if "startup_timeout_ms" in mcp:
471
+ lines.append(" # Timeout for MCP server startup (ms)")
472
+ lines.append(f" startup_timeout_ms: {mcp['startup_timeout_ms']}")
473
+ lines.append("")
474
+
475
+ if "tool_timeout_ms" in mcp:
476
+ lines.append(" # Timeout for MCP tool execution (ms)")
477
+ lines.append(f" tool_timeout_ms: {mcp['tool_timeout_ms']}")
478
+ lines.append("")
479
+
480
+ if "setup_command" in mcp:
481
+ lines.append(" # Shell command to run before the agent starts (outside task timer)")
482
+ lines.append(f' setup_command: "{mcp["setup_command"]}"')
483
+ lines.append("")
484
+
485
+ # Provider
486
+ lines.append("# Model provider (currently only anthropic is supported)")
487
+ lines.append(f'provider: "{config.get("provider", "anthropic")}"')
488
+ lines.append("")
489
+
490
+ # Agent harness
491
+ lines.append("# Agent harness (currently only claude-code is supported)")
492
+ lines.append(f'agent_harness: "{config.get("agent_harness", "claude-code")}"')
493
+ lines.append("")
494
+
495
+ # Model
496
+ lines.append("# Model ID for the selected provider")
497
+ lines.append("# Run 'mcpbr models' to see available options")
498
+ lines.append(f'model: "{config.get("model", DEFAULT_MODEL)}"')
499
+ lines.append("")
500
+
501
+ # Benchmark
502
+ lines.append("# Benchmark to run")
503
+ lines.append("# Run 'mcpbr benchmarks' to see all available benchmarks")
504
+ lines.append(f'benchmark: "{config.get("benchmark", "swe-bench-verified")}"')
505
+ lines.append("")
506
+
507
+ # Sample size
508
+ sample_size = config.get("sample_size")
509
+ lines.append("# Number of tasks to evaluate (null for full dataset)")
510
+ if sample_size is None:
511
+ lines.append("sample_size: null")
512
+ else:
513
+ lines.append(f"sample_size: {sample_size}")
514
+ lines.append("")
515
+
516
+ # Timeout
517
+ lines.append("# Timeout for each task in seconds (minimum 30)")
518
+ lines.append(f"timeout_seconds: {config.get('timeout_seconds', 300)}")
519
+ lines.append("")
520
+
521
+ # Max concurrent
522
+ lines.append("# Maximum concurrent task evaluations")
523
+ lines.append(f"max_concurrent: {config.get('max_concurrent', 4)}")
524
+ lines.append("")
525
+
526
+ # Max iterations
527
+ lines.append("# Maximum agent iterations per task")
528
+ lines.append(f"max_iterations: {config.get('max_iterations', 10)}")
529
+ lines.append("")
530
+
531
+ # Thinking budget
532
+ if "thinking_budget" in config:
533
+ lines.append("# Extended thinking token budget (1024-31999)")
534
+ lines.append(f"thinking_budget: {config['thinking_budget']}")
535
+ lines.append("")
536
+
537
+ # Budget
538
+ if "budget" in config:
539
+ lines.append("# Maximum budget in USD (halts evaluation when reached)")
540
+ lines.append(f"budget: {config['budget']}")
541
+ lines.append("")
542
+
543
+ return "\n".join(lines) + "\n"
544
+
545
+
546
+ def validate_config_dict(config: dict[str, Any]) -> list[str]:
547
+ """Validate a configuration dictionary and return a list of errors.
548
+
549
+ Performs basic validation of required fields and value ranges without
550
+ constructing a full Pydantic model.
551
+
552
+ Args:
553
+ config: Configuration dictionary to validate.
554
+
555
+ Returns:
556
+ List of error messages. Empty list means the config is valid.
557
+ """
558
+ errors: list[str] = []
559
+
560
+ # Check mcp_server
561
+ mcp = config.get("mcp_server")
562
+ if not mcp:
563
+ errors.append("mcp_server is required")
564
+ elif not isinstance(mcp, dict):
565
+ errors.append("mcp_server must be a dictionary")
566
+ else:
567
+ if not mcp.get("command"):
568
+ errors.append("mcp_server.command is required")
569
+
570
+ # Check benchmark
571
+ benchmark = config.get("benchmark", "swe-bench-verified")
572
+ if benchmark not in VALID_BENCHMARKS:
573
+ errors.append(
574
+ f"Invalid benchmark: '{benchmark}'. Valid benchmarks: {', '.join(VALID_BENCHMARKS)}"
575
+ )
576
+
577
+ # Check timeout
578
+ timeout = config.get("timeout_seconds", 300)
579
+ if isinstance(timeout, int) and timeout < 30:
580
+ errors.append("timeout_seconds must be at least 30")
581
+
582
+ # Check max_concurrent
583
+ max_concurrent = config.get("max_concurrent", 4)
584
+ if isinstance(max_concurrent, int) and max_concurrent < 1:
585
+ errors.append("max_concurrent must be at least 1")
586
+
587
+ # Check thinking_budget
588
+ thinking = config.get("thinking_budget")
589
+ if thinking is not None:
590
+ if isinstance(thinking, int):
591
+ if thinking < 1024:
592
+ errors.append("thinking_budget must be at least 1024")
593
+ elif thinking > 31999:
594
+ errors.append("thinking_budget cannot exceed 31999")
595
+
596
+ # Check budget
597
+ budget = config.get("budget")
598
+ if budget is not None and isinstance(budget, (int, float)) and budget <= 0:
599
+ errors.append("budget must be positive")
600
+
601
+ return errors
602
+
603
+
604
+ def run_wizard(output_path: Path) -> None:
605
+ """Run the configuration wizard and write the result to a file.
606
+
607
+ Entry point for invoking the wizard from the CLI. Creates a
608
+ ConfigWizard instance, runs it, validates the result, and writes
609
+ the YAML config to the specified output path.
610
+
611
+ Args:
612
+ output_path: Path to write the generated YAML configuration.
613
+
614
+ Raises:
615
+ click.Abort: If the user cancels the wizard.
616
+ """
617
+ wizard = ConfigWizard()
618
+
619
+ try:
620
+ config = wizard.run()
621
+ except (click.Abort, EOFError):
622
+ click.echo("\nWizard cancelled.")
623
+ return
624
+
625
+ # Validate before writing
626
+ errors = validate_config_dict(config)
627
+ if errors:
628
+ click.echo("\nConfiguration has issues:")
629
+ for error in errors:
630
+ click.echo(f" - {error}")
631
+ if not click.confirm("\nWrite config anyway?", default=False):
632
+ click.echo("Aborted.")
633
+ return
634
+
635
+ yaml_str = wizard._generate_config(output_path)
636
+
637
+ click.echo(f"\nConfiguration saved to: {output_path}")
638
+ click.echo("\nTo validate your config:")
639
+ click.echo(f" mcpbr config validate {output_path}")
640
+ click.echo("\nTo run an evaluation:")
641
+ click.echo(f" mcpbr run --config {output_path}")
642
+
643
+ # Verify the generated YAML is parseable
644
+ try:
645
+ yaml.safe_load(yaml_str)
646
+ except yaml.YAMLError as e:
647
+ click.echo(f"\nWarning: Generated YAML may have syntax issues: {e}")