mcpbr 0.4.16__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/formatting.py +444 -0
- mcpbr/harness.py +38 -4
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.5.0.dist-info}/METADATA +1 -1
- {mcpbr-0.4.16.dist-info → mcpbr-0.5.0.dist-info}/RECORD +25 -13
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.16.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.16.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/config_wizard.py
ADDED
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
"""Interactive configuration wizard for creating mcpbr config files.
|
|
2
|
+
|
|
3
|
+
Provides a step-by-step CLI wizard that guides users through creating
|
|
4
|
+
a valid YAML configuration file with helpful inline comments.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
from .config import VALID_BENCHMARKS
|
|
14
|
+
from .models import DEFAULT_MODEL, SUPPORTED_MODELS, list_supported_models
|
|
15
|
+
|
|
16
|
+
# Preset configurations for common MCP server use cases
|
|
17
|
+
PRESETS: dict[str, dict[str, Any]] = {
|
|
18
|
+
"filesystem": {
|
|
19
|
+
"description": "Local filesystem access (read/write files in the workspace)",
|
|
20
|
+
"mcp_server": {
|
|
21
|
+
"name": "filesystem",
|
|
22
|
+
"command": "npx",
|
|
23
|
+
"args": ["-y", "@modelcontextprotocol/server-filesystem", "{workdir}"],
|
|
24
|
+
"env": {},
|
|
25
|
+
},
|
|
26
|
+
},
|
|
27
|
+
"web-search": {
|
|
28
|
+
"description": "Web search capabilities via Brave Search API",
|
|
29
|
+
"mcp_server": {
|
|
30
|
+
"name": "brave-search",
|
|
31
|
+
"command": "npx",
|
|
32
|
+
"args": ["-y", "@modelcontextprotocol/server-brave-search"],
|
|
33
|
+
"env": {"BRAVE_API_KEY": "${BRAVE_API_KEY}"},
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
"database": {
|
|
37
|
+
"description": "PostgreSQL database access via MCP",
|
|
38
|
+
"mcp_server": {
|
|
39
|
+
"name": "postgres",
|
|
40
|
+
"command": "npx",
|
|
41
|
+
"args": [
|
|
42
|
+
"-y",
|
|
43
|
+
"@modelcontextprotocol/server-postgres",
|
|
44
|
+
"${DATABASE_URL}",
|
|
45
|
+
],
|
|
46
|
+
"env": {},
|
|
47
|
+
},
|
|
48
|
+
},
|
|
49
|
+
"custom": {
|
|
50
|
+
"description": "Custom MCP server (you provide command, args, and env)",
|
|
51
|
+
"mcp_server": None,
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ConfigWizard:
|
|
57
|
+
"""Interactive wizard for creating mcpbr configuration files.
|
|
58
|
+
|
|
59
|
+
Guides users through configuring an MCP server, selecting a model
|
|
60
|
+
and benchmark, and setting advanced options. Generates a valid YAML
|
|
61
|
+
config file with inline comments.
|
|
62
|
+
|
|
63
|
+
Example usage::
|
|
64
|
+
|
|
65
|
+
wizard = ConfigWizard()
|
|
66
|
+
config = wizard.run()
|
|
67
|
+
# config is a dict ready for YAML serialization
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self) -> None:
|
|
71
|
+
"""Initialize the configuration wizard."""
|
|
72
|
+
self.config: dict[str, Any] = {}
|
|
73
|
+
|
|
74
|
+
def run(self) -> dict[str, Any]:
|
|
75
|
+
"""Run the full interactive configuration wizard.
|
|
76
|
+
|
|
77
|
+
Walks the user through each configuration section in order:
|
|
78
|
+
preset selection, model, benchmark, MCP servers, and advanced
|
|
79
|
+
settings. Returns the assembled configuration dictionary.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Complete configuration dictionary suitable for YAML output.
|
|
83
|
+
"""
|
|
84
|
+
click.echo("\n=== mcpbr Configuration Wizard ===\n")
|
|
85
|
+
click.echo("This wizard will help you create a configuration file")
|
|
86
|
+
click.echo("for running MCP server benchmarks.\n")
|
|
87
|
+
|
|
88
|
+
self._select_preset()
|
|
89
|
+
self._configure_model()
|
|
90
|
+
self._configure_benchmark()
|
|
91
|
+
self._configure_mcp_servers()
|
|
92
|
+
self._configure_advanced()
|
|
93
|
+
|
|
94
|
+
return self.config
|
|
95
|
+
|
|
96
|
+
def _select_preset(self) -> None:
|
|
97
|
+
"""Prompt the user to select a preset MCP server configuration.
|
|
98
|
+
|
|
99
|
+
Displays available presets with descriptions and applies the
|
|
100
|
+
selected preset's MCP server config to self.config. For the
|
|
101
|
+
'custom' preset, delegates to manual MCP server configuration.
|
|
102
|
+
"""
|
|
103
|
+
click.echo("--- Step 1: Select a Preset ---\n")
|
|
104
|
+
click.echo("Choose a starting point for your MCP server configuration:\n")
|
|
105
|
+
|
|
106
|
+
preset_names = list(PRESETS.keys())
|
|
107
|
+
for i, name in enumerate(preset_names, 1):
|
|
108
|
+
desc = PRESETS[name]["description"]
|
|
109
|
+
click.echo(f" [{i}] {name} - {desc}")
|
|
110
|
+
|
|
111
|
+
click.echo()
|
|
112
|
+
|
|
113
|
+
choice = click.prompt(
|
|
114
|
+
"Select a preset",
|
|
115
|
+
type=click.IntRange(1, len(preset_names)),
|
|
116
|
+
default=1,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
selected_name = preset_names[choice - 1]
|
|
120
|
+
preset = PRESETS[selected_name]
|
|
121
|
+
|
|
122
|
+
click.echo(f"\nSelected: {selected_name}\n")
|
|
123
|
+
|
|
124
|
+
if preset["mcp_server"] is not None:
|
|
125
|
+
self.config["mcp_server"] = dict(preset["mcp_server"])
|
|
126
|
+
else:
|
|
127
|
+
# Custom preset: gather MCP server details manually
|
|
128
|
+
self.config["mcp_server"] = self._prompt_custom_mcp_server()
|
|
129
|
+
|
|
130
|
+
def _prompt_custom_mcp_server(self) -> dict[str, Any]:
|
|
131
|
+
"""Prompt the user to configure a custom MCP server.
|
|
132
|
+
|
|
133
|
+
Asks for server name, command, arguments, and optional
|
|
134
|
+
environment variables.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
MCP server configuration dictionary.
|
|
138
|
+
"""
|
|
139
|
+
click.echo("--- Custom MCP Server Configuration ---\n")
|
|
140
|
+
|
|
141
|
+
name = click.prompt("Server name", type=str, default="my-server")
|
|
142
|
+
command = click.prompt(
|
|
143
|
+
"Command to start the server (e.g., npx, uvx, python, node)",
|
|
144
|
+
type=str,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
args_str = click.prompt(
|
|
148
|
+
"Arguments (space-separated, use {workdir} for workspace path)",
|
|
149
|
+
type=str,
|
|
150
|
+
default="",
|
|
151
|
+
)
|
|
152
|
+
args = args_str.split() if args_str.strip() else []
|
|
153
|
+
|
|
154
|
+
env: dict[str, str] = {}
|
|
155
|
+
if click.confirm("Add environment variables?", default=False):
|
|
156
|
+
while True:
|
|
157
|
+
key = click.prompt(" Variable name (empty to finish)", type=str, default="")
|
|
158
|
+
if not key:
|
|
159
|
+
break
|
|
160
|
+
value = click.prompt(f" Value for {key}", type=str)
|
|
161
|
+
env[key] = value
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
"name": name,
|
|
165
|
+
"command": command,
|
|
166
|
+
"args": args,
|
|
167
|
+
"env": env,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
def _configure_model(self) -> None:
|
|
171
|
+
"""Prompt the user to select an LLM model for evaluation.
|
|
172
|
+
|
|
173
|
+
Displays available models from the model registry and lets the
|
|
174
|
+
user pick one, defaulting to the project default model.
|
|
175
|
+
"""
|
|
176
|
+
click.echo("--- Step 2: Select a Model ---\n")
|
|
177
|
+
|
|
178
|
+
models = list_supported_models()
|
|
179
|
+
# Show unique models (skip duplicates from aliases pointing to same display name)
|
|
180
|
+
seen_display: set[str] = set()
|
|
181
|
+
unique_models: list[tuple[str, str]] = []
|
|
182
|
+
for m in models:
|
|
183
|
+
if m.display_name not in seen_display:
|
|
184
|
+
seen_display.add(m.display_name)
|
|
185
|
+
unique_models.append((m.id, m.display_name))
|
|
186
|
+
|
|
187
|
+
click.echo("Available models:\n")
|
|
188
|
+
for model_id, display_name in unique_models:
|
|
189
|
+
marker = " (default)" if model_id == DEFAULT_MODEL else ""
|
|
190
|
+
click.echo(f" - {model_id}: {display_name}{marker}")
|
|
191
|
+
|
|
192
|
+
click.echo()
|
|
193
|
+
|
|
194
|
+
model_id = click.prompt(
|
|
195
|
+
"Model ID",
|
|
196
|
+
type=str,
|
|
197
|
+
default=DEFAULT_MODEL,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Warn if model is not in the supported list but allow it
|
|
201
|
+
if model_id not in SUPPORTED_MODELS:
|
|
202
|
+
click.echo(
|
|
203
|
+
f"\nWarning: '{model_id}' is not in the known model list. "
|
|
204
|
+
"It may still work if your provider supports it."
|
|
205
|
+
)
|
|
206
|
+
if not click.confirm("Use this model anyway?", default=True):
|
|
207
|
+
model_id = DEFAULT_MODEL
|
|
208
|
+
click.echo(f"Using default model: {model_id}")
|
|
209
|
+
|
|
210
|
+
self.config["model"] = model_id
|
|
211
|
+
|
|
212
|
+
def _configure_benchmark(self) -> None:
|
|
213
|
+
"""Prompt the user to select a benchmark for evaluation.
|
|
214
|
+
|
|
215
|
+
Shows all valid benchmarks and lets the user pick one. Defaults
|
|
216
|
+
to swe-bench-verified.
|
|
217
|
+
"""
|
|
218
|
+
click.echo("\n--- Step 3: Select a Benchmark ---\n")
|
|
219
|
+
|
|
220
|
+
click.echo("Available benchmarks:\n")
|
|
221
|
+
for benchmark in VALID_BENCHMARKS:
|
|
222
|
+
marker = " (default)" if benchmark == "swe-bench-verified" else ""
|
|
223
|
+
click.echo(f" - {benchmark}{marker}")
|
|
224
|
+
|
|
225
|
+
click.echo()
|
|
226
|
+
|
|
227
|
+
benchmark = click.prompt(
|
|
228
|
+
"Benchmark",
|
|
229
|
+
type=click.Choice(list(VALID_BENCHMARKS), case_sensitive=False),
|
|
230
|
+
default="swe-bench-verified",
|
|
231
|
+
show_choices=False,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self.config["benchmark"] = benchmark
|
|
235
|
+
|
|
236
|
+
def _configure_mcp_servers(self) -> None:
|
|
237
|
+
"""Prompt the user to refine MCP server connection settings.
|
|
238
|
+
|
|
239
|
+
Asks about the connection type (stdio vs SSE) and allows
|
|
240
|
+
customization of timeouts and optional setup commands.
|
|
241
|
+
"""
|
|
242
|
+
click.echo("\n--- Step 4: MCP Server Settings ---\n")
|
|
243
|
+
|
|
244
|
+
# Connection type
|
|
245
|
+
click.echo("MCP server connection type:\n")
|
|
246
|
+
click.echo(" [1] stdio - Standard I/O (local process, most common)")
|
|
247
|
+
click.echo(" [2] sse - Server-Sent Events (remote HTTP server)")
|
|
248
|
+
click.echo()
|
|
249
|
+
|
|
250
|
+
conn_choice = click.prompt(
|
|
251
|
+
"Connection type",
|
|
252
|
+
type=click.IntRange(1, 2),
|
|
253
|
+
default=1,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if conn_choice == 2:
|
|
257
|
+
# SSE mode: override command/args with SSE URL
|
|
258
|
+
sse_url = click.prompt("SSE server URL", type=str)
|
|
259
|
+
self.config["mcp_server"]["command"] = "npx"
|
|
260
|
+
self.config["mcp_server"]["args"] = [
|
|
261
|
+
"-y",
|
|
262
|
+
"@modelcontextprotocol/client-sse",
|
|
263
|
+
sse_url,
|
|
264
|
+
]
|
|
265
|
+
click.echo(f"\nConfigured SSE connection to: {sse_url}")
|
|
266
|
+
|
|
267
|
+
# Server name
|
|
268
|
+
current_name = self.config["mcp_server"].get("name", "mcpbr")
|
|
269
|
+
name = click.prompt("Server name", type=str, default=current_name)
|
|
270
|
+
self.config["mcp_server"]["name"] = name
|
|
271
|
+
|
|
272
|
+
# Startup timeout
|
|
273
|
+
startup_timeout = click.prompt(
|
|
274
|
+
"Startup timeout (ms)",
|
|
275
|
+
type=int,
|
|
276
|
+
default=60000,
|
|
277
|
+
)
|
|
278
|
+
if startup_timeout != 60000:
|
|
279
|
+
self.config["mcp_server"]["startup_timeout_ms"] = startup_timeout
|
|
280
|
+
|
|
281
|
+
# Tool timeout
|
|
282
|
+
tool_timeout = click.prompt(
|
|
283
|
+
"Tool execution timeout (ms)",
|
|
284
|
+
type=int,
|
|
285
|
+
default=900000,
|
|
286
|
+
)
|
|
287
|
+
if tool_timeout != 900000:
|
|
288
|
+
self.config["mcp_server"]["tool_timeout_ms"] = tool_timeout
|
|
289
|
+
|
|
290
|
+
# Setup command
|
|
291
|
+
if click.confirm("Add a setup command (runs before agent starts)?", default=False):
|
|
292
|
+
setup_cmd = click.prompt(
|
|
293
|
+
"Setup command (use {workdir} for workspace path)",
|
|
294
|
+
type=str,
|
|
295
|
+
)
|
|
296
|
+
self.config["mcp_server"]["setup_command"] = setup_cmd
|
|
297
|
+
|
|
298
|
+
def _configure_advanced(self) -> None:
|
|
299
|
+
"""Prompt the user for advanced evaluation settings.
|
|
300
|
+
|
|
301
|
+
Covers sample size, timeout, concurrency, iteration limits,
|
|
302
|
+
thinking budget, and budget cap.
|
|
303
|
+
"""
|
|
304
|
+
click.echo("\n--- Step 5: Advanced Settings ---\n")
|
|
305
|
+
|
|
306
|
+
if not click.confirm("Configure advanced settings?", default=False):
|
|
307
|
+
# Apply sensible defaults
|
|
308
|
+
self.config.setdefault("provider", "anthropic")
|
|
309
|
+
self.config.setdefault("agent_harness", "claude-code")
|
|
310
|
+
self.config.setdefault("sample_size", 10)
|
|
311
|
+
self.config.setdefault("timeout_seconds", 300)
|
|
312
|
+
self.config.setdefault("max_concurrent", 4)
|
|
313
|
+
self.config.setdefault("max_iterations", 10)
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
# Provider (currently only anthropic)
|
|
317
|
+
self.config["provider"] = "anthropic"
|
|
318
|
+
self.config["agent_harness"] = "claude-code"
|
|
319
|
+
|
|
320
|
+
# Sample size
|
|
321
|
+
sample_input = click.prompt(
|
|
322
|
+
"Sample size (number of tasks, or 'all' for full dataset)",
|
|
323
|
+
type=str,
|
|
324
|
+
default="10",
|
|
325
|
+
)
|
|
326
|
+
if sample_input.lower() == "all":
|
|
327
|
+
self.config["sample_size"] = None
|
|
328
|
+
else:
|
|
329
|
+
try:
|
|
330
|
+
sample_val = int(sample_input)
|
|
331
|
+
if sample_val < 1:
|
|
332
|
+
click.echo("Sample size must be at least 1. Using 10.")
|
|
333
|
+
sample_val = 10
|
|
334
|
+
self.config["sample_size"] = sample_val
|
|
335
|
+
except ValueError:
|
|
336
|
+
click.echo("Invalid number. Using default of 10.")
|
|
337
|
+
self.config["sample_size"] = 10
|
|
338
|
+
|
|
339
|
+
# Timeout
|
|
340
|
+
timeout = click.prompt(
|
|
341
|
+
"Timeout per task (seconds, minimum 30)",
|
|
342
|
+
type=int,
|
|
343
|
+
default=300,
|
|
344
|
+
)
|
|
345
|
+
if timeout < 30:
|
|
346
|
+
click.echo("Timeout must be at least 30 seconds. Using 30.")
|
|
347
|
+
timeout = 30
|
|
348
|
+
self.config["timeout_seconds"] = timeout
|
|
349
|
+
|
|
350
|
+
# Max concurrent
|
|
351
|
+
max_concurrent = click.prompt(
|
|
352
|
+
"Maximum concurrent tasks",
|
|
353
|
+
type=int,
|
|
354
|
+
default=4,
|
|
355
|
+
)
|
|
356
|
+
if max_concurrent < 1:
|
|
357
|
+
click.echo("Must be at least 1. Using 1.")
|
|
358
|
+
max_concurrent = 1
|
|
359
|
+
self.config["max_concurrent"] = max_concurrent
|
|
360
|
+
|
|
361
|
+
# Max iterations
|
|
362
|
+
max_iterations = click.prompt(
|
|
363
|
+
"Maximum agent iterations per task",
|
|
364
|
+
type=int,
|
|
365
|
+
default=10,
|
|
366
|
+
)
|
|
367
|
+
if max_iterations < 1:
|
|
368
|
+
click.echo("Must be at least 1. Using 1.")
|
|
369
|
+
max_iterations = 1
|
|
370
|
+
self.config["max_iterations"] = max_iterations
|
|
371
|
+
|
|
372
|
+
# Thinking budget
|
|
373
|
+
if click.confirm("Enable extended thinking?", default=False):
|
|
374
|
+
thinking = click.prompt(
|
|
375
|
+
"Thinking budget (tokens, 1024-31999)",
|
|
376
|
+
type=int,
|
|
377
|
+
default=10000,
|
|
378
|
+
)
|
|
379
|
+
if thinking < 1024:
|
|
380
|
+
click.echo("Minimum is 1024. Using 1024.")
|
|
381
|
+
thinking = 1024
|
|
382
|
+
elif thinking > 31999:
|
|
383
|
+
click.echo("Maximum is 31999. Using 31999.")
|
|
384
|
+
thinking = 31999
|
|
385
|
+
self.config["thinking_budget"] = thinking
|
|
386
|
+
|
|
387
|
+
# Budget cap
|
|
388
|
+
if click.confirm("Set a budget cap (USD)?", default=False):
|
|
389
|
+
budget = click.prompt("Maximum budget (USD)", type=float)
|
|
390
|
+
if budget <= 0:
|
|
391
|
+
click.echo("Budget must be positive. Skipping budget cap.")
|
|
392
|
+
else:
|
|
393
|
+
self.config["budget"] = budget
|
|
394
|
+
|
|
395
|
+
def _generate_config(self, output_path: Path) -> str:
|
|
396
|
+
"""Generate a YAML config file with inline comments.
|
|
397
|
+
|
|
398
|
+
Writes the current configuration to the specified path as YAML,
|
|
399
|
+
with a header and inline comments explaining each field.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
output_path: File path to write the YAML configuration to.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
The generated YAML string.
|
|
406
|
+
"""
|
|
407
|
+
yaml_str = generate_commented_yaml(self.config)
|
|
408
|
+
|
|
409
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
410
|
+
output_path.write_text(yaml_str)
|
|
411
|
+
|
|
412
|
+
return yaml_str
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def generate_commented_yaml(config: dict[str, Any]) -> str:
|
|
416
|
+
"""Generate a YAML configuration string with helpful inline comments.
|
|
417
|
+
|
|
418
|
+
Produces a human-readable YAML file with a header block and comments
|
|
419
|
+
explaining each configuration field. The output is compatible with
|
|
420
|
+
``mcpbr.config.load_config()``.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
config: Configuration dictionary to serialize.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
YAML string with inline comments.
|
|
427
|
+
"""
|
|
428
|
+
lines: list[str] = []
|
|
429
|
+
|
|
430
|
+
# Header
|
|
431
|
+
lines.append("# mcpbr - Model Context Protocol Benchmark Runner")
|
|
432
|
+
lines.append("#")
|
|
433
|
+
lines.append("# Generated by the interactive configuration wizard.")
|
|
434
|
+
lines.append("# Edit this file to customize your evaluation settings.")
|
|
435
|
+
lines.append("#")
|
|
436
|
+
lines.append("# Requires ANTHROPIC_API_KEY environment variable.")
|
|
437
|
+
lines.append("# Docs: https://github.com/greynewell/mcpbr")
|
|
438
|
+
lines.append("")
|
|
439
|
+
|
|
440
|
+
# MCP server section
|
|
441
|
+
mcp = config.get("mcp_server", {})
|
|
442
|
+
lines.append("# MCP server configuration")
|
|
443
|
+
lines.append("# The MCP server provides tools for the agent to use during evaluation.")
|
|
444
|
+
lines.append("mcp_server:")
|
|
445
|
+
lines.append(" # Name to register the server as (appears in tool names like mcp__<name>__*)")
|
|
446
|
+
lines.append(f' name: "{mcp.get("name", "mcpbr")}"')
|
|
447
|
+
lines.append("")
|
|
448
|
+
lines.append(" # Command to start the MCP server")
|
|
449
|
+
lines.append(f' command: "{mcp.get("command", "npx")}"')
|
|
450
|
+
lines.append("")
|
|
451
|
+
lines.append(" # Arguments to pass to the command")
|
|
452
|
+
lines.append(" # Use {workdir} as a placeholder for the task working directory")
|
|
453
|
+
lines.append(" args:")
|
|
454
|
+
for arg in mcp.get("args", []):
|
|
455
|
+
lines.append(f' - "{arg}"')
|
|
456
|
+
lines.append("")
|
|
457
|
+
|
|
458
|
+
# Environment variables
|
|
459
|
+
env = mcp.get("env", {})
|
|
460
|
+
lines.append(" # Environment variables for the MCP server")
|
|
461
|
+
if env:
|
|
462
|
+
lines.append(" env:")
|
|
463
|
+
for key, value in env.items():
|
|
464
|
+
lines.append(f' {key}: "{value}"')
|
|
465
|
+
else:
|
|
466
|
+
lines.append(" env: {}")
|
|
467
|
+
lines.append("")
|
|
468
|
+
|
|
469
|
+
# Optional MCP server fields
|
|
470
|
+
if "startup_timeout_ms" in mcp:
|
|
471
|
+
lines.append(" # Timeout for MCP server startup (ms)")
|
|
472
|
+
lines.append(f" startup_timeout_ms: {mcp['startup_timeout_ms']}")
|
|
473
|
+
lines.append("")
|
|
474
|
+
|
|
475
|
+
if "tool_timeout_ms" in mcp:
|
|
476
|
+
lines.append(" # Timeout for MCP tool execution (ms)")
|
|
477
|
+
lines.append(f" tool_timeout_ms: {mcp['tool_timeout_ms']}")
|
|
478
|
+
lines.append("")
|
|
479
|
+
|
|
480
|
+
if "setup_command" in mcp:
|
|
481
|
+
lines.append(" # Shell command to run before the agent starts (outside task timer)")
|
|
482
|
+
lines.append(f' setup_command: "{mcp["setup_command"]}"')
|
|
483
|
+
lines.append("")
|
|
484
|
+
|
|
485
|
+
# Provider
|
|
486
|
+
lines.append("# Model provider (currently only anthropic is supported)")
|
|
487
|
+
lines.append(f'provider: "{config.get("provider", "anthropic")}"')
|
|
488
|
+
lines.append("")
|
|
489
|
+
|
|
490
|
+
# Agent harness
|
|
491
|
+
lines.append("# Agent harness (currently only claude-code is supported)")
|
|
492
|
+
lines.append(f'agent_harness: "{config.get("agent_harness", "claude-code")}"')
|
|
493
|
+
lines.append("")
|
|
494
|
+
|
|
495
|
+
# Model
|
|
496
|
+
lines.append("# Model ID for the selected provider")
|
|
497
|
+
lines.append("# Run 'mcpbr models' to see available options")
|
|
498
|
+
lines.append(f'model: "{config.get("model", DEFAULT_MODEL)}"')
|
|
499
|
+
lines.append("")
|
|
500
|
+
|
|
501
|
+
# Benchmark
|
|
502
|
+
lines.append("# Benchmark to run")
|
|
503
|
+
lines.append("# Run 'mcpbr benchmarks' to see all available benchmarks")
|
|
504
|
+
lines.append(f'benchmark: "{config.get("benchmark", "swe-bench-verified")}"')
|
|
505
|
+
lines.append("")
|
|
506
|
+
|
|
507
|
+
# Sample size
|
|
508
|
+
sample_size = config.get("sample_size")
|
|
509
|
+
lines.append("# Number of tasks to evaluate (null for full dataset)")
|
|
510
|
+
if sample_size is None:
|
|
511
|
+
lines.append("sample_size: null")
|
|
512
|
+
else:
|
|
513
|
+
lines.append(f"sample_size: {sample_size}")
|
|
514
|
+
lines.append("")
|
|
515
|
+
|
|
516
|
+
# Timeout
|
|
517
|
+
lines.append("# Timeout for each task in seconds (minimum 30)")
|
|
518
|
+
lines.append(f"timeout_seconds: {config.get('timeout_seconds', 300)}")
|
|
519
|
+
lines.append("")
|
|
520
|
+
|
|
521
|
+
# Max concurrent
|
|
522
|
+
lines.append("# Maximum concurrent task evaluations")
|
|
523
|
+
lines.append(f"max_concurrent: {config.get('max_concurrent', 4)}")
|
|
524
|
+
lines.append("")
|
|
525
|
+
|
|
526
|
+
# Max iterations
|
|
527
|
+
lines.append("# Maximum agent iterations per task")
|
|
528
|
+
lines.append(f"max_iterations: {config.get('max_iterations', 10)}")
|
|
529
|
+
lines.append("")
|
|
530
|
+
|
|
531
|
+
# Thinking budget
|
|
532
|
+
if "thinking_budget" in config:
|
|
533
|
+
lines.append("# Extended thinking token budget (1024-31999)")
|
|
534
|
+
lines.append(f"thinking_budget: {config['thinking_budget']}")
|
|
535
|
+
lines.append("")
|
|
536
|
+
|
|
537
|
+
# Budget
|
|
538
|
+
if "budget" in config:
|
|
539
|
+
lines.append("# Maximum budget in USD (halts evaluation when reached)")
|
|
540
|
+
lines.append(f"budget: {config['budget']}")
|
|
541
|
+
lines.append("")
|
|
542
|
+
|
|
543
|
+
return "\n".join(lines) + "\n"
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def validate_config_dict(config: dict[str, Any]) -> list[str]:
|
|
547
|
+
"""Validate a configuration dictionary and return a list of errors.
|
|
548
|
+
|
|
549
|
+
Performs basic validation of required fields and value ranges without
|
|
550
|
+
constructing a full Pydantic model.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
config: Configuration dictionary to validate.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
List of error messages. Empty list means the config is valid.
|
|
557
|
+
"""
|
|
558
|
+
errors: list[str] = []
|
|
559
|
+
|
|
560
|
+
# Check mcp_server
|
|
561
|
+
mcp = config.get("mcp_server")
|
|
562
|
+
if not mcp:
|
|
563
|
+
errors.append("mcp_server is required")
|
|
564
|
+
elif not isinstance(mcp, dict):
|
|
565
|
+
errors.append("mcp_server must be a dictionary")
|
|
566
|
+
else:
|
|
567
|
+
if not mcp.get("command"):
|
|
568
|
+
errors.append("mcp_server.command is required")
|
|
569
|
+
|
|
570
|
+
# Check benchmark
|
|
571
|
+
benchmark = config.get("benchmark", "swe-bench-verified")
|
|
572
|
+
if benchmark not in VALID_BENCHMARKS:
|
|
573
|
+
errors.append(
|
|
574
|
+
f"Invalid benchmark: '{benchmark}'. Valid benchmarks: {', '.join(VALID_BENCHMARKS)}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# Check timeout
|
|
578
|
+
timeout = config.get("timeout_seconds", 300)
|
|
579
|
+
if isinstance(timeout, int) and timeout < 30:
|
|
580
|
+
errors.append("timeout_seconds must be at least 30")
|
|
581
|
+
|
|
582
|
+
# Check max_concurrent
|
|
583
|
+
max_concurrent = config.get("max_concurrent", 4)
|
|
584
|
+
if isinstance(max_concurrent, int) and max_concurrent < 1:
|
|
585
|
+
errors.append("max_concurrent must be at least 1")
|
|
586
|
+
|
|
587
|
+
# Check thinking_budget
|
|
588
|
+
thinking = config.get("thinking_budget")
|
|
589
|
+
if thinking is not None:
|
|
590
|
+
if isinstance(thinking, int):
|
|
591
|
+
if thinking < 1024:
|
|
592
|
+
errors.append("thinking_budget must be at least 1024")
|
|
593
|
+
elif thinking > 31999:
|
|
594
|
+
errors.append("thinking_budget cannot exceed 31999")
|
|
595
|
+
|
|
596
|
+
# Check budget
|
|
597
|
+
budget = config.get("budget")
|
|
598
|
+
if budget is not None and isinstance(budget, (int, float)) and budget <= 0:
|
|
599
|
+
errors.append("budget must be positive")
|
|
600
|
+
|
|
601
|
+
return errors
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def run_wizard(output_path: Path) -> None:
|
|
605
|
+
"""Run the configuration wizard and write the result to a file.
|
|
606
|
+
|
|
607
|
+
Entry point for invoking the wizard from the CLI. Creates a
|
|
608
|
+
ConfigWizard instance, runs it, validates the result, and writes
|
|
609
|
+
the YAML config to the specified output path.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
output_path: Path to write the generated YAML configuration.
|
|
613
|
+
|
|
614
|
+
Raises:
|
|
615
|
+
click.Abort: If the user cancels the wizard.
|
|
616
|
+
"""
|
|
617
|
+
wizard = ConfigWizard()
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
config = wizard.run()
|
|
621
|
+
except (click.Abort, EOFError):
|
|
622
|
+
click.echo("\nWizard cancelled.")
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
# Validate before writing
|
|
626
|
+
errors = validate_config_dict(config)
|
|
627
|
+
if errors:
|
|
628
|
+
click.echo("\nConfiguration has issues:")
|
|
629
|
+
for error in errors:
|
|
630
|
+
click.echo(f" - {error}")
|
|
631
|
+
if not click.confirm("\nWrite config anyway?", default=False):
|
|
632
|
+
click.echo("Aborted.")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
yaml_str = wizard._generate_config(output_path)
|
|
636
|
+
|
|
637
|
+
click.echo(f"\nConfiguration saved to: {output_path}")
|
|
638
|
+
click.echo("\nTo validate your config:")
|
|
639
|
+
click.echo(f" mcpbr config validate {output_path}")
|
|
640
|
+
click.echo("\nTo run an evaluation:")
|
|
641
|
+
click.echo(f" mcpbr run --config {output_path}")
|
|
642
|
+
|
|
643
|
+
# Verify the generated YAML is parseable
|
|
644
|
+
try:
|
|
645
|
+
yaml.safe_load(yaml_str)
|
|
646
|
+
except yaml.YAMLError as e:
|
|
647
|
+
click.echo(f"\nWarning: Generated YAML may have syntax issues: {e}")
|