mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/dry_run.py ADDED
@@ -0,0 +1,533 @@
1
+ """Dry-run mode for previewing evaluations without executing them.
2
+
3
+ This module provides functionality to preview what an evaluation would do,
4
+ validate configurations, estimate costs and time, and check infrastructure
5
+ readiness -- all without making actual API calls or running tasks.
6
+
7
+ Useful for debugging configuration issues and estimating costs before
8
+ committing to a full evaluation run.
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ import shutil
14
+ from dataclasses import dataclass, field
15
+
16
+ from rich.console import Console
17
+ from rich.panel import Panel
18
+ from rich.table import Table
19
+
20
+ import docker
21
+
22
+ from .benchmarks import create_benchmark
23
+ from .config import HarnessConfig
24
+ from .config_validator import ConfigValidator, ValidationResult
25
+ from .pricing import format_cost, get_model_pricing
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Historical estimates for average tokens per task by benchmark type.
30
+ # These are rough estimates based on typical evaluation runs and are used
31
+ # when no better data is available. Values represent (input_tokens, output_tokens).
32
+ _ESTIMATED_TOKENS_PER_TASK: dict[str, tuple[int, int]] = {
33
+ "swe-bench-lite": (50_000, 10_000),
34
+ "swe-bench-verified": (50_000, 10_000),
35
+ "swe-bench-full": (50_000, 10_000),
36
+ "humaneval": (5_000, 2_000),
37
+ "mbpp": (5_000, 2_000),
38
+ "gsm8k": (3_000, 1_000),
39
+ "math": (5_000, 2_000),
40
+ "truthfulqa": (2_000, 500),
41
+ "bigbench-hard": (5_000, 2_000),
42
+ "hellaswag": (2_000, 500),
43
+ "arc": (3_000, 1_000),
44
+ "apps": (10_000, 5_000),
45
+ "codecontests": (15_000, 8_000),
46
+ "bigcodebench": (10_000, 5_000),
47
+ "leetcode": (10_000, 5_000),
48
+ "codereval": (20_000, 8_000),
49
+ "repoqa": (30_000, 5_000),
50
+ "toolbench": (10_000, 5_000),
51
+ "aider-polyglot": (30_000, 10_000),
52
+ "terminalbench": (20_000, 8_000),
53
+ "gaia": (15_000, 5_000),
54
+ "agentbench": (20_000, 8_000),
55
+ "webarena": (20_000, 8_000),
56
+ "mlagentbench": (25_000, 10_000),
57
+ "intercode": (15_000, 5_000),
58
+ "cybergym": (30_000, 10_000),
59
+ "mcptoolbench": (10_000, 5_000),
60
+ "custom": (10_000, 5_000),
61
+ "mmmu": (5_000, 2_000),
62
+ "longbench": (30_000, 5_000),
63
+ "adversarial": (10_000, 5_000),
64
+ }
65
+
66
+ # Historical average minutes per task (wall-clock time).
67
+ # Accounts for Docker setup, agent execution, and evaluation.
68
+ _ESTIMATED_MINUTES_PER_TASK: dict[str, float] = {
69
+ "swe-bench-lite": 8.0,
70
+ "swe-bench-verified": 8.0,
71
+ "swe-bench-full": 8.0,
72
+ "humaneval": 2.0,
73
+ "mbpp": 2.0,
74
+ "gsm8k": 1.0,
75
+ "math": 2.0,
76
+ "truthfulqa": 0.5,
77
+ "bigbench-hard": 1.5,
78
+ "hellaswag": 0.5,
79
+ "arc": 1.0,
80
+ "apps": 4.0,
81
+ "codecontests": 6.0,
82
+ "bigcodebench": 4.0,
83
+ "leetcode": 4.0,
84
+ "codereval": 6.0,
85
+ "repoqa": 5.0,
86
+ "toolbench": 3.0,
87
+ "aider-polyglot": 7.0,
88
+ "terminalbench": 5.0,
89
+ "gaia": 5.0,
90
+ "agentbench": 6.0,
91
+ "webarena": 6.0,
92
+ "mlagentbench": 7.0,
93
+ "intercode": 4.0,
94
+ "cybergym": 8.0,
95
+ "mcptoolbench": 3.0,
96
+ "custom": 3.0,
97
+ "mmmu": 2.0,
98
+ "longbench": 5.0,
99
+ "adversarial": 3.0,
100
+ }
101
+
102
+
103
+ @dataclass
104
+ class DryRunResult:
105
+ """Result of a dry-run evaluation preview.
106
+
107
+ Contains all information about what an evaluation would do, including
108
+ task details, cost estimates, configuration validation, and infrastructure
109
+ readiness checks.
110
+
111
+ Attributes:
112
+ benchmark_name: Name of the benchmark to run.
113
+ total_tasks: Total number of tasks that would be executed.
114
+ task_ids: List of task IDs that would be executed.
115
+ estimated_cost_usd: Estimated total cost in USD based on model pricing.
116
+ estimated_time_minutes: Estimated total wall-clock time in minutes.
117
+ config_valid: Whether the configuration passed validation.
118
+ config_errors: List of configuration validation error messages.
119
+ docker_available: Whether Docker is available and running.
120
+ mcp_servers_reachable: Mapping of MCP server names to reachability status.
121
+ warnings: List of warning messages about the evaluation.
122
+ """
123
+
124
+ benchmark_name: str
125
+ total_tasks: int
126
+ task_ids: list[str]
127
+ estimated_cost_usd: float | None
128
+ estimated_time_minutes: float | None
129
+ config_valid: bool
130
+ config_errors: list[str]
131
+ docker_available: bool
132
+ mcp_servers_reachable: dict[str, bool]
133
+ warnings: list[str] = field(default_factory=list)
134
+
135
+
136
+ def _check_docker_available() -> bool:
137
+ """Check whether Docker is available and running.
138
+
139
+ Returns:
140
+ True if Docker daemon is reachable, False otherwise.
141
+ """
142
+ try:
143
+ client = docker.from_env()
144
+ client.ping()
145
+ return True
146
+ except Exception:
147
+ return False
148
+
149
+
150
+ def _check_mcp_server_reachable(command: str) -> bool:
151
+ """Check whether an MCP server command is available in PATH.
152
+
153
+ This checks that the command is installed and executable, which is a
154
+ necessary prerequisite for the MCP server to be reachable at runtime.
155
+
156
+ Args:
157
+ command: The MCP server command to check (e.g., 'npx', 'uvx').
158
+
159
+ Returns:
160
+ True if the command is found in PATH, False otherwise.
161
+ """
162
+ return shutil.which(command) is not None
163
+
164
+
165
+ def _validate_config_from_object(config: HarnessConfig) -> ValidationResult:
166
+ """Validate a HarnessConfig object using the config validator.
167
+
168
+ Since ConfigValidator works on files, we perform structural validation
169
+ directly on the config object instead.
170
+
171
+ Args:
172
+ config: The harness configuration to validate.
173
+
174
+ Returns:
175
+ ValidationResult with errors and warnings.
176
+ """
177
+ validator = ConfigValidator()
178
+
179
+ # Validate API key (Anthropic provider)
180
+ if config.provider == "anthropic":
181
+ validator._validate_api_key()
182
+
183
+ # If we have no errors from the validator and the config object was
184
+ # successfully created (Pydantic validated), it is valid.
185
+ return ValidationResult(
186
+ valid=not validator.has_errors,
187
+ errors=validator.errors,
188
+ warnings=validator.warnings,
189
+ )
190
+
191
+
192
+ def _estimate_cost(
193
+ model_id: str,
194
+ benchmark_name: str,
195
+ num_tasks: int,
196
+ ) -> float | None:
197
+ """Estimate the cost of running an evaluation based on model pricing.
198
+
199
+ Uses historical token usage estimates per benchmark type and the model's
200
+ pricing to compute an approximate total cost.
201
+
202
+ Args:
203
+ model_id: The model identifier for pricing lookup.
204
+ benchmark_name: The benchmark name for token estimation.
205
+ num_tasks: The number of tasks to estimate for.
206
+
207
+ Returns:
208
+ Estimated cost in USD, or None if pricing is unavailable.
209
+ """
210
+ pricing = get_model_pricing(model_id)
211
+ if pricing is None:
212
+ return None
213
+
214
+ input_tokens, output_tokens = _ESTIMATED_TOKENS_PER_TASK.get(benchmark_name, (10_000, 5_000))
215
+
216
+ # Calculate per-task cost
217
+ input_cost = (input_tokens / 1_000_000) * pricing.input_price_per_mtok
218
+ output_cost = (output_tokens / 1_000_000) * pricing.output_price_per_mtok
219
+ per_task_cost = input_cost + output_cost
220
+
221
+ return per_task_cost * num_tasks
222
+
223
+
224
+ def _estimate_time(
225
+ benchmark_name: str,
226
+ num_tasks: int,
227
+ max_concurrent: int,
228
+ timeout_seconds: int,
229
+ ) -> float:
230
+ """Estimate the wall-clock time for running an evaluation.
231
+
232
+ Uses historical per-task time estimates and accounts for concurrency.
233
+ The estimate is capped by the configured timeout per task.
234
+
235
+ Args:
236
+ benchmark_name: The benchmark name for time estimation.
237
+ num_tasks: The number of tasks to run.
238
+ max_concurrent: Maximum concurrent tasks.
239
+ timeout_seconds: Configured timeout per task in seconds.
240
+
241
+ Returns:
242
+ Estimated wall-clock time in minutes.
243
+ """
244
+ per_task_minutes = _ESTIMATED_MINUTES_PER_TASK.get(benchmark_name, 3.0)
245
+
246
+ # Cap per-task time at the configured timeout
247
+ timeout_minutes = timeout_seconds / 60.0
248
+ per_task_minutes = min(per_task_minutes, timeout_minutes)
249
+
250
+ # Account for concurrency: tasks run in batches of max_concurrent
251
+ effective_concurrency = max(1, min(max_concurrent, num_tasks)) if num_tasks > 0 else 1
252
+ total_minutes = (num_tasks / effective_concurrency) * per_task_minutes
253
+
254
+ return total_minutes
255
+
256
+
257
+ async def dry_run(config: HarnessConfig, verbosity: int = 0) -> DryRunResult:
258
+ """Preview what an evaluation would do without executing it.
259
+
260
+ Loads the benchmark tasks, validates the configuration, checks Docker
261
+ availability, checks MCP server reachability, and estimates cost and
262
+ time. Does NOT make any API calls or run any tasks.
263
+
264
+ Args:
265
+ config: The harness configuration to preview.
266
+ verbosity: Verbosity level (0=minimal, 1=summary, 2=detailed).
267
+
268
+ Returns:
269
+ DryRunResult containing all preview information.
270
+ """
271
+ warnings: list[str] = []
272
+ config_errors: list[str] = []
273
+ task_ids: list[str] = []
274
+ total_tasks = 0
275
+
276
+ # 1. Validate configuration
277
+ validation_result = _validate_config_from_object(config)
278
+ config_valid = validation_result.valid
279
+ for error in validation_result.errors:
280
+ config_errors.append(f"{error.field}: {error.error}")
281
+ for warning in validation_result.warnings:
282
+ warnings.append(f"Config warning ({warning.field}): {warning.error}")
283
+
284
+ # 2. Load benchmark tasks
285
+ benchmark_name = config.benchmark
286
+ try:
287
+ benchmark_kwargs = {}
288
+ if config.benchmark == "cybergym":
289
+ benchmark_kwargs["level"] = config.cybergym_level
290
+
291
+ benchmark = create_benchmark(config.benchmark, **benchmark_kwargs)
292
+ tasks = benchmark.load_tasks(
293
+ sample_size=config.sample_size,
294
+ filter_difficulty=config.filter_difficulty,
295
+ filter_category=config.filter_category,
296
+ filter_tags=config.filter_tags,
297
+ )
298
+ total_tasks = len(tasks)
299
+ task_ids = [t.get("instance_id", f"task_{i}") for i, t in enumerate(tasks)]
300
+ except Exception as e:
301
+ warnings.append(f"Failed to load benchmark tasks: {e}")
302
+ total_tasks = config.sample_size if config.sample_size else 0
303
+
304
+ # 3. Check Docker availability
305
+ docker_available = _check_docker_available()
306
+ if not docker_available:
307
+ warnings.append(
308
+ "Docker is not available. Evaluation requires Docker to create "
309
+ "isolated task environments."
310
+ )
311
+
312
+ # 4. Check MCP server reachability
313
+ mcp_servers_reachable: dict[str, bool] = {}
314
+ if config.comparison_mode:
315
+ if config.mcp_server_a and config.mcp_server_a.command:
316
+ name_a = config.mcp_server_a.name or "mcp_server_a"
317
+ reachable = _check_mcp_server_reachable(config.mcp_server_a.command)
318
+ mcp_servers_reachable[name_a] = reachable
319
+ if not reachable:
320
+ warnings.append(
321
+ f"MCP server A command '{config.mcp_server_a.command}' not found in PATH."
322
+ )
323
+ if config.mcp_server_b and config.mcp_server_b.command:
324
+ name_b = config.mcp_server_b.name or "mcp_server_b"
325
+ reachable = _check_mcp_server_reachable(config.mcp_server_b.command)
326
+ mcp_servers_reachable[name_b] = reachable
327
+ if not reachable:
328
+ warnings.append(
329
+ f"MCP server B command '{config.mcp_server_b.command}' not found in PATH."
330
+ )
331
+ elif config.mcp_server and config.mcp_server.command:
332
+ name = config.mcp_server.name or "mcp_server"
333
+ reachable = _check_mcp_server_reachable(config.mcp_server.command)
334
+ mcp_servers_reachable[name] = reachable
335
+ if not reachable:
336
+ warnings.append(f"MCP server command '{config.mcp_server.command}' not found in PATH.")
337
+
338
+ # 5. Check API key
339
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
340
+ if not api_key:
341
+ warnings.append(
342
+ "ANTHROPIC_API_KEY environment variable is not set. "
343
+ "Evaluation requires a valid API key."
344
+ )
345
+
346
+ # 6. Estimate cost
347
+ estimated_cost = _estimate_cost(config.model, benchmark_name, total_tasks)
348
+ if estimated_cost is None:
349
+ warnings.append(
350
+ f"Could not estimate cost: pricing unavailable for model '{config.model}'. "
351
+ f"Cost estimation uses historical token usage averages and may vary."
352
+ )
353
+ else:
354
+ # Add a note about estimation accuracy
355
+ if verbosity >= 1:
356
+ warnings.append(
357
+ "Cost estimate is based on historical averages and actual costs may vary "
358
+ "significantly depending on task complexity and agent behavior."
359
+ )
360
+
361
+ # 7. Estimate time
362
+ estimated_time = _estimate_time(
363
+ benchmark_name,
364
+ total_tasks,
365
+ config.max_concurrent,
366
+ config.timeout_seconds,
367
+ )
368
+
369
+ # 8. Budget warning
370
+ if config.budget is not None and estimated_cost is not None:
371
+ if estimated_cost > config.budget:
372
+ warnings.append(
373
+ f"Estimated cost ({format_cost(estimated_cost)}) exceeds budget "
374
+ f"({format_cost(config.budget)}). Evaluation may be halted early."
375
+ )
376
+
377
+ return DryRunResult(
378
+ benchmark_name=benchmark_name,
379
+ total_tasks=total_tasks,
380
+ task_ids=task_ids,
381
+ estimated_cost_usd=estimated_cost,
382
+ estimated_time_minutes=estimated_time,
383
+ config_valid=config_valid,
384
+ config_errors=config_errors,
385
+ docker_available=docker_available,
386
+ mcp_servers_reachable=mcp_servers_reachable,
387
+ warnings=warnings,
388
+ )
389
+
390
+
391
+ def format_dry_run_report(result: DryRunResult) -> None:
392
+ """Print a rich-formatted dry-run report to the console.
393
+
394
+ Displays a comprehensive overview of what the evaluation would do,
395
+ including task details, cost estimates, infrastructure readiness,
396
+ and any warnings or errors.
397
+
398
+ Args:
399
+ result: The DryRunResult to format and display.
400
+ """
401
+ console = Console()
402
+
403
+ # Header
404
+ console.print()
405
+ console.print(
406
+ Panel(
407
+ "[bold]Dry Run Report[/bold]\n[dim]Preview of evaluation without executing tasks[/dim]",
408
+ border_style="cyan",
409
+ )
410
+ )
411
+
412
+ # Benchmark & Tasks table
413
+ task_table = Table(
414
+ title="Evaluation Overview",
415
+ show_header=True,
416
+ header_style="bold cyan",
417
+ )
418
+ task_table.add_column("Property", style="bold")
419
+ task_table.add_column("Value")
420
+
421
+ task_table.add_row("Benchmark", result.benchmark_name)
422
+ task_table.add_row("Total Tasks", str(result.total_tasks))
423
+ task_table.add_row(
424
+ "Estimated Cost",
425
+ format_cost(result.estimated_cost_usd) if result.estimated_cost_usd is not None else "N/A",
426
+ )
427
+
428
+ if result.estimated_time_minutes is not None:
429
+ hours = int(result.estimated_time_minutes // 60)
430
+ minutes = int(result.estimated_time_minutes % 60)
431
+ if hours > 0:
432
+ time_str = f"{hours}h {minutes}m"
433
+ else:
434
+ time_str = f"{minutes}m"
435
+ task_table.add_row("Estimated Time", time_str)
436
+ else:
437
+ task_table.add_row("Estimated Time", "N/A")
438
+
439
+ console.print()
440
+ console.print(task_table)
441
+
442
+ # Task IDs (show first 10, then summarize)
443
+ if result.task_ids:
444
+ console.print()
445
+ console.print("[bold]Task IDs:[/bold]")
446
+ display_count = min(10, len(result.task_ids))
447
+ for task_id in result.task_ids[:display_count]:
448
+ console.print(f" [dim]-[/dim] {task_id}")
449
+ if len(result.task_ids) > display_count:
450
+ console.print(f" [dim]... and {len(result.task_ids) - display_count} more[/dim]")
451
+
452
+ # Infrastructure Readiness table
453
+ infra_table = Table(
454
+ title="Infrastructure Readiness",
455
+ show_header=True,
456
+ header_style="bold cyan",
457
+ )
458
+ infra_table.add_column("Check", style="bold")
459
+ infra_table.add_column("Status", justify="center")
460
+ infra_table.add_column("Details")
461
+
462
+ # Config validation
463
+ if result.config_valid:
464
+ infra_table.add_row("Configuration", "[green]PASS[/green]", "Valid")
465
+ else:
466
+ error_summary = "; ".join(result.config_errors[:3])
467
+ if len(result.config_errors) > 3:
468
+ error_summary += f" (+{len(result.config_errors) - 3} more)"
469
+ infra_table.add_row("Configuration", "[red]FAIL[/red]", error_summary)
470
+
471
+ # Docker
472
+ if result.docker_available:
473
+ infra_table.add_row("Docker", "[green]PASS[/green]", "Running")
474
+ else:
475
+ infra_table.add_row("Docker", "[red]FAIL[/red]", "Not available")
476
+
477
+ # MCP servers
478
+ for server_name, reachable in result.mcp_servers_reachable.items():
479
+ if reachable:
480
+ infra_table.add_row(f"MCP: {server_name}", "[green]PASS[/green]", "Command found")
481
+ else:
482
+ infra_table.add_row(f"MCP: {server_name}", "[red]FAIL[/red]", "Command not found")
483
+
484
+ # API key
485
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
486
+ if api_key:
487
+ masked = f"{api_key[:8]}...{api_key[-4:]}" if len(api_key) > 12 else "***"
488
+ infra_table.add_row("API Key", "[green]PASS[/green]", f"Set ({masked})")
489
+ else:
490
+ infra_table.add_row("API Key", "[red]FAIL[/red]", "Not set")
491
+
492
+ console.print()
493
+ console.print(infra_table)
494
+
495
+ # Warnings
496
+ if result.warnings:
497
+ console.print()
498
+ console.print("[yellow bold]Warnings:[/yellow bold]")
499
+ for warning in result.warnings:
500
+ console.print(f" [yellow]-[/yellow] {warning}")
501
+
502
+ # Config errors
503
+ if result.config_errors:
504
+ console.print()
505
+ console.print("[red bold]Configuration Errors:[/red bold]")
506
+ for error in result.config_errors:
507
+ console.print(f" [red]-[/red] {error}")
508
+
509
+ # Summary
510
+ console.print()
511
+ all_clear = (
512
+ result.config_valid
513
+ and result.docker_available
514
+ and all(result.mcp_servers_reachable.values())
515
+ and api_key is not None
516
+ )
517
+ if all_clear:
518
+ console.print(
519
+ Panel(
520
+ "[green bold]All checks passed.[/green bold]\nThe evaluation is ready to run.",
521
+ border_style="green",
522
+ )
523
+ )
524
+ else:
525
+ console.print(
526
+ Panel(
527
+ "[red bold]Some checks failed.[/red bold]\n"
528
+ "Please resolve the issues above before running the evaluation.",
529
+ border_style="red",
530
+ )
531
+ )
532
+
533
+ console.print()