mcpbr 0.4.16__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/dry_run.py ADDED
@@ -0,0 +1,532 @@
1
+ """Dry-run mode for previewing evaluations without executing them.
2
+
3
+ This module provides functionality to preview what an evaluation would do,
4
+ validate configurations, estimate costs and time, and check infrastructure
5
+ readiness -- all without making actual API calls or running tasks.
6
+
7
+ Useful for debugging configuration issues and estimating costs before
8
+ committing to a full evaluation run.
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ import shutil
14
+ from dataclasses import dataclass, field
15
+
16
+ import docker
17
+ from rich.console import Console
18
+ from rich.panel import Panel
19
+ from rich.table import Table
20
+
21
+ from .benchmarks import create_benchmark
22
+ from .config import HarnessConfig
23
+ from .config_validator import ConfigValidator, ValidationResult
24
+ from .pricing import format_cost, get_model_pricing
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Historical estimates for average tokens per task by benchmark type.
29
+ # These are rough estimates based on typical evaluation runs and are used
30
+ # when no better data is available. Values represent (input_tokens, output_tokens).
31
+ _ESTIMATED_TOKENS_PER_TASK: dict[str, tuple[int, int]] = {
32
+ "swe-bench-lite": (50_000, 10_000),
33
+ "swe-bench-verified": (50_000, 10_000),
34
+ "swe-bench-full": (50_000, 10_000),
35
+ "humaneval": (5_000, 2_000),
36
+ "mbpp": (5_000, 2_000),
37
+ "gsm8k": (3_000, 1_000),
38
+ "math": (5_000, 2_000),
39
+ "truthfulqa": (2_000, 500),
40
+ "bigbench-hard": (5_000, 2_000),
41
+ "hellaswag": (2_000, 500),
42
+ "arc": (3_000, 1_000),
43
+ "apps": (10_000, 5_000),
44
+ "codecontests": (15_000, 8_000),
45
+ "bigcodebench": (10_000, 5_000),
46
+ "leetcode": (10_000, 5_000),
47
+ "codereval": (20_000, 8_000),
48
+ "repoqa": (30_000, 5_000),
49
+ "toolbench": (10_000, 5_000),
50
+ "aider-polyglot": (30_000, 10_000),
51
+ "terminalbench": (20_000, 8_000),
52
+ "gaia": (15_000, 5_000),
53
+ "agentbench": (20_000, 8_000),
54
+ "webarena": (20_000, 8_000),
55
+ "mlagentbench": (25_000, 10_000),
56
+ "intercode": (15_000, 5_000),
57
+ "cybergym": (30_000, 10_000),
58
+ "mcptoolbench": (10_000, 5_000),
59
+ "custom": (10_000, 5_000),
60
+ "mmmu": (5_000, 2_000),
61
+ "longbench": (30_000, 5_000),
62
+ "adversarial": (10_000, 5_000),
63
+ }
64
+
65
+ # Historical average minutes per task (wall-clock time).
66
+ # Accounts for Docker setup, agent execution, and evaluation.
67
+ _ESTIMATED_MINUTES_PER_TASK: dict[str, float] = {
68
+ "swe-bench-lite": 8.0,
69
+ "swe-bench-verified": 8.0,
70
+ "swe-bench-full": 8.0,
71
+ "humaneval": 2.0,
72
+ "mbpp": 2.0,
73
+ "gsm8k": 1.0,
74
+ "math": 2.0,
75
+ "truthfulqa": 0.5,
76
+ "bigbench-hard": 1.5,
77
+ "hellaswag": 0.5,
78
+ "arc": 1.0,
79
+ "apps": 4.0,
80
+ "codecontests": 6.0,
81
+ "bigcodebench": 4.0,
82
+ "leetcode": 4.0,
83
+ "codereval": 6.0,
84
+ "repoqa": 5.0,
85
+ "toolbench": 3.0,
86
+ "aider-polyglot": 7.0,
87
+ "terminalbench": 5.0,
88
+ "gaia": 5.0,
89
+ "agentbench": 6.0,
90
+ "webarena": 6.0,
91
+ "mlagentbench": 7.0,
92
+ "intercode": 4.0,
93
+ "cybergym": 8.0,
94
+ "mcptoolbench": 3.0,
95
+ "custom": 3.0,
96
+ "mmmu": 2.0,
97
+ "longbench": 5.0,
98
+ "adversarial": 3.0,
99
+ }
100
+
101
+
102
+ @dataclass
103
+ class DryRunResult:
104
+ """Result of a dry-run evaluation preview.
105
+
106
+ Contains all information about what an evaluation would do, including
107
+ task details, cost estimates, configuration validation, and infrastructure
108
+ readiness checks.
109
+
110
+ Attributes:
111
+ benchmark_name: Name of the benchmark to run.
112
+ total_tasks: Total number of tasks that would be executed.
113
+ task_ids: List of task IDs that would be executed.
114
+ estimated_cost_usd: Estimated total cost in USD based on model pricing.
115
+ estimated_time_minutes: Estimated total wall-clock time in minutes.
116
+ config_valid: Whether the configuration passed validation.
117
+ config_errors: List of configuration validation error messages.
118
+ docker_available: Whether Docker is available and running.
119
+ mcp_servers_reachable: Mapping of MCP server names to reachability status.
120
+ warnings: List of warning messages about the evaluation.
121
+ """
122
+
123
+ benchmark_name: str
124
+ total_tasks: int
125
+ task_ids: list[str]
126
+ estimated_cost_usd: float | None
127
+ estimated_time_minutes: float | None
128
+ config_valid: bool
129
+ config_errors: list[str]
130
+ docker_available: bool
131
+ mcp_servers_reachable: dict[str, bool]
132
+ warnings: list[str] = field(default_factory=list)
133
+
134
+
135
+ def _check_docker_available() -> bool:
136
+ """Check whether Docker is available and running.
137
+
138
+ Returns:
139
+ True if Docker daemon is reachable, False otherwise.
140
+ """
141
+ try:
142
+ client = docker.from_env()
143
+ client.ping()
144
+ return True
145
+ except Exception:
146
+ return False
147
+
148
+
149
+ def _check_mcp_server_reachable(command: str) -> bool:
150
+ """Check whether an MCP server command is available in PATH.
151
+
152
+ This checks that the command is installed and executable, which is a
153
+ necessary prerequisite for the MCP server to be reachable at runtime.
154
+
155
+ Args:
156
+ command: The MCP server command to check (e.g., 'npx', 'uvx').
157
+
158
+ Returns:
159
+ True if the command is found in PATH, False otherwise.
160
+ """
161
+ return shutil.which(command) is not None
162
+
163
+
164
+ def _validate_config_from_object(config: HarnessConfig) -> ValidationResult:
165
+ """Validate a HarnessConfig object using the config validator.
166
+
167
+ Since ConfigValidator works on files, we perform structural validation
168
+ directly on the config object instead.
169
+
170
+ Args:
171
+ config: The harness configuration to validate.
172
+
173
+ Returns:
174
+ ValidationResult with errors and warnings.
175
+ """
176
+ validator = ConfigValidator()
177
+
178
+ # Validate API key (Anthropic provider)
179
+ if config.provider == "anthropic":
180
+ validator._validate_api_key()
181
+
182
+ # If we have no errors from the validator and the config object was
183
+ # successfully created (Pydantic validated), it is valid.
184
+ return ValidationResult(
185
+ valid=not validator.has_errors,
186
+ errors=validator.errors,
187
+ warnings=validator.warnings,
188
+ )
189
+
190
+
191
+ def _estimate_cost(
192
+ model_id: str,
193
+ benchmark_name: str,
194
+ num_tasks: int,
195
+ ) -> float | None:
196
+ """Estimate the cost of running an evaluation based on model pricing.
197
+
198
+ Uses historical token usage estimates per benchmark type and the model's
199
+ pricing to compute an approximate total cost.
200
+
201
+ Args:
202
+ model_id: The model identifier for pricing lookup.
203
+ benchmark_name: The benchmark name for token estimation.
204
+ num_tasks: The number of tasks to estimate for.
205
+
206
+ Returns:
207
+ Estimated cost in USD, or None if pricing is unavailable.
208
+ """
209
+ pricing = get_model_pricing(model_id)
210
+ if pricing is None:
211
+ return None
212
+
213
+ input_tokens, output_tokens = _ESTIMATED_TOKENS_PER_TASK.get(benchmark_name, (10_000, 5_000))
214
+
215
+ # Calculate per-task cost
216
+ input_cost = (input_tokens / 1_000_000) * pricing.input_price_per_mtok
217
+ output_cost = (output_tokens / 1_000_000) * pricing.output_price_per_mtok
218
+ per_task_cost = input_cost + output_cost
219
+
220
+ return per_task_cost * num_tasks
221
+
222
+
223
+ def _estimate_time(
224
+ benchmark_name: str,
225
+ num_tasks: int,
226
+ max_concurrent: int,
227
+ timeout_seconds: int,
228
+ ) -> float:
229
+ """Estimate the wall-clock time for running an evaluation.
230
+
231
+ Uses historical per-task time estimates and accounts for concurrency.
232
+ The estimate is capped by the configured timeout per task.
233
+
234
+ Args:
235
+ benchmark_name: The benchmark name for time estimation.
236
+ num_tasks: The number of tasks to run.
237
+ max_concurrent: Maximum concurrent tasks.
238
+ timeout_seconds: Configured timeout per task in seconds.
239
+
240
+ Returns:
241
+ Estimated wall-clock time in minutes.
242
+ """
243
+ per_task_minutes = _ESTIMATED_MINUTES_PER_TASK.get(benchmark_name, 3.0)
244
+
245
+ # Cap per-task time at the configured timeout
246
+ timeout_minutes = timeout_seconds / 60.0
247
+ per_task_minutes = min(per_task_minutes, timeout_minutes)
248
+
249
+ # Account for concurrency: tasks run in batches of max_concurrent
250
+ effective_concurrency = max(1, min(max_concurrent, num_tasks)) if num_tasks > 0 else 1
251
+ total_minutes = (num_tasks / effective_concurrency) * per_task_minutes
252
+
253
+ return total_minutes
254
+
255
+
256
+ async def dry_run(config: HarnessConfig, verbosity: int = 0) -> DryRunResult:
257
+ """Preview what an evaluation would do without executing it.
258
+
259
+ Loads the benchmark tasks, validates the configuration, checks Docker
260
+ availability, checks MCP server reachability, and estimates cost and
261
+ time. Does NOT make any API calls or run any tasks.
262
+
263
+ Args:
264
+ config: The harness configuration to preview.
265
+ verbosity: Verbosity level (0=minimal, 1=summary, 2=detailed).
266
+
267
+ Returns:
268
+ DryRunResult containing all preview information.
269
+ """
270
+ warnings: list[str] = []
271
+ config_errors: list[str] = []
272
+ task_ids: list[str] = []
273
+ total_tasks = 0
274
+
275
+ # 1. Validate configuration
276
+ validation_result = _validate_config_from_object(config)
277
+ config_valid = validation_result.valid
278
+ for error in validation_result.errors:
279
+ config_errors.append(f"{error.field}: {error.error}")
280
+ for warning in validation_result.warnings:
281
+ warnings.append(f"Config warning ({warning.field}): {warning.error}")
282
+
283
+ # 2. Load benchmark tasks
284
+ benchmark_name = config.benchmark
285
+ try:
286
+ benchmark_kwargs = {}
287
+ if config.benchmark == "cybergym":
288
+ benchmark_kwargs["level"] = config.cybergym_level
289
+
290
+ benchmark = create_benchmark(config.benchmark, **benchmark_kwargs)
291
+ tasks = benchmark.load_tasks(
292
+ sample_size=config.sample_size,
293
+ filter_difficulty=config.filter_difficulty,
294
+ filter_category=config.filter_category,
295
+ filter_tags=config.filter_tags,
296
+ )
297
+ total_tasks = len(tasks)
298
+ task_ids = [t.get("instance_id", f"task_{i}") for i, t in enumerate(tasks)]
299
+ except Exception as e:
300
+ warnings.append(f"Failed to load benchmark tasks: {e}")
301
+ total_tasks = config.sample_size if config.sample_size else 0
302
+
303
+ # 3. Check Docker availability
304
+ docker_available = _check_docker_available()
305
+ if not docker_available:
306
+ warnings.append(
307
+ "Docker is not available. Evaluation requires Docker to create "
308
+ "isolated task environments."
309
+ )
310
+
311
+ # 4. Check MCP server reachability
312
+ mcp_servers_reachable: dict[str, bool] = {}
313
+ if config.comparison_mode:
314
+ if config.mcp_server_a and config.mcp_server_a.command:
315
+ name_a = config.mcp_server_a.name or "mcp_server_a"
316
+ reachable = _check_mcp_server_reachable(config.mcp_server_a.command)
317
+ mcp_servers_reachable[name_a] = reachable
318
+ if not reachable:
319
+ warnings.append(
320
+ f"MCP server A command '{config.mcp_server_a.command}' not found in PATH."
321
+ )
322
+ if config.mcp_server_b and config.mcp_server_b.command:
323
+ name_b = config.mcp_server_b.name or "mcp_server_b"
324
+ reachable = _check_mcp_server_reachable(config.mcp_server_b.command)
325
+ mcp_servers_reachable[name_b] = reachable
326
+ if not reachable:
327
+ warnings.append(
328
+ f"MCP server B command '{config.mcp_server_b.command}' not found in PATH."
329
+ )
330
+ elif config.mcp_server and config.mcp_server.command:
331
+ name = config.mcp_server.name or "mcp_server"
332
+ reachable = _check_mcp_server_reachable(config.mcp_server.command)
333
+ mcp_servers_reachable[name] = reachable
334
+ if not reachable:
335
+ warnings.append(f"MCP server command '{config.mcp_server.command}' not found in PATH.")
336
+
337
+ # 5. Check API key
338
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
339
+ if not api_key:
340
+ warnings.append(
341
+ "ANTHROPIC_API_KEY environment variable is not set. "
342
+ "Evaluation requires a valid API key."
343
+ )
344
+
345
+ # 6. Estimate cost
346
+ estimated_cost = _estimate_cost(config.model, benchmark_name, total_tasks)
347
+ if estimated_cost is None:
348
+ warnings.append(
349
+ f"Could not estimate cost: pricing unavailable for model '{config.model}'. "
350
+ f"Cost estimation uses historical token usage averages and may vary."
351
+ )
352
+ else:
353
+ # Add a note about estimation accuracy
354
+ if verbosity >= 1:
355
+ warnings.append(
356
+ "Cost estimate is based on historical averages and actual costs may vary "
357
+ "significantly depending on task complexity and agent behavior."
358
+ )
359
+
360
+ # 7. Estimate time
361
+ estimated_time = _estimate_time(
362
+ benchmark_name,
363
+ total_tasks,
364
+ config.max_concurrent,
365
+ config.timeout_seconds,
366
+ )
367
+
368
+ # 8. Budget warning
369
+ if config.budget is not None and estimated_cost is not None:
370
+ if estimated_cost > config.budget:
371
+ warnings.append(
372
+ f"Estimated cost ({format_cost(estimated_cost)}) exceeds budget "
373
+ f"({format_cost(config.budget)}). Evaluation may be halted early."
374
+ )
375
+
376
+ return DryRunResult(
377
+ benchmark_name=benchmark_name,
378
+ total_tasks=total_tasks,
379
+ task_ids=task_ids,
380
+ estimated_cost_usd=estimated_cost,
381
+ estimated_time_minutes=estimated_time,
382
+ config_valid=config_valid,
383
+ config_errors=config_errors,
384
+ docker_available=docker_available,
385
+ mcp_servers_reachable=mcp_servers_reachable,
386
+ warnings=warnings,
387
+ )
388
+
389
+
390
+ def format_dry_run_report(result: DryRunResult) -> None:
391
+ """Print a rich-formatted dry-run report to the console.
392
+
393
+ Displays a comprehensive overview of what the evaluation would do,
394
+ including task details, cost estimates, infrastructure readiness,
395
+ and any warnings or errors.
396
+
397
+ Args:
398
+ result: The DryRunResult to format and display.
399
+ """
400
+ console = Console()
401
+
402
+ # Header
403
+ console.print()
404
+ console.print(
405
+ Panel(
406
+ "[bold]Dry Run Report[/bold]\n[dim]Preview of evaluation without executing tasks[/dim]",
407
+ border_style="cyan",
408
+ )
409
+ )
410
+
411
+ # Benchmark & Tasks table
412
+ task_table = Table(
413
+ title="Evaluation Overview",
414
+ show_header=True,
415
+ header_style="bold cyan",
416
+ )
417
+ task_table.add_column("Property", style="bold")
418
+ task_table.add_column("Value")
419
+
420
+ task_table.add_row("Benchmark", result.benchmark_name)
421
+ task_table.add_row("Total Tasks", str(result.total_tasks))
422
+ task_table.add_row(
423
+ "Estimated Cost",
424
+ format_cost(result.estimated_cost_usd) if result.estimated_cost_usd is not None else "N/A",
425
+ )
426
+
427
+ if result.estimated_time_minutes is not None:
428
+ hours = int(result.estimated_time_minutes // 60)
429
+ minutes = int(result.estimated_time_minutes % 60)
430
+ if hours > 0:
431
+ time_str = f"{hours}h {minutes}m"
432
+ else:
433
+ time_str = f"{minutes}m"
434
+ task_table.add_row("Estimated Time", time_str)
435
+ else:
436
+ task_table.add_row("Estimated Time", "N/A")
437
+
438
+ console.print()
439
+ console.print(task_table)
440
+
441
+ # Task IDs (show first 10, then summarize)
442
+ if result.task_ids:
443
+ console.print()
444
+ console.print("[bold]Task IDs:[/bold]")
445
+ display_count = min(10, len(result.task_ids))
446
+ for task_id in result.task_ids[:display_count]:
447
+ console.print(f" [dim]-[/dim] {task_id}")
448
+ if len(result.task_ids) > display_count:
449
+ console.print(f" [dim]... and {len(result.task_ids) - display_count} more[/dim]")
450
+
451
+ # Infrastructure Readiness table
452
+ infra_table = Table(
453
+ title="Infrastructure Readiness",
454
+ show_header=True,
455
+ header_style="bold cyan",
456
+ )
457
+ infra_table.add_column("Check", style="bold")
458
+ infra_table.add_column("Status", justify="center")
459
+ infra_table.add_column("Details")
460
+
461
+ # Config validation
462
+ if result.config_valid:
463
+ infra_table.add_row("Configuration", "[green]PASS[/green]", "Valid")
464
+ else:
465
+ error_summary = "; ".join(result.config_errors[:3])
466
+ if len(result.config_errors) > 3:
467
+ error_summary += f" (+{len(result.config_errors) - 3} more)"
468
+ infra_table.add_row("Configuration", "[red]FAIL[/red]", error_summary)
469
+
470
+ # Docker
471
+ if result.docker_available:
472
+ infra_table.add_row("Docker", "[green]PASS[/green]", "Running")
473
+ else:
474
+ infra_table.add_row("Docker", "[red]FAIL[/red]", "Not available")
475
+
476
+ # MCP servers
477
+ for server_name, reachable in result.mcp_servers_reachable.items():
478
+ if reachable:
479
+ infra_table.add_row(f"MCP: {server_name}", "[green]PASS[/green]", "Command found")
480
+ else:
481
+ infra_table.add_row(f"MCP: {server_name}", "[red]FAIL[/red]", "Command not found")
482
+
483
+ # API key
484
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
485
+ if api_key:
486
+ masked = f"{api_key[:8]}...{api_key[-4:]}" if len(api_key) > 12 else "***"
487
+ infra_table.add_row("API Key", "[green]PASS[/green]", f"Set ({masked})")
488
+ else:
489
+ infra_table.add_row("API Key", "[red]FAIL[/red]", "Not set")
490
+
491
+ console.print()
492
+ console.print(infra_table)
493
+
494
+ # Warnings
495
+ if result.warnings:
496
+ console.print()
497
+ console.print("[yellow bold]Warnings:[/yellow bold]")
498
+ for warning in result.warnings:
499
+ console.print(f" [yellow]-[/yellow] {warning}")
500
+
501
+ # Config errors
502
+ if result.config_errors:
503
+ console.print()
504
+ console.print("[red bold]Configuration Errors:[/red bold]")
505
+ for error in result.config_errors:
506
+ console.print(f" [red]-[/red] {error}")
507
+
508
+ # Summary
509
+ console.print()
510
+ all_clear = (
511
+ result.config_valid
512
+ and result.docker_available
513
+ and all(result.mcp_servers_reachable.values())
514
+ and api_key is not None
515
+ )
516
+ if all_clear:
517
+ console.print(
518
+ Panel(
519
+ "[green bold]All checks passed.[/green bold]\nThe evaluation is ready to run.",
520
+ border_style="green",
521
+ )
522
+ )
523
+ else:
524
+ console.print(
525
+ Panel(
526
+ "[red bold]Some checks failed.[/red bold]\n"
527
+ "Please resolve the issues above before running the evaluation.",
528
+ border_style="red",
529
+ )
530
+ )
531
+
532
+ console.print()