mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/formatting.py ADDED
@@ -0,0 +1,444 @@
1
+ """Color and formatting options for CLI output.
2
+
3
+ Provides configurable themes and formatting utilities for consistent CLI output
4
+ across the mcpbr tool. Supports the NO_COLOR convention (https://no-color.org/)
5
+ and configurable themes via the MCPBR_THEME environment variable or CLI flags.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import sys
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from typing import Any
15
+
16
+ from rich.console import Console
17
+ from rich.progress import (
18
+ BarColumn,
19
+ MofNCompleteColumn,
20
+ Progress,
21
+ SpinnerColumn,
22
+ TextColumn,
23
+ TimeElapsedColumn,
24
+ TimeRemainingColumn,
25
+ )
26
+ from rich.table import Table
27
+ from rich.text import Text
28
+
29
+
30
+ class Theme(Enum):
31
+ """Available output themes.
32
+
33
+ Attributes:
34
+ DEFAULT: Rich colors with bold styles for maximum readability.
35
+ MINIMAL: Subdued colors for less visual noise.
36
+ PLAIN: No formatting or color at all.
37
+ """
38
+
39
+ DEFAULT = "default"
40
+ MINIMAL = "minimal"
41
+ PLAIN = "plain"
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class ThemeConfig:
46
+ """Style configuration for a theme.
47
+
48
+ Each field is a Rich markup style string used to format the corresponding
49
+ message category (e.g., ``"bold green"`` for success messages).
50
+
51
+ Attributes:
52
+ success_style: Style for success messages.
53
+ error_style: Style for error messages.
54
+ warning_style: Style for warning messages.
55
+ info_style: Style for informational messages.
56
+ header_style: Style for section headers.
57
+ dim_style: Style for secondary/dimmed text.
58
+ highlight_style: Style for highlighted/emphasized text.
59
+ """
60
+
61
+ success_style: str = "bold green"
62
+ error_style: str = "bold red"
63
+ warning_style: str = "bold yellow"
64
+ info_style: str = "bold blue"
65
+ header_style: str = "bold magenta"
66
+ dim_style: str = "dim"
67
+ highlight_style: str = "bold cyan"
68
+
69
+
70
+ THEME_CONFIGS: dict[Theme, ThemeConfig] = {
71
+ Theme.DEFAULT: ThemeConfig(
72
+ success_style="bold green",
73
+ error_style="bold red",
74
+ warning_style="bold yellow",
75
+ info_style="bold blue",
76
+ header_style="bold magenta",
77
+ dim_style="dim",
78
+ highlight_style="bold cyan",
79
+ ),
80
+ Theme.MINIMAL: ThemeConfig(
81
+ success_style="green",
82
+ error_style="red",
83
+ warning_style="yellow",
84
+ info_style="blue",
85
+ header_style="magenta",
86
+ dim_style="dim",
87
+ highlight_style="cyan",
88
+ ),
89
+ Theme.PLAIN: ThemeConfig(
90
+ success_style="",
91
+ error_style="",
92
+ warning_style="",
93
+ info_style="",
94
+ header_style="",
95
+ dim_style="",
96
+ highlight_style="",
97
+ ),
98
+ }
99
+
100
+
101
+ def _resolve_theme(theme_name: str | None = None) -> Theme:
102
+ """Resolve a theme name string to a Theme enum value.
103
+
104
+ Checks the provided name first, then the MCPBR_THEME environment variable,
105
+ and falls back to ``Theme.DEFAULT``.
106
+
107
+ Args:
108
+ theme_name: Optional theme name (case-insensitive). One of
109
+ ``"default"``, ``"minimal"``, or ``"plain"``.
110
+
111
+ Returns:
112
+ The resolved Theme enum value.
113
+
114
+ Raises:
115
+ ValueError: If the theme name is not recognized.
116
+ """
117
+ name = theme_name or os.environ.get("MCPBR_THEME")
118
+ if name is None:
119
+ return Theme.DEFAULT
120
+
121
+ try:
122
+ return Theme(name.strip().lower())
123
+ except ValueError:
124
+ valid = ", ".join(t.value for t in Theme)
125
+ raise ValueError(f"Unknown theme '{name}'. Valid themes: {valid}") from None
126
+
127
+
128
+ def detect_color_support(force_color: bool | None = None) -> bool:
129
+ """Determine whether the current environment supports color output.
130
+
131
+ Resolution order:
132
+ 1. ``force_color`` parameter (explicit override).
133
+ 2. ``NO_COLOR`` environment variable -- if set (any value), colors are
134
+ disabled per https://no-color.org/.
135
+ 3. ``MCPBR_THEME`` environment variable -- if set to ``"plain"``, colors
136
+ are disabled.
137
+ 4. Terminal detection -- colors are enabled when stdout is a TTY.
138
+
139
+ Args:
140
+ force_color: Explicit override. ``True`` forces colors on, ``False``
141
+ forces them off, ``None`` uses auto-detection.
142
+
143
+ Returns:
144
+ ``True`` if color output should be used, ``False`` otherwise.
145
+ """
146
+ if force_color is not None:
147
+ return force_color
148
+
149
+ # NO_COLOR convention: any value (including empty string) disables color
150
+ if "NO_COLOR" in os.environ:
151
+ return False
152
+
153
+ # MCPBR_THEME=plain disables color
154
+ theme_env = os.environ.get("MCPBR_THEME", "").strip().lower()
155
+ if theme_env == "plain":
156
+ return False
157
+
158
+ # Auto-detect: color only when stdout is a TTY
159
+ return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
160
+
161
+
162
+ class OutputFormatter:
163
+ """Formatted output for CLI messages.
164
+
165
+ Provides methods to print and format success, error, warning, info, and
166
+ header messages using Rich markup styles. Also supports table and progress
167
+ bar rendering.
168
+
169
+ Args:
170
+ theme: The theme to use for formatting. Defaults to ``Theme.DEFAULT``.
171
+ force_color: Explicit color override. ``True`` forces colors on,
172
+ ``False`` forces them off, ``None`` uses auto-detection.
173
+ console: Optional Rich Console instance. If not provided, one is
174
+ created based on color support settings.
175
+ """
176
+
177
+ def __init__(
178
+ self,
179
+ theme: Theme = Theme.DEFAULT,
180
+ force_color: bool | None = None,
181
+ console: Console | None = None,
182
+ ) -> None:
183
+ self._theme = theme
184
+ self._config = THEME_CONFIGS[theme]
185
+ self._color_enabled = detect_color_support(force_color)
186
+
187
+ if console is not None:
188
+ self._console = console
189
+ else:
190
+ # When color is disabled, use no_color=True so Rich strips markup
191
+ self._console = Console(no_color=not self._color_enabled)
192
+
193
+ @property
194
+ def theme(self) -> Theme:
195
+ """The active theme."""
196
+ return self._theme
197
+
198
+ @property
199
+ def config(self) -> ThemeConfig:
200
+ """The active theme configuration."""
201
+ return self._config
202
+
203
+ @property
204
+ def color_enabled(self) -> bool:
205
+ """Whether color output is enabled."""
206
+ return self._color_enabled
207
+
208
+ @property
209
+ def console(self) -> Console:
210
+ """The underlying Rich console."""
211
+ return self._console
212
+
213
+ # ------------------------------------------------------------------
214
+ # Print methods (write directly to console)
215
+ # ------------------------------------------------------------------
216
+
217
+ def success(self, message: str) -> None:
218
+ """Print a success message.
219
+
220
+ Args:
221
+ message: The message text.
222
+ """
223
+ self._print_styled(message, self._config.success_style, prefix="[ok]")
224
+
225
+ def error(self, message: str) -> None:
226
+ """Print an error message.
227
+
228
+ Args:
229
+ message: The message text.
230
+ """
231
+ self._print_styled(message, self._config.error_style, prefix="[error]")
232
+
233
+ def warning(self, message: str) -> None:
234
+ """Print a warning message.
235
+
236
+ Args:
237
+ message: The message text.
238
+ """
239
+ self._print_styled(message, self._config.warning_style, prefix="[warn]")
240
+
241
+ def info(self, message: str) -> None:
242
+ """Print an informational message.
243
+
244
+ Args:
245
+ message: The message text.
246
+ """
247
+ self._print_styled(message, self._config.info_style, prefix="[info]")
248
+
249
+ def header(self, message: str) -> None:
250
+ """Print a section header.
251
+
252
+ Args:
253
+ message: The header text.
254
+ """
255
+ self._print_styled(message, self._config.header_style)
256
+
257
+ # ------------------------------------------------------------------
258
+ # Format methods (return styled strings without printing)
259
+ # ------------------------------------------------------------------
260
+
261
+ def format_success(self, message: str) -> str:
262
+ """Return a Rich-markup formatted success string.
263
+
264
+ Args:
265
+ message: The message text.
266
+
267
+ Returns:
268
+ Formatted string with Rich markup tags, or plain text when
269
+ colors are disabled.
270
+ """
271
+ return self._format_styled(message, self._config.success_style, prefix="[ok]")
272
+
273
+ def format_error(self, message: str) -> str:
274
+ """Return a Rich-markup formatted error string.
275
+
276
+ Args:
277
+ message: The message text.
278
+
279
+ Returns:
280
+ Formatted string with Rich markup tags, or plain text when
281
+ colors are disabled.
282
+ """
283
+ return self._format_styled(message, self._config.error_style, prefix="[error]")
284
+
285
+ def format_warning(self, message: str) -> str:
286
+ """Return a Rich-markup formatted warning string.
287
+
288
+ Args:
289
+ message: The message text.
290
+
291
+ Returns:
292
+ Formatted string with Rich markup tags, or plain text when
293
+ colors are disabled.
294
+ """
295
+ return self._format_styled(message, self._config.warning_style, prefix="[warn]")
296
+
297
+ def format_info(self, message: str) -> str:
298
+ """Return a Rich-markup formatted info string.
299
+
300
+ Args:
301
+ message: The message text.
302
+
303
+ Returns:
304
+ Formatted string with Rich markup tags, or plain text when
305
+ colors are disabled.
306
+ """
307
+ return self._format_styled(message, self._config.info_style, prefix="[info]")
308
+
309
+ def format_header(self, message: str) -> str:
310
+ """Return a Rich-markup formatted header string.
311
+
312
+ Args:
313
+ message: The message text.
314
+
315
+ Returns:
316
+ Formatted string with Rich markup tags, or plain text when
317
+ colors are disabled.
318
+ """
319
+ return self._format_styled(message, self._config.header_style)
320
+
321
+ # ------------------------------------------------------------------
322
+ # Table rendering
323
+ # ------------------------------------------------------------------
324
+
325
+ def table(
326
+ self,
327
+ title: str,
328
+ columns: list[str],
329
+ rows: list[list[Any]],
330
+ ) -> None:
331
+ """Print a formatted Rich table.
332
+
333
+ Args:
334
+ title: Table title displayed above the table.
335
+ columns: List of column header names.
336
+ rows: List of rows, where each row is a list of cell values.
337
+ Values are converted to strings automatically.
338
+ """
339
+ tbl = Table(title=title, show_header=True, header_style=self._config.header_style)
340
+ for col in columns:
341
+ tbl.add_column(col)
342
+ for row in rows:
343
+ tbl.add_row(*(str(cell) for cell in row))
344
+ self._console.print(tbl)
345
+
346
+ # ------------------------------------------------------------------
347
+ # Progress bar
348
+ # ------------------------------------------------------------------
349
+
350
+ def progress_bar(self) -> Progress:
351
+ """Return a configured Rich Progress instance.
352
+
353
+ Returns:
354
+ A ``rich.progress.Progress`` object with spinner, description,
355
+ bar, completion count, elapsed time, and remaining time columns.
356
+ """
357
+ return Progress(
358
+ SpinnerColumn(),
359
+ TextColumn("[progress.description]{task.description}"),
360
+ BarColumn(),
361
+ MofNCompleteColumn(),
362
+ TimeElapsedColumn(),
363
+ TimeRemainingColumn(),
364
+ console=self._console,
365
+ )
366
+
367
+ # ------------------------------------------------------------------
368
+ # Internal helpers
369
+ # ------------------------------------------------------------------
370
+
371
+ def _print_styled(self, message: str, style: str, prefix: str = "") -> None:
372
+ """Print a message with a Rich style and optional prefix.
373
+
374
+ Uses ``rich.text.Text`` objects throughout to prevent Rich from
375
+ interpreting bracket-style prefixes (e.g. ``[ok]``) as markup tags.
376
+
377
+ Args:
378
+ message: The message text.
379
+ style: Rich style string (e.g., ``"bold green"``).
380
+ prefix: Optional prefix tag like ``"[ok]"`` or ``"[error]"``.
381
+ """
382
+ text = Text()
383
+ if not self._color_enabled or not style:
384
+ if prefix:
385
+ text.append(f"{prefix} ")
386
+ text.append(message)
387
+ else:
388
+ if prefix:
389
+ text.append(f"{prefix} ", style=style)
390
+ text.append(message, style=style)
391
+ self._console.print(text)
392
+
393
+ def _format_styled(self, message: str, style: str, prefix: str = "") -> str:
394
+ """Return a message formatted with Rich markup.
395
+
396
+ When colors are disabled or the style is empty, returns plain text.
397
+
398
+ Args:
399
+ message: The message text.
400
+ style: Rich style string.
401
+ prefix: Optional prefix tag.
402
+
403
+ Returns:
404
+ Formatted string.
405
+ """
406
+ if not self._color_enabled or not style:
407
+ return f"{prefix} {message}" if prefix else message
408
+
409
+ if prefix:
410
+ return f"[{style}]{prefix} {message}[/{style}]"
411
+ return f"[{style}]{message}[/{style}]"
412
+
413
+
414
+ def get_formatter(
415
+ theme: str | None = None,
416
+ no_color: bool = False,
417
+ console: Console | None = None,
418
+ ) -> OutputFormatter:
419
+ """Factory function to create a configured OutputFormatter.
420
+
421
+ This is the primary entry point for obtaining a formatter instance.
422
+ It resolves the theme from the provided argument, the ``MCPBR_THEME``
423
+ environment variable, or the default theme. It also respects the
424
+ ``NO_COLOR`` environment variable and the ``no_color`` parameter.
425
+
426
+ Args:
427
+ theme: Theme name (``"default"``, ``"minimal"``, or ``"plain"``).
428
+ Falls back to the ``MCPBR_THEME`` environment variable, then
429
+ ``"default"``.
430
+ no_color: If ``True``, forces color off regardless of other settings.
431
+ console: Optional Rich Console instance to use.
432
+
433
+ Returns:
434
+ A configured ``OutputFormatter`` instance.
435
+
436
+ Raises:
437
+ ValueError: If the theme name is not recognized.
438
+ """
439
+ resolved_theme = _resolve_theme(theme)
440
+ force_color: bool | None = None
441
+ if no_color:
442
+ force_color = False
443
+
444
+ return OutputFormatter(theme=resolved_theme, force_color=force_color, console=console)
mcpbr/gpu_support.py ADDED
@@ -0,0 +1,157 @@
1
+ """GPU support for Docker containers used in ML benchmark evaluations.
2
+
3
+ Provides detection of available GPUs (NVIDIA), Docker GPU runtime checks,
4
+ and Docker container configuration for GPU access.
5
+ """
6
+
7
+ import logging
8
+ import subprocess
9
+
10
+ import docker
11
+ import docker.types
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def detect_gpus() -> dict:
17
+ """Detect available GPUs on the host system.
18
+
19
+ Checks for NVIDIA GPUs via nvidia-smi and verifies the Docker GPU runtime
20
+ is available.
21
+
22
+ Returns:
23
+ Dictionary with GPU detection results:
24
+ - nvidia_available (bool): Whether NVIDIA GPUs were detected.
25
+ - gpu_count (int): Number of GPUs found.
26
+ - gpu_names (list[str]): Names of detected GPUs.
27
+ - driver_version (str): NVIDIA driver version, or empty string.
28
+ - docker_runtime_available (bool): Whether Docker NVIDIA runtime is available.
29
+ """
30
+ info: dict = {
31
+ "nvidia_available": False,
32
+ "gpu_count": 0,
33
+ "gpu_names": [],
34
+ "driver_version": "",
35
+ "docker_runtime_available": False,
36
+ }
37
+
38
+ # Detect NVIDIA GPUs via nvidia-smi
39
+ try:
40
+ result = subprocess.run(
41
+ [
42
+ "nvidia-smi",
43
+ "--query-gpu=name,driver_version",
44
+ "--format=csv,noheader,nounits",
45
+ ],
46
+ capture_output=True,
47
+ text=True,
48
+ timeout=10,
49
+ )
50
+ if result.returncode == 0 and result.stdout.strip():
51
+ lines = result.stdout.strip().splitlines()
52
+ gpu_names = []
53
+ driver_version = ""
54
+ for line in lines:
55
+ parts = [p.strip() for p in line.split(",")]
56
+ if len(parts) >= 2:
57
+ gpu_names.append(parts[0])
58
+ driver_version = parts[1]
59
+ elif len(parts) == 1:
60
+ gpu_names.append(parts[0])
61
+
62
+ info["nvidia_available"] = True
63
+ info["gpu_count"] = len(gpu_names)
64
+ info["gpu_names"] = gpu_names
65
+ info["driver_version"] = driver_version
66
+ except FileNotFoundError:
67
+ logger.debug("nvidia-smi not found; no NVIDIA GPUs detected.")
68
+ except subprocess.TimeoutExpired:
69
+ logger.warning("nvidia-smi timed out while detecting GPUs.")
70
+ except Exception as e:
71
+ logger.debug(f"GPU detection failed: {e}")
72
+
73
+ # Check Docker GPU runtime availability
74
+ info["docker_runtime_available"] = check_gpu_runtime()
75
+
76
+ return info
77
+
78
+
79
+ def get_docker_gpu_config(gpu_enabled: bool) -> dict:
80
+ """Return Docker container creation kwargs for GPU access.
81
+
82
+ When gpu_enabled is True, returns a dictionary containing a DeviceRequest
83
+ that grants access to all available NVIDIA GPUs. This dict can be merged
84
+ into the kwargs passed to ``docker.containers.run()`` or
85
+ ``docker.containers.create()``.
86
+
87
+ Args:
88
+ gpu_enabled: Whether to enable GPU access in the container.
89
+
90
+ Returns:
91
+ Dictionary of Docker container kwargs. Empty dict if gpu_enabled is False.
92
+ When True, contains ``device_requests`` with a DeviceRequest for all GPUs.
93
+ """
94
+ if not gpu_enabled:
95
+ return {}
96
+
97
+ return {
98
+ "device_requests": [
99
+ docker.types.DeviceRequest(
100
+ count=-1,
101
+ capabilities=[["gpu"]],
102
+ )
103
+ ],
104
+ }
105
+
106
+
107
+ def check_gpu_runtime() -> bool:
108
+ """Check if Docker has the NVIDIA runtime available.
109
+
110
+ Queries the Docker daemon info for registered runtimes and checks
111
+ whether the ``nvidia`` runtime is among them.
112
+
113
+ Returns:
114
+ True if the NVIDIA Docker runtime is available, False otherwise.
115
+ """
116
+ try:
117
+ client = docker.from_env()
118
+ docker_info = client.info()
119
+ runtimes = docker_info.get("Runtimes", {})
120
+ return "nvidia" in runtimes
121
+ except docker.errors.DockerException as e:
122
+ logger.debug(f"Could not query Docker for GPU runtime: {e}")
123
+ return False
124
+ except Exception as e:
125
+ logger.debug(f"Unexpected error checking Docker GPU runtime: {e}")
126
+ return False
127
+
128
+
129
+ def format_gpu_info(info: dict) -> str:
130
+ """Format GPU detection info as a human-readable string.
131
+
132
+ Args:
133
+ info: Dictionary returned by ``detect_gpus()``.
134
+
135
+ Returns:
136
+ Human-readable multi-line string describing the GPU environment.
137
+ """
138
+ lines: list[str] = []
139
+
140
+ if not info.get("nvidia_available"):
141
+ lines.append("No NVIDIA GPUs detected.")
142
+ else:
143
+ gpu_count = info.get("gpu_count", 0)
144
+ lines.append(f"NVIDIA GPUs detected: {gpu_count}")
145
+
146
+ gpu_names = info.get("gpu_names", [])
147
+ for i, name in enumerate(gpu_names):
148
+ lines.append(f" GPU {i}: {name}")
149
+
150
+ driver_version = info.get("driver_version", "")
151
+ if driver_version:
152
+ lines.append(f"Driver version: {driver_version}")
153
+
154
+ runtime_available = info.get("docker_runtime_available", False)
155
+ lines.append(f"Docker NVIDIA runtime: {'available' if runtime_available else 'not available'}")
156
+
157
+ return "\n".join(lines)
mcpbr/harness.py CHANGED
@@ -418,6 +418,7 @@ async def _run_mcp_evaluation(
418
418
 
419
419
  start_time = time.time()
420
420
  env: TaskEnvironment | None = None
421
+ agent_result: AgentResult | None = None
421
422
  try:
422
423
  # Track Docker environment creation time
423
424
  docker_start = time.time()
@@ -480,10 +481,15 @@ async def _run_mcp_evaluation(
480
481
  return result
481
482
 
482
483
  except asyncio.TimeoutError:
483
- # Note: The agent harness should have captured partial statistics in the AgentResult
484
- # before raising TimeoutError, but this is a fallback for unexpected timeout locations
485
484
  end_time = time.time()
486
485
  runtime_seconds = end_time - start_time
486
+ # Preserve agent metrics if the agent completed before the timeout
487
+ # (timeout may have occurred during evaluation, not during agent solve)
488
+ if agent_result is not None:
489
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
490
+ result["status"] = "timeout"
491
+ result["error"] = "Evaluation timed out after agent completed"
492
+ return result
487
493
  cost = calculate_cost(config.model, 0, 0)
488
494
  return {
489
495
  "resolved": False,
@@ -499,6 +505,11 @@ async def _run_mcp_evaluation(
499
505
  except Exception as e:
500
506
  end_time = time.time()
501
507
  runtime_seconds = end_time - start_time
508
+ # Preserve agent metrics if the agent completed before the error
509
+ if agent_result is not None:
510
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
511
+ result["error"] = str(e)
512
+ return result
502
513
  cost = calculate_cost(config.model, 0, 0)
503
514
  return {
504
515
  "resolved": False,
@@ -562,6 +573,7 @@ async def _run_baseline_evaluation(
562
573
 
563
574
  start_time = time.time()
564
575
  env: TaskEnvironment | None = None
576
+ agent_result: AgentResult | None = None
565
577
  try:
566
578
  # Track Docker environment creation time
567
579
  docker_start = time.time()
@@ -622,10 +634,15 @@ async def _run_baseline_evaluation(
622
634
  return result
623
635
 
624
636
  except asyncio.TimeoutError:
625
- # Note: The agent harness should have captured partial statistics in the AgentResult
626
- # before raising TimeoutError, but this is a fallback for unexpected timeout locations
627
637
  end_time = time.time()
628
638
  runtime_seconds = end_time - start_time
639
+ # Preserve agent metrics if the agent completed before the timeout
640
+ # (timeout may have occurred during evaluation, not during agent solve)
641
+ if agent_result is not None:
642
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
643
+ result["status"] = "timeout"
644
+ result["error"] = "Evaluation timed out after agent completed"
645
+ return result
629
646
  cost = calculate_cost(config.model, 0, 0)
630
647
  return {
631
648
  "resolved": False,
@@ -641,6 +658,11 @@ async def _run_baseline_evaluation(
641
658
  except Exception as e:
642
659
  end_time = time.time()
643
660
  runtime_seconds = end_time - start_time
661
+ # Preserve agent metrics if the agent completed before the error
662
+ if agent_result is not None:
663
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
664
+ result["error"] = str(e)
665
+ return result
644
666
  cost = calculate_cost(config.model, 0, 0)
645
667
  return {
646
668
  "resolved": False,
@@ -1182,6 +1204,18 @@ async def run_evaluation(
1182
1204
  progress.stop()
1183
1205
  finally:
1184
1206
  await docker_manager.cleanup_all()
1207
+ # Force-shutdown the default executor to prevent asyncio.run() from
1208
+ # hanging during cleanup. Docker SDK background threads (urllib3
1209
+ # connection pool) may linger after client.close(), causing
1210
+ # executor.shutdown(wait=True) to block indefinitely.
1211
+ try:
1212
+ loop = asyncio.get_running_loop()
1213
+ executor = getattr(loop, "_default_executor", None)
1214
+ if executor is not None:
1215
+ executor.shutdown(wait=False, cancel_futures=True)
1216
+ loop._default_executor = None
1217
+ except RuntimeError as exc:
1218
+ console.print(f"[yellow]Default executor shutdown skipped: {exc}[/yellow]")
1185
1219
 
1186
1220
  # Check if we're in comparison mode
1187
1221
  if config.comparison_mode: