mcpbr 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/__init__.py CHANGED
@@ -3,4 +3,4 @@
3
3
  A benchmark runner for evaluating MCP servers against SWE-bench tasks.
4
4
  """
5
5
 
6
- __version__ = "0.5.3"
6
+ __version__ = "0.5.4"
mcpbr/evaluation.py CHANGED
@@ -93,43 +93,52 @@ async def apply_patch(
93
93
 
94
94
  workdir = workdir or env.workdir
95
95
 
96
- # Reset repository to clean state before applying patch
97
- # The agent modified files directly, so we need to restore HEAD state
98
- await env.exec_command("git reset --hard HEAD", timeout=30, workdir=workdir)
99
- await env.exec_command("git clean -fd", timeout=30, workdir=workdir)
100
-
101
- await env.write_file("fix.patch", patch, workdir=workdir)
96
+ # Use longer timeouts for git operations under concurrent load,
97
+ # Docker exec can be slow and 30s is insufficient (#399).
98
+ try:
99
+ # Reset repository to clean state before applying patch
100
+ # The agent modified files directly, so we need to restore HEAD state
101
+ await env.exec_command("git reset --hard HEAD", timeout=120, workdir=workdir)
102
+ await env.exec_command("git clean -fd", timeout=120, workdir=workdir)
102
103
 
103
- exit_code, stdout, stderr = await env.exec_command(
104
- "git apply --check fix.patch",
105
- timeout=30,
106
- workdir=workdir,
107
- )
104
+ await env.write_file("fix.patch", patch, workdir=workdir)
108
105
 
109
- if exit_code != 0:
110
- exit_code2, stdout2, stderr2 = await env.exec_command(
111
- "git apply --check -3 fix.patch",
112
- timeout=30,
113
- workdir=workdir,
114
- )
115
- if exit_code2 != 0:
116
- return False, f"Patch does not apply: {stderr or stderr2}"
117
106
  exit_code, stdout, stderr = await env.exec_command(
118
- "git apply -3 fix.patch",
119
- timeout=30,
120
- workdir=workdir,
121
- )
122
- else:
123
- exit_code, stdout, stderr = await env.exec_command(
124
- "git apply fix.patch",
125
- timeout=30,
107
+ "git apply --check fix.patch",
108
+ timeout=120,
126
109
  workdir=workdir,
127
110
  )
128
111
 
129
- if exit_code != 0:
130
- return False, f"Failed to apply patch: {stderr}"
112
+ if exit_code != 0:
113
+ exit_code2, stdout2, stderr2 = await env.exec_command(
114
+ "git apply --check -3 fix.patch",
115
+ timeout=120,
116
+ workdir=workdir,
117
+ )
118
+ if exit_code2 != 0:
119
+ return False, f"Patch does not apply: {stderr or stderr2}"
120
+ exit_code, stdout, stderr = await env.exec_command(
121
+ "git apply -3 fix.patch",
122
+ timeout=120,
123
+ workdir=workdir,
124
+ )
125
+ else:
126
+ exit_code, stdout, stderr = await env.exec_command(
127
+ "git apply fix.patch",
128
+ timeout=120,
129
+ workdir=workdir,
130
+ )
131
131
 
132
- return True, ""
132
+ if exit_code != 0:
133
+ return False, f"Failed to apply patch: {stderr}"
134
+
135
+ return True, ""
136
+
137
+ except (TimeoutError, asyncio.TimeoutError):
138
+ # Catch exec_command timeouts here so they don't bubble up as
139
+ # asyncio.TimeoutError to the harness, which would misclassify
140
+ # this as an agent/eval timeout (#399).
141
+ return False, "Docker exec timed out during patch application"
133
142
 
134
143
 
135
144
  async def run_tests(
@@ -282,38 +291,43 @@ async def _apply_test_patch(
282
291
 
283
292
  workdir = workdir or env.workdir
284
293
 
285
- await env.write_file("test.patch", test_patch, workdir=workdir)
286
-
287
- exit_code, stdout, stderr = await env.exec_command(
288
- "git apply --check test.patch",
289
- timeout=30,
290
- workdir=workdir,
291
- )
294
+ try:
295
+ await env.write_file("test.patch", test_patch, workdir=workdir)
292
296
 
293
- if exit_code != 0:
294
297
  exit_code, stdout, stderr = await env.exec_command(
295
- "git apply --check -3 test.patch",
296
- timeout=30,
298
+ "git apply --check test.patch",
299
+ timeout=120,
297
300
  workdir=workdir,
298
301
  )
302
+
303
+ if exit_code != 0:
304
+ exit_code, stdout, stderr = await env.exec_command(
305
+ "git apply --check -3 test.patch",
306
+ timeout=120,
307
+ workdir=workdir,
308
+ )
309
+ if exit_code != 0:
310
+ return True, ""
311
+ exit_code, stdout, stderr = await env.exec_command(
312
+ "git apply -3 test.patch",
313
+ timeout=120,
314
+ workdir=workdir,
315
+ )
316
+ else:
317
+ exit_code, stdout, stderr = await env.exec_command(
318
+ "git apply test.patch",
319
+ timeout=120,
320
+ workdir=workdir,
321
+ )
322
+
299
323
  if exit_code != 0:
300
324
  return True, ""
301
- exit_code, stdout, stderr = await env.exec_command(
302
- "git apply -3 test.patch",
303
- timeout=30,
304
- workdir=workdir,
305
- )
306
- else:
307
- exit_code, stdout, stderr = await env.exec_command(
308
- "git apply test.patch",
309
- timeout=30,
310
- workdir=workdir,
311
- )
312
325
 
313
- if exit_code != 0:
314
326
  return True, ""
315
327
 
316
- return True, ""
328
+ except (TimeoutError, asyncio.TimeoutError):
329
+ # Don't let exec timeouts bubble up to the harness (#399)
330
+ return True, ""
317
331
 
318
332
 
319
333
  async def evaluate_patch(
@@ -356,7 +370,14 @@ async def evaluate_patch(
356
370
 
357
371
  # Skip dependency installation for pre-built images (already done)
358
372
  if not env.uses_prebuilt:
359
- await _install_dependencies(env)
373
+ try:
374
+ await _install_dependencies(env)
375
+ except (TimeoutError, asyncio.TimeoutError):
376
+ return EvaluationResult(
377
+ resolved=False,
378
+ patch_applied=True,
379
+ error="Docker exec timed out during dependency installation",
380
+ )
360
381
 
361
382
  repo = task.get("repo")
362
383
 
mcpbr/harness.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Main evaluation harness orchestrating parallel task execution."""
2
2
 
3
3
  import asyncio
4
+ import logging
4
5
  import time
5
6
  from dataclasses import dataclass
6
7
  from datetime import datetime, timezone
@@ -29,6 +30,7 @@ from .pricing import calculate_cost
29
30
  from .profiler import PerformanceProfiler
30
31
 
31
32
  console = Console()
33
+ logger = logging.getLogger(__name__)
32
34
 
33
35
 
34
36
  class SimpleNamespace:
@@ -56,6 +58,57 @@ def dict_to_namespace(data: Any) -> Any:
56
58
  return data
57
59
 
58
60
 
61
+ # -- Cold-start mitigation helpers (#401) ------------------------------------
62
+
63
+ # Seconds between each task launch in the first concurrent batch.
64
+ _STAGGER_INTERVAL = 1.0
65
+
66
+
67
+ def _stagger_delay(task_index: int, max_concurrent: int) -> float:
68
+ """Return the startup delay for a task to avoid cold-start contention.
69
+
70
+ Only the first batch (indices 0 .. max_concurrent-1) is staggered.
71
+ The very first task starts immediately; subsequent tasks in the batch
72
+ get an increasing delay so Docker image pulls and container creation
73
+ don't all hit at once.
74
+
75
+ Args:
76
+ task_index: Zero-based index of the task in launch order.
77
+ max_concurrent: Semaphore size / max parallelism.
78
+
79
+ Returns:
80
+ Delay in seconds (0.0 means start immediately).
81
+ """
82
+ if max_concurrent <= 1:
83
+ return 0.0
84
+ # Only stagger the first batch
85
+ if task_index >= max_concurrent:
86
+ return 0.0
87
+ return task_index * _STAGGER_INTERVAL
88
+
89
+
90
+ def _should_retry_zero_iteration(result: dict[str, Any]) -> bool:
91
+ """Check whether a task result indicates a cold-start failure worth retrying.
92
+
93
+ A cold-start failure is characterised by zero iterations AND zero tokens
94
+ AND a timeout status — the agent never actually ran.
95
+
96
+ Args:
97
+ result: Single-run result dict from _run_mcp_evaluation or _run_baseline_evaluation.
98
+
99
+ Returns:
100
+ True if the result looks like a cold-start failure.
101
+ """
102
+ if result.get("status") != "timeout":
103
+ return False
104
+ if result.get("iterations", -1) != 0:
105
+ return False
106
+ tokens = result.get("tokens", {})
107
+ if tokens.get("input", -1) != 0 or tokens.get("output", -1) != 0:
108
+ return False
109
+ return True
110
+
111
+
59
112
  @dataclass
60
113
  class TaskResult:
61
114
  """Result for a single task."""
@@ -302,6 +355,24 @@ async def run_single_task(
302
355
  mcp_server_config=config.mcp_server_a,
303
356
  server_name="server_a",
304
357
  )
358
+ # Retry once on cold-start failure (#401)
359
+ if result.mcp_server_a and _should_retry_zero_iteration(result.mcp_server_a):
360
+ logger.info(
361
+ "Retrying MCP server_a task %s (zero-iteration cold-start)", instance_id
362
+ )
363
+ result.mcp_server_a = await _run_mcp_evaluation(
364
+ task,
365
+ config,
366
+ docker_manager,
367
+ benchmark,
368
+ verbose,
369
+ verbosity,
370
+ mcp_log_writer_a if mcp_log_writer_a else log_file,
371
+ cache,
372
+ mcp_logs_dir,
373
+ mcp_server_config=config.mcp_server_a,
374
+ server_name="server_a",
375
+ )
305
376
  finally:
306
377
  if mcp_log_writer_a:
307
378
  mcp_log_writer_a.close()
@@ -324,6 +395,24 @@ async def run_single_task(
324
395
  mcp_server_config=config.mcp_server_b,
325
396
  server_name="server_b",
326
397
  )
398
+ # Retry once on cold-start failure (#401)
399
+ if result.mcp_server_b and _should_retry_zero_iteration(result.mcp_server_b):
400
+ logger.info(
401
+ "Retrying MCP server_b task %s (zero-iteration cold-start)", instance_id
402
+ )
403
+ result.mcp_server_b = await _run_mcp_evaluation(
404
+ task,
405
+ config,
406
+ docker_manager,
407
+ benchmark,
408
+ verbose,
409
+ verbosity,
410
+ mcp_log_writer_b if mcp_log_writer_b else log_file,
411
+ cache,
412
+ mcp_logs_dir,
413
+ mcp_server_config=config.mcp_server_b,
414
+ server_name="server_b",
415
+ )
327
416
  finally:
328
417
  if mcp_log_writer_b:
329
418
  mcp_log_writer_b.close()
@@ -344,6 +433,20 @@ async def run_single_task(
344
433
  cache,
345
434
  mcp_logs_dir,
346
435
  )
436
+ # Retry once on cold-start failure (#401)
437
+ if result.mcp and _should_retry_zero_iteration(result.mcp):
438
+ logger.info("Retrying MCP task %s (zero-iteration cold-start)", instance_id)
439
+ result.mcp = await _run_mcp_evaluation(
440
+ task,
441
+ config,
442
+ docker_manager,
443
+ benchmark,
444
+ verbose,
445
+ verbosity,
446
+ mcp_log_writer if mcp_log_writer else log_file,
447
+ cache,
448
+ mcp_logs_dir,
449
+ )
347
450
  finally:
348
451
  if mcp_log_writer:
349
452
  mcp_log_writer.close()
@@ -363,6 +466,19 @@ async def run_single_task(
363
466
  baseline_log_writer if baseline_log_writer else log_file,
364
467
  cache,
365
468
  )
469
+ # Retry once on cold-start failure (#401)
470
+ if result.baseline and _should_retry_zero_iteration(result.baseline):
471
+ logger.info("Retrying baseline task %s (zero-iteration cold-start)", instance_id)
472
+ result.baseline = await _run_baseline_evaluation(
473
+ task,
474
+ config,
475
+ docker_manager,
476
+ benchmark,
477
+ verbose,
478
+ verbosity,
479
+ baseline_log_writer if baseline_log_writer else log_file,
480
+ cache,
481
+ )
366
482
  finally:
367
483
  if baseline_log_writer:
368
484
  baseline_log_writer.close()
@@ -539,7 +655,15 @@ async def _run_mcp_evaluation(
539
655
  if env:
540
656
  # Track Docker teardown time
541
657
  teardown_start = time.time()
542
- await env.cleanup()
658
+ try:
659
+ await asyncio.wait_for(env.cleanup(), timeout=60)
660
+ except (asyncio.TimeoutError, Exception) as cleanup_err:
661
+ logger.warning("Container cleanup failed for MCP task: %s", cleanup_err)
662
+ try:
663
+ if hasattr(env, "container") and env.container:
664
+ env.container.remove(force=True)
665
+ except Exception:
666
+ pass
543
667
  if profiler:
544
668
  teardown_end = time.time()
545
669
  profiler.record_docker_teardown(teardown_end - teardown_start)
@@ -695,7 +819,15 @@ async def _run_baseline_evaluation(
695
819
  if env:
696
820
  # Track Docker teardown time
697
821
  teardown_start = time.time()
698
- await env.cleanup()
822
+ try:
823
+ await asyncio.wait_for(env.cleanup(), timeout=60)
824
+ except (asyncio.TimeoutError, Exception) as cleanup_err:
825
+ logger.warning("Container cleanup failed for baseline task: %s", cleanup_err)
826
+ try:
827
+ if hasattr(env, "container") and env.container:
828
+ env.container.remove(force=True)
829
+ except Exception:
830
+ pass
699
831
  if profiler:
700
832
  teardown_end = time.time()
701
833
  profiler.record_docker_teardown(teardown_end - teardown_start)
@@ -1013,9 +1145,10 @@ async def run_evaluation(
1013
1145
  semaphore = asyncio.Semaphore(config.max_concurrent)
1014
1146
  budget_exceeded = False
1015
1147
  current_cost = 0.0
1148
+ _task_launch_counter = 0
1016
1149
 
1017
1150
  async def run_with_semaphore(task: dict[str, Any]) -> TaskResult | None:
1018
- nonlocal current_cost, budget_exceeded
1151
+ nonlocal current_cost, budget_exceeded, _task_launch_counter
1019
1152
 
1020
1153
  # Check budget before running task
1021
1154
  if config.budget and current_cost >= config.budget:
@@ -1023,6 +1156,15 @@ async def run_evaluation(
1023
1156
  return None
1024
1157
 
1025
1158
  async with semaphore:
1159
+ # Stagger first-batch launches to avoid cold-start contention (#401).
1160
+ # Delay is inside the semaphore so the sleeping task holds its slot
1161
+ # and later tasks cannot leapfrog ahead of the first batch.
1162
+ my_index = _task_launch_counter
1163
+ _task_launch_counter += 1
1164
+ delay = _stagger_delay(my_index, config.max_concurrent)
1165
+ if delay > 0:
1166
+ await asyncio.sleep(delay)
1167
+
1026
1168
  result = await run_single_task(
1027
1169
  task,
1028
1170
  config,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.5.3
3
+ Version: 0.5.4
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -1,4 +1,4 @@
1
- mcpbr/__init__.py,sha256=vWZ1BylD_FMXNCo_5ZWJq6kIJCv49VUU5O4I0Orpko8,151
1
+ mcpbr/__init__.py,sha256=BvGCejBsCXoZ7xBuf4RkvHWegXxymyexsnWe_h-GjwI,151
2
2
  mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
3
3
  mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
4
4
  mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
@@ -17,12 +17,12 @@ mcpbr/docker_env.py,sha256=dRhQamlEq05h4wOjZN76c0GIYR6FRx9aGB_Jrkmssss,33676
17
17
  mcpbr/docker_prewarm.py,sha256=GVRD2B10HA7OpWq_CC7CkNkJ1OUjAU7GzKOpJ5VFrXk,12638
18
18
  mcpbr/dry_run.py,sha256=w_1L5K4Bk3SzeXfZY2NDbXims_Qh6711wIGm6p3tr84,18218
19
19
  mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
20
- mcpbr/evaluation.py,sha256=NK_lId2fbmKZiAyalonhCuLY-pGSGy4tPYN-i84sx8Q,12804
20
+ mcpbr/evaluation.py,sha256=UbECTCxbUh0dLGmcYWVQdZjwtyyVe3lNMJgbCBa1858,13923
21
21
  mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
22
22
  mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
23
23
  mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
24
24
  mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
25
- mcpbr/harness.py,sha256=Ehq-Yxsvi9lWBHEqdhKx1S6LB4vbDttxHB-REcWBoNo,53935
25
+ mcpbr/harness.py,sha256=LO5viFF5uSfbYriCnIfww598ashHyN_sDT-D0ELN3dY,59999
26
26
  mcpbr/harnesses.py,sha256=iaGlRIXdvIqCrYQtXNRZT9HowgmPDVssT2_Qlj2eCkI,48294
27
27
  mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
28
28
  mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
@@ -92,15 +92,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
92
92
  mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
93
93
  mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
94
94
  mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
95
- mcpbr-0.5.3.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
96
- mcpbr-0.5.3.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
97
- mcpbr-0.5.3.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
98
- mcpbr-0.5.3.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
99
- mcpbr-0.5.3.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
100
- mcpbr-0.5.3.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
101
- mcpbr-0.5.3.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
102
- mcpbr-0.5.3.dist-info/METADATA,sha256=YJ05sM1v6ApgK9HD6wMn5mIUF96cKUbU4C8nJwwPSgQ,55068
103
- mcpbr-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
104
- mcpbr-0.5.3.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
105
- mcpbr-0.5.3.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
106
- mcpbr-0.5.3.dist-info/RECORD,,
95
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
96
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
97
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
98
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
99
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
100
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
101
+ mcpbr-0.5.4.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
102
+ mcpbr-0.5.4.dist-info/METADATA,sha256=3bZ7iyaLkIRs3-e6EpOVwfMkvSVStjW4GDKNYKJ9xfM,55068
103
+ mcpbr-0.5.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
104
+ mcpbr-0.5.4.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
105
+ mcpbr-0.5.4.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
106
+ mcpbr-0.5.4.dist-info/RECORD,,
File without changes