mcpbr 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/config.py CHANGED
@@ -339,6 +339,12 @@ class HarnessConfig(BaseModel):
339
339
  description="Timeout for each task in seconds",
340
340
  )
341
341
 
342
+ eval_timeout_seconds: int = Field(
343
+ default=600,
344
+ description="Timeout for post-agent evaluation (applying patches and running tests) "
345
+ "in seconds. Set higher for benchmarks with slow test suites (e.g., Django).",
346
+ )
347
+
342
348
  max_concurrent: int = Field(
343
349
  default=4,
344
350
  description="Maximum concurrent task evaluations",
mcpbr/docker_env.py CHANGED
@@ -139,6 +139,7 @@ class TaskEnvironment:
139
139
  timeout: int = 60,
140
140
  workdir: str | None = None,
141
141
  environment: dict[str, str] | None = None,
142
+ user: str | None = None,
142
143
  ) -> tuple[int, str, str]:
143
144
  """Execute a command in the container.
144
145
 
@@ -147,6 +148,7 @@ class TaskEnvironment:
147
148
  timeout: Timeout in seconds.
148
149
  workdir: Working directory (defaults to /workspace).
149
150
  environment: Optional environment variables to set.
151
+ user: Optional user to run the command as (e.g., "mcpbr").
150
152
 
151
153
  Returns:
152
154
  Tuple of (exit_code, stdout, stderr).
@@ -164,6 +166,7 @@ class TaskEnvironment:
164
166
  workdir=wd,
165
167
  demux=True,
166
168
  environment=environment,
169
+ user=user or "",
167
170
  )
168
171
  stdout = result.output[0].decode("utf-8") if result.output[0] else ""
169
172
  stderr = result.output[1].decode("utf-8") if result.output[1] else ""
@@ -438,17 +441,17 @@ CMD ["/bin/bash"]
438
441
  rm=True,
439
442
  )
440
443
  except Exception as e:
441
- # Last resort: just tag the base image
442
- logger.warning(
443
- f"Failed to build comprehensive fallback image: {e}. "
444
- f"Falling back to tagging python:3.11-slim as {self.FALLBACK_IMAGE}"
444
+ # Do NOT tag bare python:3.11-slim as the fallback image — it lacks
445
+ # git, build tools, etc. and will poison all subsequent tasks.
446
+ # Instead, mark the build as failed so callers get a clear error.
447
+ logger.error(
448
+ f"Failed to build fallback image: {e}. "
449
+ f"Tasks requiring the fallback image will fail."
445
450
  )
446
- try:
447
- img = self.client.images.get("python:3.11-slim")
448
- img.tag(self.FALLBACK_IMAGE)
449
- except Exception as tag_error:
450
- logger.error(f"Failed to create fallback image {self.FALLBACK_IMAGE}: {tag_error}")
451
- raise
451
+ raise RuntimeError(
452
+ f"Cannot create fallback image '{self.FALLBACK_IMAGE}': {e}. "
453
+ f"Ensure Docker has enough disk space and network access."
454
+ ) from e
452
455
 
453
456
  async def create_environment(
454
457
  self,
@@ -482,7 +485,10 @@ CMD ["/bin/bash"]
482
485
  self._temp_dirs.append(temp_dir)
483
486
  host_workdir = temp_dir.name
484
487
 
485
- container_name = f"mcpbr-{self._session_id}-{instance_id}"
488
+ # Use a unique suffix per container to prevent name collisions when
489
+ # MCP and baseline runs create containers for the same task.
490
+ unique_suffix = uuid.uuid4().hex[:6]
491
+ container_name = f"mcpbr-{self._session_id}-{instance_id}-{unique_suffix}"
486
492
 
487
493
  container_workdir = "/testbed" if uses_prebuilt else "/workspace"
488
494
 
@@ -520,18 +526,35 @@ CMD ["/bin/bash"]
520
526
  )
521
527
  return container
522
528
  except docker.errors.APIError as e:
523
- # Only retry on 500 errors (transient Docker daemon issues)
524
529
  response = getattr(e, "response", None)
525
- if response is not None and getattr(response, "status_code", None) == 500:
526
- if attempt < max_retries:
527
- delay = base_delay * (2**attempt) # Exponential backoff: 1s, 2s, 4s
528
- logger.warning(
529
- f"Docker API error (attempt {attempt + 1}/{max_retries + 1}): {e}. "
530
- f"Retrying in {delay}s..."
531
- )
532
- time.sleep(delay)
533
- continue
534
- # Re-raise for non-500 errors or after max retries
530
+ status_code = getattr(response, "status_code", None) if response else None
531
+
532
+ # On 409 Conflict (container name already in use), try to
533
+ # remove the stale container and retry once.
534
+ if status_code == 409 and attempt < max_retries:
535
+ logger.warning(
536
+ f"Container name conflict (attempt {attempt + 1}): {e}. "
537
+ f"Removing stale container and retrying..."
538
+ )
539
+ try:
540
+ stale = self.client.containers.get(container_name)
541
+ stale.remove(force=True)
542
+ except Exception:
543
+ pass # Container may already be gone
544
+ time.sleep(1)
545
+ continue
546
+
547
+ # Retry on 500 errors (transient Docker daemon issues)
548
+ if status_code == 500 and attempt < max_retries:
549
+ delay = base_delay * (2**attempt)
550
+ logger.warning(
551
+ f"Docker API error (attempt {attempt + 1}/{max_retries + 1}): {e}. "
552
+ f"Retrying in {delay}s..."
553
+ )
554
+ time.sleep(delay)
555
+ continue
556
+
557
+ # Re-raise for unrecoverable errors or after max retries
535
558
  raise
536
559
 
537
560
  loop = asyncio.get_event_loop()
@@ -584,6 +607,22 @@ CMD ["/bin/bash"]
584
607
  timeout=30,
585
608
  )
586
609
 
610
+ # Flush filesystem buffers to ensure all copied files are visible
611
+ # before any subsequent commands (e.g., setup_command) run.
612
+ await env.exec_command("sync", timeout=10)
613
+
614
+ # Verify workspace is populated — catch copy failures early
615
+ exit_code, stdout, _ = await env.exec_command(
616
+ "find /workspace -maxdepth 1 -mindepth 1 | head -5 | wc -l",
617
+ timeout=10,
618
+ )
619
+ file_count = int(stdout.strip()) if exit_code == 0 and stdout.strip().isdigit() else 0
620
+ if file_count == 0:
621
+ raise RuntimeError(
622
+ "Workspace /workspace appears empty after copy from /testbed. "
623
+ "The filesystem may not have synced correctly."
624
+ )
625
+
587
626
  env.workdir = "/workspace"
588
627
 
589
628
  async def _install_claude_cli(self, env: TaskEnvironment) -> None:
mcpbr/evaluation.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Evaluation logic for applying patches and running tests."""
2
2
 
3
3
  import ast
4
+ import asyncio
4
5
  import json
5
6
  from dataclasses import dataclass
6
7
  from typing import Any
@@ -182,7 +183,7 @@ async def run_tests(
182
183
  }
183
184
  )
184
185
 
185
- except TimeoutError:
186
+ except (TimeoutError, asyncio.TimeoutError):
186
187
  results.append(
187
188
  {
188
189
  "test": test,
mcpbr/harness.py CHANGED
@@ -466,7 +466,10 @@ async def _run_mcp_evaluation(
466
466
  profiler.sample_memory()
467
467
 
468
468
  if agent_result.patch:
469
- eval_result_dict = await benchmark.evaluate(env, task, agent_result.patch)
469
+ eval_result_dict = await asyncio.wait_for(
470
+ benchmark.evaluate(env, task, agent_result.patch),
471
+ timeout=config.eval_timeout_seconds,
472
+ )
470
473
  # Convert benchmark result format to EvaluationResult-like object
471
474
  eval_result = dict_to_namespace(eval_result_dict)
472
475
  else:
@@ -619,7 +622,10 @@ async def _run_baseline_evaluation(
619
622
  profiler.sample_memory()
620
623
 
621
624
  if agent_result.patch:
622
- eval_result_dict = await benchmark.evaluate(env, task, agent_result.patch)
625
+ eval_result_dict = await asyncio.wait_for(
626
+ benchmark.evaluate(env, task, agent_result.patch),
627
+ timeout=config.eval_timeout_seconds,
628
+ )
623
629
  # Convert benchmark result format to EvaluationResult-like object
624
630
  eval_result = dict_to_namespace(eval_result_dict)
625
631
  else:
mcpbr/harnesses.py CHANGED
@@ -455,7 +455,8 @@ MCP_PROMPT_SUFFIX = (
455
455
  "\n\nYou have access to an MCP server with additional tools for codebase analysis. "
456
456
  "Use these tools to understand the codebase structure, find definitions, trace call chains, "
457
457
  "and navigate dependencies before making changes. The MCP tools are especially useful for "
458
- "understanding how code is connected across files."
458
+ "understanding how code is connected across files. "
459
+ "The repository is located at {workdir}."
459
460
  )
460
461
 
461
462
 
@@ -581,18 +582,22 @@ class ClaudeCodeHarness:
581
582
  # Source the env file so setup_command has access to API keys etc.
582
583
  env_file = "/tmp/.mcpbr_env.sh"
583
584
  setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
585
+
586
+ # Run as the mcpbr user so files created by setup_command are
587
+ # accessible to the agent, which also runs as mcpbr.
588
+ setup_user = "mcpbr" if env.claude_cli_installed else None
589
+
584
590
  setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
585
591
  ["/bin/bash", "-c", setup_full_cmd],
586
592
  timeout=setup_timeout,
593
+ user=setup_user,
587
594
  )
588
595
 
589
596
  if setup_exit != 0:
590
- if verbose:
591
- self._console.print(
592
- f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
593
- )
594
- if setup_stderr:
595
- self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
597
+ # Always warn on failure so users can diagnose issues (#388)
598
+ self._console.print(f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]")
599
+ if verbose and setup_stderr:
600
+ self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
596
601
  # Non-fatal: continue with agent even if setup fails
597
602
  elif verbose:
598
603
  self._console.print("[green]✓ Setup command completed[/green]")
@@ -633,7 +638,7 @@ class ClaudeCodeHarness:
633
638
  )
634
639
 
635
640
  problem_statement = task.get("problem_statement", "")
636
- prompt = self.prompt_template.format(problem_statement=problem_statement)
641
+ prompt = self.prompt_template.format(problem_statement=problem_statement, workdir=workdir)
637
642
  instance_id = task_id or task.get("instance_id", "unknown")
638
643
 
639
644
  mcp_server_name = None
@@ -798,7 +803,9 @@ class ClaudeCodeHarness:
798
803
  ) -> AgentResult:
799
804
  """Solve task using Claude Code CLI inside Docker container."""
800
805
  problem_statement = task.get("problem_statement", "")
801
- prompt = self.prompt_template.format(problem_statement=problem_statement)
806
+ prompt = self.prompt_template.format(
807
+ problem_statement=problem_statement, workdir=env.workdir
808
+ )
802
809
  instance_id = task_id or task.get("instance_id", "unknown")
803
810
 
804
811
  api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -3,7 +3,7 @@ mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
3
3
  mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
4
4
  mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
5
5
  mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
6
- mcpbr/config.py,sha256=7lWV0ZtzyD6WZ07IR4yhT9lyBBPONzlanaO4XHm9OoE,18952
6
+ mcpbr/config.py,sha256=5MaV6jkPYC4t8MvJEGq6VR7bmxcWZ27behLnPAZu3zY,19200
7
7
  mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
8
8
  mcpbr/config_migration.py,sha256=vTs52uYLO0DusB07nHZT2Y27-_eZdZKhaXYWhDFcnJI,16098
9
9
  mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
@@ -13,17 +13,17 @@ mcpbr/dashboard.py,sha256=wt2A-yFgDvQc94wgPPJlz70gFAkyUi41xgfqPL9xRQY,21884
13
13
  mcpbr/dataset_streaming.py,sha256=XwQSdvy97yurlcAC5hUwto8bLuCf2A9FSMcwjTD_Tho,16720
14
14
  mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
15
15
  mcpbr/docker_cache.py,sha256=jn_9Ak2d8omNmedSCBwA7wrswtEQvB-Bu8TIP2cm-F0,18704
16
- mcpbr/docker_env.py,sha256=_45OUZKjUevE9O3YLF_1uvQtdOyJ7yZIYWmSvXN3cFw,31794
16
+ mcpbr/docker_env.py,sha256=dRhQamlEq05h4wOjZN76c0GIYR6FRx9aGB_Jrkmssss,33676
17
17
  mcpbr/docker_prewarm.py,sha256=GVRD2B10HA7OpWq_CC7CkNkJ1OUjAU7GzKOpJ5VFrXk,12638
18
18
  mcpbr/dry_run.py,sha256=w_1L5K4Bk3SzeXfZY2NDbXims_Qh6711wIGm6p3tr84,18218
19
19
  mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
20
- mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
20
+ mcpbr/evaluation.py,sha256=NK_lId2fbmKZiAyalonhCuLY-pGSGy4tPYN-i84sx8Q,12804
21
21
  mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
22
22
  mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
23
23
  mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
24
24
  mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
25
- mcpbr/harness.py,sha256=xfnD4si0DflBor1cfu_4wrCpECJ9_8eudLEsgVCU6Oo,53731
26
- mcpbr/harnesses.py,sha256=1FmUfFSQF0HBvmJsNEbyW_Km4ChsWhShY70aQP6_TBI,47947
25
+ mcpbr/harness.py,sha256=Ehq-Yxsvi9lWBHEqdhKx1S6LB4vbDttxHB-REcWBoNo,53935
26
+ mcpbr/harnesses.py,sha256=iaGlRIXdvIqCrYQtXNRZT9HowgmPDVssT2_Qlj2eCkI,48294
27
27
  mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
28
28
  mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
29
29
  mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
@@ -92,15 +92,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
92
92
  mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
93
93
  mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
94
94
  mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
95
- mcpbr-0.5.1.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
96
- mcpbr-0.5.1.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
97
- mcpbr-0.5.1.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
98
- mcpbr-0.5.1.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
99
- mcpbr-0.5.1.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
100
- mcpbr-0.5.1.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
101
- mcpbr-0.5.1.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
102
- mcpbr-0.5.1.dist-info/METADATA,sha256=1iupVSrsq687pZ0s77Hu5q0aDex74p-x7ODS876ey3E,55068
103
- mcpbr-0.5.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
104
- mcpbr-0.5.1.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
105
- mcpbr-0.5.1.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
106
- mcpbr-0.5.1.dist-info/RECORD,,
95
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
96
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
97
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
98
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
99
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
100
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
101
+ mcpbr-0.5.2.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
102
+ mcpbr-0.5.2.dist-info/METADATA,sha256=eZ6PAv2rJM_jGEy3zNPiljm3mkOc8kiO28wrWSLCXv8,55068
103
+ mcpbr-0.5.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
104
+ mcpbr-0.5.2.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
105
+ mcpbr-0.5.2.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
106
+ mcpbr-0.5.2.dist-info/RECORD,,
File without changes