mcpbr 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcpbr/config.py CHANGED
@@ -109,6 +109,16 @@ class MCPServerConfig(BaseModel):
109
109
  default=900000,
110
110
  description="Timeout in milliseconds for MCP tool execution (default: 15 min for long-running tools)",
111
111
  )
112
+ setup_command: str | None = Field(
113
+ default=None,
114
+ description="Shell command to run inside the container BEFORE the agent starts. "
115
+ "Runs outside the task timer (does not count against timeout_seconds). "
116
+ "Use {workdir} as placeholder. Useful for pre-computing caches.",
117
+ )
118
+ setup_timeout_ms: int = Field(
119
+ default=900000,
120
+ description="Timeout in milliseconds for the setup_command (default: 15 min)",
121
+ )
112
122
 
113
123
  def get_args_for_workdir(self, workdir: str) -> list[str]:
114
124
  """Replace {workdir} placeholder in args with actual path."""
@@ -117,6 +127,12 @@ class MCPServerConfig(BaseModel):
117
127
  result.append(arg.replace("{workdir}", workdir))
118
128
  return result
119
129
 
130
+ def get_setup_command_for_workdir(self, workdir: str) -> str | None:
131
+ """Replace {workdir} placeholder in setup_command with actual path."""
132
+ if self.setup_command is None:
133
+ return None
134
+ return self.setup_command.replace("{workdir}", workdir)
135
+
120
136
  def get_expanded_env(self) -> dict[str, str]:
121
137
  """Expand ${VAR} references in env values using os.environ.
122
138
 
@@ -400,6 +416,12 @@ class HarnessConfig(BaseModel):
400
416
  description="Enable comprehensive performance profiling (tool latency, memory, overhead)",
401
417
  )
402
418
 
419
+ volumes: dict[str, str] = Field(
420
+ default_factory=dict,
421
+ description="Additional volume mounts (read-write) for Docker containers (host_path: container_path). "
422
+ "Mounted into every container, persists across tasks. Useful for pre-computed caches.",
423
+ )
424
+
403
425
  infrastructure: InfrastructureConfig = Field(
404
426
  default_factory=InfrastructureConfig,
405
427
  description="Infrastructure configuration (local or azure)",
mcpbr/docker_env.py CHANGED
@@ -314,14 +314,18 @@ class DockerEnvironmentManager:
314
314
  FALLBACK_IMAGE = "mcpbr-env"
315
315
  DOCKERFILE_PATH = Path(__file__).parent.parent.parent / "Dockerfile"
316
316
 
317
- def __init__(self, use_prebuilt: bool = True) -> None:
317
+ def __init__(
318
+ self, use_prebuilt: bool = True, extra_volumes: dict[str, str] | None = None
319
+ ) -> None:
318
320
  """Initialize the Docker environment manager.
319
321
 
320
322
  Args:
321
323
  use_prebuilt: If True, try to use pre-built SWE-bench images first.
324
+ extra_volumes: Additional volume mounts (read-write) (host_path -> container_path).
322
325
  """
323
326
  self.client = docker.from_env()
324
327
  self.use_prebuilt = use_prebuilt
328
+ self._extra_volumes = extra_volumes or {}
325
329
  self._fallback_image_built = False
326
330
  self._temp_dirs: list[tempfile.TemporaryDirectory[str]] = []
327
331
  self._containers: list[Container] = []
@@ -488,6 +492,15 @@ CMD ["/bin/bash"]
488
492
 
489
493
  for attempt in range(max_retries + 1):
490
494
  try:
495
+ volumes_dict: dict[str, dict[str, str]] = {
496
+ host_workdir: {"bind": "/workspace", "mode": "rw"},
497
+ }
498
+ for host_path, container_path in self._extra_volumes.items():
499
+ volumes_dict[os.path.abspath(host_path)] = {
500
+ "bind": container_path,
501
+ "mode": "rw",
502
+ }
503
+
491
504
  container = self.client.containers.run(
492
505
  image_name,
493
506
  command="tail -f /dev/null",
@@ -495,9 +508,7 @@ CMD ["/bin/bash"]
495
508
  detach=True,
496
509
  platform="linux/amd64" if uses_prebuilt else None,
497
510
  network_mode="bridge", # Enable network for API calls
498
- volumes={
499
- host_workdir: {"bind": "/workspace", "mode": "rw"},
500
- },
511
+ volumes=volumes_dict,
501
512
  working_dir=container_workdir,
502
513
  remove=False,
503
514
  labels={
mcpbr/evaluation.py CHANGED
@@ -137,6 +137,7 @@ async def run_tests(
137
137
  timeout: int = 120,
138
138
  uses_prebuilt: bool = False,
139
139
  workdir: str | None = None,
140
+ repo: str | None = None,
140
141
  ) -> TestResults:
141
142
  """Run a list of tests and return results.
142
143
 
@@ -146,6 +147,7 @@ async def run_tests(
146
147
  timeout: Timeout per test in seconds.
147
148
  uses_prebuilt: Whether a pre-built SWE-bench image is being used.
148
149
  workdir: Working directory to run tests from. Defaults to env.workdir.
150
+ repo: Repository identifier for looking up the correct test runner.
149
151
 
150
152
  Returns:
151
153
  TestResults with pass/fail counts.
@@ -157,7 +159,7 @@ async def run_tests(
157
159
  passed = 0
158
160
 
159
161
  for test in tests:
160
- test_cmd = _build_test_command(test, uses_prebuilt)
162
+ test_cmd = _build_test_command(test, uses_prebuilt, repo=repo)
161
163
 
162
164
  try:
163
165
  exit_code, stdout, stderr = await env.exec_command(
@@ -198,7 +200,7 @@ async def run_tests(
198
200
  )
199
201
 
200
202
 
201
- def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
203
+ def _build_test_command(test: str, uses_prebuilt: bool = False, repo: str | None = None) -> str:
202
204
  """Build a test command for the given test identifier.
203
205
 
204
206
  Args:
@@ -206,18 +208,29 @@ def _build_test_command(test: str, uses_prebuilt: bool = False) -> str:
206
208
  - pytest: "tests/test_file.py::test_func" or "tests/test_file.py"
207
209
  - Django: "test_method (module.TestClass)" or "module.tests.TestClass.test_method"
208
210
  uses_prebuilt: If True, activate the testbed conda environment first.
211
+ repo: Repository identifier (e.g., "sympy/sympy") for looking up
212
+ the correct test runner from upstream SWE-bench specs.
209
213
 
210
214
  Returns:
211
215
  Shell command string to run the test.
212
216
  """
213
217
  import re
214
218
 
219
+ from .swebench_test_specs import get_repo_test_command
220
+
215
221
  # Pre-built SWE-bench images use a conda environment called 'testbed'
216
222
  if uses_prebuilt:
217
223
  activate = "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed && "
218
224
  else:
219
225
  activate = ""
220
226
 
227
+ # Check upstream SWE-bench test command mapping for non-pytest runners
228
+ if repo:
229
+ upstream_cmd = get_repo_test_command(repo)
230
+ if upstream_cmd and "runtests.py" not in upstream_cmd and "pytest" not in upstream_cmd:
231
+ # Non-pytest, non-Django project (e.g., sympy uses bin/test)
232
+ return f"{activate}{upstream_cmd} {test}"
233
+
221
234
  # Detect Django test format: "test_method (module.TestClass)"
222
235
  if "(" in test and ")" in test and "." in test:
223
236
  # Extract module path from parentheses
@@ -344,12 +357,15 @@ async def evaluate_patch(
344
357
  if not env.uses_prebuilt:
345
358
  await _install_dependencies(env)
346
359
 
360
+ repo = task.get("repo")
361
+
347
362
  fail_to_pass_results = await run_tests(
348
363
  env,
349
364
  fail_to_pass_tests,
350
365
  timeout=test_timeout,
351
366
  uses_prebuilt=env.uses_prebuilt,
352
367
  workdir=eval_workdir,
368
+ repo=repo,
353
369
  )
354
370
 
355
371
  pass_to_pass_results = await run_tests(
@@ -358,6 +374,7 @@ async def evaluate_patch(
358
374
  timeout=test_timeout,
359
375
  uses_prebuilt=env.uses_prebuilt,
360
376
  workdir=eval_workdir,
377
+ repo=repo,
361
378
  )
362
379
 
363
380
  resolved = (
mcpbr/harness.py CHANGED
@@ -962,7 +962,10 @@ async def run_evaluation(
962
962
  "args": config.mcp_server.args if config.mcp_server else [],
963
963
  }
964
964
 
965
- docker_manager = DockerEnvironmentManager(use_prebuilt=config.use_prebuilt_images)
965
+ docker_manager = DockerEnvironmentManager(
966
+ use_prebuilt=config.use_prebuilt_images,
967
+ extra_volumes=config.volumes,
968
+ )
966
969
 
967
970
  results: list[TaskResult] = []
968
971
  # Add cached results if using state tracker
mcpbr/harnesses.py CHANGED
@@ -895,6 +895,35 @@ class ClaudeCodeHarness:
895
895
  cost_usd=None,
896
896
  )
897
897
 
898
+ # Run setup_command if configured (BEFORE agent, OUTSIDE task timer).
899
+ # This is the right place for expensive one-time operations like
900
+ # pre-computing caches that should not count against timeout_seconds.
901
+ if self.mcp_server and self.mcp_server.setup_command:
902
+ setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
903
+ setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
904
+
905
+ if verbose:
906
+ self._console.print(
907
+ f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
908
+ )
909
+
910
+ setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
911
+ setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
912
+ ["/bin/bash", "-c", setup_full_cmd],
913
+ timeout=setup_timeout,
914
+ )
915
+
916
+ if setup_exit != 0:
917
+ if verbose:
918
+ self._console.print(
919
+ f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
920
+ )
921
+ if setup_stderr:
922
+ self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
923
+ # Non-fatal: continue with agent even if setup fails
924
+ elif verbose:
925
+ self._console.print("[green]✓ Setup command completed[/green]")
926
+
898
927
  try:
899
928
  claude_args = [
900
929
  "--print",
@@ -0,0 +1,33 @@
1
+ """Test command specs from upstream SWE-bench harness.
2
+
3
+ Maps repositories to their correct test commands. mcpbr defaults to pytest
4
+ for all non-Django projects, but some projects (e.g., sympy) use custom test
5
+ runners that aren't pytest-compatible.
6
+
7
+ Source: https://github.com/SWE-bench/SWE-bench/blob/main/swebench/harness/constants/python.py
8
+ """
9
+
10
+ # Base test commands per framework (from upstream constants/python.py)
11
+ TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
12
+ TEST_DJANGO = "./tests/runtests.py --verbosity 2 --settings=test_sqlite --parallel 1"
13
+ TEST_SYMPY = "PYTHONWARNINGS='ignore::UserWarning,ignore::SyntaxWarning' bin/test -C --verbose"
14
+ TEST_SPHINX = "tox --current-env -epy39 -v --"
15
+ TEST_ASTROPY = "pytest -rA -vv -o console_output_style=classic --tb=no"
16
+ TEST_SEABORN = "pytest --no-header -rA"
17
+
18
+ # Repo → test command mapping
19
+ # Only non-pytest entries need to be here — pytest is the default fallback.
20
+ # Django is included for documentation but its existing handler takes precedence.
21
+ REPO_TO_TEST_CMD: dict[str, str] = {
22
+ "sympy/sympy": TEST_SYMPY,
23
+ "django/django": TEST_DJANGO,
24
+ "sphinx-doc/sphinx": TEST_SPHINX,
25
+ }
26
+
27
+
28
+ def get_repo_test_command(repo: str) -> str | None:
29
+ """Look up the upstream test command for a repo.
30
+
31
+ Returns None if repo uses standard pytest (handled by existing logic).
32
+ """
33
+ return REPO_TO_TEST_CMD.get(repo)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcpbr
3
- Version: 0.4.13
3
+ Version: 0.4.14
4
4
  Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
5
5
  Project-URL: Homepage, https://github.com/greynewell/mcpbr
6
6
  Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -3,14 +3,14 @@ mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
3
3
  mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
4
4
  mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
5
5
  mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
6
- mcpbr/config.py,sha256=quB2KPKsFY7Y86wTZr9GjlZRYsh13MngNikdOTBKEvY,17864
6
+ mcpbr/config.py,sha256=E9Icedjk_VFONnnEZbWW5WN7El5RaJD5pGi-JQlrlV0,18890
7
7
  mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
8
8
  mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
9
- mcpbr/docker_env.py,sha256=GKjQULslYANGSkyY8ZLaAEy9WWl0MYqS1LZ0VavmhXc,31085
9
+ mcpbr/docker_env.py,sha256=vpbjL227L9qLjrS7CzXevxzo9393qmOrrxWG7lP1s44,31629
10
10
  mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
11
- mcpbr/evaluation.py,sha256=LQXSLn_4yIkZ0jwZ85AaKku2dHcPirmj5c7-nhpPMfY,11994
12
- mcpbr/harness.py,sha256=6_p_MFrs8RulosXToVtB9-P4Ej8XzR6ZzCKDP4mUeGY,51026
13
- mcpbr/harnesses.py,sha256=yqILXc2q5Q7136XTF_t4EWi9q8JYBJSIqh2FrCmU8gY,45941
11
+ mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
12
+ mcpbr/harness.py,sha256=sEMP2PnrQP_BKK-4yixz05qXcY-0OsJNJ5e5JU2Rtsc,51079
13
+ mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
14
14
  mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
15
15
  mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
16
16
  mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
@@ -27,6 +27,7 @@ mcpbr/smoke_test.py,sha256=srYGOn_auspRbt_a6ebYDDDq_nujA_iZGman5nU1ikU,14925
27
27
  mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
28
28
  mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
29
29
  mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
30
+ mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
30
31
  mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
31
32
  mcpbr/benchmarks/__init__.py,sha256=RK0TxNTSqhUX_WtGs0CcV1MX2uiCBTUWkEHYpo_7T5M,4099
32
33
  mcpbr/benchmarks/agentbench.py,sha256=jQ8OG_5cn-PvOZizXivysLTw9xvtA8c_MWfw3jXq0TQ,6512
@@ -68,15 +69,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
68
69
  mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
69
70
  mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
70
71
  mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
71
- mcpbr-0.4.13.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
72
- mcpbr-0.4.13.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
73
- mcpbr-0.4.13.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
74
- mcpbr-0.4.13.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
75
- mcpbr-0.4.13.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
76
- mcpbr-0.4.13.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
77
- mcpbr-0.4.13.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
78
- mcpbr-0.4.13.dist-info/METADATA,sha256=ZK96nQ5BBKQzudWk_E0-QRzFJNRhQ08AAUQ_88HvQBg,54809
79
- mcpbr-0.4.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
80
- mcpbr-0.4.13.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
81
- mcpbr-0.4.13.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
82
- mcpbr-0.4.13.dist-info/RECORD,,
72
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
73
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
74
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
75
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
76
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
77
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
78
+ mcpbr-0.4.14.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
79
+ mcpbr-0.4.14.dist-info/METADATA,sha256=f2PEinjR_XbBOmFtDAZxoDHdBLwKxLX4V9kjYqh_UtA,54809
80
+ mcpbr-0.4.14.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
81
+ mcpbr-0.4.14.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
82
+ mcpbr-0.4.14.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
83
+ mcpbr-0.4.14.dist-info/RECORD,,
File without changes