PyPI - mcpbr - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

mcpbr 0.5.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

mcpbr/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@
 A benchmark runner for evaluating MCP servers against SWE-bench tasks.
 """
-__version__ = "0.3.23"
+__version__ = "0.5.1"

mcpbr/harness.py CHANGED Viewed

@@ -431,6 +431,17 @@ async def _run_mcp_evaluation(
             config, benchmark, verbosity, log_file, mcp_logs_dir, mcp_server_config
         )
+        # Run setup_command OUTSIDE the agent timer. This is for expensive
+        # one-time operations (e.g. pre-computing code graphs) that must not
+        # count against timeout_seconds.
+        if env and hasattr(agent, "run_setup_command"):
+            try:
+                await agent.run_setup_command(env, verbose=verbose)
+            except asyncio.TimeoutError:
+                # Setup timeout is non-fatal – the agent still gets its
+                # full timeout budget even if setup didn't finish.
+                pass
         # Sample memory before agent execution
         if profiler:
             profiler.sample_memory()

mcpbr/harnesses.py CHANGED Viewed

@@ -555,6 +555,48 @@ class ClaudeCodeHarness:
         self.thinking_budget = thinking_budget
         self._console = Console()
+    async def run_setup_command(
+        self,
+        env: TaskEnvironment,
+        verbose: bool = False,
+    ) -> None:
+        """Run MCP server setup_command inside the container.
+        This MUST be called from the evaluation harness BEFORE the agent timer
+        starts (i.e. before asyncio.wait_for wraps agent.solve()). Expensive
+        operations like pre-computing code graphs happen here and should never
+        count against the task timeout.
+        """
+        if not self.mcp_server or not self.mcp_server.setup_command:
+            return
+        setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
+        setup_timeout = max(1, int(self.mcp_server.setup_timeout_ms / 1000))
+        if verbose:
+            self._console.print(
+                f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
+            )
+        # Source the env file so setup_command has access to API keys etc.
+        env_file = "/tmp/.mcpbr_env.sh"
+        setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
+        setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
+            ["/bin/bash", "-c", setup_full_cmd],
+            timeout=setup_timeout,
+        )
+        if setup_exit != 0:
+            if verbose:
+                self._console.print(
+                    f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
+                )
+                if setup_stderr:
+                    self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
+            # Non-fatal: continue with agent even if setup fails
+        elif verbose:
+            self._console.print("[green]✓ Setup command completed[/green]")
     async def solve(
         self,
         task: dict[str, Any],
@@ -895,34 +937,10 @@ class ClaudeCodeHarness:
                     cost_usd=None,
                 )
-        # Run setup_command if configured (BEFORE agent, OUTSIDE task timer).
-        # This is the right place for expensive one-time operations like
-        # pre-computing caches that should not count against timeout_seconds.
-        if self.mcp_server and self.mcp_server.setup_command:
-            setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
-            setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
-            if verbose:
-                self._console.print(
-                    f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
-                )
-            setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
-            setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
-                ["/bin/bash", "-c", setup_full_cmd],
-                timeout=setup_timeout,
-            )
-            if setup_exit != 0:
-                if verbose:
-                    self._console.print(
-                        f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
-                    )
-                    if setup_stderr:
-                        self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
-                # Non-fatal: continue with agent even if setup fails
-            elif verbose:
-                self._console.print("[green]✓ Setup command completed[/green]")
+        # NOTE: setup_command is intentionally NOT run here. It must be called
+        # from the evaluation harness (harness.py) BEFORE the agent timer starts,
+        # using run_setup_command(). Running it here would include it in the
+        # asyncio.wait_for() timeout that wraps agent.solve().
         try:
             claude_args = [

{mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mcpbr
-Version: 0.5.0
+Version: 0.5.1
 Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
 Project-URL: Homepage, https://github.com/greynewell/mcpbr
 Project-URL: Repository, https://github.com/greynewell/mcpbr

{mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-mcpbr/__init__.py,sha256=3vhpKV9kVECjuPapKpCPEHTjlOsyhuoiLZxBv9O1eL0,152
+mcpbr/__init__.py,sha256=fGX9CC8F1Z1g8nbw8yXbxV3_0aRYxlbv5UAtXDYllgo,151
 mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
 mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
 mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
@@ -22,8 +22,8 @@ mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,198
 mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
 mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
 mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
-mcpbr/harness.py,sha256=Rc6CqzZOMJyuHqfuOIDisLOoPka-cqAqYiL7zr7ALFg,53193
-mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
+mcpbr/harness.py,sha256=xfnD4si0DflBor1cfu_4wrCpECJ9_8eudLEsgVCU6Oo,53731
+mcpbr/harnesses.py,sha256=1FmUfFSQF0HBvmJsNEbyW_Km4ChsWhShY70aQP6_TBI,47947
 mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
 mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
 mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
@@ -92,15 +92,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
 mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
 mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
 mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
-mcpbr-0.5.0.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
-mcpbr-0.5.0.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
-mcpbr-0.5.0.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
-mcpbr-0.5.0.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
-mcpbr-0.5.0.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
-mcpbr-0.5.0.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
-mcpbr-0.5.0.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
-mcpbr-0.5.0.dist-info/METADATA,sha256=fMqq-Q3zU5arV5f777AXScxNJ2C7sHAEbUqliT7rOn4,55068
-mcpbr-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mcpbr-0.5.0.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
-mcpbr-0.5.0.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
-mcpbr-0.5.0.dist-info/RECORD,,
+mcpbr-0.5.1.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
+mcpbr-0.5.1.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
+mcpbr-0.5.1.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
+mcpbr-0.5.1.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
+mcpbr-0.5.1.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
+mcpbr-0.5.1.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
+mcpbr-0.5.1.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
+mcpbr-0.5.1.dist-info/METADATA,sha256=1iupVSrsq687pZ0s77Hu5q0aDex74p-x7ODS876ey3E,55068
+mcpbr-0.5.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mcpbr-0.5.1.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
+mcpbr-0.5.1.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
+mcpbr-0.5.1.dist-info/RECORD,,