mcpbr 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/__init__.py +1 -1
- mcpbr/harness.py +11 -0
- mcpbr/harnesses.py +46 -28
- {mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/METADATA +1 -1
- {mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/RECORD +15 -15
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.5.0.data → mcpbr-0.5.1.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/WHEEL +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.5.0.dist-info → mcpbr-0.5.1.dist-info}/licenses/LICENSE +0 -0
mcpbr/__init__.py
CHANGED
mcpbr/harness.py
CHANGED
|
@@ -431,6 +431,17 @@ async def _run_mcp_evaluation(
|
|
|
431
431
|
config, benchmark, verbosity, log_file, mcp_logs_dir, mcp_server_config
|
|
432
432
|
)
|
|
433
433
|
|
|
434
|
+
# Run setup_command OUTSIDE the agent timer. This is for expensive
|
|
435
|
+
# one-time operations (e.g. pre-computing code graphs) that must not
|
|
436
|
+
# count against timeout_seconds.
|
|
437
|
+
if env and hasattr(agent, "run_setup_command"):
|
|
438
|
+
try:
|
|
439
|
+
await agent.run_setup_command(env, verbose=verbose)
|
|
440
|
+
except asyncio.TimeoutError:
|
|
441
|
+
# Setup timeout is non-fatal – the agent still gets its
|
|
442
|
+
# full timeout budget even if setup didn't finish.
|
|
443
|
+
pass
|
|
444
|
+
|
|
434
445
|
# Sample memory before agent execution
|
|
435
446
|
if profiler:
|
|
436
447
|
profiler.sample_memory()
|
mcpbr/harnesses.py
CHANGED
|
@@ -555,6 +555,48 @@ class ClaudeCodeHarness:
|
|
|
555
555
|
self.thinking_budget = thinking_budget
|
|
556
556
|
self._console = Console()
|
|
557
557
|
|
|
558
|
+
async def run_setup_command(
|
|
559
|
+
self,
|
|
560
|
+
env: TaskEnvironment,
|
|
561
|
+
verbose: bool = False,
|
|
562
|
+
) -> None:
|
|
563
|
+
"""Run MCP server setup_command inside the container.
|
|
564
|
+
|
|
565
|
+
This MUST be called from the evaluation harness BEFORE the agent timer
|
|
566
|
+
starts (i.e. before asyncio.wait_for wraps agent.solve()). Expensive
|
|
567
|
+
operations like pre-computing code graphs happen here and should never
|
|
568
|
+
count against the task timeout.
|
|
569
|
+
"""
|
|
570
|
+
if not self.mcp_server or not self.mcp_server.setup_command:
|
|
571
|
+
return
|
|
572
|
+
|
|
573
|
+
setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
|
|
574
|
+
setup_timeout = max(1, int(self.mcp_server.setup_timeout_ms / 1000))
|
|
575
|
+
|
|
576
|
+
if verbose:
|
|
577
|
+
self._console.print(
|
|
578
|
+
f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# Source the env file so setup_command has access to API keys etc.
|
|
582
|
+
env_file = "/tmp/.mcpbr_env.sh"
|
|
583
|
+
setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
|
|
584
|
+
setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
|
|
585
|
+
["/bin/bash", "-c", setup_full_cmd],
|
|
586
|
+
timeout=setup_timeout,
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
if setup_exit != 0:
|
|
590
|
+
if verbose:
|
|
591
|
+
self._console.print(
|
|
592
|
+
f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
|
|
593
|
+
)
|
|
594
|
+
if setup_stderr:
|
|
595
|
+
self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
|
|
596
|
+
# Non-fatal: continue with agent even if setup fails
|
|
597
|
+
elif verbose:
|
|
598
|
+
self._console.print("[green]✓ Setup command completed[/green]")
|
|
599
|
+
|
|
558
600
|
async def solve(
|
|
559
601
|
self,
|
|
560
602
|
task: dict[str, Any],
|
|
@@ -895,34 +937,10 @@ class ClaudeCodeHarness:
|
|
|
895
937
|
cost_usd=None,
|
|
896
938
|
)
|
|
897
939
|
|
|
898
|
-
#
|
|
899
|
-
#
|
|
900
|
-
#
|
|
901
|
-
|
|
902
|
-
setup_cmd = self.mcp_server.get_setup_command_for_workdir(env.workdir)
|
|
903
|
-
setup_timeout = int(self.mcp_server.setup_timeout_ms / 1000)
|
|
904
|
-
|
|
905
|
-
if verbose:
|
|
906
|
-
self._console.print(
|
|
907
|
-
f"[cyan]Running setup command (timeout: {setup_timeout:.0f}s)...[/cyan]"
|
|
908
|
-
)
|
|
909
|
-
|
|
910
|
-
setup_full_cmd = f"source {shlex.quote(env_file)} && {setup_cmd}"
|
|
911
|
-
setup_exit, _setup_stdout, setup_stderr = await env.exec_command(
|
|
912
|
-
["/bin/bash", "-c", setup_full_cmd],
|
|
913
|
-
timeout=setup_timeout,
|
|
914
|
-
)
|
|
915
|
-
|
|
916
|
-
if setup_exit != 0:
|
|
917
|
-
if verbose:
|
|
918
|
-
self._console.print(
|
|
919
|
-
f"[yellow]⚠ Setup command exited with code {setup_exit}[/yellow]"
|
|
920
|
-
)
|
|
921
|
-
if setup_stderr:
|
|
922
|
-
self._console.print(f"[dim]{setup_stderr[:500]}[/dim]")
|
|
923
|
-
# Non-fatal: continue with agent even if setup fails
|
|
924
|
-
elif verbose:
|
|
925
|
-
self._console.print("[green]✓ Setup command completed[/green]")
|
|
940
|
+
# NOTE: setup_command is intentionally NOT run here. It must be called
|
|
941
|
+
# from the evaluation harness (harness.py) BEFORE the agent timer starts,
|
|
942
|
+
# using run_setup_command(). Running it here would include it in the
|
|
943
|
+
# asyncio.wait_for() timeout that wraps agent.solve().
|
|
926
944
|
|
|
927
945
|
try:
|
|
928
946
|
claude_args = [
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcpbr
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
|
|
5
5
|
Project-URL: Homepage, https://github.com/greynewell/mcpbr
|
|
6
6
|
Project-URL: Repository, https://github.com/greynewell/mcpbr
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
mcpbr/__init__.py,sha256=
|
|
1
|
+
mcpbr/__init__.py,sha256=fGX9CC8F1Z1g8nbw8yXbxV3_0aRYxlbv5UAtXDYllgo,151
|
|
2
2
|
mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
|
|
3
3
|
mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
|
|
4
4
|
mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
|
|
@@ -22,8 +22,8 @@ mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,198
|
|
|
22
22
|
mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
|
|
23
23
|
mcpbr/formatting.py,sha256=lwZcb4fD5osBzJlerICyvAVb4KHSm_nRTBg1dVfD6Lo,14193
|
|
24
24
|
mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
|
|
25
|
-
mcpbr/harness.py,sha256=
|
|
26
|
-
mcpbr/harnesses.py,sha256=
|
|
25
|
+
mcpbr/harness.py,sha256=xfnD4si0DflBor1cfu_4wrCpECJ9_8eudLEsgVCU6Oo,53731
|
|
26
|
+
mcpbr/harnesses.py,sha256=1FmUfFSQF0HBvmJsNEbyW_Km4ChsWhShY70aQP6_TBI,47947
|
|
27
27
|
mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
|
|
28
28
|
mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
|
|
29
29
|
mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
|
|
@@ -92,15 +92,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
|
|
|
92
92
|
mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
|
|
93
93
|
mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
|
|
94
94
|
mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
|
|
95
|
-
mcpbr-0.5.
|
|
96
|
-
mcpbr-0.5.
|
|
97
|
-
mcpbr-0.5.
|
|
98
|
-
mcpbr-0.5.
|
|
99
|
-
mcpbr-0.5.
|
|
100
|
-
mcpbr-0.5.
|
|
101
|
-
mcpbr-0.5.
|
|
102
|
-
mcpbr-0.5.
|
|
103
|
-
mcpbr-0.5.
|
|
104
|
-
mcpbr-0.5.
|
|
105
|
-
mcpbr-0.5.
|
|
106
|
-
mcpbr-0.5.
|
|
95
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
|
|
96
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
|
|
97
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
|
|
98
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
|
|
99
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
|
|
100
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
|
|
101
|
+
mcpbr-0.5.1.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
|
|
102
|
+
mcpbr-0.5.1.dist-info/METADATA,sha256=1iupVSrsq687pZ0s77Hu5q0aDex74p-x7ODS876ey3E,55068
|
|
103
|
+
mcpbr-0.5.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
104
|
+
mcpbr-0.5.1.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
|
|
105
|
+
mcpbr-0.5.1.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
|
|
106
|
+
mcpbr-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|