PyPI - eval-protocol - Versions diffs - 0.2.4__tar.gz → 0.2.5.dev1__tar.gz - Mend

eval-protocol 0.2.4tar.gz → 0.2.5.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (308) hide show

{eval_protocol-0.2.4/eval_protocol.egg-info → eval_protocol-0.2.5.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.4
+Version: 0.2.5.dev1
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
 Requires-Dist: pandas>=1.5.0
 Requires-Dist: watchdog>=2.1.0
 Requires-Dist: websockets>=15.0.1
-Requires-Dist: fireworks-ai>=0.19.12
 Requires-Dist: fastapi>=0.116.1
 Provides-Extra: dev
 Requires-Dist: build; extra == "dev"
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
 Provides-Extra: openevals
 Requires-Dist: openevals>=0.1.0; extra == "openevals"
 Provides-Extra: fireworks
-Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
+Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
 Provides-Extra: box2d
 Requires-Dist: swig; extra == "box2d"
 Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-05T23:22:49-0700",
+ "date": "2025-08-06T17:51:29-0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "4dbac4d9116bdb2888bd145e779eba9086c59096",
- "version": "0.2.4"
+ "full-revisionid": "a807140937b9002c71ee42a6afef594ea6377c2d",
+ "version": "0.2.5-dev1"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/client/connection.py RENAMED Viewed

@@ -101,7 +101,7 @@ class MCPConnectionManager:
                 # Update the session ID to match what the server generated
                 session.session_id = server_session_id
-                logger.debug(f"Updated session ID to match server: {server_session_id}")
+                logger.info(f"Updated session ID to match server: {server_session_id}")
         # PRE-WARM: Discover and cache tools immediately after session initialization
         # This prevents concurrent list_tools() calls later
@@ -133,6 +133,24 @@ class MCPConnectionManager:
                 self._tools_cache[cache_key] = tool_schemas
                 logger.debug(f"✅ PRE-WARMED {len(tool_schemas)} tools for{cache_key}")
+    async def reset_session(self, session: MCPSession) -> None:
+        """
+        Clean session data in remote mcp server for the given session
+        """
+        import httpx
+        base_url = session.base_url.rstrip("/").removesuffix("/mcp")
+        url = f"{base_url}/control/reset_session"
+        headers = {"mcp-session-id": session.session_id}
+        body = {"seed": session.seed}
+        timeout = httpx.Timeout(3.0)
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(url, headers=headers, json=body)
+            resp.raise_for_status()
+            logger.debug(f"Session {session.session_id}: reset_session -> {resp.json()}")
     async def discover_tools(self, session: MCPSession) -> List[Dict]:
         """
         Discover available tools from an MCP session.

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/manager.py RENAMED Viewed

@@ -22,7 +22,6 @@ from vendor.tau2.user.user_simulator import UserSimulator
 from ...models import CompletionParams, EvaluationRow, InputMetadata, Message
 from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory
-from ..client.connection import MCPConnectionManager
 if TYPE_CHECKING:
     from ..session.manager import GeneralMCPVectorEnv
@@ -33,43 +32,9 @@ logger = logging.getLogger(__name__)
 class ExecutionManager:
     """
-    Unified manager that handles both MCP session lifecycle and rollout execution.
-    Combines the functionality of SessionManager and RolloutManager for better
-    organization and reduced complexity.
+    Manage rollout for MCP environments.
     """
-    def __init__(self):
-        """Initialize the execution manager."""
-        self.connection_manager = MCPConnectionManager()
-    async def initialize_sessions(self, sessions: List[MCPSession]) -> None:
-        """
-        Initialize multiple MCP sessions in parallel.
-        Args:
-            sessions: List of MCPSessions to initialize
-        """
-        tasks = [self.connection_manager.initialize_session(session) for session in sessions]
-        await asyncio.gather(*tasks)
-    async def close_sessions(self, sessions: List[MCPSession]) -> None:
-        """
-        Close multiple MCP sessions in parallel.
-        Args:
-            sessions: List of MCPSessions to close
-        """
-        tasks = [asyncio.create_task(self.connection_manager.close_session(session)) for session in sessions]
-        if tasks:
-            try:
-                # Wait for all close operations to complete
-                await asyncio.gather(*tasks, return_exceptions=True)
-            except asyncio.CancelledError:
-                # Handle cancellation gracefully (especially important for Python 3.12)
-                logger.debug("Close operation was cancelled, but sessions are marked as closed")
     async def execute_rollouts(
         self,
         envs: "GeneralMCPVectorEnv",
@@ -178,7 +143,7 @@ class ExecutionManager:
             for msg in trajectory.conversation_history:
                 # Create a copy to avoid modifying the original
                 msg_dict = dict(msg)
                 # Handle multimodal content (list of content blocks) by extracting text
                 if isinstance(msg_dict.get("content"), list):
                     text_content = None
@@ -187,7 +152,7 @@ class ExecutionManager:
                             text_content = content_block.get("text")
                             break
                     msg_dict["content"] = text_content or ""
                 messages.append(Message.model_validate(msg_dict))
             input_metadata = InputMetadata(

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/mcpgym.py RENAMED Viewed

@@ -116,6 +116,7 @@ class McpGym(ABC):
         # Register tools and control plane endpoints
         self._register_tools()
         self._discover_and_register_control_plane_endpoints()
+        self._register_session_reset_endpoint()
     def _get_session_id(self, ctx: Context) -> str:
         """
@@ -227,6 +228,28 @@ class McpGym(ABC):
             return self.sessions[session_id]
+    def _register_session_reset_endpoint(self):
+        @self.mcp.custom_route("/control/reset_session", methods=["POST"])
+        async def reset_session_endpoint(request: Request) -> JSONResponse:
+            session_id = request.headers.get("mcp-session-id")
+            body = await request.json()
+            seed = body.get("seed", None)
+            print(f"🔍 _register_session_reset_endpoint: Resetting session, session_id: {session_id}, seed: {seed}")
+            if not session_id:
+                return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
+            with self.session_lock:
+                if session_id in self.sessions:
+                    env, obs, _ = self._new_env(seed=seed)
+                    self.sessions[session_id] = {
+                        "env": env,
+                        "obs": obs,
+                        "session_data": {},
+                        "session_id": session_id,
+                    }
+                    print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
+            return JSONResponse({"message": "Session reset successfully"})
     def _discover_and_register_control_plane_endpoints(self):
         """
         Discover and register control plane endpoints on the subclass instance.
@@ -323,7 +346,7 @@ class McpGym(ABC):
         # Log control plane update (for debugging)
         print(
-            f"🎛️  Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}"
+            f"🎛️  Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}, total_reward={self.control_plane_state['total_reward']}"
         )
     def _get_or_create_session_control_plane(self, session_id: str) -> Dict[str, Any]:
@@ -365,7 +388,7 @@ class McpGym(ABC):
         # Log control plane update
         print(
-            f"🎛️  Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}"
+            f"🎛️  Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}, total_reward={control_plane['total_reward']}"
         )
     def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/session/manager.py RENAMED Viewed

@@ -11,7 +11,7 @@ import logging
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from ...types import DatasetRow, MCPSession, MCPToolCall
-from ..execution.manager import ExecutionManager
+from ..client.connection import MCPConnectionManager
 logger = logging.getLogger(__name__)
@@ -44,7 +44,7 @@ class GeneralMCPVectorEnv:
         self.user_prompt_formatter = user_prompt_formatter or self._default_formatter
         self.n = len(sessions)
         self.tool_schemas = []  # Discovered from MCP servers
-        self.execution_manager = ExecutionManager()
+        self.connection_manager = MCPConnectionManager()
         self.usage_stats = {}  # llm usage stats for monitoring
         if len(sessions) != len(dataset_rows):
@@ -58,17 +58,14 @@ class GeneralMCPVectorEnv:
         This is thread-safe and can be called from worker threads.
         """
-        # Establish a persistent session for each environment.
-        await self.execution_manager.connection_manager.initialize_session(session)
         # Get available tools from MCP server
-        tool_schemas = await self.execution_manager.connection_manager.discover_tools(session)
+        tool_schemas = await self.connection_manager.discover_tools(session)
         if not self.tool_schemas:
             self.tool_schemas = tool_schemas
         # PROPER MCP PATTERN: Get initial state from resources during session establishment
-        initial_observation = await self.execution_manager.connection_manager.get_initial_state(session)
+        initial_observation = await self.connection_manager.get_initial_state(session)
         # Update session state
         session.terminated = False
@@ -119,7 +116,7 @@ class GeneralMCPVectorEnv:
             )
         # Execute the tool call via MCP protocol
-        observation, reward, done, info = await self.execution_manager.connection_manager.call_tool(
+        observation, reward, done, info = await self.connection_manager.call_tool(
             session, tool_call.tool_name, tool_call.arguments
         )
@@ -223,5 +220,6 @@ class GeneralMCPVectorEnv:
     async def close(self):
         """Closes all MCP sessions."""
         print(f"🧹 Closing {self.n} MCP sessions...")
-        await self.execution_manager.close_sessions(self.sessions)
+        tasks = [self.connection_manager.close_session(session) for session in self.sessions]
+        await asyncio.gather(*tasks)
         print(f"✅ All MCP sessions closed.")

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_env.py RENAMED Viewed

@@ -17,7 +17,7 @@ Usage remains the same:
     policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
     # Create environments with evaluation_rows configuration
-    envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
+    envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
     # Execute tool-calling rollouts
     evaluation_rows = await ep.rollout(envs, policy=policy, steps=512)
@@ -51,11 +51,20 @@ from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolic
 from .mcp.session.manager import GeneralMCPVectorEnv
 from .models import EvaluationRow
 from .types import DatasetRow, MCPSession, MCPToolCall
+import asyncio
 logger = logging.getLogger(__name__)
-def make(
+async def reset_mcp_sessions(envs: GeneralMCPVectorEnv):
+    """
+    Reset mcp server sessions
+    """
+    tasks = [envs.connection_manager.reset_session(session) for session in envs.sessions]
+    await asyncio.gather(*tasks)
+async def make(
     env_spec: str,
     evaluation_rows: Optional[List[EvaluationRow]] = None,
     dataset: Optional[List[Dict]] = None,
@@ -63,6 +72,7 @@ def make(
     seeds: Optional[List[int]] = None,
     model_id: str = "unknown",
     user_prompt_formatter: Optional[Callable] = None,
+    reset_sessions: bool = False,
 ) -> GeneralMCPVectorEnv:
     """
     Create general MCP environments driven by evaluation_rows configuration.
@@ -75,19 +85,20 @@ def make(
         seeds: List of seeds (for backward compatibility)
         model_id: Model identifier
         user_prompt_formatter: Optional callback for formatting user prompts
+        reset_sessions: Whether to reset sessions before returning the environment
     Returns:
         General MCP environment that works with any MCP server
     Example:
         # EvaluationRow approach (preferred)
-        envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
+        envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
         # Dataset approach (backward compatibility)
-        envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
+        envs = await ep.make("http://localhost:8000/mcp", dataset=dataset)
         # Legacy approach (backward compatibility)
-        envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
+        envs = await ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
     """
     # Parse environment specification - make sure URL format is correct
     base_url = env_spec
@@ -160,8 +171,6 @@ def make(
             )
             sessions.append(session)
-        return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
     else:
         # Legacy approach for backward compatibility
         if n is None:
@@ -198,7 +207,14 @@ def make(
             )
             sessions.append(session)
-        return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
+    mcp_envs = GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
+    tasks = [mcp_envs.connection_manager.initialize_session(session) for session in sessions]
+    await asyncio.gather(*tasks)
+    if reset_sessions:
+        await reset_mcp_sessions(mcp_envs)
+    return mcp_envs
 async def rollout(
@@ -266,7 +282,7 @@ async def rollout(
             raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL")
         auto_model_id = model_id or getattr(policy, "model_id", "unknown")
-        envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
+        envs = await make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
     # Use the new ExecutionManager for execution
     execution_manager = ExecutionManager()

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py RENAMED Viewed

@@ -182,49 +182,47 @@ class MCPServerManager:
         return False  # Don't suppress exceptions
-async def default_mcp_gym_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]:
+async def default_mcp_gym_rollout_processor(
+    rows: List[EvaluationRow], config: RolloutProcessorConfig
+) -> List[EvaluationRow]:
     """
     Rollout processor for tau bench environments.
     This processor starts an MCP server, creates tau bench environments, and runs rollouts
     using the eval_protocol framework, following the pattern from test_tau2_e2e.py.
     Args:
         rows: List of EvaluationRow objects containing messages and dataset info in input_metadata
         config: RolloutProcessorConfig with model and other parameters
     Returns:
         List of EvaluationRow objects with completed conversations
     """
     server = MCPServerManager(config.server_script_path, port=9700)
     try:
         server.start()
         policy = ep.LiteLLMPolicy(
             model_id=config.model,
-            temperature=config.input_params.get('temperature', 0.0),
-            max_tokens=config.input_params.get('max_tokens', 4096),
+            temperature=config.input_params.get("temperature", 0.0),
+            max_tokens=config.input_params.get("max_tokens", 4096),
         )
         # Create MCP environments directly from evaluation_rows
-        envs = ep.make(
-            'http://localhost:9700/mcp/',
+        envs = await ep.make(
+            "http://localhost:9700/mcp/",
             evaluation_rows=rows,
             model_id=policy.model_id,
         )
         # Run rollout with environments and policy
         evaluation_rows = await ep.rollout(
-            envs,
-            policy=policy,
-            steps=config.steps,
-            max_concurrent_rollouts=config.max_concurrent_rollouts
+            envs, policy=policy, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts
         )
         return evaluation_rows
     finally:
         # Always clean up the server
         server.stop()

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/types/types.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Dict, List, Optional
+from mcp.client.session import ClientSession
+from contextlib import AsyncExitStack
 class TerminationReason(str, Enum):
@@ -50,8 +52,8 @@ class MCPSession:
     last_observation: Any = None
     # Persistent MCP connection components
-    _exit_stack: Optional[Any] = None
-    _mcp_session: Optional[Any] = None
+    _exit_stack: Optional[AsyncExitStack] = None
+    _mcp_session: Optional[ClientSession] = None
 @dataclass

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1/eval_protocol.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.4
+Version: 0.2.5.dev1
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
 Requires-Dist: pandas>=1.5.0
 Requires-Dist: watchdog>=2.1.0
 Requires-Dist: websockets>=15.0.1
-Requires-Dist: fireworks-ai>=0.19.12
 Requires-Dist: fastapi>=0.116.1
 Provides-Extra: dev
 Requires-Dist: build; extra == "dev"
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
 Provides-Extra: openevals
 Requires-Dist: openevals>=0.1.0; extra == "openevals"
 Provides-Extra: fireworks
-Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
+Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
 Provides-Extra: box2d
 Requires-Dist: swig; extra == "box2d"
 Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/requires.txt RENAMED Viewed

@@ -28,7 +28,6 @@ deepdiff>=6.0.0
 pandas>=1.5.0
 watchdog>=2.1.0
 websockets>=15.0.1
-fireworks-ai>=0.19.12
 fastapi>=0.116.1
 [adapters]
@@ -71,7 +70,7 @@ pip>=25.1.1
 haikus==0.3.8
 [fireworks]
-fireworks-ai>=0.19.10
+fireworks-ai>=0.19.12
 [huggingface]
 datasets>=2.0.0

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/pyproject.toml RENAMED Viewed

@@ -48,7 +48,6 @@ dependencies = [
     "pandas>=1.5.0",
     "watchdog>=2.1.0",
     "websockets>=15.0.1",
-    "fireworks-ai>=0.19.12",
     "fastapi>=0.116.1",
 ]
@@ -96,7 +95,7 @@ openevals = [
     "openevals>=0.1.0",
 ]
 fireworks = [
-    "fireworks-ai>=0.19.10",
+    "fireworks-ai>=0.19.12",
 ]
 box2d = [
     "swig",

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_parallel_rollouts.py RENAMED Viewed

@@ -138,7 +138,7 @@ async def _test_seed_handling_and_type_compatibility_impl():
             )
         # 3. Test that environments are created with proper seed isolation
-        envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
+        envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
         # Verify we have the right number of environments
         assert len(envs.sessions) == len(test_seeds), f"Expected {len(test_seeds)} sessions, got {len(envs.sessions)}"
@@ -273,7 +273,7 @@ async def _run_simplified_compatibility_test():
         )
     # This should work even without a server (just creates session objects)
-    envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
+    envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
     assert len(envs.sessions) == len(test_seeds)
     print("✅ Environment creation works")

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rollout_control_plane_integration.py RENAMED Viewed

@@ -489,7 +489,7 @@ class TestRolloutControlPlaneIntegration:
         policy = MockPolicy(["right"])
         with (
-            patch("eval_protocol.mcp_env.make") as mock_make,
+            patch("eval_protocol.mcp_env.make", new_callable=AsyncMock) as mock_make,
             patch("eval_protocol.mcp_env.ExecutionManager") as MockManager,
         ):
             mock_env = MagicMock()
@@ -512,7 +512,15 @@ class TestRolloutControlPlaneIntegration:
                 dataset=dataset,
                 model_id="test_model",
             )
-            manager_instance.execute_rollouts.assert_called_once_with(mock_env, policy, 5, None, 8)
+            manager_instance.execute_rollouts.assert_called_once_with(
+                mock_make.return_value,
+                policy,
+                5,
+                None,
+                8,
+            )
             assert result == ["ok"]
     def test_control_plane_trajectory_serialization(self):

{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_url_handling.py RENAMED Viewed

@@ -1,5 +1,4 @@
-import asyncio
+from unittest.mock import AsyncMock, patch
 import httpx
 import pytest
 from werkzeug.wrappers import Response
@@ -7,31 +6,46 @@ from werkzeug.wrappers import Response
 import eval_protocol as ep
-# Sync tests for the ep.make() function
-def test_mcp_env_make_appends_trailing_slash():
+# Sync tests for the await ep.make() function
+@pytest.mark.asyncio
+async def test_mcp_env_make_appends_trailing_slash():
     """
-    Verify that ep.make() appends a trailing slash to the MCP server URL if it's missing.
+    Verify that await ep.make() appends a trailing slash to the MCP server URL if it's missing.
     This prevents 307 redirects that can break HTTP clients.
     """
     base_url = "http://localhost:8000/mcp"
     corrected_url = "http://localhost:8000/mcp/"
-    # Use n and seeds to avoid needing a full dataset
-    envs = ep.make(base_url, n=1, seeds=[42])
+    with patch(
+        "eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
+        new_callable=AsyncMock,
+    ) as mock_init:
+        mock_init.return_value = None
+        envs = await ep.make(base_url, n=1, seeds=[42])
+        mock_init.assert_awaited_once()
     assert len(envs.sessions) == 1
-    # The session's base_url should have the trailing slash
     assert envs.sessions[0].base_url == corrected_url
-def test_mcp_env_make_keeps_existing_trailing_slash():
+@pytest.mark.asyncio
+async def test_mcp_env_make_keeps_existing_trailing_slash():
     """
-    Verify that ep.make() does not add an extra slash if one is already present.
+    Verify that await ep.make() does not add an extra slash if one is already present.
     """
     base_url = "http://localhost:8000/mcp/"
-    # Use n and seeds to avoid needing a full dataset
-    envs = ep.make(base_url, n=1, seeds=[42])
+    with patch(
+        "eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
+        new_callable=AsyncMock,
+    ) as mock_init:
+        mock_init.return_value = None
+        envs = await ep.make(base_url, n=1, seeds=[42])
+        mock_init.assert_awaited_once()
     assert len(envs.sessions) == 1
     # The session's base_url should remain unchanged