PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/mcp/process_manager.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+Generic process manager for MCP servers running in isolated Conda environments.
+This module provides a reusable helper class to manage the lifecycle of server
+subprocesses within dedicated Conda environments, ensuring dependency isolation.
+"""
+import os
+import socket
+import subprocess
+import time
+import uuid
+from typing import Dict, Tuple
+class CondaServerProcessManager:
+    """Manages the lifecycle of server subprocesses inside Conda environments."""
+    def __init__(
+        self,
+        script_path: str,
+        requirements_path: str,
+        conda_base_env: str = "base",
+        port_range: Tuple[int, int] = (10000, 11000),
+    ):
+        """
+        Initialize the process manager.
+        Args:
+            script_path: Path to the server script to run
+            requirements_path: Path to requirements.txt for the environment
+            conda_base_env: Base conda environment to clone from
+            port_range: Tuple of (min_port, max_port) for server instances
+        """
+        self.script_path = script_path
+        self.requirements_path = requirements_path
+        self.conda_base_env = conda_base_env
+        self.port_range = port_range
+        self.processes: Dict[int, Tuple[subprocess.Popen, str]] = {}  # port -> (process, conda_env_name)
+        self.used_ports: set = set()  # Track used ports for better management
+    def _create_conda_env(self, env_name: str):
+        """Creates a new conda environment by cloning the base."""
+        print(f"Creating conda environment '{env_name}'...")
+        # Clone the base environment
+        clone_cmd = [
+            "conda",
+            "create",
+            "--name",
+            env_name,
+            "--clone",
+            self.conda_base_env,
+            "-y",
+        ]
+        subprocess.run(clone_cmd, check=True, capture_output=True, text=True)
+        # Install specific requirements into the new environment
+        pip_install_cmd = [
+            "conda",
+            "run",
+            "-n",
+            env_name,
+            "pip",
+            "install",
+            "-r",
+            self.requirements_path,
+        ]
+        subprocess.run(pip_install_cmd, check=True, capture_output=True, text=True)
+        print(f"Environment '{env_name}' created and dependencies installed.")
+    def find_free_port(self) -> int:
+        """
+        Finds and returns an available TCP port within the configured range.
+        Returns:
+            Available port number
+        Raises:
+            RuntimeError: If no ports are available in the range
+        """
+        min_port, max_port = self.port_range
+        # Try ports in the configured range, avoiding recently used ones
+        attempted_ports = set()
+        for _ in range(max_port - min_port):
+            # Generate a candidate port, preferring unused ones
+            import random
+            # First try unused ports
+            available_ports = set(range(min_port, max_port)) - self.used_ports
+            if available_ports:
+                candidate_port = random.choice(list(available_ports))
+            else:
+                # If all ports have been used, try any port in range
+                candidate_port = random.randint(min_port, max_port - 1)
+            # Skip if we already tried this port
+            if candidate_port in attempted_ports:
+                continue
+            attempted_ports.add(candidate_port)
+            # Test if the port is actually available
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("localhost", candidate_port))
+                    # Port is available
+                    self.used_ports.add(candidate_port)
+                    print(f"Allocated port {candidate_port} from range {min_port}-{max_port}")
+                    return candidate_port
+            except OSError:
+                # Port is in use, try next one
+                continue
+        # No available ports found
+        raise RuntimeError(f"No available ports in range {min_port}-{max_port}. Used ports: {len(self.used_ports)}")
+    def start_server(self, seed: int) -> int:
+        """Creates a new Conda env and starts a server instance within it."""
+        port = self.find_free_port()
+        env_name = f"mcp-sim-env-{uuid.uuid4().hex[:8]}"
+        self._create_conda_env(env_name)
+        env = os.environ.copy()
+        env["PORT"] = str(port)
+        # Command to run the server inside the new conda environment
+        cmd = [
+            "conda",
+            "run",
+            "-n",
+            env_name,
+            "python",
+            self.script_path,
+            "--port",
+            str(port),
+            "--seed",
+            str(seed),
+        ]
+        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        self.processes[port] = (process, env_name)
+        time.sleep(3)  # Give the server more time to start up after env creation
+        return port
+    def stop_server(self, port: int):
+        """Stops the server and removes its Conda environment."""
+        if port in self.processes:
+            process, env_name = self.processes[port]
+            print(f"Stopping server on port {port} and cleaning up environment '{env_name}'")
+            process.terminate()
+            try:
+                process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                print(f"Force killing server on port {port}")
+                process.kill()
+                process.wait()
+            # Remove the conda environment
+            print(f"Removing conda environment '{env_name}'...")
+            rm_cmd = ["conda", "env", "remove", "--name", env_name, "-y"]
+            subprocess.run(rm_cmd, check=True, capture_output=True, text=True)
+            # Clean up tracking
+            del self.processes[port]
+            if port in self.used_ports:
+                self.used_ports.remove(port)
+            print(f"✅ Environment '{env_name}' removed and port {port} freed")
+    def stop_all(self):
+        """Stops all managed servers and cleans up all environments."""
+        for port in list(self.processes.keys()):
+            self.stop_server(port)

eval_protocol/mcp/session/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+MCP Session Management
+This module handles session management and vector environment operations.
+"""
+from .manager import GeneralMCPVectorEnv
+__all__ = [
+    "GeneralMCPVectorEnv",
+]

eval_protocol/mcp/session/manager.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+Session Management and Vector Environment
+Handles MCPSession management and vector environment operations.
+Extracted from mcp_env.py to improve modularity.
+"""
+import asyncio
+import json
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from ..execution.manager import ExecutionManager
+from ..types import DatasetRow, MCPSession, MCPToolCall
+logger = logging.getLogger(__name__)
+# TODO: rename this file or the other manager.py
+class GeneralMCPVectorEnv:
+    """
+    General MCP vector environment that works with any MCP server.
+    Manages on-demand MCP sessions for rollouts.
+    Driven by dataset prompts and MCP tool discovery, not hardcoded logic.
+    """
+    def __init__(
+        self,
+        sessions: List[MCPSession],
+        dataset_rows: List[DatasetRow],
+        user_prompt_formatter: Optional[Callable] = None,
+    ):
+        """
+        Initialize with dataset-driven configuration.
+        Args:
+            sessions: MCP sessions
+            dataset_rows: Full dataset rows with prompts and context
+            user_prompt_formatter: Callback to format user prompts dynamically
+        """
+        self.sessions = sessions
+        self.dataset_rows = dataset_rows
+        self.user_prompt_formatter = user_prompt_formatter or self._default_formatter
+        self.n = len(sessions)
+        self.tool_schemas = []  # Discovered from MCP servers
+        self.execution_manager = ExecutionManager()
+        self.usage_stats = {}  # llm usage stats for monitoring
+        if len(sessions) != len(dataset_rows):
+            raise ValueError(
+                f"Sessions ({len(sessions)}) and dataset rows ({len(dataset_rows)}) must have same length"
+            )
+    async def reset(self, session: MCPSession) -> Tuple[Any, List[Dict]]:
+        """
+        Reset a single session - establish connection, get tools and initial state.
+        This is thread-safe and can be called from worker threads.
+        """
+        # Establish a persistent session for each environment.
+        await self.execution_manager.connection_manager.initialize_session(session)
+        # Get available tools from MCP server
+        tool_schemas = await self.execution_manager.connection_manager.discover_tools(session)
+        # PROPER MCP PATTERN: Get initial state from resources during session establishment
+        initial_observation = await self.execution_manager.connection_manager.get_initial_state(session)
+        # Update session state
+        session.terminated = False
+        session.last_observation = initial_observation
+        return initial_observation, tool_schemas
+    async def step(self, env_index: int, tool_call: MCPToolCall) -> Tuple[Any, float, bool, Dict]:
+        """
+        Execute a tool call for a single environment.
+        Args:
+            env_index: Index of the environment to step
+            tool_call: Tool call to execute
+        Returns:
+            observation: New observation after executing the tool call
+            reward: Reward from the environment
+            done: Whether the environment is terminated
+            info: Additional info from the environment
+        """
+        if env_index >= self.n or env_index < 0:
+            raise ValueError(f"Environment index {env_index} out of range [0, {self.n})")
+        session = self.sessions[env_index]
+        if session.terminated:
+            return session.last_observation, 0.0, True, {}
+        # Handle special playback termination signal
+        if tool_call.tool_name == "_playback_terminate":
+            logger.info(f"🎬 Session {session.session_id}: Received playback termination signal")
+            session.terminated = True
+            return session.last_observation, 0.0, True, {"playback_terminated": True}
+        # Handle special no-tool-call signal
+        if tool_call.tool_name == "_no_tool_call":
+            logger.info(f"🏁 Session {session.session_id}: No tool call generated, episode likely ended")
+            session.terminated = True
+            return (
+                session.last_observation,
+                0.0,
+                True,
+                {
+                    "no_tool_call": True,
+                    "reason": tool_call.arguments.get("reason", "unknown"),
+                },
+            )
+        # Execute the tool call via MCP protocol
+        observation, reward, done, info = await self.execution_manager.connection_manager.call_tool(
+            session, tool_call.tool_name, tool_call.arguments
+        )
+        # Update session state
+        session.last_observation = observation
+        session.terminated = done
+        return observation, reward, done, info
+    def format_user_prompt(self, env_index: int, observation: Any) -> Union[str, List[Dict[str, Any]]]:
+        """
+        Format user prompt dynamically for a single environment based on current observation.
+        """
+        if env_index >= self.n or env_index < 0:
+            raise ValueError(f"Environment index {env_index} out of range [0, {self.n})")
+        dataset_row = self.dataset_rows[env_index]
+        # Use the callback to format the prompt
+        prompt = self.user_prompt_formatter(
+            dataset_row.user_prompt_template,
+            observation,
+            dataset_row.environment_context,
+        )
+        return prompt
+    def format_tool_response(self, obs: Any) -> Union[str, List[Dict[str, Any]]]:
+        """
+        Format observation to tool response. If there's an image_url, it will be returned as a multimodal content. If not, it will be returned as a string.
+        This is what gets filled in for the tool responses content.
+        """
+        if isinstance(obs, dict) and obs.get("image_url"):
+            image_url = obs["image_url"]["url"]
+            obs.pop("image_url")
+            return [
+                {
+                    "type": "text",
+                    "text": json.dumps(obs) if isinstance(obs, dict) else str(obs),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                },
+            ]
+        else:
+            return json.dumps(obs) if isinstance(obs, dict) else str(obs)
+    def _default_formatter(self, template: str, obs: Any, context: Dict) -> Union[str, List[Dict[str, Any]]]:
+        """
+        Default user prompt formatter.
+        Extracts meaningful display data from MCP observations.
+        For FrozenLake: extracts grid_layout if available, otherwise uses raw observation.
+        For visual environments: returns multimodal content with both text and images.
+        Returns:
+            Either a string (text-only) or a dict (multimodal content)
+        """
+        # Extract formatted display from observation if available
+        display_obs = obs
+        image_dict = None
+        if isinstance(obs, dict):
+            # For visual environments like LunarLander, we have image_url
+            if "image_url" in obs:
+                image_dict = obs["image_url"]
+                display_obs.pop("image_url")
+            # For other structured observations, try to extract meaningful display
+            elif "observation" in obs and obs["observation"] != "default_initial_state":
+                display_obs = obs["observation"]
+            # If we still have default_initial_state, try to use position info
+            elif obs.get("observation") == "default_initial_state" and "session_id" in obs:
+                # This is the fallback case - we should have gotten the proper initial state from MCP resources
+                display_obs = (
+                    f"Initial game state (Session: {obs['session_id']})\nWaiting for grid data from server..."
+                )
+        formatted_prompt = template.format(observation=display_obs, **context)
+        # If we have image data, return multimodal content
+        if image_dict:
+            return [
+                {
+                    "type": "text",
+                    "text": formatted_prompt,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": image_dict,
+                },
+            ]
+        return formatted_prompt
+    async def close(self):
+        """Closes all MCP sessions."""
+        print(f"🧹 Closing {self.n} MCP sessions...")
+        await self.execution_manager.close_sessions(self.sessions)
+        print(f"✅ All MCP sessions closed.")
+# Keep the old MCPVectorEnv for backward compatibility
+MCPVectorEnv = GeneralMCPVectorEnv