PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/utils/packaging_utils.py ADDED Viewed

@@ -0,0 +1,108 @@
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+from typing import List, Optional
+logger = logging.getLogger(__name__)
+def get_pip_executable(venv_pip_path: Optional[str] = None) -> List[str]:
+    """Determines the pip executable command parts."""
+    if venv_pip_path and os.path.exists(venv_pip_path) and os.access(venv_pip_path, os.X_OK):
+        logger.info(f"Using specified pip executable: {venv_pip_path}")
+        return [venv_pip_path]
+    # Try to find pip in the current virtual environment's scripts/bin directory
+    # sys.executable should be /path/to/.venv/bin/python
+    # So, pip should be /path/to/.venv/bin/pip
+    # On Windows, it might be /path/to/.venv/Scripts/pip.exe
+    potential_pip_path = os.path.join(os.path.dirname(sys.executable), "pip")
+    if os.name == "nt":  # Windows check
+        potential_pip_path += ".exe"
+    if os.path.exists(potential_pip_path) and os.access(potential_pip_path, os.X_OK):
+        logger.info(f"Using auto-detected pip executable: {potential_pip_path}")
+        return [potential_pip_path]
+    # Fallback to sys.executable -m pip (should generally work if python is from the venv)
+    logger.info(f"Using pip via: {sys.executable} -m pip")
+    return [sys.executable, "-m", "pip"]
+def install_requirements(
+    requirements_list: List[str],
+    venv_pip_path: Optional[str] = None,  # User can specify e.g. ".venv/bin/pip"
+    log_output: bool = True,
+) -> None:
+    """
+    Installs a list of Python package requirements using pip.
+    Args:
+        requirements_list: A list of requirement strings (e.g., ["package_a==1.0", "package_b>=2.0"]).
+        venv_pip_path: Optional path to the specific pip executable to use.
+        log_output: If True, logs the stdout and stderr of the pip command.
+    """
+    if not requirements_list:
+        logger.debug("No requirements provided to install.")
+        return
+    unique_requirements = sorted(list(set(req.strip() for req in requirements_list if req.strip())))
+    if not unique_requirements:
+        logger.debug("No unique, non-empty requirements to install after stripping.")
+        return
+    pip_command_parts = get_pip_executable(venv_pip_path)
+    # Create a temporary requirements file
+    # delete=False is used because on Windows, a file opened for writing cannot be opened by another process.
+    # We will manually delete it in the finally block.
+    tmp_req_fd, tmp_req_file_path = tempfile.mkstemp(suffix=".txt", prefix="rk_reqs_")
+    try:
+        with os.fdopen(tmp_req_fd, "w") as tmp_req_file:
+            for req in unique_requirements:
+                tmp_req_file.write(req + "\n")
+        logger.info(
+            f"Attempting to install requirements: {unique_requirements} using pip command: {' '.join(pip_command_parts)} -r {tmp_req_file_path}"
+        )
+        command = pip_command_parts + ["install", "-r", tmp_req_file_path]
+        process = subprocess.run(
+            command,
+            check=True,  # Raise CalledProcessError on non-zero exit
+            capture_output=True,
+            text=True,  # Decodes stdout/stderr as text
+            encoding="utf-8",  # Explicit encoding
+            errors="replace",  # Handle potential encoding errors in pip output
+        )
+        if log_output and process.stdout:
+            logger.info(f"Pip install stdout:\n{process.stdout.strip()}")
+        # pip often uses stderr for progress/warnings even on success
+        if log_output and process.stderr:
+            logger.info(f"Pip install stderr:\n{process.stderr.strip()}")
+        logger.info(f"Successfully installed requirements: {unique_requirements}")
+    except subprocess.CalledProcessError as e:
+        error_message = f"Error installing requirements from {tmp_req_file_path}.\n"
+        error_message += f"Command: {' '.join(e.cmd)}\n"
+        if e.stdout:
+            error_message += f"Pip stdout:\n{e.stdout.strip()}\n"
+        if e.stderr:
+            error_message += f"Pip stderr:\n{e.stderr.strip()}\n"
+        logger.error(error_message)
+        raise RuntimeError(
+            f"Failed to install requirements: {unique_requirements}. Details:\n{e.stderr or e.stdout or str(e)}"
+        )
+    except FileNotFoundError:
+        logger.error(
+            f"Pip executable not found: {' '.join(pip_command_parts)}. Please ensure pip is installed and in PATH, or venv_pip_path is correct."
+        )
+        raise
+    finally:
+        if os.path.exists(tmp_req_file_path):
+            os.remove(tmp_req_file_path)
+            logger.debug(f"Removed temporary requirements file: {tmp_req_file_path}")

eval_protocol/utils/static_policy.py ADDED Viewed

@@ -0,0 +1,305 @@
+"""
+General Static Policy for MCP Environment Testing
+This policy provides a deterministic, non-LLM action sequence for fast iteration
+across different MCP environments. It can be configured with custom tool names
+and action sequences.
+This is useful for:
+- Fast testing of multi-session functionality
+- Debugging environment behavior
+- Performance testing without LLM overhead
+"""
+import asyncio
+import json
+import logging
+import os
+import random
+from typing import Any, Dict, List, Optional, Tuple, Union
+# Import the base policy and types for proper recording functionality
+from eval_protocol.mcp.types import LLMUsageStats, MCPToolCall
+from eval_protocol.playback_policy import PlaybackPolicyBase
+logger = logging.getLogger(__name__)
+class StaticPolicy(PlaybackPolicyBase):
+    """
+    Static policy that follows a predetermined action sequence.
+    Can be configured for different environments with custom tool names and actions.
+    """
+    def __init__(
+        self,
+        tool_name: str,
+        action_sequence: Optional[List[str]] = None,
+        available_actions: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize static policy with recording/playback support.
+        Args:
+            tool_name: Name of the tool to call for actions (e.g., "lake_move", "lander_action")
+            action_sequence: List of actions to execute. If None, uses a default sequence.
+            available_actions: List of all available actions for this environment.
+            **kwargs: Additional arguments passed to PlaybackPolicyBase
+        """
+        # Initialize parent class for recording/playback functionality
+        super().__init__(**kwargs)
+        self.tool_name = tool_name
+        self.available_actions = available_actions or []
+        # Set default action sequence if not provided
+        if action_sequence is None:
+            if self.available_actions:
+                # Use first few actions as default sequence
+                self.action_sequence = self.available_actions[: min(6, len(self.available_actions))]
+            else:
+                self.action_sequence = ["DEFAULT_ACTION"]
+        else:
+            self.action_sequence = action_sequence
+        self.step_counts = {}  # Track step count per environment
+    async def _generate_live_tool_calls(
+        self,
+        tool_schemas: List[Dict],
+        env_index: int,
+        conversation_history: List[Dict[str, Any]],
+    ) -> Tuple[List[MCPToolCall], LLMUsageStats]:
+        """
+        Generate tool calls in live mode using the static action sequence.
+        This implements the abstract method from PlaybackPolicyBase.
+        Args:
+            tool_schemas: Available tools for this environment
+            env_index: Environment index
+            conversation_history: Current conversation history for this environment
+        Returns:
+            List of MCPToolCall objects
+        """
+        # Get current step count for this environment
+        step_count = self.step_counts.get(env_index, 0)
+        # Determine action based on step count
+        if step_count < len(self.action_sequence):
+            action = self.action_sequence[step_count]
+        else:
+            # After sequence completes, repeat the last action
+            action = self.action_sequence[-1]
+        # Create tool call in MCPToolCall format
+        tool_call = MCPToolCall(tool_name=self.tool_name, arguments={"action": action})
+        # Update step count
+        self.step_counts[env_index] = step_count + 1
+        logger.debug(f"🎮 Env {env_index} step {step_count}: {action}")
+        return [tool_call], None
+    def add_tool_response(
+        self,
+        env_index: int,
+        tool_call: MCPToolCall,
+        tool_response: Union[str, List[Dict[str, Any]]],
+        conversation_history: List[Dict[str, Any]],
+        reward: float = 0.0,
+        terminated: bool = False,
+        info: Dict[str, Any] = None,
+    ):
+        """Add tool call and response to conversation history for recording."""
+        # Find the most recent assistant message with tool calls to get the correct call_id
+        call_id = None
+        for i in range(len(conversation_history) - 1, -1, -1):
+            if conversation_history[i]["role"] == "assistant" and "tool_calls" in conversation_history[i]:
+                # Find the tool call that matches our tool_name
+                for tc in conversation_history[i]["tool_calls"]:
+                    if tc["function"]["name"] == tool_call.tool_name:
+                        call_id = tc["id"]
+                        break
+                if call_id:
+                    break
+        # Fallback if no matching tool call found
+        if not call_id:
+            call_id = f"call_{env_index}_{len(conversation_history)}"
+        # Add tool response with control plane metadata
+        tool_message = {
+            "role": "tool",
+            "tool_call_id": call_id,
+            "content": tool_response,
+        }
+        # Add control plane metadata if provided
+        if reward != 0.0 or terminated or info:
+            tool_message["metadata"] = {
+                "reward": reward,
+                "terminated": terminated,
+                "info": info or {},
+            }
+        conversation_history.append(tool_message)
+    def log_conversation_state_for_playback(
+        self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
+    ):
+        """
+        Log the current conversation state in the format required for playback.
+        Expected format: {"env_index": 0, "step": 0, "messages": [{..}, {..}]}
+        Args:
+            env_index: Environment index
+            step: Current step number
+            conversation_history: List of conversation messages
+        """
+        # Use EP_PLAYBACK_FILE environment variable for recording
+        playback_file = os.environ.get("EP_PLAYBACK_FILE")
+        if not playback_file:
+            return  # No recording file specified
+        playback_entry = {
+            "env_index": env_index,
+            "step": step,
+            "messages": conversation_history.copy(),
+        }
+        with open(playback_file, "a") as f:
+            f.write(json.dumps(playback_entry) + "\n")
+    @property
+    def model_id(self) -> str:
+        """Model identifier for static policy."""
+        return f"static-policy-{self.tool_name}-v1"
+class RandomPolicy(PlaybackPolicyBase):
+    """
+    Random policy that selects random actions.
+    Useful for testing environment robustness.
+    """
+    def __init__(
+        self,
+        tool_name: str,
+        available_actions: List[str],
+        seed: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Initialize random policy with recording/playback support.
+        Args:
+            tool_name: Name of the tool to call for actions
+            available_actions: List of all available actions for this environment
+            seed: Random seed for reproducibility
+            **kwargs: Additional arguments passed to PlaybackPolicyBase
+        """
+        # Initialize parent class for recording/playback functionality
+        super().__init__(**kwargs)
+        self.tool_name = tool_name
+        self.available_actions = available_actions
+        self.random = random.Random(seed)
+    async def _generate_live_tool_calls(
+        self,
+        tool_schemas: List[Dict],
+        env_index: int,
+        conversation_history: List[Dict[str, Any]],
+    ) -> Tuple[List[MCPToolCall], LLMUsageStats]:
+        """
+        Generate random tool calls in live mode.
+        Args:
+            tool_schemas: Available tools for this environment
+            env_index: Environment index
+            conversation_history: Current conversation history for this environment
+        Returns:
+            List of MCPToolCall objects
+        """
+        # Select random action
+        action = self.random.choice(self.available_actions)
+        # Create tool call
+        tool_call = MCPToolCall(tool_name=self.tool_name, arguments={"action": action})
+        logger.debug(f"🎲 Env {env_index}: {action}")
+        return [tool_call], None
+    def add_tool_response(
+        self,
+        env_index: int,
+        tool_call: MCPToolCall,
+        tool_response: Union[str, List[Dict[str, Any]]],
+        conversation_history: List[Dict[str, Any]],
+        reward: float = 0.0,
+        terminated: bool = False,
+        info: Dict[str, Any] = None,
+    ):
+        """Add tool call and response to conversation history for recording."""
+        # Find the most recent assistant message with tool calls
+        call_id = None
+        for i in range(len(conversation_history) - 1, -1, -1):
+            if conversation_history[i]["role"] == "assistant" and "tool_calls" in conversation_history[i]:
+                for tc in conversation_history[i]["tool_calls"]:
+                    if tc["function"]["name"] == tool_call.tool_name:
+                        call_id = tc["id"]
+                        break
+                if call_id:
+                    break
+        if not call_id:
+            call_id = f"call_{env_index}_{len(conversation_history)}"
+        # Add tool response with control plane metadata
+        tool_message = {
+            "role": "tool",
+            "tool_call_id": call_id,
+            "content": tool_response,
+        }
+        # Add control plane metadata if provided
+        if reward != 0.0 or terminated or info:
+            tool_message["metadata"] = {
+                "reward": reward,
+                "terminated": terminated,
+                "info": info or {},
+            }
+        conversation_history.append(tool_message)
+    def log_conversation_state_for_playback(
+        self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
+    ):
+        """Log the current conversation state for playback recording."""
+        playback_file = os.environ.get("EP_PLAYBACK_FILE")
+        if not playback_file:
+            return
+        playback_entry = {
+            "env_index": env_index,
+            "step": step,
+            "messages": conversation_history.copy(),
+        }
+        with open(playback_file, "a") as f:
+            f.write(json.dumps(playback_entry) + "\n")
+    @property
+    def model_id(self) -> str:
+        """Model identifier for random policy."""
+        return f"random-policy-{self.tool_name}-v1"