PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/mcp/execution/policy.py ADDED Viewed

@@ -0,0 +1,421 @@
+"""
+LLM Policy Execution and Tool Calling
+Base classes and implementations for LLM policies that work with MCP environments.
+Extracted from mcp_env.py to improve modularity and enable OpenAI integration.
+"""
+import asyncio
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+from concurrent.futures import ThreadPoolExecutor
+from .base_policy import LLMBasePolicy
+from ..types import LLMUsageStats, MCPToolCall
+# Try to import FireworksPolicy from separate module - it's optional
+try:
+    from .fireworks_policy import FireworksPolicy
+except ImportError:
+    # FireworksPolicy not available (fireworks-ai package not installed)
+    FireworksPolicy = None
+logger = logging.getLogger(__name__)
+class OpenAIPolicy(LLMBasePolicy):
+    """
+    OpenAI policy implementation that works with ANY MCP environment via tool calling.
+    NO environment-specific logic - everything comes from MCP tools and dataset prompts.
+    Supports both live mode (using OpenAI API) and playback mode (replaying recorded trajectories).
+    """
+    def __init__(
+        self,
+        model_id: str,
+        temperature: float = 0.2,
+        max_tokens: int = 4096,
+        max_tools_per_turn: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Initialize OpenAI policy.
+        Args:
+            model_id: OpenAI model identifier (e.g., "gpt-4o", "gpt-4o-mini", "gpt-4-turbo")
+            temperature: Sampling temperature (0.0 to 2.0)
+            max_tokens: Maximum tokens to generate per request
+            max_tools_per_turn: Maximum number of tool calls per turn (None = unlimited, 1 = single tool)
+        """
+        super().__init__(model_id, temperature, max_tokens, max_tools_per_turn, **kwargs)
+        # Only initialize OpenAI client in live mode (not in playback mode)
+        if not self._is_playback:
+            # Import OpenAI SDK - optional at module level
+            try:
+                from openai import AsyncOpenAI
+            except ImportError:
+                raise ImportError(
+                    "The 'openai' package is required for OpenAIPolicy. " "Please install it with 'pip install openai'"
+                )
+            # Verify authentication
+            api_key = os.environ.get("OPENAI_API_KEY")
+            if not api_key:
+                raise ValueError(
+                    "OPENAI_API_KEY environment variable is required "
+                    "to use OpenAIPolicy. Set this variable before running."
+                )
+            # Initialize the OpenAI client
+            try:
+                self.client = AsyncOpenAI(api_key=api_key)
+                logger.info(f"✅ Initialized OpenAI client: {self.model_id}")
+            except Exception as e:
+                raise RuntimeError(f"Failed to initialize OpenAI client for '{self.model_id}': {e}")
+        else:
+            # In playback mode, skip expensive client initialization
+            self.client = None
+            logger.info(f"🎬 Playback mode: Skipping OpenAI client initialization for performance")
+    def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]:
+        """
+        Clean messages by removing metadata fields that OpenAI API doesn't accept.
+        Args:
+            messages: Conversation messages with potential metadata
+        Returns:
+            Clean messages without metadata fields
+        """
+        clean_messages = []
+        for msg in messages:
+            clean_msg = msg.copy()
+            # Remove metadata field if present
+            if "metadata" in clean_msg:
+                del clean_msg["metadata"]
+            clean_messages.append(clean_msg)
+        return clean_messages
+    async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
+        """
+        Make an OpenAI API call.
+        Args:
+            messages: Conversation messages (may contain metadata)
+            tools: Available tools in OpenAI format
+        Returns:
+            API response in OpenAI format
+        """
+        # Clean messages by removing metadata before sending to API
+        clean_messages = self._clean_messages_for_api(messages)
+        current_request = {
+            "model": self.model_id,
+            "messages": clean_messages,
+            "tools": tools,
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+        }
+        if self.client is None:
+            raise RuntimeError("OpenAI client not initialized")
+        # Make the API call
+        response = await self.client.chat.completions.create(**current_request)
+        # Convert OpenAI response to standard format
+        return {
+            "choices": [
+                {
+                    "message": {
+                        "content": response.choices[0].message.content,
+                        "tool_calls": (
+                            [
+                                {
+                                    "id": tc.id,
+                                    "type": tc.type,
+                                    "function": {
+                                        "name": tc.function.name,
+                                        "arguments": tc.function.arguments,
+                                    },
+                                }
+                                for tc in (response.choices[0].message.tool_calls or [])
+                            ]
+                            if response.choices[0].message.tool_calls
+                            else []
+                        ),
+                    }
+                }
+            ],
+            "usage": {
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens,
+            },
+        }
+    def _convert_mcp_tools_to_llm_format(self, mcp_tools: List[Dict]) -> List[Dict]:
+        """
+        Convert MCP tool schemas to OpenAI function calling format.
+        Args:
+            mcp_tools: List of MCP tool definitions
+        Returns:
+            List of OpenAI-compatible tool definitions
+        """
+        openai_tools = []
+        for mcp_tool in mcp_tools:
+            openai_tool = {
+                "type": "function",
+                "function": {
+                    "name": mcp_tool["name"],
+                    "description": mcp_tool.get("description", f"Execute {mcp_tool['name']} action"),
+                    "parameters": mcp_tool.get(
+                        "input_schema",
+                        {"type": "object", "properties": {}, "required": []},
+                    ),
+                },
+            }
+            openai_tools.append(openai_tool)
+        return openai_tools
+class AnthropicPolicy(LLMBasePolicy):
+    """
+    Anthropic policy implementation that works with ANY MCP environment via tool calling.
+    NO environment-specific logic - everything comes from MCP tools and dataset prompts.
+    Supports both live mode (using Anthropic API) and playback mode (replaying recorded trajectories).
+    """
+    def __init__(
+        self,
+        model_id: str,
+        temperature: float = 0.2,
+        max_tokens: int = 4096,
+        max_tools_per_turn: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Initialize Anthropic policy.
+        Args:
+            model_id: Anthropic model identifier (e.g., "claude-3-5-sonnet-20241022", "claude-3-opus-20240229")
+            temperature: Sampling temperature (0.0 to 1.0)
+            max_tokens: Maximum tokens to generate per request
+            max_tools_per_turn: Maximum number of tool calls per turn (None = unlimited, 1 = single tool)
+        """
+        super().__init__(model_id, temperature, max_tokens, max_tools_per_turn, **kwargs)
+        # Only initialize Anthropic client in live mode (not in playback mode)
+        if not self._is_playback:
+            # Import Anthropic SDK - optional at module level
+            try:
+                from anthropic import AsyncAnthropic
+            except ImportError:
+                raise ImportError(
+                    "The 'anthropic' package is required for AnthropicPolicy. "
+                    "Please install it with 'pip install anthropic'"
+                )
+            # Verify authentication
+            api_key = os.environ.get("ANTHROPIC_API_KEY")
+            if not api_key:
+                raise ValueError(
+                    "ANTHROPIC_API_KEY environment variable is required "
+                    "to use AnthropicPolicy. Set this variable before running."
+                )
+            # Initialize the Anthropic client
+            try:
+                self.client = AsyncAnthropic(api_key=api_key)
+                logger.info(f"✅ Initialized Anthropic client: {self.model_id}")
+            except Exception as e:
+                raise RuntimeError(f"Failed to initialize Anthropic client for '{self.model_id}': {e}")
+        else:
+            # In playback mode, skip expensive client initialization
+            self.client = None
+            logger.info(f"🎬 Playback mode: Skipping Anthropic client initialization for performance")
+    def _clean_messages_for_api(self, messages: List[Dict]) -> Tuple[List[Dict], Optional[str]]:
+        """
+        Clean messages by removing metadata fields, extracting system message, and converting tool messages.
+        Anthropic handles system messages separately and doesn't support "tool" role messages.
+        Tool results must be converted to "user" messages with tool_result content blocks.
+        Args:
+            messages: Conversation messages with potential metadata and system messages
+        Returns:
+            Tuple of (clean_messages_without_system, system_message_content)
+        """
+        clean_messages = []
+        system_message = None
+        for msg in messages:
+            clean_msg = msg.copy()
+            # Remove metadata field if present
+            if "metadata" in clean_msg:
+                del clean_msg["metadata"]
+            # Extract system message separately - Anthropic handles it differently
+            if clean_msg.get("role") == "system":
+                system_message = clean_msg["content"]
+            elif clean_msg.get("role") == "tool":
+                # Convert tool message to user message with tool_result content
+                # Anthropic expects tool results as content blocks in user messages
+                tool_call_id = clean_msg.get("tool_call_id", "unknown")
+                tool_result_content = clean_msg.get("content", "")
+                converted_msg = {
+                    "role": "user",
+                    "content": [{"type": "tool_result", "tool_use_id": tool_call_id, "content": tool_result_content}],
+                }
+                clean_messages.append(converted_msg)
+            elif clean_msg.get("role") == "assistant" and "tool_calls" in clean_msg:
+                # Convert assistant message with tool_calls to Anthropic format
+                # Anthropic uses content blocks instead of tool_calls field
+                content_blocks = []
+                # Add text content if present
+                if clean_msg.get("content"):
+                    content_blocks.append({"type": "text", "text": clean_msg["content"]})
+                # Convert tool_calls to tool_use content blocks
+                for tool_call in clean_msg.get("tool_calls", []):
+                    if tool_call.get("type") == "function":
+                        import json
+                        content_blocks.append(
+                            {
+                                "type": "tool_use",
+                                "id": tool_call["id"],
+                                "name": tool_call["function"]["name"],
+                                "input": (
+                                    json.loads(tool_call["function"]["arguments"])
+                                    if isinstance(tool_call["function"]["arguments"], str)
+                                    else tool_call["function"]["arguments"]
+                                ),
+                            }
+                        )
+                converted_msg = {"role": "assistant", "content": content_blocks}
+                clean_messages.append(converted_msg)
+            else:
+                clean_messages.append(clean_msg)
+        return clean_messages, system_message
+    async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
+        """
+        Make an Anthropic API call.
+        Args:
+            messages: Conversation messages (may contain metadata and system messages)
+            tools: Available tools in Anthropic format
+        Returns:
+            API response in OpenAI-compatible format
+        """
+        # Clean messages and extract system message
+        clean_messages, system_message = self._clean_messages_for_api(messages)
+        current_request = {
+            "model": self.model_id,
+            "messages": clean_messages,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+        }
+        # Add system message if present
+        if system_message:
+            current_request["system"] = system_message
+        # Add tools if present
+        if tools:
+            current_request["tools"] = tools
+        if self.client is None:
+            raise RuntimeError("Anthropic client not initialized")
+        # Make the API call
+        response = await self.client.messages.create(**current_request)
+        # Convert Anthropic response to OpenAI-compatible format
+        tool_calls = []
+        if hasattr(response, "content"):
+            for content_block in response.content:
+                if hasattr(content_block, "type") and content_block.type == "tool_use":
+                    tool_calls.append(
+                        {
+                            "id": content_block.id,
+                            "type": "function",
+                            "function": {
+                                "name": content_block.name,
+                                "arguments": json.dumps(content_block.input),
+                            },
+                        }
+                    )
+        # Get text content
+        text_content = ""
+        if hasattr(response, "content"):
+            for content_block in response.content:
+                if hasattr(content_block, "type") and content_block.type == "text":
+                    text_content = content_block.text
+                    break
+        return {
+            "choices": [
+                {
+                    "message": {
+                        "content": text_content,
+                        "tool_calls": tool_calls if tool_calls else None,
+                    }
+                }
+            ],
+            "usage": {
+                "prompt_tokens": response.usage.input_tokens,
+                "completion_tokens": response.usage.output_tokens,
+                "total_tokens": response.usage.input_tokens + response.usage.output_tokens,
+            },
+        }
+    def _convert_mcp_tools_to_llm_format(self, mcp_tools: List[Dict]) -> List[Dict]:
+        """
+        Convert MCP tool schemas to Anthropic tool calling format.
+        Args:
+            mcp_tools: List of MCP tool definitions
+        Returns:
+            List of Anthropic-compatible tool definitions
+        """
+        anthropic_tools = []
+        for mcp_tool in mcp_tools:
+            anthropic_tool = {
+                "name": mcp_tool["name"],
+                "description": mcp_tool.get("description", f"Execute {mcp_tool['name']} action"),
+                "input_schema": mcp_tool.get(
+                    "input_schema",
+                    {"type": "object", "properties": {}, "required": []},
+                ),
+            }
+            anthropic_tools.append(anthropic_tool)
+        return anthropic_tools

eval_protocol/mcp/grid_renderer.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+Grid Rendering Utilities
+Utilities for rendering grid-based environments in a human-readable format.
+"""
+from typing import Any
+def render_grid(desc, position: int) -> str:
+    """
+    Render a grid environment showing the current player position.
+    Args:
+        desc: Grid description (usually from env.desc)
+        position: Current player position as 1D index
+    Returns:
+        String representation of the grid with player position marked
+    """
+    if desc is None:
+        return f"Position: {position} (no grid available)"
+    # Convert numpy array or bytes to string if needed
+    if hasattr(desc, "shape"):
+        size = desc.shape[0]
+        # Convert position to row, col coordinates
+        row = position // size
+        col = position % size
+        # Create grid representation
+        grid_lines = []
+        for r, desc_row in enumerate(desc):
+            line = ""
+            for c, cell in enumerate(desc_row):
+                # Convert bytes to string if needed
+                cell_char = cell.decode("utf-8") if isinstance(cell, bytes) else str(cell)
+                if r == row and c == col:
+                    # Show player position with 'P', unless it's the goal
+                    if cell_char == "G":
+                        line += "W"  # Won - player reached goal
+                    else:
+                        line += "P"
+                else:
+                    # Show original cell
+                    line += cell_char
+            grid_lines.append(line)
+        return "\n".join(grid_lines)
+    else:
+        # Fallback for other grid formats
+        return f"Position: {position}"