PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/agent/resources/http_rollout_resource.py ADDED Viewed

@@ -0,0 +1,325 @@
+"""
+HTTP Rollout Resource implementation for the agent evaluation framework.
+This resource bridges the HTTP rollout protocol with the ForkableResource interface,
+allowing HTTP-based environments to be used in agent evaluations.
+"""
+import json
+import uuid
+from typing import Any, Dict, List, Optional
+import httpx
+from ..resource_abc import ForkableResource
+from .http_rollout_protocol import (
+    EndEpisodeRequest,
+    GameObservation,
+    HttpRolloutConfig,
+    StartEpisodeRequest,
+    StartEpisodeResponse,
+    StepRequest,
+    StepResponse,
+)
+class HttpRolloutResource(ForkableResource):
+    """
+    A ForkableResource implementation that communicates with HTTP rollout servers.
+    This resource allows the agent evaluation framework to interact with
+    HTTP-based environments through a standardized rollout protocol.
+    """
+    def __init__(self):
+        """Initialize the HTTP rollout resource."""
+        super().__init__()
+        self.config: Optional[HttpRolloutConfig] = None
+        self.episode_id: Optional[str] = None
+        self.current_observation: Optional[Dict[str, Any]] = None
+        self.is_episode_active = False
+        self.client: Optional[httpx.Client] = None
+        # Set up logging
+        import logging
+        self.logger = logging.getLogger(f"{self.__class__.__name__}")
+    async def setup(self, config: Dict[str, Any]) -> None:
+        """
+        Set up the resource with the provided configuration.
+        Args:
+            config: Configuration dictionary from the task definition
+        """
+        self.config = HttpRolloutConfig(**config)
+        self.client = httpx.Client(timeout=self.config.timeout)
+    async def fork(self) -> "HttpRolloutResource":
+        """
+        Create a new independent instance of this resource.
+        For HTTP rollout, forking means creating a new resource instance
+        that will start its own episode when initialized.
+        """
+        if not self.config:
+            raise RuntimeError("Resource not set up. Call setup() first.")
+        # Create a new instance with the same config
+        new_resource = HttpRolloutResource()
+        await new_resource.setup(self.config.model_dump())
+        return new_resource
+    async def get_state(self) -> Dict[str, Any]:
+        """
+        Get the current state of the resource.
+        Returns the current observation and episode metadata.
+        """
+        return {
+            "episode_id": self.episode_id,
+            "observation": self.current_observation,
+            "is_episode_active": self.is_episode_active,
+            "type": "http_rollout",
+        }
+    async def initialize(self, **kwargs) -> None:
+        """
+        Initialize the resource by starting a new episode.
+        Passes any provided kwargs (like seed) to the server in the request body.
+        """
+        try:
+            url = f"{self.config.base_url}{self.config.start_episode_endpoint}"
+            # Include any sample data (like seed) in the request body
+            if kwargs:
+                self.logger.info(f"Sending initialization data to server: {kwargs}")
+                response = self.client.post(url, json=kwargs)
+            else:
+                response = self.client.post(url)
+            response.raise_for_status()
+            episode_data = response.json()
+            self.episode_id = episode_data["episode_id"]
+            self.current_observation = episode_data["observation"]
+            self.is_episode_active = True
+        except Exception as e:
+            raise RuntimeError(f"Failed to start HTTP rollout episode: {e}")
+    async def get_initial_state_description(self) -> str:
+        """
+        Get a formatted description of the initial game state for the agent.
+        Uses the observation from start_episode to build the prompt.
+        """
+        # Start episode to get current game state
+        if not self.is_episode_active:
+            await self.initialize()
+        if not self.current_observation:
+            return "No initial state available."
+        obs = self.current_observation
+        # Build comprehensive game prompt
+        content = """🎮 FROZEN LAKE GAME - AUTONOMOUS PLAY MODE
+🎯 OBJECTIVE: Navigate from S to G without hitting H
+📋 GAME RULES: S=start, F=safe, H=hole(death), G=goal(win)
+🤖 AUTONOMOUS MODE INSTRUCTIONS:
+- You are playing this game AUTONOMOUSLY until completion
+- KEEP MAKING MOVES using the step tool until you reach G or hit H
+- DO NOT ask for user input or wait for confirmation
+- DO NOT stop after one move - continue until the game ends
+- Each move should be followed immediately by another move
+- Game only ends when you reach G (win) or hit H (lose)
+🎮 ACTION: Use step tool with: "left", "right", "up", or "down"
+⚡ START NOW - Make your first move and continue until the game is complete!"""
+        description_parts = [content]
+        if obs.get("message"):
+            description_parts.append(f"\nEnvironment: {obs['message']}")
+        if obs.get("visual"):
+            description_parts.append(f"\nGame Board:\n{obs['visual']}")
+        if obs.get("position"):
+            description_parts.append(f"\nStarting Position: {obs['position']}")
+        description_parts.append("\nGame Rules:")
+        description_parts.append("- S = Start position")
+        description_parts.append("- F = Frozen (safe to step on)")
+        description_parts.append("- H = Hole (game over if you step here)")
+        description_parts.append("- G = Goal (reach this to win)")
+        description_parts.append("- [X] = Your current position")
+        return "\n".join(description_parts)
+    async def cleanup(self) -> None:
+        """
+        Clean up the resource by ending the current episode.
+        """
+        if self.is_episode_active and self.episode_id:
+            try:
+                url = f"{self.config.base_url}{self.config.end_episode_endpoint}"
+                response = self.client.post(url, json={"episode_id": self.episode_id})
+                response.raise_for_status()
+            except Exception as e:
+                # Log but don't raise - cleanup should be best effort
+                print(f"Warning: Failed to properly end episode {self.episode_id}: {e}")
+            finally:
+                self.episode_id = None
+                self.current_observation = None
+                self.is_episode_active = False
+        # Close the HTTP client
+        self.client.close()
+    async def get_tools_spec(self) -> List[Dict[str, Any]]:
+        """
+        Get the list of available tools for this resource.
+        For HTTP rollout, this returns the 'step' tool that allows
+        the agent to take actions in the environment.
+        """
+        return [
+            {
+                "name": "step",
+                "description": "Take a step in the Frozen Lake game by choosing a direction to move",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "action": {
+                            "type": "string",
+                            "enum": ["left", "down", "right", "up"],
+                            "description": "The direction to move in the game: 'left', 'down', 'right', or 'up'",
+                        }
+                    },
+                    "required": ["action"],
+                },
+            }
+        ]
+    async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
+        """
+        Execute a tool call on this resource.
+        For HTTP rollout, this handles the 'step' tool by sending
+        the action to the HTTP rollout server.
+        """
+        if not self.is_episode_active or not self.episode_id:
+            # If no active episode, start one first
+            await self.initialize()
+        if action_name == "step":
+            action = action_params.get("action")
+            return await self._handle_step_tool(action)
+        else:
+            raise ValueError(f"Unknown action: {action_name}")
+    async def get_observation(self) -> Any:
+        """
+        Get the current observation from the environment.
+        """
+        if self.current_observation:
+            return self.current_observation
+        else:
+            return {"message": "No observation available. Start an episode first."}
+    async def checkpoint(self) -> Dict[str, Any]:
+        """
+        Create a checkpoint of the current resource state.
+        For HTTP rollout, this saves the episode ID and current observation.
+        """
+        return {
+            "episode_id": self.episode_id,
+            "current_observation": self.current_observation,
+            "is_episode_active": self.is_episode_active,
+        }
+    async def restore(self, state_data: Dict[str, Any]) -> None:
+        """
+        Restore the resource state from a checkpoint.
+        Note: This is limited for HTTP rollout since we can't restore
+        arbitrary server-side state.
+        """
+        self.episode_id = state_data.get("episode_id")
+        self.current_observation = state_data.get("current_observation")
+        self.is_episode_active = state_data.get("is_episode_active", False)
+    async def close(self) -> None:
+        """
+        Clean up and close the resource.
+        """
+        await self.cleanup()
+    async def _handle_step_tool(self, action: Any) -> Dict[str, Any]:
+        """
+        Handle the 'step' tool by sending an action to the HTTP rollout server.
+        """
+        try:
+            # Convert string action to integer for the server
+            action_map = {"left": 0, "down": 1, "right": 2, "up": 3}
+            if isinstance(action, str):
+                if action.lower() not in action_map:
+                    raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up")
+                numeric_action = action_map[action.lower()]
+            else:
+                # Backward compatibility with numeric actions
+                numeric_action = action
+            url = f"{self.config.base_url}{self.config.step_endpoint}"
+            step_data = {"episode_id": self.episode_id, "action": numeric_action}
+            response = self.client.post(url, json=step_data)
+            response.raise_for_status()
+            step_result = response.json()
+            self.current_observation = step_result["observation"]
+            # If the episode is done, mark it as inactive
+            if step_result.get("is_done", False):
+                self.is_episode_active = False
+            # Format the response for the agent
+            observation = step_result["observation"]
+            message = observation.get("message", "")
+            visual = observation.get("visual", "")
+            # Create a comprehensive response
+            response_content = []
+            if message:
+                response_content.append(f"Environment: {message}")
+            if visual:
+                response_content.append(f"Visual State:\n{visual}")
+            # Add structured data
+            response_content.append(f"Position: {observation.get('position', 'unknown')}")
+            response_content.append(f"Done: {step_result.get('is_done', False)}")
+            if step_result.get("is_done", False):
+                won = observation.get("won", False)
+                response_content.append(f"Result: {'Victory!' if won else 'Game Over'}")
+            return {"content": [{"type": "text", "text": "\n".join(response_content)}]}
+        except Exception as e:
+            raise RuntimeError(f"Failed to execute step: {e}")
+    def __del__(self):
+        """Ensure cleanup on deletion."""
+        if hasattr(self, "client") and self.client:
+            try:
+                self.client.close()
+            except Exception:
+                pass  # Ignore cleanup errors during deletion

eval_protocol/agent/resources/python_state_resource.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""
+PythonStateResource: A ForkableResource that manages state as a Python dictionary.
+"""
+import copy
+import pickle
+from typing import Any, Dict, List, Optional
+from ..resource_abc import ForkableResource
+class PythonStateResource(ForkableResource):
+    """
+    A ForkableResource that manages its state as an in-memory Python dictionary.
+    This resource is useful for tasks where the environment's state can be
+    represented and manipulated directly as Python objects.
+    Attributes:
+        _state (Dict[str, Any]): The internal dictionary holding the resource's state.
+        _config (Dict[str, Any]): The configuration passed during setup.
+    """
+    def __init__(self) -> None:
+        self._state: Dict[str, Any] = {}
+        self._config: Dict[str, Any] = {}
+    async def setup(self, config: Dict[str, Any]) -> None:
+        """
+        Initializes the resource with a given configuration.
+        The configuration can specify an 'initial_state' dictionary.
+        Args:
+            config: Configuration dictionary.
+                    Expected keys:
+                    - 'initial_state' (Optional[Dict[str, Any]]):
+                      A dictionary to set as the initial state.
+        """
+        self._config = copy.deepcopy(config)
+        self._state = copy.deepcopy(self._config.get("initial_state", {}))
+    async def fork(self) -> "PythonStateResource":
+        """
+        Creates and returns a new, independent instance of this resource
+        with an identical copy of the current state.
+        """
+        forked_resource = PythonStateResource()
+        forked_resource._config = copy.deepcopy(self._config)
+        forked_resource._state = copy.deepcopy(self._state)
+        return forked_resource
+    async def checkpoint(self) -> bytes:
+        """
+        Returns a serializable representation of the resource's current state
+        using pickle.
+        """
+        return pickle.dumps(self._state)
+    async def restore(self, state_data: bytes) -> None:
+        """
+        Restores the resource's state from previously checkpointed state_data
+        (pickle format).
+        """
+        self._state = pickle.loads(state_data)
+    async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
+        """
+        Executes a named action with given parameters on the resource.
+        This implementation provides a generic 'update_state' action
+        that merges action_params into the current state.
+        Subclasses could override this for more specific actions.
+        Args:
+            action_name: The name of the action to perform.
+                         Currently supports 'update_state'.
+            action_params: A dictionary of parameters for the action.
+                           For 'update_state', these are key-value pairs
+                           to update in the state.
+        Returns:
+            A copy of the updated state.
+        Raises:
+            NotImplementedError: If action_name is not 'update_state'.
+        """
+        if action_name == "update_state":
+            self._state.update(action_params)
+            return copy.deepcopy(self._state)
+        elif action_name == "get_value":
+            key = action_params.get("key")
+            if key is None:
+                raise ValueError("Missing 'key' in action_params for 'get_value'")
+            return self._state.get(key)
+        else:
+            raise NotImplementedError(f"Action '{action_name}' is not implemented for PythonStateResource.")
+    async def get_observation(self) -> Dict[str, Any]:
+        """
+        Returns a deep copy of the current observable state of the resource.
+        """
+        return copy.deepcopy(self._state)
+    def get_state(self) -> Dict[str, Any]:
+        """
+        Returns a deep copy of the current state dictionary.
+        This is a synchronous version of get_observation for compatibility with test tasks.
+        """
+        return copy.deepcopy(self._state)
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """
+        Sets the resource's state to the provided dictionary.
+        Args:
+            state: A dictionary containing the new state.
+        """
+        self._state = copy.deepcopy(state)
+    async def get_tools_spec(self) -> List[Dict[str, Any]]:
+        """
+        Returns a list of tool specifications available for this resource.
+        Provides generic 'update_state' and 'get_value' tools.
+        """
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "update_state",
+                    "description": "Updates the current state dictionary with the provided key-value pairs.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "updates": {
+                                "type": "object",
+                                "description": "Key-value pairs to update in the state.",
+                            }
+                        },
+                        "required": ["updates"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_value",
+                    "description": "Retrieves a value from the state dictionary for a given key.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "key": {
+                                "type": "string",
+                                "description": "The key of the value to retrieve.",
+                            }
+                        },
+                        "required": ["key"],
+                    },
+                },
+            },
+        ]
+    async def close(self) -> None:
+        """
+        Performs any necessary cleanup for the resource.
+        For PythonStateResource, this is a no-op as state is in-memory.
+        """
+        self._state = {}
+        self._config = {}