PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/playback_policy.py ADDED Viewed

@@ -0,0 +1,396 @@
+"""
+Playback policy base class for record-and-replay functionality.
+This module implements the abstract base class that handles all playback logic,
+allowing concrete policy classes to inherit replay functionality while focusing
+on their specific implementation details.
+"""
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple
+from .mcp.types import LLMUsageStats, MCPToolCall
+logger = logging.getLogger(__name__)
+class PlaybackPolicyBase(ABC):
+    """
+    Abstract base class for policies that support record-and-playback functionality.
+    This class handles all playback logic including trajectory loading, parsing,
+    and step management. Concrete policy classes inherit from this to get
+    replay functionality while implementing their own live mode logic.
+    """
+    def __init__(
+        self,
+        _playback_actions: Optional[Dict[str, List[Dict[str, Any]]]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize policy with optional playback actions.
+        Args:
+            _playback_actions: Pre-parsed playback actions organized by environment.
+                              Format: {env_index: [{"step": int, "messages": [...]}]}
+            **kwargs: Additional arguments passed to concrete implementations
+        """
+        # Playback state management
+        self._playback_actions = _playback_actions
+        self._is_playback = _playback_actions is not None
+        self._playback_step_counters = {}  # {env_index: current_step}
+        # Environment variable override
+        playback_file = os.environ.get("EP_PLAYBACK_FILE")
+        if playback_file and not self._is_playback:
+            logger.info(f"🎬 Auto-enabling playback mode from environment variable: {playback_file}")
+            self._playback_actions = self._load_trajectory_file(playback_file)
+            self._is_playback = self._playback_actions is not None
+        # Initialize step counters if in playback mode
+        if self._is_playback and self._playback_actions:
+            for env_index in self._playback_actions.keys():
+                self._playback_step_counters[env_index] = 0
+        logger.debug(f"PlaybackPolicyBase initialized: playback_mode={self._is_playback}")
+    @staticmethod
+    def _load_trajectory_file(
+        filepath: str,
+    ) -> Optional[Dict[str, List[Dict[str, Any]]]]:
+        """
+        Load and parse trajectory file into organized playback actions.
+        Expected JSONL format per design document:
+        {"env_index": 0, "step": 0, "messages": [{..}, {..}]}
+        {"env_index": 1, "step": 0, "messages": [{..}, {..}]}
+        {"env_index": 0, "step": 1, "messages": [{..}, {..}]}
+        Args:
+            filepath: Path to trajectory JSONL file
+        Returns:
+            Organized playback actions: {env_index: [{"step": int, "messages": [...]}]}
+        """
+        if not os.path.exists(filepath):
+            logger.error(f"Trajectory file not found: {filepath}")
+            return None
+        try:
+            playback_actions = {}
+            valid_entries = 0
+            with open(filepath, "r") as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        entry = json.loads(line)
+                        # Validate required fields
+                        if not isinstance(entry, dict):
+                            logger.warning(f"Line {line_num}: Entry is not a dictionary, skipping")
+                            continue
+                        env_index = entry.get("env_index")
+                        step = entry.get("step")
+                        messages = entry.get("messages")
+                        if env_index is None or step is None or messages is None:
+                            logger.warning(
+                                f"Line {line_num}: Missing required fields (env_index, step, messages), skipping"
+                            )
+                            continue
+                        # Convert env_index to string for consistent dictionary keys
+                        env_key = str(env_index)
+                        # Initialize environment list if needed
+                        if env_key not in playback_actions:
+                            playback_actions[env_key] = []
+                        # Add step entry
+                        playback_actions[env_key].append({"step": step, "messages": messages})
+                        valid_entries += 1
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"Line {line_num}: Invalid JSON - {e}")
+                        continue
+            # Sort each environment's actions by step
+            for env_key in playback_actions:
+                playback_actions[env_key].sort(key=lambda x: x["step"])
+            if playback_actions:
+                logger.info(f"✅ Loaded {valid_entries} trajectory entries for {len(playback_actions)} environments")
+                return playback_actions
+            else:
+                logger.warning(
+                    f"⚠️  Trajectory file {filepath} exists but contains no valid entries. "
+                    f"Falling back to recording mode. Please check file format - expected JSONL with "
+                    f"'env_index', 'step', and 'messages' fields."
+                )
+                return None
+        except Exception as e:
+            logger.error(f"Error loading trajectory file {filepath}: {e}")
+            return None
+    def _get_playback_messages(self, env_index: int) -> Optional[List[Dict[str, Any]]]:
+        """
+        Get the next playback messages for the specified environment.
+        Args:
+            env_index: Environment index
+        Returns:
+            Messages list for the current step, or None if no more steps
+        """
+        if not self._is_playback or not self._playback_actions:
+            return None
+        env_key = str(env_index)
+        if env_key not in self._playback_actions:
+            logger.warning(f"No playback data for environment {env_index}")
+            return None
+        current_step = self._playback_step_counters.get(str(env_index), 0)
+        env_actions = self._playback_actions[env_key]
+        # Find action for current step
+        for action in env_actions:
+            if action["step"] == current_step:
+                # Increment step counter for next call
+                self._playback_step_counters[str(env_index)] = current_step + 1
+                logger.debug(f"🎬 Environment {env_index}: Returning playback messages for step {current_step}")
+                return action["messages"]
+        # No more recorded actions available
+        logger.debug(f"🎬 Environment {env_index}: No more playback data (step {current_step})")
+        return None
+    def has_more_playback_data(self, env_index: int) -> bool:
+        """
+        Check if there are more playback actions available for an environment.
+        Args:
+            env_index: Environment index
+        Returns:
+            True if more actions are available, False otherwise
+        """
+        if not self._is_playback or not self._playback_actions:
+            return False
+        env_key = str(env_index)
+        if env_key not in self._playback_actions:
+            return False
+        current_step = self._playback_step_counters.get(str(env_index), 0)
+        env_actions = self._playback_actions[env_key]
+        # Check if there's an action for the current step
+        return any(action["step"] == current_step for action in env_actions)
+    @abstractmethod
+    async def _generate_live_tool_calls(
+        self,
+        tool_schemas: List[Dict],
+        env_index: int,
+        conversation_history: List[Dict[str, Any]],
+    ) -> Tuple[List["MCPToolCall"], LLMUsageStats]:
+        """
+        Generate tool calls in live mode. Concrete classes must implement this.
+        Args:
+            tool_schemas: Available tools for this environment
+            env_index: Environment index
+            conversation_history: Current conversation history for this environment
+        Returns:
+            List of ToolCall objects and LLM interation usage stats
+        """
+        pass
+    async def __call__(
+        self,
+        tool_schemas: List[Dict],
+        env_index: int,
+        conversation_history: List[Dict[str, Any]],
+    ):
+        """
+        Main policy call method. Delegates to playback or live mode.
+        Args:
+            tool_schemas: Available tools for each environment
+            observations: Current observations from environments
+            system_prompts: System prompts for each environment
+            user_prompts: User prompts for each environment
+        Returns:
+            List of ToolCall objects and LLM interation usage stats for each environment
+        """
+        if self._is_playback:
+            # In playback mode, get recorded messages
+            messages = self._get_playback_messages(env_index)
+            if messages is None:
+                # No more recorded actions - signal early termination
+                return [
+                    MCPToolCall(
+                        "_playback_terminate",
+                        {"reason": "no_more_recorded_actions"},
+                    )
+                ]
+            # Return the recorded tool call
+            return self._extract_tool_call_from_messages(messages, env_index), None
+        else:
+            # Live mode - generate tool call using provided conversation history
+            return await self._generate_live_tool_calls(tool_schemas, env_index, conversation_history)
+    def _extract_tool_call_from_messages(self, messages: List[Dict[str, Any]], env_index: int) -> List[MCPToolCall]:
+        """
+        Extract tool calls from recorded conversation messages.
+        Args:
+            messages: List of conversation messages
+            env_index: Environment index for logging
+        Returns:
+            List of MCPToolCall objects
+        """
+        # Look for the last assistant message with tool_calls
+        for message in reversed(messages):
+            if message.get("role") == "assistant" and message.get("tool_calls"):
+                tool_calls = message["tool_calls"]
+                if tool_calls and len(tool_calls) > 0:
+                    # Process ALL tool calls, not just the first one
+                    mcp_tool_calls = []
+                    for tool_call in tool_calls:
+                        # Extract function name and arguments
+                        function = tool_call.get("function", {})
+                        tool_name = function.get("name", "unknown")
+                        tool_call_id = tool_call.get("id", "unknown")
+                        # Parse arguments if they're a string
+                        arguments = function.get("arguments", {})
+                        if isinstance(arguments, str):
+                            try:
+                                arguments = json.loads(arguments)
+                            except json.JSONDecodeError:
+                                logger.warning(
+                                    f"🎬 Environment {env_index}: Failed to parse tool call arguments: {arguments}"
+                                )
+                                arguments = {}
+                        mcp_tool_calls.append(MCPToolCall(tool_name, arguments, tool_call_id))
+                    logger.debug(f"🎬 Environment {env_index}: Extracted {len(mcp_tool_calls)} tool calls")
+                    return mcp_tool_calls
+        # Fallback if no tool calls found
+        logger.warning(f"🎬 Environment {env_index}: No tool calls found in messages, using unknown tool")
+        return [MCPToolCall("unknown", {})]
+    def is_playback_mode(self) -> bool:
+        """
+        Check if the policy is in playback mode.
+        Returns:
+            True if in playback mode, False otherwise
+        """
+        return self._is_playback
+    def get_playback_progress(self) -> Dict[str, Any]:
+        """
+        Get playback progress information.
+        Returns:
+            Dictionary with playback progress details
+        """
+        if not self._is_playback:
+            return {"playback_mode": False}
+        progress = {
+            "playback_mode": True,
+            "environments": {},
+            "total_environments": (len(self._playback_actions) if self._playback_actions else 0),
+        }
+        if self._playback_actions:
+            for env_key, actions in self._playback_actions.items():
+                env_index = int(env_key)
+                current_step = self._playback_step_counters.get(str(env_index), 0)
+                total_steps = len(actions)
+                progress["environments"][env_index] = {
+                    "current_step": current_step,
+                    "total_steps": total_steps,
+                    "completed": current_step >= total_steps,
+                }
+        return progress
+    def log_conversation_state_for_playback(
+        self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
+    ):
+        """
+        Log the current conversation state in the format required for playback.
+        Base implementation that subclasses can override with specific behavior.
+        Expected format: {"env_index": 0, "step": 0, "messages": [{..}, {..}]}
+        Args:
+            env_index: Environment index
+            step: Current step number
+            conversation_history: List of conversation messages
+        """
+        # Use EP_PLAYBACK_FILE environment variable for recording
+        playback_file = os.environ.get("EP_PLAYBACK_FILE")
+        if not playback_file:
+            return  # No recording file specified
+        playback_entry = {
+            "env_index": env_index,
+            "step": step,
+            "messages": conversation_history.copy(),
+        }
+        with open(playback_file, "a") as f:
+            f.write(json.dumps(playback_entry) + "\n")
+    def log_conversation_state_for_playback(
+        self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
+    ):
+        """
+        Log the current conversation state in the format required for playback.
+        Base implementation that subclasses can override with specific behavior.
+        Expected format: {"env_index": 0, "step": 0, "messages": [{..}, {..}]}
+        Args:
+            env_index: Environment index
+            step: Current step number
+            conversation_history: List of conversation messages
+        """
+        # Use EP_PLAYBACK_FILE environment variable for recording
+        playback_file = os.environ.get("EP_PLAYBACK_FILE")
+        if not playback_file:
+            return  # No recording file specified
+        playback_entry = {
+            "env_index": env_index,
+            "step": step,
+            "messages": conversation_history.copy(),
+        }
+        with open(playback_file, "a") as f:
+            f.write(json.dumps(playback_entry) + "\n")

eval_protocol/resources.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+Resource management for reward functions.
+This module provides resource wrappers for external services like LLMs,
+databases, etc. Resources are automatically setup and cleaned up by the
+reward function decorator.
+"""
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, TypeVar
+logger = logging.getLogger(__name__)
+# Type definitions
+T = TypeVar("T")
+ResourceDict = Dict[str, List["ResourceWrapper"]]
+class ResourceWrapper(ABC):
+    """Abstract base class for all resource wrappers."""
+    @abstractmethod
+    def setup(self) -> None:
+        """Setup the resource (e.g., start deployment, create connection)."""
+        pass
+    @abstractmethod
+    def cleanup(self) -> None:
+        """Cleanup the resource (e.g., stop deployment, close connection)."""
+        pass
+    @abstractmethod
+    def get_client(self) -> Any:
+        """Get the client object for using this resource."""
+        pass
+class LLMResourceWrapper(ResourceWrapper):
+    """Resource wrapper for Fireworks LLM deployments."""
+    def __init__(self, llm_instance: Any):
+        """
+        Initialize LLM resource wrapper.
+        Args:
+            llm_instance: A Fireworks LLM instance from the Build SDK
+        """
+        self.llm_instance = llm_instance
+        self._client = None
+        self._is_setup = False
+    def setup(self) -> None:
+        """Setup the LLM deployment."""
+        if self._is_setup:
+            logger.debug(f"LLM resource already setup for model: {self.llm_instance.model}")
+            return
+        try:
+            logger.debug(f"Setting up LLM deployment for model: " f"{self.llm_instance.model}")
+            # For on-demand deployments, call apply()
+            if hasattr(self.llm_instance, "deployment_type") and self.llm_instance.deployment_type == "on-demand":
+                logger.info("Applying on-demand LLM deployment...")
+                self.llm_instance.apply()
+                logger.info("On-demand LLM deployment applied successfully")
+            self._client = self.llm_instance
+            self._is_setup = True
+            logger.info(f"LLM resource setup completed for model: " f"{self.llm_instance.model}")
+        except Exception as e:
+            logger.error(f"Failed to setup LLM resource: {e}")
+            raise
+    def cleanup(self) -> None:
+        """Cleanup the LLM deployment."""
+        if not self._is_setup:
+            logger.debug("LLM resource not setup, nothing to cleanup")
+            return
+        try:
+            logger.debug("Cleaning up LLM resource")
+            # For Fireworks Build SDK, we typically don't need explicit
+            # cleanup as deployments are managed by the platform
+            self._client = None
+            self._is_setup = False
+            logger.debug("LLM resource cleanup completed")
+        except Exception as e:
+            logger.error(f"Error during LLM resource cleanup: {e}")
+            # Don't re-raise cleanup errors to avoid masking original
+            # exceptions
+    def get_client(self) -> Any:
+        """Get the LLM client for making API calls."""
+        if not self._is_setup or self._client is None:
+            raise RuntimeError("LLM resource not setup. Call setup() first.")
+        return self._client
+def create_llm_resource(llm_instance: Any) -> LLMResourceWrapper:
+    """
+    Create an LLM resource wrapper from a Fireworks LLM instance.
+    Args:
+        llm_instance: A Fireworks LLM instance from the Build SDK
+    Returns:
+        LLMResourceWrapper instance
+    Example:
+        ```python
+        from fireworks import LLM
+        from eval_protocol import create_llm_resource
+        llm = LLM(
+            model="accounts/fireworks/models/llama-v3p1-8b-instruct",
+            deployment_type="on-demand",
+        )
+        llm_resource = create_llm_resource(llm)
+        ```
+    """
+    return LLMResourceWrapper(llm_instance)