PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/mcp_agent/main.py ADDED Viewed

@@ -0,0 +1,210 @@
+import asyncio
+import logging
+import signal
+from contextlib import asynccontextmanager
+from typing import Optional
+import click
+import uvicorn
+import yaml
+from mcp.server.streamable_http_manager import (  # MCP SDK component
+    StreamableHTTPSessionManager,
+)
+from starlette.applications import Starlette
+from starlette.routing import Mount, Route  # Import Mount
+from eval_protocol.mcp_agent.config import AppConfig
+from eval_protocol.mcp_agent.intermediary_server import RewardKitIntermediaryServer
+logger = logging.getLogger(__name__)
+# Global server instance to be managed by signal handlers
+# This will now be the Uvicorn server instance.
+_uvicorn_server_instance_ref: Optional[uvicorn.Server] = None  # Keep a global ref if needed for signals
+# Keep a reference to our MCP server for lifespan management
+_mcp_server_instance_ref: Optional[RewardKitIntermediaryServer] = None
+# _session_manager_ref is not needed globally if lifespan_wrapper handles it.
+# Custom app_lifespan is no longer needed if StreamableHTTPSessionManager.lifespan_wrapper is used.
+async def main_async(config_path: str, host: str, port: int):
+    """
+    Asynchronous main function to load config, set up the ASGI application,
+    and run it with Uvicorn.
+    """
+    global _uvicorn_server_instance_ref, _mcp_server_instance_ref  # _session_manager_ref removed from globals
+    try:
+        with open(config_path, "r") as f:
+            raw_config = yaml.safe_load(f)
+        app_config = AppConfig(**raw_config)
+    except FileNotFoundError:
+        logger.error(f"Configuration file not found: {config_path}")
+        return
+    except yaml.YAMLError as e:
+        logger.error(f"Error parsing YAML configuration file {config_path}: {e}")
+        return
+    except Exception as e:
+        logger.error(f"Error loading or validating AppConfig from {config_path}: {e}")
+        return
+    # Configure logging early
+    server_root_log_level_str = app_config.log_level.upper()
+    server_root_log_level = getattr(logging, server_root_log_level_str, logging.INFO)
+    logging.basicConfig(
+        level=server_root_log_level,  # Root logger for the server process
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",  # Added datefmt for consistency
+    )
+    logger.info(f"Configuration loaded from {config_path}. Server root log level set to {server_root_log_level_str}.")
+    # Ensure eval_protocol.mcp_agent namespace respects this level
+    rk_mcp_agent_logger = logging.getLogger("eval_protocol.mcp_agent")
+    rk_mcp_agent_logger.setLevel(server_root_log_level)
+    # Be very explicit for the intermediary_server logger as well
+    intermediary_server_logger = logging.getLogger("eval_protocol.mcp_agent.intermediary_server")
+    intermediary_server_logger.setLevel(server_root_log_level)
+    # Also ensure its handlers respect this level
+    for handler in intermediary_server_logger.handlers:
+        handler.setLevel(server_root_log_level)
+    # If it's propagating to the 'eval_protocol.mcp_agent' parent, ensure that parent's handlers are also correct.
+    # The parent rk_mcp_agent_logger already had its level set.
+    # Quiet down other noisy libraries for the server unless server itself is in DEBUG mode
+    if server_root_log_level > logging.DEBUG:  # e.g. if INFO or WARNING
+        libraries_to_quiet = [
+            "httpx",
+            "mcp",
+            "uvicorn",
+            "starlette",
+            "asyncio",
+            "hpack",
+            "httpcore",
+        ]
+        for lib_name in libraries_to_quiet:
+            logging.getLogger(lib_name).setLevel(logging.WARNING)
+    logger.info(
+        f"Log level for 'eval_protocol.mcp_agent' namespace set to {logging.getLevelName(logging.getLogger('eval_protocol.mcp_agent').getEffectiveLevel())}"
+    )
+    # 1. Instantiate RewardKitIntermediaryServer
+    _mcp_server_instance_ref = RewardKitIntermediaryServer(
+        app_config=app_config
+    )  # Store globally for lifespan_wrapper
+    # 2. Instantiate StreamableHTTPSessionManager
+    # Pass the internal _mcp_server (the MCPServer instance) from our FastMCP subclass
+    session_manager = StreamableHTTPSessionManager(
+        app=_mcp_server_instance_ref._mcp_server,
+        event_store=None,
+        json_response=True,  # Changed to True
+    )
+    # 3. Create Starlette app, using session_manager.lifespan_wrapper
+    # This wrapper should handle the startup/shutdown of both the session_manager's task group
+    # and the underlying _mcp_server_instance_ref.
+    routes = [
+        Mount("/mcp", app=session_manager.handle_request),
+    ]
+    # The lifespan_wrapper approach was incorrect as the method doesn't exist.
+    # We will now use a custom lifespan for the MCPServer and run Uvicorn
+    # within the context of session_manager.run() if it's an async context manager.
+    @asynccontextmanager
+    async def mcp_server_lifespan_only(app_for_lifespan: Starlette):
+        # This lifespan only manages the _mcp_server_instance_ref
+        if _mcp_server_instance_ref:
+            logger.info("MCP Server Lifespan: Starting up RewardKitIntermediaryServer...")
+            await _mcp_server_instance_ref.startup()
+            logger.info("MCP Server Lifespan: RewardKitIntermediaryServer startup complete.")
+        yield
+        if _mcp_server_instance_ref:
+            logger.info("MCP Server Lifespan: Shutting down RewardKitIntermediaryServer...")
+            await _mcp_server_instance_ref.shutdown()
+            logger.info("MCP Server Lifespan: RewardKitIntermediaryServer shutdown complete.")
+    routes = [
+        Mount("/mcp", app=session_manager.handle_request),
+    ]
+    starlette_app = Starlette(routes=routes, lifespan=mcp_server_lifespan_only)
+    # 4. Configure Uvicorn
+    config = uvicorn.Config(
+        app=starlette_app,  # Starlette app with its own lifespan for MCPServer
+        host=host,
+        port=port,
+        log_level=app_config.log_level.lower(),
+        log_config=None,  # Prevent Uvicorn from overriding our basicConfig for app loggers
+    )
+    uvicorn_server = uvicorn.Server(config)
+    _uvicorn_server_instance_ref = uvicorn_server
+    logger.info(f"Starting RewardKit Intermediary MCP Server on {host}:{port}/mcp.")
+    try:
+        if hasattr(session_manager, "run"):
+            # Call run() to get the potential context manager
+            sm_context_manager = session_manager.run()
+            if hasattr(sm_context_manager, "__aenter__") and hasattr(sm_context_manager, "__aexit__"):
+                logger.info(
+                    "Attempting to run Uvicorn server within context returned by StreamableHTTPSessionManager.run()..."
+                )
+                async with sm_context_manager:  # type: ignore
+                    logger.info("Context from StreamableHTTPSessionManager.run() entered. Serving Uvicorn...")
+                    await uvicorn_server.serve()
+            else:
+                logger.error(
+                    "Object returned by StreamableHTTPSessionManager.run() is not an async context manager. Falling back to direct Uvicorn serve."
+                )
+                await uvicorn_server.serve()
+        else:
+            logger.error(
+                "StreamableHTTPSessionManager does not have a 'run' method. Falling back to direct Uvicorn serve."
+            )
+            await uvicorn_server.serve()
+    except asyncio.CancelledError:
+        logger.info("Server operation cancelled (main_async level).")
+    except Exception as e:
+        logger.error(
+            f"An error occurred during server operation (main_async level): {e}",
+            exc_info=True,
+        )
+    finally:
+        logger.info("Uvicorn server has shut down (main_async finally).")
+# Signal handling is now primarily managed by Uvicorn.
+# If we needed custom logic *before* Uvicorn handles signals, it would be more complex.
+# For now, relying on Uvicorn's graceful shutdown which triggers the ASGI lifespan.
+@click.command()
+@click.option(
+    "--config",
+    "config_path",
+    default="mcp_agent_config.yaml",
+    help="Path to the YAML configuration file for the MCP agent server.",
+    type=click.Path(exists=True, dir_okay=False),
+)
+@click.option("--host", default="0.0.0.0", help="Host for the server to listen on.")
+@click.option("--port", default=8001, type=int, help="Port for the server to listen on.")
+def main_cli(config_path: str, host: str, port: int):
+    """
+    CLI entry point to run the RewardKit Intermediary MCP Server using Uvicorn.
+    """
+    try:
+        asyncio.run(main_async(config_path, host, port))
+    except KeyboardInterrupt:  # This will be caught by Uvicorn first usually
+        logger.info("CLI interrupted by KeyboardInterrupt. Uvicorn should handle shutdown.")
+    finally:
+        logger.info("MCP Agent Server CLI finished.")
+if __name__ == "__main__":
+    main_cli()

eval_protocol/mcp_agent/orchestration/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # MCP Agent Orchestration Package

eval_protocol/mcp_agent/orchestration/base_client.py ADDED Viewed

@@ -0,0 +1,132 @@
+import abc
+from typing import Any, Dict, List, Literal, Optional
+from mcp import types as mcp_types  # Added import
+from pydantic import BaseModel, Field
+from eval_protocol.mcp_agent.config import BackendServerConfig
+class ManagedInstanceInfo(BaseModel):
+    """
+    Stores all necessary details to interact with a provisioned backend instance.
+    """
+    instance_id: str = Field(..., description="Client-facing ID for this instance within a session.")
+    backend_name_ref: str = Field(..., description="Reference name of the backend configuration used.")
+    orchestration_mode: Literal["local_docker", "remote_http_api"] = Field(
+        ..., description="Orchestration mode used for this instance."
+    )
+    mcp_transport: Literal["http", "stdio"] = Field(..., description="MCP transport protocol used by this instance.")
+    mcp_endpoint_url: Optional[str] = Field(
+        None,
+        description="The full MCP endpoint URL for this instance if using HTTP transport (e.g., 'http://localhost:12345/mcp'). None for stdio.",
+    )
+    internal_instance_details: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Orchestrator-specific details, e.g., {'container_id': '...', 'host_port': ...} for Docker or {'remote_instance_id': '...'}. Not directly used by the intermediary server logic after provisioning, but useful for deprovisioning.",
+    )
+    committed_image_tag: Optional[str] = Field(
+        None,
+        description="If local Docker orchestration created a temporary image via 'docker commit', this stores its tag for later cleanup.",
+    )
+    class Config:
+        extra = "forbid"
+class AbstractOrchestrationClient(abc.ABC):
+    """
+    Abstract base class for orchestration clients.
+    Orchestration clients are responsible for provisioning, deprovisioning,
+    and interacting with backend MCP server instances.
+    """
+    @abc.abstractmethod
+    async def provision_instances(
+        self,
+        backend_config: BackendServerConfig,
+        num_instances: int,
+        session_id: str,
+        # template_details might be specific to the backend type,
+        # e.g., path to a database dump for DuckDB, or a directory for filesystem.
+        template_details: Optional[Any] = None,
+    ) -> List[ManagedInstanceInfo]:
+        """
+        Provisions a number of backend instances based on the given configuration.
+        For stateful backends requiring a unique state from a template (e.g., local Docker with a template data path),
+        this method might involve:
+        1. Creating a temporary "template" instance/container.
+        2. Seeding it with data from `template_details` or `backend_config.template_data_path_host`.
+        3. Committing this template instance to a new, temporary image (for Docker).
+        4. Starting `num_instances` from this temporary image.
+        For stateless backends or those not requiring template-based forking, this is simpler.
+        Args:
+            backend_config: Configuration for the backend type to provision.
+            num_instances: Number of instances to provision.
+            session_id: The ID of the current intermediary session, useful for naming/tagging resources.
+            template_details: Optional backend-specific details for initializing stateful instances.
+                              This could be a path to a data file, a directory, or other structured data.
+        Returns:
+            A list of ManagedInstanceInfo objects, one for each provisioned instance.
+        """
+        pass
+    @abc.abstractmethod
+    async def deprovision_instances(self, instances: List[ManagedInstanceInfo]) -> None:
+        """
+        Deprovisions (e.g., stops and removes) the specified backend instances.
+        Also handles cleanup of any temporary resources like committed Docker images.
+        Args:
+            instances: A list of ManagedInstanceInfo objects for the instances to deprovision.
+        """
+        pass
+    @abc.abstractmethod
+    async def call_tool_on_instance(
+        self, instance: ManagedInstanceInfo, tool_name: str, tool_args: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Calls a specific MCP tool on a given backend instance.
+        Args:
+            instance: The ManagedInstanceInfo for the target backend instance.
+            tool_name: The name of the MCP tool to call.
+            tool_args: A dictionary of arguments for the tool.
+        Returns:
+            A dictionary representing the JSON response from the tool call.
+        """
+        pass
+    @abc.abstractmethod
+    async def list_tools_on_instance(self, instance: ManagedInstanceInfo) -> mcp_types.ListToolsResult:
+        """
+        Lists all available tools on a given backend instance.
+        Args:
+            instance: The ManagedInstanceInfo for the target backend instance.
+        Returns:
+            A ListToolsResult object containing the tools available on the instance.
+        """
+        pass
+    async def startup(self) -> None:
+        """
+        Optional: Perform any setup required when the orchestration client is initialized.
+        e.g., check Docker connection, authenticate with remote API.
+        """
+        pass
+    async def shutdown(self) -> None:
+        """
+        Optional: Perform any cleanup required when the orchestration client is shut down.
+        e.g., clean up globally shared resources if any were managed by this client.
+        """
+        pass