eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,210 @@
1
+ import asyncio
2
+ import logging
3
+ import signal
4
+ from contextlib import asynccontextmanager
5
+ from typing import Optional
6
+
7
+ import click
8
+ import uvicorn
9
+ import yaml
10
+ from mcp.server.streamable_http_manager import ( # MCP SDK component
11
+ StreamableHTTPSessionManager,
12
+ )
13
+ from starlette.applications import Starlette
14
+ from starlette.routing import Mount, Route # Import Mount
15
+
16
+ from eval_protocol.mcp_agent.config import AppConfig
17
+ from eval_protocol.mcp_agent.intermediary_server import RewardKitIntermediaryServer
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Global server instance to be managed by signal handlers
22
+ # This will now be the Uvicorn server instance.
23
+ _uvicorn_server_instance_ref: Optional[uvicorn.Server] = None # Keep a global ref if needed for signals
24
+ # Keep a reference to our MCP server for lifespan management
25
+ _mcp_server_instance_ref: Optional[RewardKitIntermediaryServer] = None
26
+ # _session_manager_ref is not needed globally if lifespan_wrapper handles it.
27
+
28
+
29
+ # Custom app_lifespan is no longer needed if StreamableHTTPSessionManager.lifespan_wrapper is used.
30
+
31
+
32
+ async def main_async(config_path: str, host: str, port: int):
33
+ """
34
+ Asynchronous main function to load config, set up the ASGI application,
35
+ and run it with Uvicorn.
36
+ """
37
+ global _uvicorn_server_instance_ref, _mcp_server_instance_ref # _session_manager_ref removed from globals
38
+ try:
39
+ with open(config_path, "r") as f:
40
+ raw_config = yaml.safe_load(f)
41
+ app_config = AppConfig(**raw_config)
42
+ except FileNotFoundError:
43
+ logger.error(f"Configuration file not found: {config_path}")
44
+ return
45
+ except yaml.YAMLError as e:
46
+ logger.error(f"Error parsing YAML configuration file {config_path}: {e}")
47
+ return
48
+ except Exception as e:
49
+ logger.error(f"Error loading or validating AppConfig from {config_path}: {e}")
50
+ return
51
+
52
+ # Configure logging early
53
+ server_root_log_level_str = app_config.log_level.upper()
54
+ server_root_log_level = getattr(logging, server_root_log_level_str, logging.INFO)
55
+
56
+ logging.basicConfig(
57
+ level=server_root_log_level, # Root logger for the server process
58
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
59
+ datefmt="%Y-%m-%d %H:%M:%S", # Added datefmt for consistency
60
+ )
61
+ logger.info(f"Configuration loaded from {config_path}. Server root log level set to {server_root_log_level_str}.")
62
+
63
+ # Ensure eval_protocol.mcp_agent namespace respects this level
64
+ rk_mcp_agent_logger = logging.getLogger("eval_protocol.mcp_agent")
65
+ rk_mcp_agent_logger.setLevel(server_root_log_level)
66
+
67
+ # Be very explicit for the intermediary_server logger as well
68
+ intermediary_server_logger = logging.getLogger("eval_protocol.mcp_agent.intermediary_server")
69
+ intermediary_server_logger.setLevel(server_root_log_level)
70
+ # Also ensure its handlers respect this level
71
+ for handler in intermediary_server_logger.handlers:
72
+ handler.setLevel(server_root_log_level)
73
+ # If it's propagating to the 'eval_protocol.mcp_agent' parent, ensure that parent's handlers are also correct.
74
+ # The parent rk_mcp_agent_logger already had its level set.
75
+
76
+ # Quiet down other noisy libraries for the server unless server itself is in DEBUG mode
77
+ if server_root_log_level > logging.DEBUG: # e.g. if INFO or WARNING
78
+ libraries_to_quiet = [
79
+ "httpx",
80
+ "mcp",
81
+ "uvicorn",
82
+ "starlette",
83
+ "asyncio",
84
+ "hpack",
85
+ "httpcore",
86
+ ]
87
+ for lib_name in libraries_to_quiet:
88
+ logging.getLogger(lib_name).setLevel(logging.WARNING)
89
+
90
+ logger.info(
91
+ f"Log level for 'eval_protocol.mcp_agent' namespace set to {logging.getLevelName(logging.getLogger('eval_protocol.mcp_agent').getEffectiveLevel())}"
92
+ )
93
+
94
+ # 1. Instantiate RewardKitIntermediaryServer
95
+ _mcp_server_instance_ref = RewardKitIntermediaryServer(
96
+ app_config=app_config
97
+ ) # Store globally for lifespan_wrapper
98
+
99
+ # 2. Instantiate StreamableHTTPSessionManager
100
+ # Pass the internal _mcp_server (the MCPServer instance) from our FastMCP subclass
101
+ session_manager = StreamableHTTPSessionManager(
102
+ app=_mcp_server_instance_ref._mcp_server,
103
+ event_store=None,
104
+ json_response=True, # Changed to True
105
+ )
106
+
107
+ # 3. Create Starlette app, using session_manager.lifespan_wrapper
108
+ # This wrapper should handle the startup/shutdown of both the session_manager's task group
109
+ # and the underlying _mcp_server_instance_ref.
110
+ routes = [
111
+ Mount("/mcp", app=session_manager.handle_request),
112
+ ]
113
+
114
+ # The lifespan_wrapper approach was incorrect as the method doesn't exist.
115
+ # We will now use a custom lifespan for the MCPServer and run Uvicorn
116
+ # within the context of session_manager.run() if it's an async context manager.
117
+
118
+ @asynccontextmanager
119
+ async def mcp_server_lifespan_only(app_for_lifespan: Starlette):
120
+ # This lifespan only manages the _mcp_server_instance_ref
121
+ if _mcp_server_instance_ref:
122
+ logger.info("MCP Server Lifespan: Starting up RewardKitIntermediaryServer...")
123
+ await _mcp_server_instance_ref.startup()
124
+ logger.info("MCP Server Lifespan: RewardKitIntermediaryServer startup complete.")
125
+ yield
126
+ if _mcp_server_instance_ref:
127
+ logger.info("MCP Server Lifespan: Shutting down RewardKitIntermediaryServer...")
128
+ await _mcp_server_instance_ref.shutdown()
129
+ logger.info("MCP Server Lifespan: RewardKitIntermediaryServer shutdown complete.")
130
+
131
+ routes = [
132
+ Mount("/mcp", app=session_manager.handle_request),
133
+ ]
134
+ starlette_app = Starlette(routes=routes, lifespan=mcp_server_lifespan_only)
135
+
136
+ # 4. Configure Uvicorn
137
+ config = uvicorn.Config(
138
+ app=starlette_app, # Starlette app with its own lifespan for MCPServer
139
+ host=host,
140
+ port=port,
141
+ log_level=app_config.log_level.lower(),
142
+ log_config=None, # Prevent Uvicorn from overriding our basicConfig for app loggers
143
+ )
144
+ uvicorn_server = uvicorn.Server(config)
145
+ _uvicorn_server_instance_ref = uvicorn_server
146
+
147
+ logger.info(f"Starting RewardKit Intermediary MCP Server on {host}:{port}/mcp.")
148
+
149
+ try:
150
+ if hasattr(session_manager, "run"):
151
+ # Call run() to get the potential context manager
152
+ sm_context_manager = session_manager.run()
153
+ if hasattr(sm_context_manager, "__aenter__") and hasattr(sm_context_manager, "__aexit__"):
154
+ logger.info(
155
+ "Attempting to run Uvicorn server within context returned by StreamableHTTPSessionManager.run()..."
156
+ )
157
+ async with sm_context_manager: # type: ignore
158
+ logger.info("Context from StreamableHTTPSessionManager.run() entered. Serving Uvicorn...")
159
+ await uvicorn_server.serve()
160
+ else:
161
+ logger.error(
162
+ "Object returned by StreamableHTTPSessionManager.run() is not an async context manager. Falling back to direct Uvicorn serve."
163
+ )
164
+ await uvicorn_server.serve()
165
+ else:
166
+ logger.error(
167
+ "StreamableHTTPSessionManager does not have a 'run' method. Falling back to direct Uvicorn serve."
168
+ )
169
+ await uvicorn_server.serve()
170
+
171
+ except asyncio.CancelledError:
172
+ logger.info("Server operation cancelled (main_async level).")
173
+ except Exception as e:
174
+ logger.error(
175
+ f"An error occurred during server operation (main_async level): {e}",
176
+ exc_info=True,
177
+ )
178
+ finally:
179
+ logger.info("Uvicorn server has shut down (main_async finally).")
180
+
181
+
182
+ # Signal handling is now primarily managed by Uvicorn.
183
+ # If we needed custom logic *before* Uvicorn handles signals, it would be more complex.
184
+ # For now, relying on Uvicorn's graceful shutdown which triggers the ASGI lifespan.
185
+
186
+
187
+ @click.command()
188
+ @click.option(
189
+ "--config",
190
+ "config_path",
191
+ default="mcp_agent_config.yaml",
192
+ help="Path to the YAML configuration file for the MCP agent server.",
193
+ type=click.Path(exists=True, dir_okay=False),
194
+ )
195
+ @click.option("--host", default="0.0.0.0", help="Host for the server to listen on.")
196
+ @click.option("--port", default=8001, type=int, help="Port for the server to listen on.")
197
+ def main_cli(config_path: str, host: str, port: int):
198
+ """
199
+ CLI entry point to run the RewardKit Intermediary MCP Server using Uvicorn.
200
+ """
201
+ try:
202
+ asyncio.run(main_async(config_path, host, port))
203
+ except KeyboardInterrupt: # This will be caught by Uvicorn first usually
204
+ logger.info("CLI interrupted by KeyboardInterrupt. Uvicorn should handle shutdown.")
205
+ finally:
206
+ logger.info("MCP Agent Server CLI finished.")
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main_cli()
@@ -0,0 +1 @@
1
+ # MCP Agent Orchestration Package
@@ -0,0 +1,132 @@
1
+ import abc
2
+ from typing import Any, Dict, List, Literal, Optional
3
+
4
+ from mcp import types as mcp_types # Added import
5
+ from pydantic import BaseModel, Field
6
+
7
+ from eval_protocol.mcp_agent.config import BackendServerConfig
8
+
9
+
10
+ class ManagedInstanceInfo(BaseModel):
11
+ """
12
+ Stores all necessary details to interact with a provisioned backend instance.
13
+ """
14
+
15
+ instance_id: str = Field(..., description="Client-facing ID for this instance within a session.")
16
+ backend_name_ref: str = Field(..., description="Reference name of the backend configuration used.")
17
+ orchestration_mode: Literal["local_docker", "remote_http_api"] = Field(
18
+ ..., description="Orchestration mode used for this instance."
19
+ )
20
+ mcp_transport: Literal["http", "stdio"] = Field(..., description="MCP transport protocol used by this instance.")
21
+ mcp_endpoint_url: Optional[str] = Field(
22
+ None,
23
+ description="The full MCP endpoint URL for this instance if using HTTP transport (e.g., 'http://localhost:12345/mcp'). None for stdio.",
24
+ )
25
+ internal_instance_details: Dict[str, Any] = Field(
26
+ default_factory=dict,
27
+ description="Orchestrator-specific details, e.g., {'container_id': '...', 'host_port': ...} for Docker or {'remote_instance_id': '...'}. Not directly used by the intermediary server logic after provisioning, but useful for deprovisioning.",
28
+ )
29
+ committed_image_tag: Optional[str] = Field(
30
+ None,
31
+ description="If local Docker orchestration created a temporary image via 'docker commit', this stores its tag for later cleanup.",
32
+ )
33
+
34
+ class Config:
35
+ extra = "forbid"
36
+
37
+
38
+ class AbstractOrchestrationClient(abc.ABC):
39
+ """
40
+ Abstract base class for orchestration clients.
41
+ Orchestration clients are responsible for provisioning, deprovisioning,
42
+ and interacting with backend MCP server instances.
43
+ """
44
+
45
+ @abc.abstractmethod
46
+ async def provision_instances(
47
+ self,
48
+ backend_config: BackendServerConfig,
49
+ num_instances: int,
50
+ session_id: str,
51
+ # template_details might be specific to the backend type,
52
+ # e.g., path to a database dump for DuckDB, or a directory for filesystem.
53
+ template_details: Optional[Any] = None,
54
+ ) -> List[ManagedInstanceInfo]:
55
+ """
56
+ Provisions a number of backend instances based on the given configuration.
57
+
58
+ For stateful backends requiring a unique state from a template (e.g., local Docker with a template data path),
59
+ this method might involve:
60
+ 1. Creating a temporary "template" instance/container.
61
+ 2. Seeding it with data from `template_details` or `backend_config.template_data_path_host`.
62
+ 3. Committing this template instance to a new, temporary image (for Docker).
63
+ 4. Starting `num_instances` from this temporary image.
64
+
65
+ For stateless backends or those not requiring template-based forking, this is simpler.
66
+
67
+ Args:
68
+ backend_config: Configuration for the backend type to provision.
69
+ num_instances: Number of instances to provision.
70
+ session_id: The ID of the current intermediary session, useful for naming/tagging resources.
71
+ template_details: Optional backend-specific details for initializing stateful instances.
72
+ This could be a path to a data file, a directory, or other structured data.
73
+
74
+ Returns:
75
+ A list of ManagedInstanceInfo objects, one for each provisioned instance.
76
+ """
77
+ pass
78
+
79
+ @abc.abstractmethod
80
+ async def deprovision_instances(self, instances: List[ManagedInstanceInfo]) -> None:
81
+ """
82
+ Deprovisions (e.g., stops and removes) the specified backend instances.
83
+ Also handles cleanup of any temporary resources like committed Docker images.
84
+
85
+ Args:
86
+ instances: A list of ManagedInstanceInfo objects for the instances to deprovision.
87
+ """
88
+ pass
89
+
90
+ @abc.abstractmethod
91
+ async def call_tool_on_instance(
92
+ self, instance: ManagedInstanceInfo, tool_name: str, tool_args: Dict[str, Any]
93
+ ) -> Dict[str, Any]:
94
+ """
95
+ Calls a specific MCP tool on a given backend instance.
96
+
97
+ Args:
98
+ instance: The ManagedInstanceInfo for the target backend instance.
99
+ tool_name: The name of the MCP tool to call.
100
+ tool_args: A dictionary of arguments for the tool.
101
+
102
+ Returns:
103
+ A dictionary representing the JSON response from the tool call.
104
+ """
105
+ pass
106
+
107
+ @abc.abstractmethod
108
+ async def list_tools_on_instance(self, instance: ManagedInstanceInfo) -> mcp_types.ListToolsResult:
109
+ """
110
+ Lists all available tools on a given backend instance.
111
+
112
+ Args:
113
+ instance: The ManagedInstanceInfo for the target backend instance.
114
+
115
+ Returns:
116
+ A ListToolsResult object containing the tools available on the instance.
117
+ """
118
+ pass
119
+
120
+ async def startup(self) -> None:
121
+ """
122
+ Optional: Perform any setup required when the orchestration client is initialized.
123
+ e.g., check Docker connection, authenticate with remote API.
124
+ """
125
+ pass
126
+
127
+ async def shutdown(self) -> None:
128
+ """
129
+ Optional: Perform any cleanup required when the orchestration client is shut down.
130
+ e.g., clean up globally shared resources if any were managed by this client.
131
+ """
132
+ pass