eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,177 @@
1
+ """
2
+ Generic process manager for MCP servers running in isolated Conda environments.
3
+
4
+ This module provides a reusable helper class to manage the lifecycle of server
5
+ subprocesses within dedicated Conda environments, ensuring dependency isolation.
6
+ """
7
+
8
+ import os
9
+ import socket
10
+ import subprocess
11
+ import time
12
+ import uuid
13
+ from typing import Dict, Tuple
14
+
15
+
16
+ class CondaServerProcessManager:
17
+ """Manages the lifecycle of server subprocesses inside Conda environments."""
18
+
19
+ def __init__(
20
+ self,
21
+ script_path: str,
22
+ requirements_path: str,
23
+ conda_base_env: str = "base",
24
+ port_range: Tuple[int, int] = (10000, 11000),
25
+ ):
26
+ """
27
+ Initialize the process manager.
28
+
29
+ Args:
30
+ script_path: Path to the server script to run
31
+ requirements_path: Path to requirements.txt for the environment
32
+ conda_base_env: Base conda environment to clone from
33
+ port_range: Tuple of (min_port, max_port) for server instances
34
+ """
35
+ self.script_path = script_path
36
+ self.requirements_path = requirements_path
37
+ self.conda_base_env = conda_base_env
38
+ self.port_range = port_range
39
+ self.processes: Dict[int, Tuple[subprocess.Popen, str]] = {} # port -> (process, conda_env_name)
40
+ self.used_ports: set = set() # Track used ports for better management
41
+
42
+ def _create_conda_env(self, env_name: str):
43
+ """Creates a new conda environment by cloning the base."""
44
+ print(f"Creating conda environment '{env_name}'...")
45
+ # Clone the base environment
46
+ clone_cmd = [
47
+ "conda",
48
+ "create",
49
+ "--name",
50
+ env_name,
51
+ "--clone",
52
+ self.conda_base_env,
53
+ "-y",
54
+ ]
55
+ subprocess.run(clone_cmd, check=True, capture_output=True, text=True)
56
+
57
+ # Install specific requirements into the new environment
58
+ pip_install_cmd = [
59
+ "conda",
60
+ "run",
61
+ "-n",
62
+ env_name,
63
+ "pip",
64
+ "install",
65
+ "-r",
66
+ self.requirements_path,
67
+ ]
68
+ subprocess.run(pip_install_cmd, check=True, capture_output=True, text=True)
69
+ print(f"Environment '{env_name}' created and dependencies installed.")
70
+
71
+ def find_free_port(self) -> int:
72
+ """
73
+ Finds and returns an available TCP port within the configured range.
74
+
75
+ Returns:
76
+ Available port number
77
+
78
+ Raises:
79
+ RuntimeError: If no ports are available in the range
80
+ """
81
+ min_port, max_port = self.port_range
82
+
83
+ # Try ports in the configured range, avoiding recently used ones
84
+ attempted_ports = set()
85
+
86
+ for _ in range(max_port - min_port):
87
+ # Generate a candidate port, preferring unused ones
88
+ import random
89
+
90
+ # First try unused ports
91
+ available_ports = set(range(min_port, max_port)) - self.used_ports
92
+ if available_ports:
93
+ candidate_port = random.choice(list(available_ports))
94
+ else:
95
+ # If all ports have been used, try any port in range
96
+ candidate_port = random.randint(min_port, max_port - 1)
97
+
98
+ # Skip if we already tried this port
99
+ if candidate_port in attempted_ports:
100
+ continue
101
+ attempted_ports.add(candidate_port)
102
+
103
+ # Test if the port is actually available
104
+ try:
105
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
106
+ s.bind(("localhost", candidate_port))
107
+ # Port is available
108
+ self.used_ports.add(candidate_port)
109
+ print(f"Allocated port {candidate_port} from range {min_port}-{max_port}")
110
+ return candidate_port
111
+ except OSError:
112
+ # Port is in use, try next one
113
+ continue
114
+
115
+ # No available ports found
116
+ raise RuntimeError(f"No available ports in range {min_port}-{max_port}. Used ports: {len(self.used_ports)}")
117
+
118
+ def start_server(self, seed: int) -> int:
119
+ """Creates a new Conda env and starts a server instance within it."""
120
+ port = self.find_free_port()
121
+ env_name = f"mcp-sim-env-{uuid.uuid4().hex[:8]}"
122
+
123
+ self._create_conda_env(env_name)
124
+
125
+ env = os.environ.copy()
126
+ env["PORT"] = str(port)
127
+
128
+ # Command to run the server inside the new conda environment
129
+ cmd = [
130
+ "conda",
131
+ "run",
132
+ "-n",
133
+ env_name,
134
+ "python",
135
+ self.script_path,
136
+ "--port",
137
+ str(port),
138
+ "--seed",
139
+ str(seed),
140
+ ]
141
+
142
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
143
+
144
+ self.processes[port] = (process, env_name)
145
+ time.sleep(3) # Give the server more time to start up after env creation
146
+ return port
147
+
148
+ def stop_server(self, port: int):
149
+ """Stops the server and removes its Conda environment."""
150
+ if port in self.processes:
151
+ process, env_name = self.processes[port]
152
+ print(f"Stopping server on port {port} and cleaning up environment '{env_name}'")
153
+
154
+ process.terminate()
155
+ try:
156
+ process.wait(timeout=5)
157
+ except subprocess.TimeoutExpired:
158
+ print(f"Force killing server on port {port}")
159
+ process.kill()
160
+ process.wait()
161
+
162
+ # Remove the conda environment
163
+ print(f"Removing conda environment '{env_name}'...")
164
+ rm_cmd = ["conda", "env", "remove", "--name", env_name, "-y"]
165
+ subprocess.run(rm_cmd, check=True, capture_output=True, text=True)
166
+
167
+ # Clean up tracking
168
+ del self.processes[port]
169
+ if port in self.used_ports:
170
+ self.used_ports.remove(port)
171
+
172
+ print(f"โœ… Environment '{env_name}' removed and port {port} freed")
173
+
174
+ def stop_all(self):
175
+ """Stops all managed servers and cleans up all environments."""
176
+ for port in list(self.processes.keys()):
177
+ self.stop_server(port)
@@ -0,0 +1,11 @@
1
+ """
2
+ MCP Session Management
3
+
4
+ This module handles session management and vector environment operations.
5
+ """
6
+
7
+ from .manager import GeneralMCPVectorEnv
8
+
9
+ __all__ = [
10
+ "GeneralMCPVectorEnv",
11
+ ]
@@ -0,0 +1,228 @@
1
+ """
2
+ Session Management and Vector Environment
3
+
4
+ Handles MCPSession management and vector environment operations.
5
+ Extracted from mcp_env.py to improve modularity.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import logging
11
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
12
+
13
+ from ..execution.manager import ExecutionManager
14
+ from ..types import DatasetRow, MCPSession, MCPToolCall
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # TODO: rename this file or the other manager.py
20
+ class GeneralMCPVectorEnv:
21
+ """
22
+ General MCP vector environment that works with any MCP server.
23
+
24
+ Manages on-demand MCP sessions for rollouts.
25
+ Driven by dataset prompts and MCP tool discovery, not hardcoded logic.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ sessions: List[MCPSession],
31
+ dataset_rows: List[DatasetRow],
32
+ user_prompt_formatter: Optional[Callable] = None,
33
+ ):
34
+ """
35
+ Initialize with dataset-driven configuration.
36
+
37
+ Args:
38
+ sessions: MCP sessions
39
+ dataset_rows: Full dataset rows with prompts and context
40
+ user_prompt_formatter: Callback to format user prompts dynamically
41
+ """
42
+ self.sessions = sessions
43
+ self.dataset_rows = dataset_rows
44
+ self.user_prompt_formatter = user_prompt_formatter or self._default_formatter
45
+ self.n = len(sessions)
46
+ self.tool_schemas = [] # Discovered from MCP servers
47
+ self.execution_manager = ExecutionManager()
48
+ self.usage_stats = {} # llm usage stats for monitoring
49
+
50
+ if len(sessions) != len(dataset_rows):
51
+ raise ValueError(
52
+ f"Sessions ({len(sessions)}) and dataset rows ({len(dataset_rows)}) must have same length"
53
+ )
54
+
55
+ async def reset(self, session: MCPSession) -> Tuple[Any, List[Dict]]:
56
+ """
57
+ Reset a single session - establish connection, get tools and initial state.
58
+
59
+ This is thread-safe and can be called from worker threads.
60
+ """
61
+ # Establish a persistent session for each environment.
62
+ await self.execution_manager.connection_manager.initialize_session(session)
63
+
64
+ # Get available tools from MCP server
65
+ tool_schemas = await self.execution_manager.connection_manager.discover_tools(session)
66
+
67
+ # PROPER MCP PATTERN: Get initial state from resources during session establishment
68
+ initial_observation = await self.execution_manager.connection_manager.get_initial_state(session)
69
+
70
+ # Update session state
71
+ session.terminated = False
72
+ session.last_observation = initial_observation
73
+
74
+ return initial_observation, tool_schemas
75
+
76
+ async def step(self, env_index: int, tool_call: MCPToolCall) -> Tuple[Any, float, bool, Dict]:
77
+ """
78
+ Execute a tool call for a single environment.
79
+
80
+ Args:
81
+ env_index: Index of the environment to step
82
+ tool_call: Tool call to execute
83
+
84
+ Returns:
85
+ observation: New observation after executing the tool call
86
+ reward: Reward from the environment
87
+ done: Whether the environment is terminated
88
+ info: Additional info from the environment
89
+ """
90
+ if env_index >= self.n or env_index < 0:
91
+ raise ValueError(f"Environment index {env_index} out of range [0, {self.n})")
92
+
93
+ session = self.sessions[env_index]
94
+
95
+ if session.terminated:
96
+ return session.last_observation, 0.0, True, {}
97
+
98
+ # Handle special playback termination signal
99
+ if tool_call.tool_name == "_playback_terminate":
100
+ logger.info(f"๐ŸŽฌ Session {session.session_id}: Received playback termination signal")
101
+ session.terminated = True
102
+ return session.last_observation, 0.0, True, {"playback_terminated": True}
103
+
104
+ # Handle special no-tool-call signal
105
+ if tool_call.tool_name == "_no_tool_call":
106
+ logger.info(f"๐Ÿ Session {session.session_id}: No tool call generated, episode likely ended")
107
+ session.terminated = True
108
+ return (
109
+ session.last_observation,
110
+ 0.0,
111
+ True,
112
+ {
113
+ "no_tool_call": True,
114
+ "reason": tool_call.arguments.get("reason", "unknown"),
115
+ },
116
+ )
117
+
118
+ # Execute the tool call via MCP protocol
119
+ observation, reward, done, info = await self.execution_manager.connection_manager.call_tool(
120
+ session, tool_call.tool_name, tool_call.arguments
121
+ )
122
+
123
+ # Update session state
124
+ session.last_observation = observation
125
+ session.terminated = done
126
+
127
+ return observation, reward, done, info
128
+
129
+ def format_user_prompt(self, env_index: int, observation: Any) -> Union[str, List[Dict[str, Any]]]:
130
+ """
131
+ Format user prompt dynamically for a single environment based on current observation.
132
+ """
133
+ if env_index >= self.n or env_index < 0:
134
+ raise ValueError(f"Environment index {env_index} out of range [0, {self.n})")
135
+
136
+ dataset_row = self.dataset_rows[env_index]
137
+
138
+ # Use the callback to format the prompt
139
+ prompt = self.user_prompt_formatter(
140
+ dataset_row.user_prompt_template,
141
+ observation,
142
+ dataset_row.environment_context,
143
+ )
144
+
145
+ return prompt
146
+
147
+ def format_tool_response(self, obs: Any) -> Union[str, List[Dict[str, Any]]]:
148
+ """
149
+ Format observation to tool response. If there's an image_url, it will be returned as a multimodal content. If not, it will be returned as a string.
150
+ This is what gets filled in for the tool responses content.
151
+ """
152
+
153
+ if isinstance(obs, dict) and obs.get("image_url"):
154
+ image_url = obs["image_url"]["url"]
155
+ obs.pop("image_url")
156
+
157
+ return [
158
+ {
159
+ "type": "text",
160
+ "text": json.dumps(obs) if isinstance(obs, dict) else str(obs),
161
+ },
162
+ {
163
+ "type": "image_url",
164
+ "image_url": {
165
+ "url": image_url,
166
+ },
167
+ },
168
+ ]
169
+
170
+ else:
171
+ return json.dumps(obs) if isinstance(obs, dict) else str(obs)
172
+
173
+ def _default_formatter(self, template: str, obs: Any, context: Dict) -> Union[str, List[Dict[str, Any]]]:
174
+ """
175
+ Default user prompt formatter.
176
+
177
+ Extracts meaningful display data from MCP observations.
178
+ For FrozenLake: extracts grid_layout if available, otherwise uses raw observation.
179
+ For visual environments: returns multimodal content with both text and images.
180
+
181
+ Returns:
182
+ Either a string (text-only) or a dict (multimodal content)
183
+ """
184
+ # Extract formatted display from observation if available
185
+ display_obs = obs
186
+ image_dict = None
187
+
188
+ if isinstance(obs, dict):
189
+ # For visual environments like LunarLander, we have image_url
190
+ if "image_url" in obs:
191
+ image_dict = obs["image_url"]
192
+ display_obs.pop("image_url")
193
+ # For other structured observations, try to extract meaningful display
194
+ elif "observation" in obs and obs["observation"] != "default_initial_state":
195
+ display_obs = obs["observation"]
196
+ # If we still have default_initial_state, try to use position info
197
+ elif obs.get("observation") == "default_initial_state" and "session_id" in obs:
198
+ # This is the fallback case - we should have gotten the proper initial state from MCP resources
199
+ display_obs = (
200
+ f"Initial game state (Session: {obs['session_id']})\nWaiting for grid data from server..."
201
+ )
202
+
203
+ formatted_prompt = template.format(observation=display_obs, **context)
204
+
205
+ # If we have image data, return multimodal content
206
+ if image_dict:
207
+ return [
208
+ {
209
+ "type": "text",
210
+ "text": formatted_prompt,
211
+ },
212
+ {
213
+ "type": "image_url",
214
+ "image_url": image_dict,
215
+ },
216
+ ]
217
+
218
+ return formatted_prompt
219
+
220
+ async def close(self):
221
+ """Closes all MCP sessions."""
222
+ print(f"๐Ÿงน Closing {self.n} MCP sessions...")
223
+ await self.execution_manager.close_sessions(self.sessions)
224
+ print(f"โœ… All MCP sessions closed.")
225
+
226
+
227
+ # Keep the old MCPVectorEnv for backward compatibility
228
+ MCPVectorEnv = GeneralMCPVectorEnv