eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simplified process manager for MCP servers running in separate processes.
|
|
3
|
+
|
|
4
|
+
This module provides a simpler alternative to the conda-based process manager
|
|
5
|
+
for testing and development scenarios where full environment isolation is not required.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import os
|
|
10
|
+
import socket
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
import uuid
|
|
15
|
+
from contextlib import AsyncExitStack
|
|
16
|
+
from typing import Dict, Tuple
|
|
17
|
+
|
|
18
|
+
from mcp.client.session import ClientSession
|
|
19
|
+
from mcp.client.streamable_http import streamablehttp_client
|
|
20
|
+
from mcp.types import Implementation
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SimpleServerProcessManager:
|
|
24
|
+
"""Manages the lifecycle of server subprocesses using the current Python environment."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
script_path: str,
|
|
29
|
+
python_executable: str = None,
|
|
30
|
+
port_range: Tuple[int, int] = (10000, 11000),
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the process manager.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
script_path: Path to the server script to run
|
|
37
|
+
python_executable: Python executable to use (defaults to current Python)
|
|
38
|
+
port_range: Tuple of (min_port, max_port) for server instances
|
|
39
|
+
"""
|
|
40
|
+
self.script_path = script_path
|
|
41
|
+
self.python_executable = python_executable or sys.executable
|
|
42
|
+
self.port_range = port_range
|
|
43
|
+
self.processes: Dict[int, Tuple[subprocess.Popen, str]] = {} # port -> (process, instance_id)
|
|
44
|
+
self.used_ports: set = set() # Track used ports for better management
|
|
45
|
+
|
|
46
|
+
def find_free_port(self) -> int:
|
|
47
|
+
"""
|
|
48
|
+
Finds and returns an available TCP port within the configured range.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Available port number
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
RuntimeError: If no ports are available in the range
|
|
55
|
+
"""
|
|
56
|
+
min_port, max_port = self.port_range
|
|
57
|
+
|
|
58
|
+
# Try ports in the configured range, avoiding recently used ones
|
|
59
|
+
attempted_ports = set()
|
|
60
|
+
|
|
61
|
+
for _ in range(max_port - min_port):
|
|
62
|
+
# Generate a candidate port, preferring unused ones
|
|
63
|
+
import random
|
|
64
|
+
|
|
65
|
+
# First try unused ports
|
|
66
|
+
available_ports = set(range(min_port, max_port)) - self.used_ports
|
|
67
|
+
if available_ports:
|
|
68
|
+
candidate_port = random.choice(list(available_ports))
|
|
69
|
+
else:
|
|
70
|
+
# If all ports have been used, try any port in range
|
|
71
|
+
candidate_port = random.randint(min_port, max_port - 1)
|
|
72
|
+
|
|
73
|
+
# Skip if we already tried this port
|
|
74
|
+
if candidate_port in attempted_ports:
|
|
75
|
+
continue
|
|
76
|
+
attempted_ports.add(candidate_port)
|
|
77
|
+
|
|
78
|
+
# Test if the port is actually available
|
|
79
|
+
try:
|
|
80
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
81
|
+
s.bind(("localhost", candidate_port))
|
|
82
|
+
# Port is available
|
|
83
|
+
self.used_ports.add(candidate_port)
|
|
84
|
+
print(f"Allocated port {candidate_port} from range {min_port}-{max_port}")
|
|
85
|
+
return candidate_port
|
|
86
|
+
except OSError:
|
|
87
|
+
# Port is in use, try next one
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# No available ports found
|
|
91
|
+
raise RuntimeError(f"No available ports in range {min_port}-{max_port}. Used ports: {len(self.used_ports)}")
|
|
92
|
+
|
|
93
|
+
def start_server(self, seed: int) -> int:
|
|
94
|
+
"""Starts a server instance with the given seed."""
|
|
95
|
+
port = self.find_free_port()
|
|
96
|
+
instance_id = f"simple-server-{uuid.uuid4().hex[:8]}"
|
|
97
|
+
|
|
98
|
+
print(f"Starting server instance '{instance_id}' on port {port} with seed {seed}")
|
|
99
|
+
|
|
100
|
+
env = os.environ.copy()
|
|
101
|
+
env["PORT"] = str(port)
|
|
102
|
+
|
|
103
|
+
# Command to run the server with the current Python environment
|
|
104
|
+
cmd = [
|
|
105
|
+
self.python_executable,
|
|
106
|
+
self.script_path,
|
|
107
|
+
"--port",
|
|
108
|
+
str(port),
|
|
109
|
+
"--seed",
|
|
110
|
+
str(seed),
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Start the process with visible output for debugging
|
|
114
|
+
process = subprocess.Popen(
|
|
115
|
+
cmd,
|
|
116
|
+
stdout=subprocess.PIPE,
|
|
117
|
+
stderr=subprocess.PIPE, # Keep stderr separate to see error output
|
|
118
|
+
text=True,
|
|
119
|
+
env=env,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
self.processes[port] = (process, instance_id)
|
|
123
|
+
|
|
124
|
+
# Wait for server to be ready with health check polling
|
|
125
|
+
if not self._wait_for_server_ready(port, instance_id, process):
|
|
126
|
+
# Clean up failed process
|
|
127
|
+
if port in self.processes:
|
|
128
|
+
del self.processes[port]
|
|
129
|
+
raise RuntimeError(f"Server instance '{instance_id}' failed to start or become ready")
|
|
130
|
+
|
|
131
|
+
print(f"Server instance '{instance_id}' started successfully on port {port}")
|
|
132
|
+
return port
|
|
133
|
+
|
|
134
|
+
def _wait_for_server_ready(
|
|
135
|
+
self, port: int, instance_id: str, process: subprocess.Popen, timeout: int = 15
|
|
136
|
+
) -> bool:
|
|
137
|
+
"""
|
|
138
|
+
Wait for server to be ready by polling MCP health check.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
port: Server port
|
|
142
|
+
instance_id: Server instance ID for logging
|
|
143
|
+
process: Server process
|
|
144
|
+
timeout: Maximum time to wait in seconds
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if server is ready, False otherwise
|
|
148
|
+
"""
|
|
149
|
+
start_time = time.time()
|
|
150
|
+
health_check_failures = 0 # Fix: Initialize counter properly
|
|
151
|
+
|
|
152
|
+
while time.time() - start_time < timeout:
|
|
153
|
+
# Check if process is still running
|
|
154
|
+
if process.poll() is not None:
|
|
155
|
+
stdout, stderr = process.communicate()
|
|
156
|
+
print(f"Server instance '{instance_id}' process exited early")
|
|
157
|
+
print(f"STDOUT: {stdout}")
|
|
158
|
+
print(f"STDERR: {stderr}")
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# Try simple socket check instead of full MCP health check
|
|
162
|
+
try:
|
|
163
|
+
# Simple TCP socket check first
|
|
164
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
165
|
+
s.settimeout(1)
|
|
166
|
+
result = s.connect_ex(("localhost", port))
|
|
167
|
+
if result == 0:
|
|
168
|
+
# Port is open, server is likely ready
|
|
169
|
+
return True
|
|
170
|
+
except Exception as e:
|
|
171
|
+
health_check_failures += 1
|
|
172
|
+
# Print first few failures for debugging
|
|
173
|
+
if health_check_failures <= 3:
|
|
174
|
+
print(f"Health check failed for instance '{instance_id}': {e}")
|
|
175
|
+
|
|
176
|
+
# Wait before next check
|
|
177
|
+
time.sleep(0.5)
|
|
178
|
+
|
|
179
|
+
print(f"Server instance '{instance_id}' failed to become ready within {timeout} seconds")
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
async def _check_mcp_health(self, port: int, instance_id: str) -> bool:
|
|
183
|
+
"""
|
|
184
|
+
Check if MCP server is responding to requests.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
port: Server port
|
|
188
|
+
instance_id: Server instance ID for logging
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
True if MCP server is responding, False otherwise
|
|
192
|
+
"""
|
|
193
|
+
try:
|
|
194
|
+
# Fix: Use proper MCP server URL with /mcp/ path
|
|
195
|
+
server_url = f"http://localhost:{port}/mcp/"
|
|
196
|
+
|
|
197
|
+
# Use asyncio timeout to prevent hanging (compatible with older Python versions)
|
|
198
|
+
try:
|
|
199
|
+
await asyncio.wait_for(self._do_health_check(server_url), timeout=5.0)
|
|
200
|
+
return True
|
|
201
|
+
except asyncio.TimeoutError:
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
except Exception as e:
|
|
205
|
+
# Reduce verbosity - only show critical connection errors
|
|
206
|
+
error_str = str(e).lower()
|
|
207
|
+
if any(keyword in error_str for keyword in ["connection", "refused", "timeout", "unreachable"]):
|
|
208
|
+
# Connection errors are normal during startup
|
|
209
|
+
return False
|
|
210
|
+
else:
|
|
211
|
+
print(f"MCP health check error for instance '{instance_id}' on port {port}: {e}")
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
async def _do_health_check(self, server_url: str) -> bool:
|
|
215
|
+
"""Perform the actual health check."""
|
|
216
|
+
try:
|
|
217
|
+
async with AsyncExitStack() as exit_stack:
|
|
218
|
+
# Connect to the MCP server with shorter timeout for health checks
|
|
219
|
+
read_stream, write_stream, _ = await exit_stack.enter_async_context(
|
|
220
|
+
streamablehttp_client(server_url, terminate_on_close=True)
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
client_info = Implementation(name="health-check", version="1.0.0")
|
|
224
|
+
mcp_client = await exit_stack.enter_async_context(
|
|
225
|
+
ClientSession(read_stream, write_stream, client_info=client_info)
|
|
226
|
+
)
|
|
227
|
+
await mcp_client.initialize()
|
|
228
|
+
|
|
229
|
+
# Try to list tools - this should be available for all MCP servers
|
|
230
|
+
result = await mcp_client.list_tools()
|
|
231
|
+
return True # If we got here, MCP server is responding
|
|
232
|
+
except Exception:
|
|
233
|
+
return False
|
|
234
|
+
return False # This should never be reached, but added for mypy
|
|
235
|
+
|
|
236
|
+
def stop_server(self, port: int) -> None:
|
|
237
|
+
"""Stops the server instance and verifies port cleanup."""
|
|
238
|
+
if port in self.processes:
|
|
239
|
+
process, instance_id = self.processes[port]
|
|
240
|
+
print(f"Stopping server instance '{instance_id}' on port {port}")
|
|
241
|
+
|
|
242
|
+
process.terminate()
|
|
243
|
+
try:
|
|
244
|
+
process.wait(timeout=5)
|
|
245
|
+
except subprocess.TimeoutExpired:
|
|
246
|
+
print(f"Force killing server instance '{instance_id}'")
|
|
247
|
+
process.kill()
|
|
248
|
+
process.wait()
|
|
249
|
+
|
|
250
|
+
# Verify port is actually freed
|
|
251
|
+
if self._verify_port_freed(port):
|
|
252
|
+
print(f"✅ Port {port} successfully freed")
|
|
253
|
+
else:
|
|
254
|
+
print(f"⚠️ Warning: Port {port} may still be in use after server stop")
|
|
255
|
+
|
|
256
|
+
# Clean up tracking
|
|
257
|
+
del self.processes[port]
|
|
258
|
+
if port in self.used_ports:
|
|
259
|
+
self.used_ports.remove(port)
|
|
260
|
+
|
|
261
|
+
print(f"Server instance '{instance_id}' stopped and cleaned up")
|
|
262
|
+
|
|
263
|
+
def _verify_port_freed(self, port: int, max_retries: int = 3) -> bool:
|
|
264
|
+
"""
|
|
265
|
+
Verify that a port is actually freed after stopping a server.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
port: The port to check
|
|
269
|
+
max_retries: Number of times to retry the check
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
True if port is freed, False otherwise
|
|
273
|
+
"""
|
|
274
|
+
for attempt in range(max_retries):
|
|
275
|
+
try:
|
|
276
|
+
# Try to bind to the port - if successful, it's free
|
|
277
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
278
|
+
s.bind(("localhost", port))
|
|
279
|
+
return True
|
|
280
|
+
except OSError:
|
|
281
|
+
# Port still in use, wait a bit and retry
|
|
282
|
+
if attempt < max_retries - 1:
|
|
283
|
+
time.sleep(0.5)
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
def stop_all(self) -> None:
|
|
289
|
+
"""Stops all managed servers."""
|
|
290
|
+
for port in list(self.processes.keys()):
|
|
291
|
+
self.stop_server(port)
|