eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,291 @@
1
+ """
2
+ Simplified process manager for MCP servers running in separate processes.
3
+
4
+ This module provides a simpler alternative to the conda-based process manager
5
+ for testing and development scenarios where full environment isolation is not required.
6
+ """
7
+
8
+ import asyncio
9
+ import os
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import time
14
+ import uuid
15
+ from contextlib import AsyncExitStack
16
+ from typing import Dict, Tuple
17
+
18
+ from mcp.client.session import ClientSession
19
+ from mcp.client.streamable_http import streamablehttp_client
20
+ from mcp.types import Implementation
21
+
22
+
23
+ class SimpleServerProcessManager:
24
+ """Manages the lifecycle of server subprocesses using the current Python environment."""
25
+
26
+ def __init__(
27
+ self,
28
+ script_path: str,
29
+ python_executable: str = None,
30
+ port_range: Tuple[int, int] = (10000, 11000),
31
+ ):
32
+ """
33
+ Initialize the process manager.
34
+
35
+ Args:
36
+ script_path: Path to the server script to run
37
+ python_executable: Python executable to use (defaults to current Python)
38
+ port_range: Tuple of (min_port, max_port) for server instances
39
+ """
40
+ self.script_path = script_path
41
+ self.python_executable = python_executable or sys.executable
42
+ self.port_range = port_range
43
+ self.processes: Dict[int, Tuple[subprocess.Popen, str]] = {} # port -> (process, instance_id)
44
+ self.used_ports: set = set() # Track used ports for better management
45
+
46
+ def find_free_port(self) -> int:
47
+ """
48
+ Finds and returns an available TCP port within the configured range.
49
+
50
+ Returns:
51
+ Available port number
52
+
53
+ Raises:
54
+ RuntimeError: If no ports are available in the range
55
+ """
56
+ min_port, max_port = self.port_range
57
+
58
+ # Try ports in the configured range, avoiding recently used ones
59
+ attempted_ports = set()
60
+
61
+ for _ in range(max_port - min_port):
62
+ # Generate a candidate port, preferring unused ones
63
+ import random
64
+
65
+ # First try unused ports
66
+ available_ports = set(range(min_port, max_port)) - self.used_ports
67
+ if available_ports:
68
+ candidate_port = random.choice(list(available_ports))
69
+ else:
70
+ # If all ports have been used, try any port in range
71
+ candidate_port = random.randint(min_port, max_port - 1)
72
+
73
+ # Skip if we already tried this port
74
+ if candidate_port in attempted_ports:
75
+ continue
76
+ attempted_ports.add(candidate_port)
77
+
78
+ # Test if the port is actually available
79
+ try:
80
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
81
+ s.bind(("localhost", candidate_port))
82
+ # Port is available
83
+ self.used_ports.add(candidate_port)
84
+ print(f"Allocated port {candidate_port} from range {min_port}-{max_port}")
85
+ return candidate_port
86
+ except OSError:
87
+ # Port is in use, try next one
88
+ continue
89
+
90
+ # No available ports found
91
+ raise RuntimeError(f"No available ports in range {min_port}-{max_port}. Used ports: {len(self.used_ports)}")
92
+
93
+ def start_server(self, seed: int) -> int:
94
+ """Starts a server instance with the given seed."""
95
+ port = self.find_free_port()
96
+ instance_id = f"simple-server-{uuid.uuid4().hex[:8]}"
97
+
98
+ print(f"Starting server instance '{instance_id}' on port {port} with seed {seed}")
99
+
100
+ env = os.environ.copy()
101
+ env["PORT"] = str(port)
102
+
103
+ # Command to run the server with the current Python environment
104
+ cmd = [
105
+ self.python_executable,
106
+ self.script_path,
107
+ "--port",
108
+ str(port),
109
+ "--seed",
110
+ str(seed),
111
+ ]
112
+
113
+ # Start the process with visible output for debugging
114
+ process = subprocess.Popen(
115
+ cmd,
116
+ stdout=subprocess.PIPE,
117
+ stderr=subprocess.PIPE, # Keep stderr separate to see error output
118
+ text=True,
119
+ env=env,
120
+ )
121
+
122
+ self.processes[port] = (process, instance_id)
123
+
124
+ # Wait for server to be ready with health check polling
125
+ if not self._wait_for_server_ready(port, instance_id, process):
126
+ # Clean up failed process
127
+ if port in self.processes:
128
+ del self.processes[port]
129
+ raise RuntimeError(f"Server instance '{instance_id}' failed to start or become ready")
130
+
131
+ print(f"Server instance '{instance_id}' started successfully on port {port}")
132
+ return port
133
+
134
+ def _wait_for_server_ready(
135
+ self, port: int, instance_id: str, process: subprocess.Popen, timeout: int = 15
136
+ ) -> bool:
137
+ """
138
+ Wait for server to be ready by polling MCP health check.
139
+
140
+ Args:
141
+ port: Server port
142
+ instance_id: Server instance ID for logging
143
+ process: Server process
144
+ timeout: Maximum time to wait in seconds
145
+
146
+ Returns:
147
+ True if server is ready, False otherwise
148
+ """
149
+ start_time = time.time()
150
+ health_check_failures = 0 # Fix: Initialize counter properly
151
+
152
+ while time.time() - start_time < timeout:
153
+ # Check if process is still running
154
+ if process.poll() is not None:
155
+ stdout, stderr = process.communicate()
156
+ print(f"Server instance '{instance_id}' process exited early")
157
+ print(f"STDOUT: {stdout}")
158
+ print(f"STDERR: {stderr}")
159
+ return False
160
+
161
+ # Try simple socket check instead of full MCP health check
162
+ try:
163
+ # Simple TCP socket check first
164
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
165
+ s.settimeout(1)
166
+ result = s.connect_ex(("localhost", port))
167
+ if result == 0:
168
+ # Port is open, server is likely ready
169
+ return True
170
+ except Exception as e:
171
+ health_check_failures += 1
172
+ # Print first few failures for debugging
173
+ if health_check_failures <= 3:
174
+ print(f"Health check failed for instance '{instance_id}': {e}")
175
+
176
+ # Wait before next check
177
+ time.sleep(0.5)
178
+
179
+ print(f"Server instance '{instance_id}' failed to become ready within {timeout} seconds")
180
+ return False
181
+
182
+ async def _check_mcp_health(self, port: int, instance_id: str) -> bool:
183
+ """
184
+ Check if MCP server is responding to requests.
185
+
186
+ Args:
187
+ port: Server port
188
+ instance_id: Server instance ID for logging
189
+
190
+ Returns:
191
+ True if MCP server is responding, False otherwise
192
+ """
193
+ try:
194
+ # Fix: Use proper MCP server URL with /mcp/ path
195
+ server_url = f"http://localhost:{port}/mcp/"
196
+
197
+ # Use asyncio timeout to prevent hanging (compatible with older Python versions)
198
+ try:
199
+ await asyncio.wait_for(self._do_health_check(server_url), timeout=5.0)
200
+ return True
201
+ except asyncio.TimeoutError:
202
+ return False
203
+
204
+ except Exception as e:
205
+ # Reduce verbosity - only show critical connection errors
206
+ error_str = str(e).lower()
207
+ if any(keyword in error_str for keyword in ["connection", "refused", "timeout", "unreachable"]):
208
+ # Connection errors are normal during startup
209
+ return False
210
+ else:
211
+ print(f"MCP health check error for instance '{instance_id}' on port {port}: {e}")
212
+ return False
213
+
214
+ async def _do_health_check(self, server_url: str) -> bool:
215
+ """Perform the actual health check."""
216
+ try:
217
+ async with AsyncExitStack() as exit_stack:
218
+ # Connect to the MCP server with shorter timeout for health checks
219
+ read_stream, write_stream, _ = await exit_stack.enter_async_context(
220
+ streamablehttp_client(server_url, terminate_on_close=True)
221
+ )
222
+
223
+ client_info = Implementation(name="health-check", version="1.0.0")
224
+ mcp_client = await exit_stack.enter_async_context(
225
+ ClientSession(read_stream, write_stream, client_info=client_info)
226
+ )
227
+ await mcp_client.initialize()
228
+
229
+ # Try to list tools - this should be available for all MCP servers
230
+ result = await mcp_client.list_tools()
231
+ return True # If we got here, MCP server is responding
232
+ except Exception:
233
+ return False
234
+ return False # This should never be reached, but added for mypy
235
+
236
+ def stop_server(self, port: int) -> None:
237
+ """Stops the server instance and verifies port cleanup."""
238
+ if port in self.processes:
239
+ process, instance_id = self.processes[port]
240
+ print(f"Stopping server instance '{instance_id}' on port {port}")
241
+
242
+ process.terminate()
243
+ try:
244
+ process.wait(timeout=5)
245
+ except subprocess.TimeoutExpired:
246
+ print(f"Force killing server instance '{instance_id}'")
247
+ process.kill()
248
+ process.wait()
249
+
250
+ # Verify port is actually freed
251
+ if self._verify_port_freed(port):
252
+ print(f"✅ Port {port} successfully freed")
253
+ else:
254
+ print(f"⚠️ Warning: Port {port} may still be in use after server stop")
255
+
256
+ # Clean up tracking
257
+ del self.processes[port]
258
+ if port in self.used_ports:
259
+ self.used_ports.remove(port)
260
+
261
+ print(f"Server instance '{instance_id}' stopped and cleaned up")
262
+
263
+ def _verify_port_freed(self, port: int, max_retries: int = 3) -> bool:
264
+ """
265
+ Verify that a port is actually freed after stopping a server.
266
+
267
+ Args:
268
+ port: The port to check
269
+ max_retries: Number of times to retry the check
270
+
271
+ Returns:
272
+ True if port is freed, False otherwise
273
+ """
274
+ for attempt in range(max_retries):
275
+ try:
276
+ # Try to bind to the port - if successful, it's free
277
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
278
+ s.bind(("localhost", port))
279
+ return True
280
+ except OSError:
281
+ # Port still in use, wait a bit and retry
282
+ if attempt < max_retries - 1:
283
+ time.sleep(0.5)
284
+ continue
285
+
286
+ return False
287
+
288
+ def stop_all(self) -> None:
289
+ """Stops all managed servers."""
290
+ for port in list(self.processes.keys()):
291
+ self.stop_server(port)