eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,481 @@
1
+ import json
2
+ import os
3
+ import re # Added for Serveo URL parsing
4
+ import shutil # Added for checking ssh availability
5
+ import signal
6
+ import subprocess
7
+ import time
8
+ from typing import IO, Any, Dict, List, Optional # Added IO, Any, List, Dict, Optional
9
+
10
+ try:
11
+ import requests
12
+
13
+ REQUESTS_AVAILABLE = True
14
+ except ImportError:
15
+ REQUESTS_AVAILABLE = False
16
+
17
+ # Store PIDs of started processes
18
+ managed_processes: Dict[int, Dict[str, Any]] = (
19
+ {}
20
+ ) # pid -> {process, command, log_file, log_file_path, env}
21
+
22
+ NGROK_API_URL = "http://127.0.0.1:4040/api/tunnels"
23
+
24
+
25
+ def start_process(
26
+ command: List[str], # Changed to List[str]
27
+ log_file_path: str,
28
+ cwd: Optional[str] = None, # Changed to Optional[str]
29
+ new_process_group: bool = True,
30
+ env: Optional[Dict[str, str]] = None, # Changed to Optional[Dict[str, str]]
31
+ ) -> Optional[subprocess.Popen]: # Changed to Optional[subprocess.Popen]
32
+ """
33
+ Starts a process in the background and logs its output.
34
+ Stores the process information for later management.
35
+
36
+ Args:
37
+ command: A list representing the command and its arguments.
38
+ log_file_path: Path to the file where stdout and stderr will be logged.
39
+ cwd: The working directory for the command. Defaults to current directory.
40
+ new_process_group: Whether to start the process in a new group (for Unix-like systems).
41
+ env: Optional dictionary of environment variables for the new process.
42
+
43
+ Returns:
44
+ The Popen object for the started process.
45
+ """
46
+ print(
47
+ f"Starting process: {' '.join(str(c) for c in command)}"
48
+ ) # Ensure all parts of command are str for join
49
+ print(f"Logging output to: {log_file_path}")
50
+
51
+ # Ensure log directory exists
52
+ os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
53
+
54
+ log_file: IO[Any] = open(log_file_path, "w")
55
+
56
+ process_env = os.environ.copy()
57
+ if env:
58
+ process_env.update(env)
59
+
60
+ try:
61
+ process = subprocess.Popen(
62
+ command,
63
+ stdout=log_file,
64
+ stderr=log_file,
65
+ cwd=cwd if cwd else os.getcwd(),
66
+ preexec_fn=os.setsid if (os.name != "nt" and new_process_group) else None,
67
+ env=process_env,
68
+ )
69
+
70
+ managed_processes[process.pid] = {
71
+ "process": process, # Type: subprocess.Popen
72
+ "command": command, # Type: List[str]
73
+ "log_file": log_file, # Type: IO[Any] (TextIOWrapper)
74
+ "log_file_path": log_file_path, # Type: str
75
+ "is_ngrok": "ngrok" in command[0] if command else False,
76
+ "env": env, # Type: Optional[Dict[str, str]]
77
+ }
78
+ print(f"Process started with PID: {process.pid}")
79
+ return process
80
+ except Exception as e:
81
+ print(f"Failed to start process: {e}")
82
+ log_file.close() # Ensure log file is closed on error
83
+ return None
84
+
85
+
86
+ # --- Ngrok functions ---
87
+ def get_ngrok_public_url(retries: int = 5, delay: int = 3) -> Optional[str]:
88
+ """
89
+ Queries the local ngrok API to get the public HTTPS URL.
90
+ """
91
+ if not REQUESTS_AVAILABLE:
92
+ print(
93
+ "ERROR: 'requests' library is not installed. Cannot fetch ngrok URL automatically."
94
+ )
95
+ print("Please install it, e.g., pip install requests")
96
+ return None
97
+
98
+ # NGROK_API_URL is now a module-level constant
99
+ for attempt in range(retries):
100
+ try:
101
+ response = requests.get(NGROK_API_URL, timeout=5)
102
+ response.raise_for_status()
103
+ tunnels_data = response.json()
104
+ for tunnel in tunnels_data.get("tunnels", []):
105
+ if tunnel.get("proto") == "https" and tunnel.get(
106
+ "public_url", ""
107
+ ).startswith("https://"):
108
+ print(f"Found ngrok public URL: {tunnel['public_url']}")
109
+ return tunnel["public_url"]
110
+ print(
111
+ f"Attempt {attempt + 1}/{retries}: HTTPS tunnel not found yet in ngrok API response. Retrying in {delay}s..."
112
+ )
113
+ except requests.exceptions.ConnectionError:
114
+ print(
115
+ f"Attempt {attempt + 1}/{retries}: ngrok API not yet available at {NGROK_API_URL}. Retrying in {delay}s..."
116
+ )
117
+ except Exception as e:
118
+ print(
119
+ f"Attempt {attempt + 1}/{retries}: Error fetching ngrok URL: {e}. Retrying in {delay}s..."
120
+ )
121
+ time.sleep(delay)
122
+ print("ERROR: Failed to get ngrok public URL after multiple retries.")
123
+ return None
124
+
125
+
126
+ def start_ngrok_and_get_url(
127
+ local_port: int, ngrok_log_file: str, authtoken: Optional[str] = None
128
+ ) -> tuple[Optional[subprocess.Popen], Optional[str]]:
129
+ """
130
+ Starts ngrok to expose a local port and retrieves its public HTTPS URL.
131
+ """
132
+ ngrok_command = [
133
+ "ngrok",
134
+ "http",
135
+ str(local_port),
136
+ "--log=stdout",
137
+ ] # Using --log=stdout to simplify log capture by start_process
138
+
139
+ try:
140
+ # Check if ngrok command is available
141
+ ngrok_version_process = subprocess.run(
142
+ ["ngrok", "--version"], capture_output=True, text=True, check=True
143
+ )
144
+ print(f"Found ngrok version: {ngrok_version_process.stdout.strip()}")
145
+ except (subprocess.CalledProcessError, FileNotFoundError):
146
+ print(
147
+ "ERROR: ngrok command not found or not executable. Please ensure ngrok is installed and in your PATH."
148
+ )
149
+ return None, None
150
+
151
+ if authtoken:
152
+ # This is usually done by the user beforehand with `ngrok config add-authtoken <token>`
153
+ # Or by setting NGROK_AUTHTOKEN environment variable.
154
+ # Forcing it via command line is also an option but less common for persistent setup.
155
+ print(
156
+ f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
157
+ )
158
+ # Example if passing via env for the subprocess:
159
+ # ngrok_env = os.environ.copy()
160
+ # ngrok_env["NGROK_AUTHTOKEN"] = authtoken
161
+ # ngrok_process = start_process(ngrok_command, ngrok_log_file, new_process_group=False, env=ngrok_env)
162
+
163
+ print(f"Attempting to start ngrok for port {local_port}...")
164
+ # ngrok typically doesn't need to be in a new process group for simple start/stop.
165
+ # Its logs will go to ngrok_log_file via start_process.
166
+ ngrok_process = start_process(
167
+ ngrok_command, ngrok_log_file, new_process_group=False
168
+ )
169
+
170
+ if not ngrok_process or ngrok_process.poll() is not None:
171
+ print(f"ERROR: Failed to start ngrok. Check log: {ngrok_log_file}")
172
+ # managed_processes should handle cleanup if start_process added it and it failed.
173
+ # However, if ngrok_process is None, it wasn't added.
174
+ # If it's not None but poll() is not None, it means it started and exited quickly.
175
+ if ngrok_process and ngrok_process.pid in managed_processes:
176
+ # This might be redundant if start_process failed and didn't add it,
177
+ # or if it was added and then stop_process is called later.
178
+ # For safety, ensure it's stopped if it was ever in managed_processes.
179
+ pass # stop_process will be called by the caller if needed
180
+ return None, None
181
+
182
+ print(
183
+ f"ngrok process started with PID {ngrok_process.pid}. Waiting for tunnel URL..."
184
+ )
185
+ # Increased sleep time as ngrok can take a moment to establish tunnel and API to update
186
+ time.sleep(8)
187
+
188
+ public_url = get_ngrok_public_url()
189
+
190
+ if not public_url:
191
+ print(
192
+ f"ERROR: Could not retrieve public URL from ngrok API. Check log: {ngrok_log_file}"
193
+ )
194
+ # If URL fetch fails, stop the ngrok process we started.
195
+ stop_process(
196
+ ngrok_process.pid
197
+ ) # stop_process will remove it from managed_processes
198
+ return None, None
199
+
200
+ print(f"Successfully started ngrok and retrieved public URL: {public_url}")
201
+ return ngrok_process, public_url
202
+
203
+
204
+ # --- End of Ngrok functions ---
205
+
206
+
207
+ def start_serveo_and_get_url(
208
+ local_port: int, log_file_path: str, timeout_seconds: int = 20
209
+ ) -> tuple[subprocess.Popen | None, str | None]:
210
+ """
211
+ Starts Serveo.net SSH tunnel to expose a local port and retrieves its public HTTPS URL.
212
+ The SSH process is added to managed_processes.
213
+
214
+ Args:
215
+ local_port: The local port number to expose (e.g., 8001).
216
+ log_file_path: Path to the file where Serveo SSH client output will be logged.
217
+ timeout_seconds: How long to wait for Serveo to provide a URL.
218
+
219
+ Returns:
220
+ A tuple (ssh_process, public_url). (None, None) on failure.
221
+ """
222
+ if not shutil.which("ssh"):
223
+ print(
224
+ "ERROR: 'ssh' command not found. Please ensure OpenSSH client is installed and in your PATH."
225
+ )
226
+ return None, None
227
+
228
+ # Ensure log directory exists
229
+ os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
230
+
231
+ # Using a temporary file for UserKnownHostsFile might be more robust on some systems than /dev/null
232
+ # For simplicity, /dev/null is used here as specified in the plan.
233
+ # On Windows, /dev/null equivalent is NUL.
234
+ known_hosts_file = "/dev/null" if os.name != "nt" else "NUL"
235
+
236
+ serveo_command = [
237
+ "ssh",
238
+ "-o",
239
+ "StrictHostKeyChecking=no",
240
+ "-o",
241
+ f"UserKnownHostsFile={known_hosts_file}",
242
+ "-R",
243
+ f"80:localhost:{local_port}",
244
+ "serveo.net",
245
+ ]
246
+
247
+ print(f"Attempting to start Serveo.net tunnel for localhost:{local_port}...")
248
+ print(f"Command: {' '.join(serveo_command)}")
249
+ print(f"Logging Serveo SSH client output to: {log_file_path}")
250
+
251
+ public_url = None
252
+ ssh_process = None
253
+
254
+ try:
255
+ # We need to capture stdout to parse the URL.
256
+ # stderr will also be captured by the same pipe.
257
+ log_file = open(log_file_path, "w")
258
+ ssh_process = subprocess.Popen(
259
+ serveo_command,
260
+ stdout=subprocess.PIPE, # Capture stdout for parsing
261
+ stderr=subprocess.STDOUT, # Redirect stderr to stdout pipe
262
+ text=True, # Decode output as text
263
+ bufsize=1, # Line-buffered
264
+ universal_newlines=True, # Ensure consistent line endings
265
+ preexec_fn=(
266
+ os.setsid if os.name != "nt" else None
267
+ ), # New process group for proper termination
268
+ )
269
+
270
+ # Add to managed_processes early so it can be cleaned up if something goes wrong
271
+ managed_processes[ssh_process.pid] = {
272
+ "process": ssh_process,
273
+ "command": serveo_command,
274
+ "log_file": log_file, # This log_file will store what we read from the pipe
275
+ "log_file_path": log_file_path,
276
+ "is_ngrok": False,
277
+ }
278
+ print(
279
+ f"Serveo SSH process started with PID: {ssh_process.pid}. Waiting for URL..."
280
+ )
281
+
282
+ url_pattern = re.compile(
283
+ r"Forwarding HTTP traffic from (https://\S+\.serveo\.net)"
284
+ )
285
+
286
+ start_time = time.time()
287
+ if ssh_process.stdout:
288
+ for line in iter(ssh_process.stdout.readline, ""):
289
+ log_file.write(line) # Write to the main log file
290
+ log_file.flush()
291
+ print(
292
+ f"[Serveo PID {ssh_process.pid}]: {line.strip()}"
293
+ ) # Also print to console for live feedback
294
+
295
+ match = url_pattern.search(line)
296
+ if match:
297
+ public_url = match.group(1)
298
+ print(f"Found Serveo public URL: {public_url}")
299
+ break # URL found
300
+
301
+ if time.time() - start_time > timeout_seconds:
302
+ print(
303
+ f"ERROR: Timeout ({timeout_seconds}s) waiting for Serveo URL."
304
+ )
305
+ break # Timeout
306
+ if ssh_process.poll() is not None: # Process terminated unexpectedly
307
+ print(
308
+ f"ERROR: Serveo SSH process terminated unexpectedly. Check log: {log_file_path}"
309
+ )
310
+ break
311
+
312
+ # If loop exited because readline returned '', process ended.
313
+ if ssh_process.poll() is not None and not public_url:
314
+ print(
315
+ f"ERROR: Serveo SSH process ended before URL was found. Check log: {log_file_path}"
316
+ )
317
+
318
+ else: # Should not happen if Popen was successful
319
+ print("ERROR: SSH process stdout stream not available.")
320
+
321
+ except FileNotFoundError:
322
+ print(
323
+ "ERROR: 'ssh' command not found. Please ensure OpenSSH client is installed and in your PATH."
324
+ )
325
+ if ssh_process and ssh_process.pid in managed_processes:
326
+ stop_process(ssh_process.pid) # Clean up if partially started
327
+ return None, None
328
+ except Exception as e:
329
+ print(f"ERROR: An exception occurred while starting or monitoring Serveo: {e}")
330
+ if ssh_process and ssh_process.pid in managed_processes:
331
+ stop_process(ssh_process.pid) # Clean up
332
+ return None, None
333
+
334
+ if not public_url:
335
+ print("ERROR: Could not retrieve public URL from Serveo.net.")
336
+ if ssh_process: # If process was started, try to stop it
337
+ stop_process(ssh_process.pid)
338
+ return None, None
339
+
340
+ # The ssh_process is kept running in the background by Popen.
341
+ # It's up to the caller to manage its lifecycle (e.g., via stop_process or stop_all_processes).
342
+ return ssh_process, public_url
343
+
344
+
345
+ def stop_process(pid: int):
346
+ """
347
+ Stops a managed process and its process group.
348
+ Closes its log file.
349
+
350
+ Args:
351
+ pid: The PID of the process to stop.
352
+ """
353
+ if pid in managed_processes:
354
+ proc_info = managed_processes[pid]
355
+ process: subprocess.Popen = proc_info["process"]
356
+ log_file: IO[Any] = proc_info["log_file"]
357
+ command_list: List[str] = proc_info["command"]
358
+ command_str = " ".join(str(c) for c in command_list)
359
+
360
+ print(f"Stopping process PID {pid} ({command_str})...")
361
+ try:
362
+ if os.name == "nt":
363
+ # For Windows, taskkill is more reliable for process trees
364
+ subprocess.run(
365
+ ["taskkill", "/F", "/T", "/PID", str(pid)],
366
+ check=True,
367
+ capture_output=True,
368
+ )
369
+ else:
370
+ # Send SIGTERM to the entire process group
371
+ os.killpg(os.getpgid(pid), signal.SIGTERM)
372
+
373
+ if (
374
+ hasattr(process, "stdout")
375
+ and process.stdout
376
+ and not process.stdout.closed
377
+ ):
378
+ process.stdout.close() # This might not be necessary if Popen handles it on terminate/kill
379
+ process.wait(timeout=5) # Wait for graceful termination
380
+ print(f"Process PID {pid} terminated gracefully.")
381
+ except subprocess.TimeoutExpired:
382
+ print(f"Process PID {pid} did not terminate gracefully, sending SIGKILL...")
383
+ if os.name == "nt":
384
+ # On Windows, Popen.kill() is often sufficient for direct children.
385
+ # taskkill /T is for tree. If os.killpg was used via setsid, this might be complex.
386
+ # For simplicity, let's try process.kill() first.
387
+ process.kill()
388
+ else:
389
+ os.killpg(os.getpgid(pid), signal.SIGKILL) # Kill the whole group
390
+ process.wait(timeout=5) # Wait for kill
391
+ print(f"Process PID {pid} killed.")
392
+ except ProcessLookupError: # Process might have already exited
393
+ print(f"Process PID {pid} already exited.")
394
+ except Exception as e:
395
+ print(f"Error stopping process PID {pid}: {e}")
396
+ finally:
397
+ if (
398
+ log_file and not log_file.closed
399
+ ): # Check if log_file is TextIOWrapper and not closed
400
+ log_file.close()
401
+ if pid in managed_processes: # Check if pid still exists before deleting
402
+ del managed_processes[pid]
403
+ else:
404
+ print(f"Process with PID {pid} not found in managed list.")
405
+
406
+
407
+ def stop_all_processes():
408
+ """
409
+ Stops all currently managed processes.
410
+ """
411
+ print("Stopping all managed processes...")
412
+ # Iterate over a copy of keys as `stop_process` modifies the dictionary
413
+ for pid in list(managed_processes.keys()):
414
+ stop_process(pid)
415
+ print("All managed processes have been requested to stop.")
416
+
417
+
418
+ if __name__ == "__main__":
419
+ import sys # Import sys here for sys.executable
420
+
421
+ # Example Usage:
422
+ log_dir = os.path.join(os.getcwd(), "logs")
423
+ os.makedirs(log_dir, exist_ok=True)
424
+
425
+ print("Starting a sample sleep process...")
426
+ sleep_log = os.path.join(log_dir, "sleep_test.log")
427
+ # Using sys.executable to ensure we use the same python interpreter
428
+ # For a real server, this would be [sys.executable, 'path/to/server_script.py']
429
+ # Note: sys.executable is not defined in this scope if this file is run directly without importing sys first.
430
+ # For the example, let's assume python is in path or use a simple command.
431
+
432
+ # Test basic process start/stop
433
+ print("Starting a sample sleep process...")
434
+ sleep_log = os.path.join(log_dir, "sleep_test.log")
435
+ # Using a simple platform-independent sleep command for the example
436
+ sleep_command = ["timeout", "10"] if os.name == "nt" else ["sleep", "10"]
437
+ proc = start_process(sleep_command, sleep_log)
438
+
439
+ if proc and proc.pid: # Check if proc is not None and has a pid
440
+ print(f"Sleep process PID: {proc.pid}")
441
+ print("Waiting for a few seconds before stopping...")
442
+ time.sleep(3)
443
+ stop_process(proc.pid)
444
+ else:
445
+ print("Failed to start sleep process.")
446
+
447
+ print("\nStarting another process to test stop_all...")
448
+ another_log = os.path.join(log_dir, "another_test.log")
449
+ # Using a simple platform-independent sleep command for the example
450
+ another_sleep_command = ["timeout", "10"] if os.name == "nt" else ["sleep", "10"]
451
+ proc2 = start_process(another_sleep_command, another_log)
452
+ if proc2 and proc2.pid:
453
+ print(f"Another process PID: {proc2.pid}")
454
+ time.sleep(1)
455
+ # stop_all_processes() # This would be called by atexit or explicitly
456
+ else:
457
+ print("Failed to start another process.")
458
+
459
+ # Test ngrok (manual execution needed if you want to see this run)
460
+ # print("\nTesting ngrok start (requires ngrok in PATH and a service on port 8888)...")
461
+ # ngrok_test_log = os.path.join(log_dir, "ngrok_test.log")
462
+ # ngrok_proc, ngrok_url = start_ngrok_and_get_url(8888, ngrok_test_log)
463
+ # if ngrok_proc and ngrok_url:
464
+ # print(f"Ngrok started: PID {ngrok_proc.pid}, URL {ngrok_url}")
465
+ # time.sleep(5) # Keep it running for 5s
466
+ # # stop_process(ngrok_proc.pid) # stop_all_processes will handle it
467
+ # else:
468
+ # print("Failed to start ngrok for testing.")
469
+
470
+ # stop_all_processes() will be called by atexit if this script is run
471
+ # or can be called explicitly if needed.
472
+ # For this example, we'll let atexit handle it if processes were started.
473
+ # If running this __main__ block, ensure atexit is registered or call stop_all_processes()
474
+ import atexit
475
+
476
+ atexit.register(stop_all_processes)
477
+ print("Subprocess manager example finished. Check logs in 'logs' directory.")
478
+ print(
479
+ "Remaining managed processes (should be empty if all stopped):",
480
+ managed_processes.keys(),
481
+ )
@@ -0,0 +1,86 @@
1
+ """
2
+ Fireworks Eval Protocol - Simplify reward modeling and evaluation for LLM RL fine-tuning.
3
+
4
+ A Python library for defining, testing, deploying, and using reward functions
5
+ for LLM fine-tuning, including launching full RL jobs on the Fireworks platform.
6
+
7
+ The library also provides an agent evaluation framework for testing and evaluating
8
+ tool-augmented models using self-contained task bundles.
9
+ """
10
+
11
+ import warnings
12
+
13
+ from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
14
+ from .auth import get_fireworks_api_key, get_fireworks_account_id
15
+ from .common_utils import load_jsonl
16
+ from .config import load_config, get_config, RewardKitConfig
17
+ from .mcp_env import (
18
+ OpenAIPolicy,
19
+ AnthropicPolicy,
20
+ MCPVectorEnv,
21
+ make,
22
+ rollout,
23
+ test_mcp,
24
+ )
25
+
26
+ # Try to import FireworksPolicy if available
27
+ try:
28
+ from .mcp_env import FireworksPolicy
29
+
30
+ _FIREWORKS_AVAILABLE = True
31
+ except (ImportError, AttributeError):
32
+ _FIREWORKS_AVAILABLE = False
33
+ from .models import EvaluateResult, Message, MetricResult
34
+ from .playback_policy import PlaybackPolicyBase
35
+ from .resources import create_llm_resource
36
+ from .reward_function import RewardFunction
37
+ from .typed_interface import reward_function
38
+
39
+ # Import submodules to make them available via eval_protocol.rewards, etc.
40
+ from . import rewards
41
+ from . import mcp
42
+
43
+ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
44
+
45
+ __all__ = [
46
+ # Core interfaces
47
+ "Message",
48
+ "MetricResult",
49
+ "EvaluateResult",
50
+ "reward_function",
51
+ "RewardFunction",
52
+ "scorer_to_reward_fn",
53
+ "reward_fn_to_scorer",
54
+ # Authentication
55
+ "get_fireworks_api_key",
56
+ "get_fireworks_account_id",
57
+ # Configuration
58
+ "load_config",
59
+ "get_config",
60
+ "RewardKitConfig",
61
+ # Utilities
62
+ "load_jsonl",
63
+ # MCP Environment API
64
+ "make",
65
+ "rollout",
66
+ "AnthropicPolicy",
67
+ "FireworksPolicy",
68
+ "OpenAIPolicy",
69
+ "MCPVectorEnv",
70
+ "test_mcp",
71
+ # Playback functionality
72
+ "PlaybackPolicyBase",
73
+ # Resource management
74
+ "create_llm_resource",
75
+ # Submodules
76
+ "rewards",
77
+ "mcp",
78
+ ]
79
+
80
+ # Add FireworksPolicy to exports if available
81
+ if _FIREWORKS_AVAILABLE:
82
+ __all__.insert(__all__.index("OpenAIPolicy") + 1, "FireworksPolicy")
83
+
84
+ from . import _version
85
+
86
+ __version__ = _version.get_versions()["version"]
@@ -0,0 +1,10 @@
1
+ """
2
+ Main entry point for running reward-kit as a module.
3
+ """
4
+
5
+ import sys
6
+
7
+ from eval_protocol.cli import main
8
+
9
+ if __name__ == "__main__":
10
+ sys.exit(main())
@@ -0,0 +1,21 @@
1
+
2
+ # This file was generated by 'versioneer.py' (0.29) from
3
+ # revision-control system data, or from the parent directory name of an
4
+ # unpacked source archive. Distribution tarballs contain a pre-generated copy
5
+ # of this file.
6
+
7
+ import json
8
+
9
+ version_json = '''
10
+ {
11
+ "date": "2025-07-26T21:49:07-0700",
12
+ "dirty": false,
13
+ "error": null,
14
+ "full-revisionid": "aca1f0cedf1248dc578ea7d3e3f6e962b6d9de69",
15
+ "version": "0.0.3"
16
+ }
17
+ ''' # END VERSION_JSON
18
+
19
+
20
+ def get_versions():
21
+ return json.loads(version_json)
@@ -0,0 +1 @@
1
+ # This file makes the 'adapters' directory a Python package.
@@ -0,0 +1,8 @@
1
+ """Deprecated adapter wrappers for Braintrust.
2
+
3
+ This module forwards imports to :mod:`eval_protocol.integrations.braintrust`.
4
+ """
5
+
6
+ from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
7
+
8
+ __all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer"]
@@ -0,0 +1,8 @@
1
+ """Deprecated adapter wrappers for TRL.
2
+
3
+ This module forwards imports to :mod:`eval_protocol.integrations.trl`.
4
+ """
5
+
6
+ from ..integrations.trl import create_trl_adapter
7
+
8
+ __all__ = ["create_trl_adapter"]
@@ -0,0 +1,29 @@
1
+ """
2
+ Reward Kit Agent Evaluation Framework V2 Components.
3
+
4
+ This package contains the core components for the new, resource-centric
5
+ agent evaluation framework, including the ForkableResource ABC, Orchestrator,
6
+ and concrete resource implementations.
7
+ """
8
+
9
+ from .orchestrator import Orchestrator
10
+
11
+ # Make key components easily importable from eval_protocol.agent
12
+ from .resource_abc import ForkableResource
13
+ from .resources import (
14
+ DockerResource,
15
+ FileSystemResource,
16
+ PythonStateResource,
17
+ SQLResource,
18
+ )
19
+ from .tool_registry import ToolRegistry
20
+
21
+ __all__ = [
22
+ "ForkableResource",
23
+ "Orchestrator",
24
+ "PythonStateResource",
25
+ "SQLResource",
26
+ "FileSystemResource",
27
+ "DockerResource",
28
+ "ToolRegistry",
29
+ ]