eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re # Added for Serveo URL parsing
|
|
4
|
+
import shutil # Added for checking ssh availability
|
|
5
|
+
import signal
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
from typing import IO, Any, Dict, List, Optional # Added IO, Any, List, Dict, Optional
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
REQUESTS_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
REQUESTS_AVAILABLE = False
|
|
16
|
+
|
|
17
|
+
# Store PIDs of started processes
|
|
18
|
+
managed_processes: Dict[int, Dict[str, Any]] = (
|
|
19
|
+
{}
|
|
20
|
+
) # pid -> {process, command, log_file, log_file_path, env}
|
|
21
|
+
|
|
22
|
+
NGROK_API_URL = "http://127.0.0.1:4040/api/tunnels"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def start_process(
|
|
26
|
+
command: List[str], # Changed to List[str]
|
|
27
|
+
log_file_path: str,
|
|
28
|
+
cwd: Optional[str] = None, # Changed to Optional[str]
|
|
29
|
+
new_process_group: bool = True,
|
|
30
|
+
env: Optional[Dict[str, str]] = None, # Changed to Optional[Dict[str, str]]
|
|
31
|
+
) -> Optional[subprocess.Popen]: # Changed to Optional[subprocess.Popen]
|
|
32
|
+
"""
|
|
33
|
+
Starts a process in the background and logs its output.
|
|
34
|
+
Stores the process information for later management.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
command: A list representing the command and its arguments.
|
|
38
|
+
log_file_path: Path to the file where stdout and stderr will be logged.
|
|
39
|
+
cwd: The working directory for the command. Defaults to current directory.
|
|
40
|
+
new_process_group: Whether to start the process in a new group (for Unix-like systems).
|
|
41
|
+
env: Optional dictionary of environment variables for the new process.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The Popen object for the started process.
|
|
45
|
+
"""
|
|
46
|
+
print(
|
|
47
|
+
f"Starting process: {' '.join(str(c) for c in command)}"
|
|
48
|
+
) # Ensure all parts of command are str for join
|
|
49
|
+
print(f"Logging output to: {log_file_path}")
|
|
50
|
+
|
|
51
|
+
# Ensure log directory exists
|
|
52
|
+
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
|
53
|
+
|
|
54
|
+
log_file: IO[Any] = open(log_file_path, "w")
|
|
55
|
+
|
|
56
|
+
process_env = os.environ.copy()
|
|
57
|
+
if env:
|
|
58
|
+
process_env.update(env)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
process = subprocess.Popen(
|
|
62
|
+
command,
|
|
63
|
+
stdout=log_file,
|
|
64
|
+
stderr=log_file,
|
|
65
|
+
cwd=cwd if cwd else os.getcwd(),
|
|
66
|
+
preexec_fn=os.setsid if (os.name != "nt" and new_process_group) else None,
|
|
67
|
+
env=process_env,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
managed_processes[process.pid] = {
|
|
71
|
+
"process": process, # Type: subprocess.Popen
|
|
72
|
+
"command": command, # Type: List[str]
|
|
73
|
+
"log_file": log_file, # Type: IO[Any] (TextIOWrapper)
|
|
74
|
+
"log_file_path": log_file_path, # Type: str
|
|
75
|
+
"is_ngrok": "ngrok" in command[0] if command else False,
|
|
76
|
+
"env": env, # Type: Optional[Dict[str, str]]
|
|
77
|
+
}
|
|
78
|
+
print(f"Process started with PID: {process.pid}")
|
|
79
|
+
return process
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"Failed to start process: {e}")
|
|
82
|
+
log_file.close() # Ensure log file is closed on error
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# --- Ngrok functions ---
|
|
87
|
+
def get_ngrok_public_url(retries: int = 5, delay: int = 3) -> Optional[str]:
|
|
88
|
+
"""
|
|
89
|
+
Queries the local ngrok API to get the public HTTPS URL.
|
|
90
|
+
"""
|
|
91
|
+
if not REQUESTS_AVAILABLE:
|
|
92
|
+
print(
|
|
93
|
+
"ERROR: 'requests' library is not installed. Cannot fetch ngrok URL automatically."
|
|
94
|
+
)
|
|
95
|
+
print("Please install it, e.g., pip install requests")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
# NGROK_API_URL is now a module-level constant
|
|
99
|
+
for attempt in range(retries):
|
|
100
|
+
try:
|
|
101
|
+
response = requests.get(NGROK_API_URL, timeout=5)
|
|
102
|
+
response.raise_for_status()
|
|
103
|
+
tunnels_data = response.json()
|
|
104
|
+
for tunnel in tunnels_data.get("tunnels", []):
|
|
105
|
+
if tunnel.get("proto") == "https" and tunnel.get(
|
|
106
|
+
"public_url", ""
|
|
107
|
+
).startswith("https://"):
|
|
108
|
+
print(f"Found ngrok public URL: {tunnel['public_url']}")
|
|
109
|
+
return tunnel["public_url"]
|
|
110
|
+
print(
|
|
111
|
+
f"Attempt {attempt + 1}/{retries}: HTTPS tunnel not found yet in ngrok API response. Retrying in {delay}s..."
|
|
112
|
+
)
|
|
113
|
+
except requests.exceptions.ConnectionError:
|
|
114
|
+
print(
|
|
115
|
+
f"Attempt {attempt + 1}/{retries}: ngrok API not yet available at {NGROK_API_URL}. Retrying in {delay}s..."
|
|
116
|
+
)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(
|
|
119
|
+
f"Attempt {attempt + 1}/{retries}: Error fetching ngrok URL: {e}. Retrying in {delay}s..."
|
|
120
|
+
)
|
|
121
|
+
time.sleep(delay)
|
|
122
|
+
print("ERROR: Failed to get ngrok public URL after multiple retries.")
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def start_ngrok_and_get_url(
|
|
127
|
+
local_port: int, ngrok_log_file: str, authtoken: Optional[str] = None
|
|
128
|
+
) -> tuple[Optional[subprocess.Popen], Optional[str]]:
|
|
129
|
+
"""
|
|
130
|
+
Starts ngrok to expose a local port and retrieves its public HTTPS URL.
|
|
131
|
+
"""
|
|
132
|
+
ngrok_command = [
|
|
133
|
+
"ngrok",
|
|
134
|
+
"http",
|
|
135
|
+
str(local_port),
|
|
136
|
+
"--log=stdout",
|
|
137
|
+
] # Using --log=stdout to simplify log capture by start_process
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
# Check if ngrok command is available
|
|
141
|
+
ngrok_version_process = subprocess.run(
|
|
142
|
+
["ngrok", "--version"], capture_output=True, text=True, check=True
|
|
143
|
+
)
|
|
144
|
+
print(f"Found ngrok version: {ngrok_version_process.stdout.strip()}")
|
|
145
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
146
|
+
print(
|
|
147
|
+
"ERROR: ngrok command not found or not executable. Please ensure ngrok is installed and in your PATH."
|
|
148
|
+
)
|
|
149
|
+
return None, None
|
|
150
|
+
|
|
151
|
+
if authtoken:
|
|
152
|
+
# This is usually done by the user beforehand with `ngrok config add-authtoken <token>`
|
|
153
|
+
# Or by setting NGROK_AUTHTOKEN environment variable.
|
|
154
|
+
# Forcing it via command line is also an option but less common for persistent setup.
|
|
155
|
+
print(
|
|
156
|
+
f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
|
|
157
|
+
)
|
|
158
|
+
# Example if passing via env for the subprocess:
|
|
159
|
+
# ngrok_env = os.environ.copy()
|
|
160
|
+
# ngrok_env["NGROK_AUTHTOKEN"] = authtoken
|
|
161
|
+
# ngrok_process = start_process(ngrok_command, ngrok_log_file, new_process_group=False, env=ngrok_env)
|
|
162
|
+
|
|
163
|
+
print(f"Attempting to start ngrok for port {local_port}...")
|
|
164
|
+
# ngrok typically doesn't need to be in a new process group for simple start/stop.
|
|
165
|
+
# Its logs will go to ngrok_log_file via start_process.
|
|
166
|
+
ngrok_process = start_process(
|
|
167
|
+
ngrok_command, ngrok_log_file, new_process_group=False
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if not ngrok_process or ngrok_process.poll() is not None:
|
|
171
|
+
print(f"ERROR: Failed to start ngrok. Check log: {ngrok_log_file}")
|
|
172
|
+
# managed_processes should handle cleanup if start_process added it and it failed.
|
|
173
|
+
# However, if ngrok_process is None, it wasn't added.
|
|
174
|
+
# If it's not None but poll() is not None, it means it started and exited quickly.
|
|
175
|
+
if ngrok_process and ngrok_process.pid in managed_processes:
|
|
176
|
+
# This might be redundant if start_process failed and didn't add it,
|
|
177
|
+
# or if it was added and then stop_process is called later.
|
|
178
|
+
# For safety, ensure it's stopped if it was ever in managed_processes.
|
|
179
|
+
pass # stop_process will be called by the caller if needed
|
|
180
|
+
return None, None
|
|
181
|
+
|
|
182
|
+
print(
|
|
183
|
+
f"ngrok process started with PID {ngrok_process.pid}. Waiting for tunnel URL..."
|
|
184
|
+
)
|
|
185
|
+
# Increased sleep time as ngrok can take a moment to establish tunnel and API to update
|
|
186
|
+
time.sleep(8)
|
|
187
|
+
|
|
188
|
+
public_url = get_ngrok_public_url()
|
|
189
|
+
|
|
190
|
+
if not public_url:
|
|
191
|
+
print(
|
|
192
|
+
f"ERROR: Could not retrieve public URL from ngrok API. Check log: {ngrok_log_file}"
|
|
193
|
+
)
|
|
194
|
+
# If URL fetch fails, stop the ngrok process we started.
|
|
195
|
+
stop_process(
|
|
196
|
+
ngrok_process.pid
|
|
197
|
+
) # stop_process will remove it from managed_processes
|
|
198
|
+
return None, None
|
|
199
|
+
|
|
200
|
+
print(f"Successfully started ngrok and retrieved public URL: {public_url}")
|
|
201
|
+
return ngrok_process, public_url
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# --- End of Ngrok functions ---
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def start_serveo_and_get_url(
|
|
208
|
+
local_port: int, log_file_path: str, timeout_seconds: int = 20
|
|
209
|
+
) -> tuple[subprocess.Popen | None, str | None]:
|
|
210
|
+
"""
|
|
211
|
+
Starts Serveo.net SSH tunnel to expose a local port and retrieves its public HTTPS URL.
|
|
212
|
+
The SSH process is added to managed_processes.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
local_port: The local port number to expose (e.g., 8001).
|
|
216
|
+
log_file_path: Path to the file where Serveo SSH client output will be logged.
|
|
217
|
+
timeout_seconds: How long to wait for Serveo to provide a URL.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
A tuple (ssh_process, public_url). (None, None) on failure.
|
|
221
|
+
"""
|
|
222
|
+
if not shutil.which("ssh"):
|
|
223
|
+
print(
|
|
224
|
+
"ERROR: 'ssh' command not found. Please ensure OpenSSH client is installed and in your PATH."
|
|
225
|
+
)
|
|
226
|
+
return None, None
|
|
227
|
+
|
|
228
|
+
# Ensure log directory exists
|
|
229
|
+
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
|
230
|
+
|
|
231
|
+
# Using a temporary file for UserKnownHostsFile might be more robust on some systems than /dev/null
|
|
232
|
+
# For simplicity, /dev/null is used here as specified in the plan.
|
|
233
|
+
# On Windows, /dev/null equivalent is NUL.
|
|
234
|
+
known_hosts_file = "/dev/null" if os.name != "nt" else "NUL"
|
|
235
|
+
|
|
236
|
+
serveo_command = [
|
|
237
|
+
"ssh",
|
|
238
|
+
"-o",
|
|
239
|
+
"StrictHostKeyChecking=no",
|
|
240
|
+
"-o",
|
|
241
|
+
f"UserKnownHostsFile={known_hosts_file}",
|
|
242
|
+
"-R",
|
|
243
|
+
f"80:localhost:{local_port}",
|
|
244
|
+
"serveo.net",
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
print(f"Attempting to start Serveo.net tunnel for localhost:{local_port}...")
|
|
248
|
+
print(f"Command: {' '.join(serveo_command)}")
|
|
249
|
+
print(f"Logging Serveo SSH client output to: {log_file_path}")
|
|
250
|
+
|
|
251
|
+
public_url = None
|
|
252
|
+
ssh_process = None
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
# We need to capture stdout to parse the URL.
|
|
256
|
+
# stderr will also be captured by the same pipe.
|
|
257
|
+
log_file = open(log_file_path, "w")
|
|
258
|
+
ssh_process = subprocess.Popen(
|
|
259
|
+
serveo_command,
|
|
260
|
+
stdout=subprocess.PIPE, # Capture stdout for parsing
|
|
261
|
+
stderr=subprocess.STDOUT, # Redirect stderr to stdout pipe
|
|
262
|
+
text=True, # Decode output as text
|
|
263
|
+
bufsize=1, # Line-buffered
|
|
264
|
+
universal_newlines=True, # Ensure consistent line endings
|
|
265
|
+
preexec_fn=(
|
|
266
|
+
os.setsid if os.name != "nt" else None
|
|
267
|
+
), # New process group for proper termination
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Add to managed_processes early so it can be cleaned up if something goes wrong
|
|
271
|
+
managed_processes[ssh_process.pid] = {
|
|
272
|
+
"process": ssh_process,
|
|
273
|
+
"command": serveo_command,
|
|
274
|
+
"log_file": log_file, # This log_file will store what we read from the pipe
|
|
275
|
+
"log_file_path": log_file_path,
|
|
276
|
+
"is_ngrok": False,
|
|
277
|
+
}
|
|
278
|
+
print(
|
|
279
|
+
f"Serveo SSH process started with PID: {ssh_process.pid}. Waiting for URL..."
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
url_pattern = re.compile(
|
|
283
|
+
r"Forwarding HTTP traffic from (https://\S+\.serveo\.net)"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
start_time = time.time()
|
|
287
|
+
if ssh_process.stdout:
|
|
288
|
+
for line in iter(ssh_process.stdout.readline, ""):
|
|
289
|
+
log_file.write(line) # Write to the main log file
|
|
290
|
+
log_file.flush()
|
|
291
|
+
print(
|
|
292
|
+
f"[Serveo PID {ssh_process.pid}]: {line.strip()}"
|
|
293
|
+
) # Also print to console for live feedback
|
|
294
|
+
|
|
295
|
+
match = url_pattern.search(line)
|
|
296
|
+
if match:
|
|
297
|
+
public_url = match.group(1)
|
|
298
|
+
print(f"Found Serveo public URL: {public_url}")
|
|
299
|
+
break # URL found
|
|
300
|
+
|
|
301
|
+
if time.time() - start_time > timeout_seconds:
|
|
302
|
+
print(
|
|
303
|
+
f"ERROR: Timeout ({timeout_seconds}s) waiting for Serveo URL."
|
|
304
|
+
)
|
|
305
|
+
break # Timeout
|
|
306
|
+
if ssh_process.poll() is not None: # Process terminated unexpectedly
|
|
307
|
+
print(
|
|
308
|
+
f"ERROR: Serveo SSH process terminated unexpectedly. Check log: {log_file_path}"
|
|
309
|
+
)
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
# If loop exited because readline returned '', process ended.
|
|
313
|
+
if ssh_process.poll() is not None and not public_url:
|
|
314
|
+
print(
|
|
315
|
+
f"ERROR: Serveo SSH process ended before URL was found. Check log: {log_file_path}"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
else: # Should not happen if Popen was successful
|
|
319
|
+
print("ERROR: SSH process stdout stream not available.")
|
|
320
|
+
|
|
321
|
+
except FileNotFoundError:
|
|
322
|
+
print(
|
|
323
|
+
"ERROR: 'ssh' command not found. Please ensure OpenSSH client is installed and in your PATH."
|
|
324
|
+
)
|
|
325
|
+
if ssh_process and ssh_process.pid in managed_processes:
|
|
326
|
+
stop_process(ssh_process.pid) # Clean up if partially started
|
|
327
|
+
return None, None
|
|
328
|
+
except Exception as e:
|
|
329
|
+
print(f"ERROR: An exception occurred while starting or monitoring Serveo: {e}")
|
|
330
|
+
if ssh_process and ssh_process.pid in managed_processes:
|
|
331
|
+
stop_process(ssh_process.pid) # Clean up
|
|
332
|
+
return None, None
|
|
333
|
+
|
|
334
|
+
if not public_url:
|
|
335
|
+
print("ERROR: Could not retrieve public URL from Serveo.net.")
|
|
336
|
+
if ssh_process: # If process was started, try to stop it
|
|
337
|
+
stop_process(ssh_process.pid)
|
|
338
|
+
return None, None
|
|
339
|
+
|
|
340
|
+
# The ssh_process is kept running in the background by Popen.
|
|
341
|
+
# It's up to the caller to manage its lifecycle (e.g., via stop_process or stop_all_processes).
|
|
342
|
+
return ssh_process, public_url
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def stop_process(pid: int):
|
|
346
|
+
"""
|
|
347
|
+
Stops a managed process and its process group.
|
|
348
|
+
Closes its log file.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
pid: The PID of the process to stop.
|
|
352
|
+
"""
|
|
353
|
+
if pid in managed_processes:
|
|
354
|
+
proc_info = managed_processes[pid]
|
|
355
|
+
process: subprocess.Popen = proc_info["process"]
|
|
356
|
+
log_file: IO[Any] = proc_info["log_file"]
|
|
357
|
+
command_list: List[str] = proc_info["command"]
|
|
358
|
+
command_str = " ".join(str(c) for c in command_list)
|
|
359
|
+
|
|
360
|
+
print(f"Stopping process PID {pid} ({command_str})...")
|
|
361
|
+
try:
|
|
362
|
+
if os.name == "nt":
|
|
363
|
+
# For Windows, taskkill is more reliable for process trees
|
|
364
|
+
subprocess.run(
|
|
365
|
+
["taskkill", "/F", "/T", "/PID", str(pid)],
|
|
366
|
+
check=True,
|
|
367
|
+
capture_output=True,
|
|
368
|
+
)
|
|
369
|
+
else:
|
|
370
|
+
# Send SIGTERM to the entire process group
|
|
371
|
+
os.killpg(os.getpgid(pid), signal.SIGTERM)
|
|
372
|
+
|
|
373
|
+
if (
|
|
374
|
+
hasattr(process, "stdout")
|
|
375
|
+
and process.stdout
|
|
376
|
+
and not process.stdout.closed
|
|
377
|
+
):
|
|
378
|
+
process.stdout.close() # This might not be necessary if Popen handles it on terminate/kill
|
|
379
|
+
process.wait(timeout=5) # Wait for graceful termination
|
|
380
|
+
print(f"Process PID {pid} terminated gracefully.")
|
|
381
|
+
except subprocess.TimeoutExpired:
|
|
382
|
+
print(f"Process PID {pid} did not terminate gracefully, sending SIGKILL...")
|
|
383
|
+
if os.name == "nt":
|
|
384
|
+
# On Windows, Popen.kill() is often sufficient for direct children.
|
|
385
|
+
# taskkill /T is for tree. If os.killpg was used via setsid, this might be complex.
|
|
386
|
+
# For simplicity, let's try process.kill() first.
|
|
387
|
+
process.kill()
|
|
388
|
+
else:
|
|
389
|
+
os.killpg(os.getpgid(pid), signal.SIGKILL) # Kill the whole group
|
|
390
|
+
process.wait(timeout=5) # Wait for kill
|
|
391
|
+
print(f"Process PID {pid} killed.")
|
|
392
|
+
except ProcessLookupError: # Process might have already exited
|
|
393
|
+
print(f"Process PID {pid} already exited.")
|
|
394
|
+
except Exception as e:
|
|
395
|
+
print(f"Error stopping process PID {pid}: {e}")
|
|
396
|
+
finally:
|
|
397
|
+
if (
|
|
398
|
+
log_file and not log_file.closed
|
|
399
|
+
): # Check if log_file is TextIOWrapper and not closed
|
|
400
|
+
log_file.close()
|
|
401
|
+
if pid in managed_processes: # Check if pid still exists before deleting
|
|
402
|
+
del managed_processes[pid]
|
|
403
|
+
else:
|
|
404
|
+
print(f"Process with PID {pid} not found in managed list.")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def stop_all_processes():
|
|
408
|
+
"""
|
|
409
|
+
Stops all currently managed processes.
|
|
410
|
+
"""
|
|
411
|
+
print("Stopping all managed processes...")
|
|
412
|
+
# Iterate over a copy of keys as `stop_process` modifies the dictionary
|
|
413
|
+
for pid in list(managed_processes.keys()):
|
|
414
|
+
stop_process(pid)
|
|
415
|
+
print("All managed processes have been requested to stop.")
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
if __name__ == "__main__":
|
|
419
|
+
import sys # Import sys here for sys.executable
|
|
420
|
+
|
|
421
|
+
# Example Usage:
|
|
422
|
+
log_dir = os.path.join(os.getcwd(), "logs")
|
|
423
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
424
|
+
|
|
425
|
+
print("Starting a sample sleep process...")
|
|
426
|
+
sleep_log = os.path.join(log_dir, "sleep_test.log")
|
|
427
|
+
# Using sys.executable to ensure we use the same python interpreter
|
|
428
|
+
# For a real server, this would be [sys.executable, 'path/to/server_script.py']
|
|
429
|
+
# Note: sys.executable is not defined in this scope if this file is run directly without importing sys first.
|
|
430
|
+
# For the example, let's assume python is in path or use a simple command.
|
|
431
|
+
|
|
432
|
+
# Test basic process start/stop
|
|
433
|
+
print("Starting a sample sleep process...")
|
|
434
|
+
sleep_log = os.path.join(log_dir, "sleep_test.log")
|
|
435
|
+
# Using a simple platform-independent sleep command for the example
|
|
436
|
+
sleep_command = ["timeout", "10"] if os.name == "nt" else ["sleep", "10"]
|
|
437
|
+
proc = start_process(sleep_command, sleep_log)
|
|
438
|
+
|
|
439
|
+
if proc and proc.pid: # Check if proc is not None and has a pid
|
|
440
|
+
print(f"Sleep process PID: {proc.pid}")
|
|
441
|
+
print("Waiting for a few seconds before stopping...")
|
|
442
|
+
time.sleep(3)
|
|
443
|
+
stop_process(proc.pid)
|
|
444
|
+
else:
|
|
445
|
+
print("Failed to start sleep process.")
|
|
446
|
+
|
|
447
|
+
print("\nStarting another process to test stop_all...")
|
|
448
|
+
another_log = os.path.join(log_dir, "another_test.log")
|
|
449
|
+
# Using a simple platform-independent sleep command for the example
|
|
450
|
+
another_sleep_command = ["timeout", "10"] if os.name == "nt" else ["sleep", "10"]
|
|
451
|
+
proc2 = start_process(another_sleep_command, another_log)
|
|
452
|
+
if proc2 and proc2.pid:
|
|
453
|
+
print(f"Another process PID: {proc2.pid}")
|
|
454
|
+
time.sleep(1)
|
|
455
|
+
# stop_all_processes() # This would be called by atexit or explicitly
|
|
456
|
+
else:
|
|
457
|
+
print("Failed to start another process.")
|
|
458
|
+
|
|
459
|
+
# Test ngrok (manual execution needed if you want to see this run)
|
|
460
|
+
# print("\nTesting ngrok start (requires ngrok in PATH and a service on port 8888)...")
|
|
461
|
+
# ngrok_test_log = os.path.join(log_dir, "ngrok_test.log")
|
|
462
|
+
# ngrok_proc, ngrok_url = start_ngrok_and_get_url(8888, ngrok_test_log)
|
|
463
|
+
# if ngrok_proc and ngrok_url:
|
|
464
|
+
# print(f"Ngrok started: PID {ngrok_proc.pid}, URL {ngrok_url}")
|
|
465
|
+
# time.sleep(5) # Keep it running for 5s
|
|
466
|
+
# # stop_process(ngrok_proc.pid) # stop_all_processes will handle it
|
|
467
|
+
# else:
|
|
468
|
+
# print("Failed to start ngrok for testing.")
|
|
469
|
+
|
|
470
|
+
# stop_all_processes() will be called by atexit if this script is run
|
|
471
|
+
# or can be called explicitly if needed.
|
|
472
|
+
# For this example, we'll let atexit handle it if processes were started.
|
|
473
|
+
# If running this __main__ block, ensure atexit is registered or call stop_all_processes()
|
|
474
|
+
import atexit
|
|
475
|
+
|
|
476
|
+
atexit.register(stop_all_processes)
|
|
477
|
+
print("Subprocess manager example finished. Check logs in 'logs' directory.")
|
|
478
|
+
print(
|
|
479
|
+
"Remaining managed processes (should be empty if all stopped):",
|
|
480
|
+
managed_processes.keys(),
|
|
481
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fireworks Eval Protocol - Simplify reward modeling and evaluation for LLM RL fine-tuning.
|
|
3
|
+
|
|
4
|
+
A Python library for defining, testing, deploying, and using reward functions
|
|
5
|
+
for LLM fine-tuning, including launching full RL jobs on the Fireworks platform.
|
|
6
|
+
|
|
7
|
+
The library also provides an agent evaluation framework for testing and evaluating
|
|
8
|
+
tool-augmented models using self-contained task bundles.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
|
|
14
|
+
from .auth import get_fireworks_api_key, get_fireworks_account_id
|
|
15
|
+
from .common_utils import load_jsonl
|
|
16
|
+
from .config import load_config, get_config, RewardKitConfig
|
|
17
|
+
from .mcp_env import (
|
|
18
|
+
OpenAIPolicy,
|
|
19
|
+
AnthropicPolicy,
|
|
20
|
+
MCPVectorEnv,
|
|
21
|
+
make,
|
|
22
|
+
rollout,
|
|
23
|
+
test_mcp,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Try to import FireworksPolicy if available
|
|
27
|
+
try:
|
|
28
|
+
from .mcp_env import FireworksPolicy
|
|
29
|
+
|
|
30
|
+
_FIREWORKS_AVAILABLE = True
|
|
31
|
+
except (ImportError, AttributeError):
|
|
32
|
+
_FIREWORKS_AVAILABLE = False
|
|
33
|
+
from .models import EvaluateResult, Message, MetricResult
|
|
34
|
+
from .playback_policy import PlaybackPolicyBase
|
|
35
|
+
from .resources import create_llm_resource
|
|
36
|
+
from .reward_function import RewardFunction
|
|
37
|
+
from .typed_interface import reward_function
|
|
38
|
+
|
|
39
|
+
# Import submodules to make them available via eval_protocol.rewards, etc.
|
|
40
|
+
from . import rewards
|
|
41
|
+
from . import mcp
|
|
42
|
+
|
|
43
|
+
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
# Core interfaces
|
|
47
|
+
"Message",
|
|
48
|
+
"MetricResult",
|
|
49
|
+
"EvaluateResult",
|
|
50
|
+
"reward_function",
|
|
51
|
+
"RewardFunction",
|
|
52
|
+
"scorer_to_reward_fn",
|
|
53
|
+
"reward_fn_to_scorer",
|
|
54
|
+
# Authentication
|
|
55
|
+
"get_fireworks_api_key",
|
|
56
|
+
"get_fireworks_account_id",
|
|
57
|
+
# Configuration
|
|
58
|
+
"load_config",
|
|
59
|
+
"get_config",
|
|
60
|
+
"RewardKitConfig",
|
|
61
|
+
# Utilities
|
|
62
|
+
"load_jsonl",
|
|
63
|
+
# MCP Environment API
|
|
64
|
+
"make",
|
|
65
|
+
"rollout",
|
|
66
|
+
"AnthropicPolicy",
|
|
67
|
+
"FireworksPolicy",
|
|
68
|
+
"OpenAIPolicy",
|
|
69
|
+
"MCPVectorEnv",
|
|
70
|
+
"test_mcp",
|
|
71
|
+
# Playback functionality
|
|
72
|
+
"PlaybackPolicyBase",
|
|
73
|
+
# Resource management
|
|
74
|
+
"create_llm_resource",
|
|
75
|
+
# Submodules
|
|
76
|
+
"rewards",
|
|
77
|
+
"mcp",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
# Add FireworksPolicy to exports if available
|
|
81
|
+
if _FIREWORKS_AVAILABLE:
|
|
82
|
+
__all__.insert(__all__.index("OpenAIPolicy") + 1, "FireworksPolicy")
|
|
83
|
+
|
|
84
|
+
from . import _version
|
|
85
|
+
|
|
86
|
+
__version__ = _version.get_versions()["version"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
# This file was generated by 'versioneer.py' (0.29) from
|
|
3
|
+
# revision-control system data, or from the parent directory name of an
|
|
4
|
+
# unpacked source archive. Distribution tarballs contain a pre-generated copy
|
|
5
|
+
# of this file.
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
version_json = '''
|
|
10
|
+
{
|
|
11
|
+
"date": "2025-07-26T21:49:07-0700",
|
|
12
|
+
"dirty": false,
|
|
13
|
+
"error": null,
|
|
14
|
+
"full-revisionid": "aca1f0cedf1248dc578ea7d3e3f6e962b6d9de69",
|
|
15
|
+
"version": "0.0.3"
|
|
16
|
+
}
|
|
17
|
+
''' # END VERSION_JSON
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_versions():
|
|
21
|
+
return json.loads(version_json)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This file makes the 'adapters' directory a Python package.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Deprecated adapter wrappers for Braintrust.
|
|
2
|
+
|
|
3
|
+
This module forwards imports to :mod:`eval_protocol.integrations.braintrust`.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
|
|
7
|
+
|
|
8
|
+
__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reward Kit Agent Evaluation Framework V2 Components.
|
|
3
|
+
|
|
4
|
+
This package contains the core components for the new, resource-centric
|
|
5
|
+
agent evaluation framework, including the ForkableResource ABC, Orchestrator,
|
|
6
|
+
and concrete resource implementations.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .orchestrator import Orchestrator
|
|
10
|
+
|
|
11
|
+
# Make key components easily importable from eval_protocol.agent
|
|
12
|
+
from .resource_abc import ForkableResource
|
|
13
|
+
from .resources import (
|
|
14
|
+
DockerResource,
|
|
15
|
+
FileSystemResource,
|
|
16
|
+
PythonStateResource,
|
|
17
|
+
SQLResource,
|
|
18
|
+
)
|
|
19
|
+
from .tool_registry import ToolRegistry
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"ForkableResource",
|
|
23
|
+
"Orchestrator",
|
|
24
|
+
"PythonStateResource",
|
|
25
|
+
"SQLResource",
|
|
26
|
+
"FileSystemResource",
|
|
27
|
+
"DockerResource",
|
|
28
|
+
"ToolRegistry",
|
|
29
|
+
]
|