eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQLResource: A ForkableResource for managing SQL database states, initially focusing on SQLite.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import sqlite3
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from ..resource_abc import ForkableResource
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SQLResource(ForkableResource):
|
|
16
|
+
"""
|
|
17
|
+
A ForkableResource for managing SQL database states, primarily SQLite.
|
|
18
|
+
|
|
19
|
+
Manages a SQLite database file, allowing it to be initialized with a schema
|
|
20
|
+
and seed data, forked (by copying the DB file), checkpointed (by copying),
|
|
21
|
+
and restored.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
_config (Dict[str, Any]): Configuration for the resource.
|
|
25
|
+
_db_path (Optional[Path]): Path to the current SQLite database file.
|
|
26
|
+
_base_db_path (Optional[Path]): Path to the initially set up database, used for forking.
|
|
27
|
+
_temp_dir (Path): Directory to store database files.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self._config: Dict[str, Any] = {}
|
|
32
|
+
self._db_path: Optional[Path] = None
|
|
33
|
+
self._base_db_path: Optional[Path] = None
|
|
34
|
+
# Consider making temp_dir configurable or using a more robust temp solution
|
|
35
|
+
self._temp_dir = Path("./.rk_temp_dbs").resolve() # Ensure absolute path
|
|
36
|
+
self._temp_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
def _get_db_connection(self) -> sqlite3.Connection:
|
|
39
|
+
if not self._db_path:
|
|
40
|
+
raise ConnectionError("Database path not set. Call setup() or fork() first.")
|
|
41
|
+
# Set timeout to prevent indefinite hangs
|
|
42
|
+
return sqlite3.connect(str(self._db_path), timeout=10)
|
|
43
|
+
|
|
44
|
+
async def setup(self, config: Dict[str, Any]) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Initializes the SQLite database.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
config: Configuration dictionary. Expected keys:
|
|
50
|
+
- 'db_type' (str): Must be 'sqlite'.
|
|
51
|
+
- 'db_name' (Optional[str]): Name for the database file. Defaults to a UUID.
|
|
52
|
+
- 'schema_file' (Optional[str]): Path to an SQL file to execute for schema setup.
|
|
53
|
+
- 'seed_data_file' (Optional[str]): Path to an SQL file for initial data seeding.
|
|
54
|
+
- 'schema_sql' (Optional[str]): SQL string for schema setup.
|
|
55
|
+
- 'seed_sql' (Optional[str]): SQL string for initial data seeding.
|
|
56
|
+
"""
|
|
57
|
+
self._config = config.copy()
|
|
58
|
+
db_type = self._config.get("db_type", "sqlite")
|
|
59
|
+
if db_type != "sqlite":
|
|
60
|
+
raise ValueError("SQLResource currently only supports 'sqlite'.")
|
|
61
|
+
|
|
62
|
+
db_name = self._config.get("db_name", f"db_{uuid.uuid4().hex}.sqlite")
|
|
63
|
+
self._base_db_path = self._temp_dir / db_name
|
|
64
|
+
self._db_path = self._base_db_path # Initially, the current DB is the base DB
|
|
65
|
+
|
|
66
|
+
# Ensure a fresh start if the base DB file already exists from a previous run
|
|
67
|
+
if self._base_db_path is not None and self._base_db_path.exists():
|
|
68
|
+
self._base_db_path.unlink()
|
|
69
|
+
|
|
70
|
+
conn = self._get_db_connection()
|
|
71
|
+
try:
|
|
72
|
+
with conn:
|
|
73
|
+
# Apply schema
|
|
74
|
+
schema_file = self._config.get("schema_file")
|
|
75
|
+
if schema_file and Path(schema_file).exists():
|
|
76
|
+
with open(schema_file, "r") as f:
|
|
77
|
+
conn.executescript(f.read())
|
|
78
|
+
|
|
79
|
+
schema_sql = self._config.get("schema_sql")
|
|
80
|
+
if schema_sql:
|
|
81
|
+
conn.executescript(schema_sql)
|
|
82
|
+
|
|
83
|
+
# Apply seed data
|
|
84
|
+
seed_data_file = self._config.get("seed_data_file")
|
|
85
|
+
if seed_data_file and Path(seed_data_file).exists():
|
|
86
|
+
with open(seed_data_file, "r") as f:
|
|
87
|
+
conn.executescript(f.read())
|
|
88
|
+
|
|
89
|
+
seed_sql = self._config.get("seed_sql")
|
|
90
|
+
if seed_sql:
|
|
91
|
+
conn.executescript(seed_sql)
|
|
92
|
+
finally:
|
|
93
|
+
conn.close()
|
|
94
|
+
|
|
95
|
+
async def fork(self) -> "SQLResource":
|
|
96
|
+
"""
|
|
97
|
+
Creates a new SQLResource instance with a copy of the base database state.
|
|
98
|
+
If called on an already forked resource, it forks from its current state.
|
|
99
|
+
"""
|
|
100
|
+
if not self._db_path or not self._db_path.exists():
|
|
101
|
+
raise RuntimeError("Cannot fork: original database does not exist or setup was not called.")
|
|
102
|
+
|
|
103
|
+
forked_resource = SQLResource()
|
|
104
|
+
forked_resource._config = self._config.copy()
|
|
105
|
+
forked_resource._temp_dir = self._temp_dir # Share the same temp dir base
|
|
106
|
+
|
|
107
|
+
# The new fork's base is the current state of this resource
|
|
108
|
+
forked_resource._base_db_path = self._db_path
|
|
109
|
+
|
|
110
|
+
# Create a new unique DB file for this fork
|
|
111
|
+
forked_db_name = f"fork_{uuid.uuid4().hex}.sqlite"
|
|
112
|
+
forked_resource._db_path = self._temp_dir / forked_db_name
|
|
113
|
+
|
|
114
|
+
shutil.copyfile(str(self._db_path), str(forked_resource._db_path))
|
|
115
|
+
return forked_resource
|
|
116
|
+
|
|
117
|
+
async def checkpoint(self) -> Dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Returns a serializable representation of the resource's current state.
|
|
120
|
+
For SQLite, this involves copying the database file to a checkpoint location
|
|
121
|
+
and returning the path.
|
|
122
|
+
"""
|
|
123
|
+
if not self._db_path or not self._db_path.exists():
|
|
124
|
+
raise RuntimeError("Cannot checkpoint: database does not exist.")
|
|
125
|
+
|
|
126
|
+
checkpoint_name = f"checkpoint_{self._db_path.stem}_{uuid.uuid4().hex}.sqlite"
|
|
127
|
+
checkpoint_path = self._temp_dir / checkpoint_name
|
|
128
|
+
shutil.copyfile(str(self._db_path), str(checkpoint_path))
|
|
129
|
+
return {"db_type": "sqlite", "checkpoint_path": str(checkpoint_path)}
|
|
130
|
+
|
|
131
|
+
async def restore(self, state_data: Dict[str, Any]) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Restores the resource's state from a previously checkpointed state.
|
|
134
|
+
For SQLite, this means copying the checkpointed DB file to become the current DB.
|
|
135
|
+
"""
|
|
136
|
+
db_type = state_data.get("db_type")
|
|
137
|
+
checkpoint_path_str = state_data.get("checkpoint_path")
|
|
138
|
+
|
|
139
|
+
if db_type != "sqlite" or not checkpoint_path_str:
|
|
140
|
+
raise ValueError("Invalid state_data for SQLite restore.")
|
|
141
|
+
|
|
142
|
+
checkpoint_path = Path(checkpoint_path_str)
|
|
143
|
+
if not checkpoint_path.exists():
|
|
144
|
+
raise FileNotFoundError(f"Checkpoint file not found: {checkpoint_path}")
|
|
145
|
+
|
|
146
|
+
# If current db_path is not set (e.g. fresh resource), assign one
|
|
147
|
+
if not self._db_path:
|
|
148
|
+
self._db_path = self._temp_dir / f"restored_{uuid.uuid4().hex}.sqlite"
|
|
149
|
+
|
|
150
|
+
shutil.copyfile(str(checkpoint_path), str(self._db_path))
|
|
151
|
+
self._base_db_path = self._db_path # The restored state becomes the new base for future forks
|
|
152
|
+
|
|
153
|
+
async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
|
|
154
|
+
"""
|
|
155
|
+
Executes a SQL query on the database.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
action_name: Should be 'execute_sql'.
|
|
159
|
+
action_params: Dictionary containing:
|
|
160
|
+
- 'query' (str): The SQL query to execute.
|
|
161
|
+
- 'parameters' (Optional[Dict | List]): Parameters for the query.
|
|
162
|
+
- 'fetch_mode' (Optional[str]): 'one', 'all', or 'val'. If None, no fetch.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Query result based on fetch_mode, or rowcount for DML.
|
|
166
|
+
"""
|
|
167
|
+
if action_name != "execute_sql":
|
|
168
|
+
raise NotImplementedError(f"Action '{action_name}' not supported by SQLResource.")
|
|
169
|
+
|
|
170
|
+
query = action_params.get("query")
|
|
171
|
+
if not query:
|
|
172
|
+
raise ValueError("Missing 'query' in action_params for 'execute_sql'.")
|
|
173
|
+
|
|
174
|
+
params = action_params.get("parameters", [])
|
|
175
|
+
fetch_mode = action_params.get("fetch_mode") # 'one', 'all', 'val'
|
|
176
|
+
|
|
177
|
+
conn = self._get_db_connection()
|
|
178
|
+
try:
|
|
179
|
+
with conn:
|
|
180
|
+
cursor = conn.cursor()
|
|
181
|
+
cursor.execute(query, params)
|
|
182
|
+
|
|
183
|
+
if fetch_mode == "one":
|
|
184
|
+
columns = [desc[0] for desc in cursor.description]
|
|
185
|
+
row = cursor.fetchone()
|
|
186
|
+
return dict(zip(columns, row)) if row else None
|
|
187
|
+
elif fetch_mode == "all":
|
|
188
|
+
columns = [desc[0] for desc in cursor.description]
|
|
189
|
+
rows = cursor.fetchall()
|
|
190
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
191
|
+
elif fetch_mode == "val":
|
|
192
|
+
row = cursor.fetchone()
|
|
193
|
+
return row[0] if row else None
|
|
194
|
+
else: # DML or no fetch needed
|
|
195
|
+
return {"rowcount": cursor.rowcount}
|
|
196
|
+
finally:
|
|
197
|
+
conn.close()
|
|
198
|
+
|
|
199
|
+
async def get_observation(self) -> Dict[str, Any]:
|
|
200
|
+
"""
|
|
201
|
+
Returns the current observable state of the resource.
|
|
202
|
+
For SQLResource, this could be the path to the DB or a status message.
|
|
203
|
+
"""
|
|
204
|
+
return {
|
|
205
|
+
"db_type": "sqlite",
|
|
206
|
+
"db_path": str(self._db_path) if self._db_path else None,
|
|
207
|
+
"status": ("ready" if self._db_path and self._db_path.exists() else "uninitialized"),
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
async def get_tools_spec(self) -> List[Dict[str, Any]]:
|
|
211
|
+
"""
|
|
212
|
+
Returns tool specifications for interacting with the SQL database.
|
|
213
|
+
"""
|
|
214
|
+
return [
|
|
215
|
+
{
|
|
216
|
+
"type": "function",
|
|
217
|
+
"function": {
|
|
218
|
+
"name": "execute_sql",
|
|
219
|
+
"description": "Executes a SQL query against the database. "
|
|
220
|
+
"Use 'fetch_mode' to control return value: "
|
|
221
|
+
"'one' for a single row, "
|
|
222
|
+
"'all' for all rows, "
|
|
223
|
+
"'val' for a single value from the first row. "
|
|
224
|
+
"If 'fetch_mode' is not provided, returns rowcount for DML statements.",
|
|
225
|
+
"parameters": {
|
|
226
|
+
"type": "object",
|
|
227
|
+
"properties": {
|
|
228
|
+
"query": {
|
|
229
|
+
"type": "string",
|
|
230
|
+
"description": "The SQL query to execute.",
|
|
231
|
+
},
|
|
232
|
+
"parameters": {
|
|
233
|
+
"type": "array", # Or object for named parameters, sqlite3 supports both
|
|
234
|
+
"description": "Parameters for the SQL query (optional).",
|
|
235
|
+
"items": {"type": "any"},
|
|
236
|
+
},
|
|
237
|
+
"fetch_mode": {
|
|
238
|
+
"type": "string",
|
|
239
|
+
"enum": ["one", "all", "val"],
|
|
240
|
+
"description": "Specifies how to fetch results (optional).",
|
|
241
|
+
},
|
|
242
|
+
},
|
|
243
|
+
"required": ["query"],
|
|
244
|
+
},
|
|
245
|
+
},
|
|
246
|
+
}
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
async def close(self) -> None:
|
|
250
|
+
"""
|
|
251
|
+
Cleans up by deleting the created database file(s).
|
|
252
|
+
More robust cleanup of the _temp_dir might be needed if it's shared or persistent.
|
|
253
|
+
"""
|
|
254
|
+
if self._db_path and self._db_path.exists():
|
|
255
|
+
try:
|
|
256
|
+
self._db_path.unlink()
|
|
257
|
+
except OSError as e:
|
|
258
|
+
print(f"Error deleting database file {self._db_path}: {e}")
|
|
259
|
+
|
|
260
|
+
# Potentially clean up base_db_path if it's different and also temporary
|
|
261
|
+
# if self._base_db_path and self._base_db_path.exists() and self._base_db_path != self._db_path:
|
|
262
|
+
# try:
|
|
263
|
+
# self._base_db_path.unlink()
|
|
264
|
+
# except OSError:
|
|
265
|
+
# pass # ignore if it was already deleted or moved
|
|
266
|
+
|
|
267
|
+
# For now, we don't delete the _temp_dir itself, as it might contain checkpoints
|
|
268
|
+
# or other DBs from concurrent operations. A more sophisticated cleanup strategy
|
|
269
|
+
# for _temp_dir might be needed for long-running processes.
|
|
270
|
+
self._db_path = None
|
|
271
|
+
self._base_db_path = None
|