eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,271 @@
1
+ """
2
+ SQLResource: A ForkableResource for managing SQL database states, initially focusing on SQLite.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ import sqlite3
8
+ import uuid
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from ..resource_abc import ForkableResource
13
+
14
+
15
+ class SQLResource(ForkableResource):
16
+ """
17
+ A ForkableResource for managing SQL database states, primarily SQLite.
18
+
19
+ Manages a SQLite database file, allowing it to be initialized with a schema
20
+ and seed data, forked (by copying the DB file), checkpointed (by copying),
21
+ and restored.
22
+
23
+ Attributes:
24
+ _config (Dict[str, Any]): Configuration for the resource.
25
+ _db_path (Optional[Path]): Path to the current SQLite database file.
26
+ _base_db_path (Optional[Path]): Path to the initially set up database, used for forking.
27
+ _temp_dir (Path): Directory to store database files.
28
+ """
29
+
30
+ def __init__(self) -> None:
31
+ self._config: Dict[str, Any] = {}
32
+ self._db_path: Optional[Path] = None
33
+ self._base_db_path: Optional[Path] = None
34
+ # Consider making temp_dir configurable or using a more robust temp solution
35
+ self._temp_dir = Path("./.rk_temp_dbs").resolve() # Ensure absolute path
36
+ self._temp_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ def _get_db_connection(self) -> sqlite3.Connection:
39
+ if not self._db_path:
40
+ raise ConnectionError("Database path not set. Call setup() or fork() first.")
41
+ # Set timeout to prevent indefinite hangs
42
+ return sqlite3.connect(str(self._db_path), timeout=10)
43
+
44
+ async def setup(self, config: Dict[str, Any]) -> None:
45
+ """
46
+ Initializes the SQLite database.
47
+
48
+ Args:
49
+ config: Configuration dictionary. Expected keys:
50
+ - 'db_type' (str): Must be 'sqlite'.
51
+ - 'db_name' (Optional[str]): Name for the database file. Defaults to a UUID.
52
+ - 'schema_file' (Optional[str]): Path to an SQL file to execute for schema setup.
53
+ - 'seed_data_file' (Optional[str]): Path to an SQL file for initial data seeding.
54
+ - 'schema_sql' (Optional[str]): SQL string for schema setup.
55
+ - 'seed_sql' (Optional[str]): SQL string for initial data seeding.
56
+ """
57
+ self._config = config.copy()
58
+ db_type = self._config.get("db_type", "sqlite")
59
+ if db_type != "sqlite":
60
+ raise ValueError("SQLResource currently only supports 'sqlite'.")
61
+
62
+ db_name = self._config.get("db_name", f"db_{uuid.uuid4().hex}.sqlite")
63
+ self._base_db_path = self._temp_dir / db_name
64
+ self._db_path = self._base_db_path # Initially, the current DB is the base DB
65
+
66
+ # Ensure a fresh start if the base DB file already exists from a previous run
67
+ if self._base_db_path is not None and self._base_db_path.exists():
68
+ self._base_db_path.unlink()
69
+
70
+ conn = self._get_db_connection()
71
+ try:
72
+ with conn:
73
+ # Apply schema
74
+ schema_file = self._config.get("schema_file")
75
+ if schema_file and Path(schema_file).exists():
76
+ with open(schema_file, "r") as f:
77
+ conn.executescript(f.read())
78
+
79
+ schema_sql = self._config.get("schema_sql")
80
+ if schema_sql:
81
+ conn.executescript(schema_sql)
82
+
83
+ # Apply seed data
84
+ seed_data_file = self._config.get("seed_data_file")
85
+ if seed_data_file and Path(seed_data_file).exists():
86
+ with open(seed_data_file, "r") as f:
87
+ conn.executescript(f.read())
88
+
89
+ seed_sql = self._config.get("seed_sql")
90
+ if seed_sql:
91
+ conn.executescript(seed_sql)
92
+ finally:
93
+ conn.close()
94
+
95
+ async def fork(self) -> "SQLResource":
96
+ """
97
+ Creates a new SQLResource instance with a copy of the base database state.
98
+ If called on an already forked resource, it forks from its current state.
99
+ """
100
+ if not self._db_path or not self._db_path.exists():
101
+ raise RuntimeError("Cannot fork: original database does not exist or setup was not called.")
102
+
103
+ forked_resource = SQLResource()
104
+ forked_resource._config = self._config.copy()
105
+ forked_resource._temp_dir = self._temp_dir # Share the same temp dir base
106
+
107
+ # The new fork's base is the current state of this resource
108
+ forked_resource._base_db_path = self._db_path
109
+
110
+ # Create a new unique DB file for this fork
111
+ forked_db_name = f"fork_{uuid.uuid4().hex}.sqlite"
112
+ forked_resource._db_path = self._temp_dir / forked_db_name
113
+
114
+ shutil.copyfile(str(self._db_path), str(forked_resource._db_path))
115
+ return forked_resource
116
+
117
+ async def checkpoint(self) -> Dict[str, Any]:
118
+ """
119
+ Returns a serializable representation of the resource's current state.
120
+ For SQLite, this involves copying the database file to a checkpoint location
121
+ and returning the path.
122
+ """
123
+ if not self._db_path or not self._db_path.exists():
124
+ raise RuntimeError("Cannot checkpoint: database does not exist.")
125
+
126
+ checkpoint_name = f"checkpoint_{self._db_path.stem}_{uuid.uuid4().hex}.sqlite"
127
+ checkpoint_path = self._temp_dir / checkpoint_name
128
+ shutil.copyfile(str(self._db_path), str(checkpoint_path))
129
+ return {"db_type": "sqlite", "checkpoint_path": str(checkpoint_path)}
130
+
131
+ async def restore(self, state_data: Dict[str, Any]) -> None:
132
+ """
133
+ Restores the resource's state from a previously checkpointed state.
134
+ For SQLite, this means copying the checkpointed DB file to become the current DB.
135
+ """
136
+ db_type = state_data.get("db_type")
137
+ checkpoint_path_str = state_data.get("checkpoint_path")
138
+
139
+ if db_type != "sqlite" or not checkpoint_path_str:
140
+ raise ValueError("Invalid state_data for SQLite restore.")
141
+
142
+ checkpoint_path = Path(checkpoint_path_str)
143
+ if not checkpoint_path.exists():
144
+ raise FileNotFoundError(f"Checkpoint file not found: {checkpoint_path}")
145
+
146
+ # If current db_path is not set (e.g. fresh resource), assign one
147
+ if not self._db_path:
148
+ self._db_path = self._temp_dir / f"restored_{uuid.uuid4().hex}.sqlite"
149
+
150
+ shutil.copyfile(str(checkpoint_path), str(self._db_path))
151
+ self._base_db_path = self._db_path # The restored state becomes the new base for future forks
152
+
153
+ async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
154
+ """
155
+ Executes a SQL query on the database.
156
+
157
+ Args:
158
+ action_name: Should be 'execute_sql'.
159
+ action_params: Dictionary containing:
160
+ - 'query' (str): The SQL query to execute.
161
+ - 'parameters' (Optional[Dict | List]): Parameters for the query.
162
+ - 'fetch_mode' (Optional[str]): 'one', 'all', or 'val'. If None, no fetch.
163
+
164
+ Returns:
165
+ Query result based on fetch_mode, or rowcount for DML.
166
+ """
167
+ if action_name != "execute_sql":
168
+ raise NotImplementedError(f"Action '{action_name}' not supported by SQLResource.")
169
+
170
+ query = action_params.get("query")
171
+ if not query:
172
+ raise ValueError("Missing 'query' in action_params for 'execute_sql'.")
173
+
174
+ params = action_params.get("parameters", [])
175
+ fetch_mode = action_params.get("fetch_mode") # 'one', 'all', 'val'
176
+
177
+ conn = self._get_db_connection()
178
+ try:
179
+ with conn:
180
+ cursor = conn.cursor()
181
+ cursor.execute(query, params)
182
+
183
+ if fetch_mode == "one":
184
+ columns = [desc[0] for desc in cursor.description]
185
+ row = cursor.fetchone()
186
+ return dict(zip(columns, row)) if row else None
187
+ elif fetch_mode == "all":
188
+ columns = [desc[0] for desc in cursor.description]
189
+ rows = cursor.fetchall()
190
+ return [dict(zip(columns, row)) for row in rows]
191
+ elif fetch_mode == "val":
192
+ row = cursor.fetchone()
193
+ return row[0] if row else None
194
+ else: # DML or no fetch needed
195
+ return {"rowcount": cursor.rowcount}
196
+ finally:
197
+ conn.close()
198
+
199
+ async def get_observation(self) -> Dict[str, Any]:
200
+ """
201
+ Returns the current observable state of the resource.
202
+ For SQLResource, this could be the path to the DB or a status message.
203
+ """
204
+ return {
205
+ "db_type": "sqlite",
206
+ "db_path": str(self._db_path) if self._db_path else None,
207
+ "status": ("ready" if self._db_path and self._db_path.exists() else "uninitialized"),
208
+ }
209
+
210
+ async def get_tools_spec(self) -> List[Dict[str, Any]]:
211
+ """
212
+ Returns tool specifications for interacting with the SQL database.
213
+ """
214
+ return [
215
+ {
216
+ "type": "function",
217
+ "function": {
218
+ "name": "execute_sql",
219
+ "description": "Executes a SQL query against the database. "
220
+ "Use 'fetch_mode' to control return value: "
221
+ "'one' for a single row, "
222
+ "'all' for all rows, "
223
+ "'val' for a single value from the first row. "
224
+ "If 'fetch_mode' is not provided, returns rowcount for DML statements.",
225
+ "parameters": {
226
+ "type": "object",
227
+ "properties": {
228
+ "query": {
229
+ "type": "string",
230
+ "description": "The SQL query to execute.",
231
+ },
232
+ "parameters": {
233
+ "type": "array", # Or object for named parameters, sqlite3 supports both
234
+ "description": "Parameters for the SQL query (optional).",
235
+ "items": {"type": "any"},
236
+ },
237
+ "fetch_mode": {
238
+ "type": "string",
239
+ "enum": ["one", "all", "val"],
240
+ "description": "Specifies how to fetch results (optional).",
241
+ },
242
+ },
243
+ "required": ["query"],
244
+ },
245
+ },
246
+ }
247
+ ]
248
+
249
+ async def close(self) -> None:
250
+ """
251
+ Cleans up by deleting the created database file(s).
252
+ More robust cleanup of the _temp_dir might be needed if it's shared or persistent.
253
+ """
254
+ if self._db_path and self._db_path.exists():
255
+ try:
256
+ self._db_path.unlink()
257
+ except OSError as e:
258
+ print(f"Error deleting database file {self._db_path}: {e}")
259
+
260
+ # Potentially clean up base_db_path if it's different and also temporary
261
+ # if self._base_db_path and self._base_db_path.exists() and self._base_db_path != self._db_path:
262
+ # try:
263
+ # self._base_db_path.unlink()
264
+ # except OSError:
265
+ # pass # ignore if it was already deleted or moved
266
+
267
+ # For now, we don't delete the _temp_dir itself, as it might contain checkpoints
268
+ # or other DBs from concurrent operations. A more sophisticated cleanup strategy
269
+ # for _temp_dir might be needed for long-running processes.
270
+ self._db_path = None
271
+ self._base_db_path = None