benchmax 0.1.2.dev4__tar.gz → 0.1.2.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/PKG-INFO +1 -1
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/pyproject.toml +1 -1
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/server_pool.py +1 -1
- benchmax-0.1.2.dev6/src/benchmax/envs/search/search_env.py +269 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/PKG-INFO +1 -1
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/SOURCES.txt +1 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/LICENSE +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/README.md +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/setup.cfg +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/benchmax_wrapper.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/skyrl/benchmax_data_process.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/skyrl/skyrl_adapter.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/base_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/crm_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/workdir/salesforce_mcp.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/data_utils.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/excel_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/math/math_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/parallel_mcp_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/utils.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/proxy_server.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/utils.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/types.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/wikipedia/utils.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/wikipedia/wiki_env.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/prompts/__init__.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/prompts/tools.py +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/dependency_links.txt +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/requires.txt +0 -0
- {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/top_level.txt +0 -0
|
@@ -116,7 +116,7 @@ class ServerPool:
|
|
|
116
116
|
async with self._server_available:
|
|
117
117
|
# Wait until a server is available
|
|
118
118
|
while not self._unassigned_servers:
|
|
119
|
-
logger.
|
|
119
|
+
logger.debug(
|
|
120
120
|
f"[{rollout_id}] Waiting for available server "
|
|
121
121
|
f"(pool: 0 available, {len(self._rollout_to_server)} assigned)"
|
|
122
122
|
)
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
from difflib import SequenceMatcher
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from benchmax.envs.base_env import BaseEnv
|
|
8
|
+
from benchmax.envs.types import ToolDefinition, StandardizedExample
|
|
9
|
+
|
|
10
|
+
SYSTEM_PROMPT = """Please use the search tool provided to find relevant information from the corpus.
|
|
11
|
+
Formulate effective search queries to retrieve the most relevant chunks.
|
|
12
|
+
You can filter by metadata or filename to narrow your search.
|
|
13
|
+
Write your complete answer on the final line only as a concise entity, within the xml tags <answer></answer>.\n
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def percent_of_text_a_in_text_b(text_a, text_b):
|
|
18
|
+
if not text_a:
|
|
19
|
+
return 0.0
|
|
20
|
+
|
|
21
|
+
matcher = SequenceMatcher(None, text_a, text_b)
|
|
22
|
+
matched_chars = sum(
|
|
23
|
+
size for _, _, size in matcher.get_matching_blocks()
|
|
24
|
+
)
|
|
25
|
+
return (matched_chars / len(text_a))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def chunk_overlap_reward_function(
|
|
29
|
+
completion: str,
|
|
30
|
+
ground_truth: str,
|
|
31
|
+
**kwargs: Any
|
|
32
|
+
) -> float:
|
|
33
|
+
"""
|
|
34
|
+
Reward function that computes the percentage of overlapping text between
|
|
35
|
+
the completion and the ground truth.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
completion: The model's generated text
|
|
39
|
+
ground_truth: The reference text to compare against
|
|
40
|
+
**kwargs: Additional arguments (not used here)
|
|
41
|
+
Returns:
|
|
42
|
+
float: A score between 0.0 and 1.0 representing the overlap percentage.
|
|
43
|
+
"""
|
|
44
|
+
reference_chunks = kwargs.get("reference_chunks", [])
|
|
45
|
+
reference_string = " ".join(reference_chunks)
|
|
46
|
+
completion_str = completion if isinstance(completion, str) else ""
|
|
47
|
+
if isinstance(completion, list):
|
|
48
|
+
completion_str = " ".join(
|
|
49
|
+
[c.get("content", "") for c in completion if isinstance(c, dict) and c.get("role", "") != "assistant"]
|
|
50
|
+
)
|
|
51
|
+
for msg in completion:
|
|
52
|
+
if not isinstance(msg, dict):
|
|
53
|
+
continue
|
|
54
|
+
if msg.get("role", "") != "assistant":
|
|
55
|
+
continue
|
|
56
|
+
msg_content = msg.get("content", "")
|
|
57
|
+
if msg_content.count("<tool_call>") >= 4:
|
|
58
|
+
return 0.0
|
|
59
|
+
|
|
60
|
+
if reference_string:
|
|
61
|
+
overlap_score = percent_of_text_a_in_text_b(reference_string, completion_str)
|
|
62
|
+
if overlap_score >= 0.25:
|
|
63
|
+
return overlap_score
|
|
64
|
+
return 0.0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SearchEnv(BaseEnv):
|
|
68
|
+
"""Search environment with BM25 corpus search tool."""
|
|
69
|
+
|
|
70
|
+
system_prompt: str = SYSTEM_PROMPT
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
api_key: str,
|
|
75
|
+
corpus_id: str,
|
|
76
|
+
base_url: str,
|
|
77
|
+
**kwargs,
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Initialize the search environment.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
api_key: API key for authentication (required)
|
|
84
|
+
corpus_id: ID of the corpus to search (required)
|
|
85
|
+
base_url: Base URL of the search API (required)
|
|
86
|
+
"""
|
|
87
|
+
if not api_key:
|
|
88
|
+
raise ValueError("api_key is required")
|
|
89
|
+
if not corpus_id:
|
|
90
|
+
raise ValueError("corpus_id is required")
|
|
91
|
+
|
|
92
|
+
self._api_key = api_key
|
|
93
|
+
self._corpus_id = corpus_id
|
|
94
|
+
self._base_url = base_url.rstrip("/")
|
|
95
|
+
|
|
96
|
+
search_tool_definition = ToolDefinition(
|
|
97
|
+
name="search_corpus",
|
|
98
|
+
description="Search the corpus using BM25 with optional metadata and filename filtering.",
|
|
99
|
+
input_schema={
|
|
100
|
+
"type": "object",
|
|
101
|
+
"properties": {
|
|
102
|
+
"query": {
|
|
103
|
+
"type": "string",
|
|
104
|
+
"description": "Search query string.",
|
|
105
|
+
},
|
|
106
|
+
"metadata": {
|
|
107
|
+
"type": "object",
|
|
108
|
+
"description": "Optional metadata filters (e.g., {'ticker': 'DDOG', 'year': 2024}).",
|
|
109
|
+
},
|
|
110
|
+
"filename": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"description": "Optional filename filter. Simple string for substring match (e.g., 'config') or regex pattern (e.g., '.*\\.json$').",
|
|
113
|
+
},
|
|
114
|
+
"limit": {
|
|
115
|
+
"type": "integer",
|
|
116
|
+
"description": "Max number of results to return (default 10).",
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
"required": ["query"],
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
self._tools: Dict[str, Tuple[ToolDefinition, Callable]] = {
|
|
124
|
+
search_tool_definition.name: (search_tool_definition, self._search_corpus_tool)
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async def _search_corpus_tool(
|
|
128
|
+
self,
|
|
129
|
+
query: str,
|
|
130
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
131
|
+
filename: Optional[str] = None,
|
|
132
|
+
limit: int = 10,
|
|
133
|
+
**kwargs
|
|
134
|
+
) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Search the corpus using BM25.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
query: Search query string
|
|
140
|
+
metadata: Optional metadata filters
|
|
141
|
+
filename: Optional filename filter (substring or regex)
|
|
142
|
+
limit: Maximum number of results
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Formatted search results or error message
|
|
146
|
+
"""
|
|
147
|
+
if not query:
|
|
148
|
+
return "Error: Missing required parameter: 'query'"
|
|
149
|
+
|
|
150
|
+
# Build request body
|
|
151
|
+
request_body = {"query": query, "limit": limit}
|
|
152
|
+
if metadata:
|
|
153
|
+
request_body["metadata"] = metadata
|
|
154
|
+
if filename:
|
|
155
|
+
request_body["filename"] = filename
|
|
156
|
+
|
|
157
|
+
# Build URL
|
|
158
|
+
url = f"{self._base_url}/api/corpora/{self._corpus_id}/search"
|
|
159
|
+
headers = {
|
|
160
|
+
"x-api-key": self._api_key,
|
|
161
|
+
"Content-Type": "application/json",
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
async with aiohttp.ClientSession() as session:
|
|
166
|
+
async with session.post(
|
|
167
|
+
url,
|
|
168
|
+
json=request_body,
|
|
169
|
+
headers=headers,
|
|
170
|
+
timeout=aiohttp.ClientTimeout(total=10.0),
|
|
171
|
+
) as resp:
|
|
172
|
+
if resp.status != 200:
|
|
173
|
+
error_text = await resp.text()
|
|
174
|
+
return f"Error: API request failed with status {resp.status}: {error_text}"
|
|
175
|
+
|
|
176
|
+
data = await resp.json()
|
|
177
|
+
|
|
178
|
+
results = data.get("results", [])
|
|
179
|
+
total = data.get("total", 0)
|
|
180
|
+
|
|
181
|
+
if not results:
|
|
182
|
+
return "No results found."
|
|
183
|
+
|
|
184
|
+
# Format results
|
|
185
|
+
lines = []
|
|
186
|
+
for i, item in enumerate(results, start=1):
|
|
187
|
+
filename_val = item.get("filename", "—")
|
|
188
|
+
score = item.get("score")
|
|
189
|
+
score_str = f"(score: {score:.2f})" if score is not None else "(filtered)"
|
|
190
|
+
content = item.get("content", "")
|
|
191
|
+
metadata_val = item.get("metadata", {})
|
|
192
|
+
|
|
193
|
+
lines.append(f"{i}. {filename_val} {score_str}")
|
|
194
|
+
lines.append(f" Content: {content}")
|
|
195
|
+
if metadata_val:
|
|
196
|
+
lines.append(f" Metadata: {metadata_val}")
|
|
197
|
+
|
|
198
|
+
lines.append(f"\nTotal: {total} results")
|
|
199
|
+
return "\n".join(lines)
|
|
200
|
+
|
|
201
|
+
except aiohttp.ClientError as e:
|
|
202
|
+
return f"Error: Network error: {str(e)}"
|
|
203
|
+
except Exception as e:
|
|
204
|
+
return f"Error: {str(e)}"
|
|
205
|
+
|
|
206
|
+
async def shutdown(self):
|
|
207
|
+
# no cleanup required
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def dataset_preprocess(cls, example: Any, **kwargs) -> StandardizedExample:
|
|
212
|
+
return StandardizedExample(
|
|
213
|
+
prompt=example.get("Question", ""),
|
|
214
|
+
ground_truth=example.get("Answer", None),
|
|
215
|
+
init_rollout_args={},
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
async def list_tools(self) -> List[ToolDefinition]:
|
|
219
|
+
"""List available tools."""
|
|
220
|
+
return [self._tools[k][0] for k in sorted(self._tools)]
|
|
221
|
+
|
|
222
|
+
async def run_tool(self, rollout_id: str, tool_name: str, **tool_args) -> Any:
|
|
223
|
+
"""
|
|
224
|
+
Execute a tool.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
rollout_id: Identifier for current rollout (unused for stateless env)
|
|
228
|
+
tool_name: Name of the tool (e.g., "search_corpus")
|
|
229
|
+
**tool_args: Arguments for the tool function
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Tool execution result or error message
|
|
233
|
+
"""
|
|
234
|
+
_, tool_function = self._tools[tool_name]
|
|
235
|
+
return await tool_function(**tool_args)
|
|
236
|
+
|
|
237
|
+
async def init_rollout(self, rollout_id: str, **rollout_args) -> None:
|
|
238
|
+
"""Initialize rollout (no-op for stateless environment)."""
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
async def release_rollout(self, rollout_id: str) -> None:
|
|
242
|
+
"""Release rollout (no-op for stateless environment)."""
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
async def copy_to_workspace(
|
|
246
|
+
self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Not implemented for this environment."""
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
async def copy_content_to_workspace(
|
|
252
|
+
self, rollout_id: str, src_content: str | bytes, dst_filename: str
|
|
253
|
+
) -> None:
|
|
254
|
+
"""Not implemented for this environment."""
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
async def copy_from_workspace(
|
|
258
|
+
self, rollout_id: str, src_filename: str, dst_path: Path
|
|
259
|
+
) -> None:
|
|
260
|
+
"""Not implemented for this environment."""
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
async def compute_reward(
|
|
264
|
+
self, rollout_id: str, completion: str, ground_truth: Any, **kwargs: Any
|
|
265
|
+
) -> Dict[str, float]:
|
|
266
|
+
"""Compute rewards using the chunk overlap reward function."""
|
|
267
|
+
return {
|
|
268
|
+
"chunk_overlap": await chunk_overlap_reward_function(completion, ground_truth, **kwargs)
|
|
269
|
+
}
|
|
@@ -37,6 +37,7 @@ src/benchmax/envs/mcp/provisioners/local_provisioner.py
|
|
|
37
37
|
src/benchmax/envs/mcp/provisioners/manual_provisioner.py
|
|
38
38
|
src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py
|
|
39
39
|
src/benchmax/envs/mcp/provisioners/utils.py
|
|
40
|
+
src/benchmax/envs/search/search_env.py
|
|
40
41
|
src/benchmax/envs/wikipedia/utils.py
|
|
41
42
|
src/benchmax/envs/wikipedia/wiki_env.py
|
|
42
43
|
src/benchmax/prompts/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/skyrl/benchmax_data_process.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py
RENAMED
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/reward_fn.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/base_provisioner.py
RENAMED
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/local_provisioner.py
RENAMED
|
File without changes
|
{benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|