benchmax 0.1.2.dev4__tar.gz → 0.1.2.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/PKG-INFO +1 -1
  2. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/pyproject.toml +1 -1
  3. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/server_pool.py +1 -1
  4. benchmax-0.1.2.dev6/src/benchmax/envs/search/search_env.py +269 -0
  5. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/PKG-INFO +1 -1
  6. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/SOURCES.txt +1 -0
  7. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/LICENSE +0 -0
  8. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/README.md +0 -0
  9. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/setup.cfg +0 -0
  10. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/__init__.py +0 -0
  11. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/benchmax_wrapper.py +0 -0
  12. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/skyrl/benchmax_data_process.py +0 -0
  13. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/adapters/skyrl/skyrl_adapter.py +0 -0
  14. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/__init__.py +0 -0
  15. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/base_env.py +0 -0
  16. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/crm_env.py +0 -0
  17. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
  18. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/crm/workdir/salesforce_mcp.py +0 -0
  19. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/data_utils.py +0 -0
  20. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/excel_env.py +0 -0
  21. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/__init__.py +0 -0
  22. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
  23. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
  24. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
  25. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/math/math_env.py +0 -0
  26. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
  27. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/__init__.py +0 -0
  28. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
  29. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
  30. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/parallel_mcp_env.py +0 -0
  31. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
  32. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
  33. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
  34. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
  35. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
  36. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/provisioners/utils.py +0 -0
  37. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/proxy_server.py +0 -0
  38. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/mcp/utils.py +0 -0
  39. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/types.py +0 -0
  40. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/wikipedia/utils.py +0 -0
  41. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/envs/wikipedia/wiki_env.py +0 -0
  42. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/prompts/__init__.py +0 -0
  43. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax/prompts/tools.py +0 -0
  44. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/dependency_links.txt +0 -0
  45. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/requires.txt +0 -0
  46. {benchmax-0.1.2.dev4 → benchmax-0.1.2.dev6}/src/benchmax.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev4
3
+ Version: 0.1.2.dev6
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: cgft.io
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchmax"
3
- version = "0.1.2.dev4"
3
+ version = "0.1.2.dev6"
4
4
  description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "cgft.io" }]
@@ -116,7 +116,7 @@ class ServerPool:
116
116
  async with self._server_available:
117
117
  # Wait until a server is available
118
118
  while not self._unassigned_servers:
119
- logger.info(
119
+ logger.debug(
120
120
  f"[{rollout_id}] Waiting for available server "
121
121
  f"(pool: 0 available, {len(self._rollout_to_server)} assigned)"
122
122
  )
@@ -0,0 +1,269 @@
1
+ from difflib import SequenceMatcher
2
+ from pathlib import Path
3
+ from typing import Any, Callable, Dict, List, Optional, Tuple
4
+
5
+ import aiohttp
6
+
7
+ from benchmax.envs.base_env import BaseEnv
8
+ from benchmax.envs.types import ToolDefinition, StandardizedExample
9
+
10
+ SYSTEM_PROMPT = """Please use the search tool provided to find relevant information from the corpus.
11
+ Formulate effective search queries to retrieve the most relevant chunks.
12
+ You can filter by metadata or filename to narrow your search.
13
+ Write your complete answer on the final line only as a concise entity, within the xml tags <answer></answer>.\n
14
+ """
15
+
16
+
17
+ def percent_of_text_a_in_text_b(text_a, text_b):
18
+ if not text_a:
19
+ return 0.0
20
+
21
+ matcher = SequenceMatcher(None, text_a, text_b)
22
+ matched_chars = sum(
23
+ size for _, _, size in matcher.get_matching_blocks()
24
+ )
25
+ return (matched_chars / len(text_a))
26
+
27
+
28
+ async def chunk_overlap_reward_function(
29
+ completion: str,
30
+ ground_truth: str,
31
+ **kwargs: Any
32
+ ) -> float:
33
+ """
34
+ Reward function that computes the percentage of overlapping text between
35
+ the completion and the ground truth.
36
+
37
+ Args:
38
+ completion: The model's generated text
39
+ ground_truth: The reference text to compare against
40
+ **kwargs: Additional arguments (not used here)
41
+ Returns:
42
+ float: A score between 0.0 and 1.0 representing the overlap percentage.
43
+ """
44
+ reference_chunks = kwargs.get("reference_chunks", [])
45
+ reference_string = " ".join(reference_chunks)
46
+ completion_str = completion if isinstance(completion, str) else ""
47
+ if isinstance(completion, list):
48
+ completion_str = " ".join(
49
+ [c.get("content", "") for c in completion if isinstance(c, dict) and c.get("role", "") != "assistant"]
50
+ )
51
+ for msg in completion:
52
+ if not isinstance(msg, dict):
53
+ continue
54
+ if msg.get("role", "") != "assistant":
55
+ continue
56
+ msg_content = msg.get("content", "")
57
+ if msg_content.count("<tool_call>") >= 4:
58
+ return 0.0
59
+
60
+ if reference_string:
61
+ overlap_score = percent_of_text_a_in_text_b(reference_string, completion_str)
62
+ if overlap_score >= 0.25:
63
+ return overlap_score
64
+ return 0.0
65
+
66
+
67
+ class SearchEnv(BaseEnv):
68
+ """Search environment with BM25 corpus search tool."""
69
+
70
+ system_prompt: str = SYSTEM_PROMPT
71
+
72
+ def __init__(
73
+ self,
74
+ api_key: str,
75
+ corpus_id: str,
76
+ base_url: str,
77
+ **kwargs,
78
+ ):
79
+ """
80
+ Initialize the search environment.
81
+
82
+ Args:
83
+ api_key: API key for authentication (required)
84
+ corpus_id: ID of the corpus to search (required)
85
+ base_url: Base URL of the search API (required)
86
+ """
87
+ if not api_key:
88
+ raise ValueError("api_key is required")
89
+ if not corpus_id:
90
+ raise ValueError("corpus_id is required")
91
+
92
+ self._api_key = api_key
93
+ self._corpus_id = corpus_id
94
+ self._base_url = base_url.rstrip("/")
95
+
96
+ search_tool_definition = ToolDefinition(
97
+ name="search_corpus",
98
+ description="Search the corpus using BM25 with optional metadata and filename filtering.",
99
+ input_schema={
100
+ "type": "object",
101
+ "properties": {
102
+ "query": {
103
+ "type": "string",
104
+ "description": "Search query string.",
105
+ },
106
+ "metadata": {
107
+ "type": "object",
108
+ "description": "Optional metadata filters (e.g., {'ticker': 'DDOG', 'year': 2024}).",
109
+ },
110
+ "filename": {
111
+ "type": "string",
112
+ "description": "Optional filename filter. Simple string for substring match (e.g., 'config') or regex pattern (e.g., '.*\\.json$').",
113
+ },
114
+ "limit": {
115
+ "type": "integer",
116
+ "description": "Max number of results to return (default 10).",
117
+ },
118
+ },
119
+ "required": ["query"],
120
+ },
121
+ )
122
+
123
+ self._tools: Dict[str, Tuple[ToolDefinition, Callable]] = {
124
+ search_tool_definition.name: (search_tool_definition, self._search_corpus_tool)
125
+ }
126
+
127
+ async def _search_corpus_tool(
128
+ self,
129
+ query: str,
130
+ metadata: Optional[Dict[str, Any]] = None,
131
+ filename: Optional[str] = None,
132
+ limit: int = 10,
133
+ **kwargs
134
+ ) -> str:
135
+ """
136
+ Search the corpus using BM25.
137
+
138
+ Args:
139
+ query: Search query string
140
+ metadata: Optional metadata filters
141
+ filename: Optional filename filter (substring or regex)
142
+ limit: Maximum number of results
143
+
144
+ Returns:
145
+ Formatted search results or error message
146
+ """
147
+ if not query:
148
+ return "Error: Missing required parameter: 'query'"
149
+
150
+ # Build request body
151
+ request_body = {"query": query, "limit": limit}
152
+ if metadata:
153
+ request_body["metadata"] = metadata
154
+ if filename:
155
+ request_body["filename"] = filename
156
+
157
+ # Build URL
158
+ url = f"{self._base_url}/api/corpora/{self._corpus_id}/search"
159
+ headers = {
160
+ "x-api-key": self._api_key,
161
+ "Content-Type": "application/json",
162
+ }
163
+
164
+ try:
165
+ async with aiohttp.ClientSession() as session:
166
+ async with session.post(
167
+ url,
168
+ json=request_body,
169
+ headers=headers,
170
+ timeout=aiohttp.ClientTimeout(total=10.0),
171
+ ) as resp:
172
+ if resp.status != 200:
173
+ error_text = await resp.text()
174
+ return f"Error: API request failed with status {resp.status}: {error_text}"
175
+
176
+ data = await resp.json()
177
+
178
+ results = data.get("results", [])
179
+ total = data.get("total", 0)
180
+
181
+ if not results:
182
+ return "No results found."
183
+
184
+ # Format results
185
+ lines = []
186
+ for i, item in enumerate(results, start=1):
187
+ filename_val = item.get("filename", "—")
188
+ score = item.get("score")
189
+ score_str = f"(score: {score:.2f})" if score is not None else "(filtered)"
190
+ content = item.get("content", "")
191
+ metadata_val = item.get("metadata", {})
192
+
193
+ lines.append(f"{i}. {filename_val} {score_str}")
194
+ lines.append(f" Content: {content}")
195
+ if metadata_val:
196
+ lines.append(f" Metadata: {metadata_val}")
197
+
198
+ lines.append(f"\nTotal: {total} results")
199
+ return "\n".join(lines)
200
+
201
+ except aiohttp.ClientError as e:
202
+ return f"Error: Network error: {str(e)}"
203
+ except Exception as e:
204
+ return f"Error: {str(e)}"
205
+
206
+ async def shutdown(self):
207
+ # no cleanup required
208
+ pass
209
+
210
+ @classmethod
211
+ def dataset_preprocess(cls, example: Any, **kwargs) -> StandardizedExample:
212
+ return StandardizedExample(
213
+ prompt=example.get("Question", ""),
214
+ ground_truth=example.get("Answer", None),
215
+ init_rollout_args={},
216
+ )
217
+
218
+ async def list_tools(self) -> List[ToolDefinition]:
219
+ """List available tools."""
220
+ return [self._tools[k][0] for k in sorted(self._tools)]
221
+
222
+ async def run_tool(self, rollout_id: str, tool_name: str, **tool_args) -> Any:
223
+ """
224
+ Execute a tool.
225
+
226
+ Args:
227
+ rollout_id: Identifier for current rollout (unused for stateless env)
228
+ tool_name: Name of the tool (e.g., "search_corpus")
229
+ **tool_args: Arguments for the tool function
230
+
231
+ Returns:
232
+ Tool execution result or error message
233
+ """
234
+ _, tool_function = self._tools[tool_name]
235
+ return await tool_function(**tool_args)
236
+
237
+ async def init_rollout(self, rollout_id: str, **rollout_args) -> None:
238
+ """Initialize rollout (no-op for stateless environment)."""
239
+ pass
240
+
241
+ async def release_rollout(self, rollout_id: str) -> None:
242
+ """Release rollout (no-op for stateless environment)."""
243
+ pass
244
+
245
+ async def copy_to_workspace(
246
+ self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
247
+ ) -> None:
248
+ """Not implemented for this environment."""
249
+ pass
250
+
251
+ async def copy_content_to_workspace(
252
+ self, rollout_id: str, src_content: str | bytes, dst_filename: str
253
+ ) -> None:
254
+ """Not implemented for this environment."""
255
+ pass
256
+
257
+ async def copy_from_workspace(
258
+ self, rollout_id: str, src_filename: str, dst_path: Path
259
+ ) -> None:
260
+ """Not implemented for this environment."""
261
+ pass
262
+
263
+ async def compute_reward(
264
+ self, rollout_id: str, completion: str, ground_truth: Any, **kwargs: Any
265
+ ) -> Dict[str, float]:
266
+ """Compute rewards using the chunk overlap reward function."""
267
+ return {
268
+ "chunk_overlap": await chunk_overlap_reward_function(completion, ground_truth, **kwargs)
269
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev4
3
+ Version: 0.1.2.dev6
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: cgft.io
6
6
  Classifier: Programming Language :: Python :: 3
@@ -37,6 +37,7 @@ src/benchmax/envs/mcp/provisioners/local_provisioner.py
37
37
  src/benchmax/envs/mcp/provisioners/manual_provisioner.py
38
38
  src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py
39
39
  src/benchmax/envs/mcp/provisioners/utils.py
40
+ src/benchmax/envs/search/search_env.py
40
41
  src/benchmax/envs/wikipedia/utils.py
41
42
  src/benchmax/envs/wikipedia/wiki_env.py
42
43
  src/benchmax/prompts/__init__.py
File without changes
File without changes
File without changes