PyPI - benchmax - Versions diffs - 0.1.1.dev4__tar.gz → 0.1.1.dev6__tar.gz - Mend

benchmax 0.1.1.dev4tar.gz → 0.1.1.dev6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: benchmax
-Version: 0.1.1.dev4
+Version: 0.1.1.dev6
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: cgft.io
 Requires-Python: >=3.11,<3.13
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.12
 Provides-Extra: crm
 Provides-Extra: excel
 Provides-Extra: excel-linux
+Provides-Extra: skypilot
 Provides-Extra: verifiers
 Provides-Extra: verl
 Requires-Dist: fastmcp (>=2.10.0,<2.11.0)
@@ -17,6 +18,7 @@ Requires-Dist: openpyxl (==3.1.5) ; extra == "excel-linux" or extra == "excel"
 Requires-Dist: python-dateutil (>=2.9.0,<2.10.0) ; extra == "crm"
 Requires-Dist: sglang[all] (==0.4.9) ; extra == "verl"
 Requires-Dist: simple-salesforce (>=1.12.3) ; extra == "crm"
+Requires-Dist: skypilot (==0.8.1) ; extra == "skypilot"
 Requires-Dist: verifiers[train] (>=0.1.1,<0.2.0) ; extra == "verifiers"
 Requires-Dist: verl-cgft-fork (==0.5.0.dev2) ; extra == "verl"
 Requires-Dist: xlwings (==0.33.15) ; extra == "excel"
@@ -68,7 +70,7 @@ Get started with ready to use recipes, from Wikipedia search to spreadsheet mani
 **Trainer Integrations**
-Use your own trainer or training framework - no lock-in. `benchmax` is already Integrated into verl and verifiers, with more integrations (SkyRL, etc.) coming soon!
+Use your own trainer or training framework - no lock-in. `benchmax` is already integrated into verl and verifiers, with more integrations (SkyRL, etc.) coming soon!
 **MCP Support**
 Tap into the growing MCP ecosystem and integrate them as tools within your environments.
@@ -89,6 +91,8 @@ Tap into the growing MCP ecosystem and integrate them as tools within your envir
     `pip install benchmax[verl]`
+    \* Note that benchmax installs our verl fork (temporary until [PR gets merged](https://github.com/volcengine/verl/pull/2792))
 1. **Prepare the dataset**
     ```bash
@@ -387,7 +391,7 @@ Open an issue and tag us & we will look into building you one!
     - Facilitate easy deployment and scalability in cloud environments.
 - **MCP as a first class citizen**:
-    There has been an explosion of MCP servers/tools built out for usecases ranging from browser use to excel to game creation.`benchmax` allow folks to leverage and composes these existing MCP servers to build environment integrated with real world systems e.g. excel
+    There has been an explosion of MCP servers/tools built out for usecases ranging from browser use to excel to game creation.`benchmax` allows folks to leverage and compose these existing MCP servers to build environments integrated with real world systems e.g. excel
 ## 🤝 Contributing
@@ -399,3 +403,4 @@ We welcome new environment recipes, bug reports, and trainer integrations!
 ## 📜 License
 Apache 2.0 © 2025 CGFT Inc.

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/README.md RENAMED Viewed

@@ -44,7 +44,7 @@ Get started with ready to use recipes, from Wikipedia search to spreadsheet mani
 **Trainer Integrations**
-Use your own trainer or training framework - no lock-in. `benchmax` is already Integrated into verl and verifiers, with more integrations (SkyRL, etc.) coming soon!
+Use your own trainer or training framework - no lock-in. `benchmax` is already integrated into verl and verifiers, with more integrations (SkyRL, etc.) coming soon!
 **MCP Support**
 Tap into the growing MCP ecosystem and integrate them as tools within your environments.
@@ -65,6 +65,8 @@ Tap into the growing MCP ecosystem and integrate them as tools within your envir
     `pip install benchmax[verl]`
+    \* Note that benchmax installs our verl fork (temporary until [PR gets merged](https://github.com/volcengine/verl/pull/2792))
 1. **Prepare the dataset**
     ```bash
@@ -363,7 +365,7 @@ Open an issue and tag us & we will look into building you one!
     - Facilitate easy deployment and scalability in cloud environments.
 - **MCP as a first class citizen**:
-    There has been an explosion of MCP servers/tools built out for usecases ranging from browser use to excel to game creation.`benchmax` allow folks to leverage and composes these existing MCP servers to build environment integrated with real world systems e.g. excel
+    There has been an explosion of MCP servers/tools built out for usecases ranging from browser use to excel to game creation.`benchmax` allows folks to leverage and compose these existing MCP servers to build environments integrated with real world systems e.g. excel
 ## 🤝 Contributing
@@ -374,4 +376,4 @@ We welcome new environment recipes, bug reports, and trainer integrations!
 ## 📜 License
-Apache 2.0 © 2025 CGFT Inc.
+Apache 2.0 © 2025 CGFT Inc.

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/base_env.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Any, Tuple
+from typing import Dict, List, Any, Optional, Tuple
 from pathlib import Path
 from datasets import (
@@ -59,34 +59,38 @@ class BaseEnv(ABC):
         return load_dataset(dataset_name, **kwargs), None
     @abstractmethod
-    def list_tools(self) -> List[ToolDefinition]:
+    async def list_tools(self) -> List[ToolDefinition]:
         """Return list of available tools"""
         pass
     @abstractmethod
-    def run_tool(self, rollout_id: str, tool_name: str, **tool_args) -> Any:
+    async def run_tool(self, rollout_id: str, tool_name: str, **tool_args) -> Any:
         """Execute named tool in rollout context with given arguments"""
         pass
     @abstractmethod
-    def init_rollout(self, rollout_id: str, **rollout_args) -> None:
+    async def init_rollout(self, rollout_id: str, **rollout_args) -> None:
         """Initialize resources for a new rollout"""
         pass
     @abstractmethod
-    def cleanup_rollout(self, rollout_id: str) -> None:
-        """Clean up resources for a rollout"""
+    async def copy_to_workspace(
+        self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
+    ) -> None:
+        """Copy a file to the workspace for a specific rollout. If dst_filename is None, use the original filename."""
         pass
     @abstractmethod
-    def get_rollout_workspace(self, rollout_id: str) -> Path:
-        """Get the workspace path for a specific rollout"""
+    async def copy_from_workspace(
+        self, rollout_id: str, src_filename: str, dst_path: Path
+    ) -> None:
+        """Copy a file from the workspace for a specific rollout"""
         pass
-    def compute_reward(
+    @abstractmethod
+    async def compute_reward(
         self,
         rollout_id: str,
-        prompt: str,
         completion: str,
         ground_truth: Any,
         **kwargs: Any
@@ -95,32 +99,11 @@ class BaseEnv(ABC):
         Returns dict mapping reward function names to their computed scores.
         """
-        workspace = self.get_rollout_workspace(rollout_id)
-        if workspace is None:
-            raise ValueError(f"No workspace found for rollout {rollout_id}")
-        results: Dict[str, float] = {}
-        for func in self.reward_funcs:
-            try:
-                # Get function name, falling back to string representation if not available
-                func_name = getattr(func, "__name__", str(func))
-                results[func_name] = func(
-                    prompt=prompt,
-                    completion=completion,
-                    ground_truth=ground_truth,
-                    workspace=workspace,
-                    **kwargs
-                )
-            except Exception as e:
-                # Use same function name resolution
-                func_name = getattr(func, "__name__", str(func))
-                results[func_name] = float('nan')
-                print(f"[WARN] reward {func_name} failed: {e}")
-        return results
+        pass
-    def get_system_prompt(self, add_tool_defs: bool = False) -> str:
+    async def get_system_prompt(self, add_tool_defs: bool = False) -> str:
         """Get system prompt. To add tool definitions, set add_tool_defs to True."""
         if add_tool_defs:
-            return render_tools_prompt(self.list_tools(), self.system_prompt or "")
+            return render_tools_prompt(await self.list_tools(), self.system_prompt or "")
         else:
             return self.system_prompt

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/excel/excel_env.py RENAMED Viewed

@@ -138,16 +138,7 @@ class ExcelEnv(LocalMCPEnv):
         answer_spreadsheet_path = rollout_args["answer_spreadsheet_path"]
         super().init_rollout(rollout_id, **rollout_args)
-        workspace = self.get_rollout_workspace(rollout_id)
-        def _copy_to_workspace(src_path: Path):
-            """
-            Copy the spreadsheet file to the workspace if it doesn't already exist.
-            """
-            dest_path = workspace / src_path.name
-            if not dest_path.exists():
-                dest_path.write_bytes(src_path.read_bytes())
-        # Copy the spreadsheet to the workspace
-        _copy_to_workspace(Path(spreadsheet_path))
-        _copy_to_workspace(Path(answer_spreadsheet_path))
+        self.copy_to_workspace(rollout_id, Path(spreadsheet_path))
+        self.copy_to_workspace(rollout_id, Path(answer_spreadsheet_path))

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/excel/excel_utils.py RENAMED Viewed

@@ -16,7 +16,6 @@ WHITE_LIKE_COLORS = [
 ]
 def evaluate_excel(excel_path: str):
-    import xlwings
     """
     Evaluate Python code that manipulates an Excel file using xlwings.
     """
@@ -24,11 +23,13 @@ def evaluate_excel(excel_path: str):
     if platform.system() == "Linux":
         evaluate_excel_libre(excel_path)
         return
-    excel_app = xlwings.App(visible=False)
-    excel_book = excel_app.books.open(excel_path)
-    excel_book.save()
-    excel_book.close()
-    excel_app.quit()
+    else:
+        import xlwings
+        excel_app = xlwings.App(visible=False)
+        excel_book = excel_app.books.open(excel_path)
+        excel_book.save()
+        excel_book.close()
+        excel_app.quit()
 def evaluate_excel_libre(excel_path: str) -> None:
     """
@@ -120,7 +121,7 @@ def excel_to_str_repr(excel_path: str, evaluate_formulas = False) -> str:
                     result.append(f"{coords}: null [{', '.join(style)}]")
                     is_row_empty = False
                 elif display_value:
-                    style_str = f" [{", ".join(style)}]" if style else ""
+                    style_str = f" [{', '.join(style)}]" if style else ""
                     result.append(f"{coords}: {display_value}{style_str}")
                     is_row_empty = False
             if not is_row_empty:
@@ -212,4 +213,4 @@ def compare_excel_cells(ground_truth_path: str, output_path: str, answer_positio
                     return False, f"Fill color mismatch at {cell_name}"
                 if not compare_font_color(cell_gt.font, cell_out.font):
                     return False, f"Font color mismatch at {cell_name}"
-    return True, "All comparisons passed."
+    return True, "All comparisons passed."

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/local_mcp_env.py RENAMED Viewed

@@ -228,6 +228,34 @@ class LocalMCPEnv(BaseEnv):
             raise ValueError(f"No active client found for rollout {rollout_id}")
         else:
             return Path()
+    def copy_to_workspace(
+        self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
+    ) -> None:
+        """Copy a file to the workspace for a specific rollout. If dst_filename is None, use the original filename."""
+        if rollout_id not in self._active_clients:
+            raise ValueError(f"No active client found for rollout {rollout_id}")
+        if not src_path.exists():
+            raise FileNotFoundError(f"Source file {src_path} does not exist")
+        pair = self._active_clients[rollout_id]
+        dst_path = pair.workspace / (dst_filename or src_path.name)
+        dst_path.write_bytes(src_path.read_bytes())
+    def copy_from_workspace(
+        self, rollout_id: str, src_filename: str, dst_path: Path
+    ) -> None:
+        """Copy a file from the workspace for a specific rollout"""
+        if rollout_id not in self._active_clients:
+            raise ValueError(f"No active client found for rollout {rollout_id}")
+        pair = self._active_clients[rollout_id]
+        src_path = pair.workspace / src_filename
+        if not src_path.exists():
+            raise FileNotFoundError(f"File {src_filename} not found in workspace {pair.workspace}")
+        dst_path.write_bytes(src_path.read_bytes())
     # ---- Private Helper Methods ----

benchmax-0.1.1.dev6/benchmax/envs/skypilot/proxy_server.py ADDED Viewed

@@ -0,0 +1,167 @@
+import os
+import sys
+import shutil
+import uuid
+import yaml
+import asyncio
+from pathlib import Path
+from functools import wraps
+from fastmcp import FastMCP, Client
+from starlette.requests import Request
+from starlette.responses import PlainTextResponse, FileResponse, JSONResponse
+from starlette.datastructures import UploadFile
+from reward_func import reward_functions  # your reward functions
+# ---------------- Utility Functions ---------------- #
+def setup_workspace(base_dir: Path) -> Path:
+    """Create a unique workspace directory."""
+    ws = (base_dir / uuid.uuid4().hex).resolve()
+    ws.mkdir(parents=True, exist_ok=True)
+    return ws
+def load_config(config_path: Path, workspace: Path) -> dict:
+    """Load YAML config and inject workspace paths."""
+    with open(config_path, "r") as f:
+        content = f.read().replace("${{ sync_workdir }}", str(Path(__file__).resolve().parent))
+    config = yaml.safe_load(content)
+    if "mcpServers" in config:
+        for server in config["mcpServers"].values():
+            server["cwd"] = str(workspace)
+    return config
+# ---------------- Auth Decorator ---------------- #
+def require_auth(func):
+    """Require API_TOKEN header."""
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        request = args[1] if len(args) == 2 else args[0]
+        token = request.headers.get("Authorization")
+        if token != os.getenv("API_TOKEN", "default-secret-token"):
+            return PlainTextResponse("Unauthorized", status_code=401)
+        return await func(*args, **kwargs)
+    return wrapper
+# ---------------- Proxy Server ---------------- #
+class ProxyServer:
+    def __init__(self, base_dir="workspace", host="0.0.0.0", port=8080):
+        self.base_dir = Path(base_dir)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.host = host
+        self.port = port
+        self.workspace: Path | None = None
+        self.client: Client | None = None
+        self.proxy: FastMCP | None = None
+        self.config_path = Path(__file__).parent / "mcp_config.yaml"
+    async def _setup(self):
+        """Initialize workspace, MCP client, and proxy server."""
+        self.workspace = setup_workspace(self.base_dir)
+        config = load_config(self.config_path, self.workspace)
+        self.client = Client(config)
+        await self.client._connect()
+        self.proxy = FastMCP.as_proxy(self.client, name="proxy")
+        # Register endpoints
+        self.proxy.custom_route("/health", methods=["GET"])(self._health)
+        self.proxy.custom_route("/upload", methods=["POST"])(self._upload)
+        self.proxy.custom_route("/download", methods=["GET"])(self._download)
+        self.proxy.custom_route("/compute_reward", methods=["POST"])(self._compute_reward)
+        self.proxy.custom_route("/reset", methods=["POST"])(self._reset)
+    # ---------------- Endpoints ---------------- #
+    async def _health(self, request: Request):
+        return PlainTextResponse("OK")
+    @require_auth
+    async def _upload(self, request: Request):
+        if not self.workspace:
+            return PlainTextResponse("No workspace available", 500)
+        form = await request.form()
+        uploaded = []
+        for file in form.values():
+            if isinstance(file, UploadFile) and file.filename:
+                dest = self.workspace / file.filename
+                with open(dest, "wb") as f:
+                    f.write(await file.read())
+                uploaded.append(file.filename)
+        if not uploaded:
+            return PlainTextResponse("No files uploaded", 400)
+        return PlainTextResponse(f"Uploaded: {', '.join(uploaded)}")
+    @require_auth
+    async def _download(self, request: Request):
+        if not self.workspace:
+            return PlainTextResponse("No workspace", 500)
+        file_path = request.query_params.get("file_path")
+        if not file_path:
+            return PlainTextResponse("file_path required", 400)
+        full_path = self.workspace / file_path
+        if not full_path.exists() or not full_path.is_file():
+            return PlainTextResponse("File not found", 404)
+        return FileResponse(str(full_path), filename=full_path.name)
+    @require_auth
+    async def _compute_reward(self, request: Request):
+        try:
+            data = await request.json()
+        except Exception:
+            return PlainTextResponse("Invalid JSON", 400)
+        completion = data.get("completion")
+        ground_truth = data.get("ground_truth")
+        if completion is None or ground_truth is None:
+            return PlainTextResponse("completion and ground_truth required", 400)
+        results = {}
+        for func in reward_functions or []:
+            name = getattr(func, "__name__", str(func))
+            try:
+                results[name] = func(completion=completion, ground_truth=ground_truth, workspace=self.workspace, mcp_client=self.client, **{
+                    k: v for k, v in data.items() if k not in ("completion", "ground_truth")
+                })
+            except Exception as e:
+                results[name] = float("nan")
+                print(f"[WARN] reward {name} failed: {e}")
+        return JSONResponse(results)
+    @require_auth
+    async def _reset(self, request: Request):
+        """Reset server: clean workspace and restart process."""
+        async def do_reset():
+            await asyncio.sleep(0.1)
+            print("[INFO] Resetting server...")
+            sys.stdout.flush()
+            os.execv(sys.executable, [sys.executable] + sys.argv)
+        # Clean up workspace
+        self.cleanup_workspace()
+        asyncio.create_task(do_reset())
+        return PlainTextResponse("Server reset scheduled")
+    # ---------------- Public API ---------------- #
+    def cleanup_workspace(self):
+        if self.workspace and self.workspace.exists():
+            shutil.rmtree(self.workspace)
+    async def start(self):
+        await self._setup()
+        if self.proxy:
+            await self.proxy.run_async(transport="http", host=self.host, port=self.port)
+# ---------------- Main ---------------- #
+if __name__ == "__main__":
+    server = ProxyServer("../workspace")
+    try:
+        asyncio.run(server.start())
+    except KeyboardInterrupt:
+        print("\nShutting down gracefully...")
+        server.cleanup_workspace()

benchmax-0.1.1.dev6/benchmax/envs/skypilot/remote_skypilot_mcp_server.py ADDED Viewed

@@ -0,0 +1,694 @@
+import asyncio
+import datetime
+import aiohttp
+import uuid
+import tempfile
+import shutil
+from typing import Callable, List, Any, Optional, Dict
+from pathlib import Path
+from fastmcp import Client as FastMCPClient
+from mcp.types import TextContent
+from fastmcp.exceptions import ToolError
+from mcp import Tool
+import sky
+import logging
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+logging.basicConfig(level=logging.DEBUG, format='%(levelname)s:%(name)s:%(message)s')
+logging.getLogger('httpx').setLevel(logging.CRITICAL)
+logging.getLogger('aiohttp').setLevel(logging.CRITICAL)
+logging.getLogger('mcp.client.streamable_http').setLevel(logging.CRITICAL)
+logging.getLogger('urllib3').setLevel(logging.CRITICAL)
+logging.getLogger('requests').setLevel(logging.CRITICAL)
+from benchmax.envs.base_env import BaseEnv
+from benchmax.envs.types import ToolDefinition
+logger = logging.getLogger(__name__)
+class RemoteSkypilotMcpEnv(BaseEnv):
+    """Remote MCP Environment for managing tool execution and rollouts with a remote MCP server.
+    Currently only supports running on Skypilot containers.
+    """
+    def __init__(
+        self,
+        workdir_path: str,
+        num_nodes: int = 1,
+        allowed_tools: Optional[List[str]] = None,
+        output_parsers: Optional[Dict[str, Callable[[str], Any]]] = None,
+        cluster_name: str = "benchmax-env-cluster",
+        health_check_timeout: int = 300,  # 5 minutes
+        health_check_interval: int = 5,  # 5 seconds
+        launch_workers_on_init: bool = True,
+        cloud: Optional[Any] = sky.Azure(),  # sky.Cloud instance
+        cpus: str = "2+",
+    ) -> None:
+        """Initialize the environment with configuration and pool settings."""
+        super().__init__()
+        self._workdir_path = Path(workdir_path)
+        self._num_nodes = num_nodes
+        self._allowed_tools = allowed_tools or []
+        self._output_parsers: Dict[str, Callable[[str], Any]] = output_parsers or {}
+        self._cluster_name = cluster_name
+        self._health_check_timeout = health_check_timeout
+        self._health_check_interval = health_check_interval
+        self._cloud = cloud
+        self._cpus = cpus
+        self._ports = ["8080"]  # MCP server port
+        # Generate API token for worker authentication
+        self._api_token = uuid.uuid4().hex
+        # Generate unique cluster name with suffix
+        unique_suffix = uuid.uuid4().hex[:8]
+        self._full_cluster_name = f"{self._cluster_name}-{unique_suffix}"
+        # Sync directory management
+        self._sync_dir: Optional[Path] = None
+        # Worker management
+        self._client_pool: Dict[str, FastMCPClient] = {}
+        self._available_workers: asyncio.Queue[str] = asyncio.Queue()
+        self._rollout_to_worker: Dict[str, str] = {}
+        self._worker_init_tasks: List[asyncio.Task] = []
+        # HTTP session
+        self._http_session = aiohttp.ClientSession()
+        # Cached tool definitions
+        self._tool_definitions: Optional[List[ToolDefinition]] = None
+        # Launch workers and start initialization
+        self.launch_workers_started = False
+        if launch_workers_on_init:
+            self.launch_workers()
+    def _setup_sync_directory(self) -> Path:
+        """Create temporary sync directory and copy required files."""
+        # Create temporary directory
+        self._sync_dir = Path(tempfile.mkdtemp(prefix="benchmax_skypilot_"))
+        logger.info(f"Created sync directory: {self._sync_dir}")
+        try:
+            # Get the directory where this file is located
+            current_file_dir = Path(__file__).parent  # inside remote_skypilot_mcp_server.py
+            proxy_server_path = current_file_dir / "proxy_server.py"
+            # Copy proxy_server.py
+            if not proxy_server_path.exists():
+                raise FileNotFoundError(f"proxy_server.py not found at {proxy_server_path}")
+            shutil.copy2(proxy_server_path, self._sync_dir / "proxy_server.py")
+            logger.debug(f"Copied proxy_server.py to sync directory")
+            # Copy all contents from workdir_path
+            if not self._workdir_path.exists():
+                raise FileNotFoundError(f"Workdir path does not exist: {self._workdir_path}")
+            if not self._workdir_path.is_dir():
+                raise ValueError(f"Workdir path is not a directory: {self._workdir_path}")
+            # Copy all contents
+            for item in self._workdir_path.iterdir():
+                if item.is_file():
+                    shutil.copy2(item, self._sync_dir / item.name)
+                    logger.debug(f"Copied file {item.name} to sync directory")
+                elif item.is_dir():
+                    shutil.copytree(item, self._sync_dir / item.name)
+                    logger.debug(f"Copied directory {item.name} to sync directory")
+            # Validate required files exist
+            reward_func_path = self._sync_dir / "reward_func.py"
+            setup_sh_path = self._sync_dir / "setup.sh"
+            mcp_config_path = self._sync_dir / "mcp_config.yaml"
+            if not reward_func_path.exists():
+                raise FileNotFoundError(f"reward_func.py not found in workdir: {self._workdir_path}")
+            if not setup_sh_path.exists():
+                raise FileNotFoundError(f"setup.sh not found in workdir: {self._workdir_path}")
+            if not mcp_config_path.exists():
+                raise FileNotFoundError(f"mcp_config.yaml not found in workdir: {self._workdir_path}")
+            logger.info(f"Validated required files in sync directory")
+            return self._sync_dir
+        except Exception as e:
+            # Clean up sync directory if setup fails
+            if self._sync_dir and self._sync_dir.exists():
+                shutil.rmtree(self._sync_dir, ignore_errors=True)
+                self._sync_dir = None
+            raise e
+    def _cleanup_sync_directory(self) -> None:
+        """Clean up the temporary sync directory."""
+        if self._sync_dir and self._sync_dir.exists():
+            try:
+                shutil.rmtree(self._sync_dir)
+                logger.info(f"Cleaned up sync directory: {self._sync_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up sync directory {self._sync_dir}: {e}")
+            finally:
+                self._sync_dir = None
+    async def _init_worker(self, worker_ip: str) -> None:
+        """Initialize a single worker: health check + FastMCP client + add to pool."""
+        try:
+            # Health check
+            await self._wait_for_worker_health(worker_ip)
+            # Initialize FastMCP client
+            mcp_url = f"http://{worker_ip}:8080/mcp/"
+            client = FastMCPClient(mcp_url)
+            await client._connect()
+            self._client_pool[worker_ip] = client
+            # Add to available pool
+            await self._available_workers.put(worker_ip)
+            logger.debug(f"Worker {worker_ip} initialized and added to pool")
+        except Exception as e:
+            logger.error(f"Failed to initialize worker {worker_ip}: {e}")
+            # Don't re-raise - let other workers continue
+    async def _wait_for_worker_health(self, worker_ip: str) -> None:
+        """Wait for worker to pass health check."""
+        health_url = f"http://{worker_ip}:8080/health"
+        start_time = asyncio.get_event_loop().time()
+        while True:
+            elapsed = asyncio.get_event_loop().time() - start_time
+            if elapsed > self._health_check_timeout:
+                raise TimeoutError(f"Health check timeout for worker {worker_ip}")
+            try:
+                timeout = aiohttp.ClientTimeout(total=5)
+                async with self._http_session.get(health_url, timeout=timeout) as response:
+                    if response.status == 200:
+                        logger.debug(f"Worker {worker_ip} is healthy")
+                        return
+                    else:
+                        logger.debug(f"Worker {worker_ip} health check returned {response.status}")
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                logger.debug(f"Health check failed for {worker_ip}: {e}")
+            await asyncio.sleep(self._health_check_interval)
+    async def _get_available_worker(self) -> str:
+        """Get an available worker, blocking until one is ready."""
+        return await self._available_workers.get()
+    async def _release_worker(self, worker_ip: str) -> None:
+        """Return a worker to the available pool."""
+        await self._available_workers.put(worker_ip)
+    async def _call_worker_reset(self, worker_ip: str) -> None:
+        """Call the reset endpoint on a specific worker."""
+        reset_url = f"http://{worker_ip}:8080/reset"
+        headers = {"Authorization": self._api_token}
+        try:
+            async with self._http_session.post(reset_url, headers=headers) as response:
+                if response.status == 200:
+                    logger.info(f"Reset successful for worker {worker_ip}")
+                else:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Reset failed for worker {worker_ip}: {response.status} - {error_text}")
+        except aiohttp.ClientError as e:
+            raise RuntimeError(f"Reset request failed for worker {worker_ip}: {e}")
+    async def add_worker_back_once_available(self, worker_ip: str) -> None:
+        """Add a worker back to the available pool once it passes health check."""
+        await asyncio.sleep(1)  # brief delay before starting health checks
+        try:
+            await self._init_worker(worker_ip)
+            logger.info(f"Worker {worker_ip} added back to available pool after reset")
+        except Exception as e:
+            logger.error(f"Failed to add worker {worker_ip} back to pool: {e}")
+            # Don't re-raise - worker remains out of pool
+    # Function is expected to be called at the end of compute_reward
+    async def _cleanup_rollout(self, rollout_id: str) -> None:
+        """Clean up rollout resources and return worker to pool."""
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        logger.debug(f"Cleaning up rollout {rollout_id} on worker {worker_ip}")
+        del self._rollout_to_worker[rollout_id]
+        try:
+            # Call reset endpoint
+            await self._call_worker_reset(worker_ip)
+        except Exception as e:
+            logger.error(f"Failed to reset worker {worker_ip} for rollout {rollout_id}: {e}")
+        # Disconnect the client
+        client = self._client_pool.get(worker_ip)
+        if client:
+            await client._disconnect()
+            self._client_pool.pop(worker_ip, None)
+        # Start background task to add worker back once healthy
+        asyncio.create_task(self.add_worker_back_once_available(worker_ip))
+    def _convert_and_filter_tools(self, tools: List[Tool]) -> List[ToolDefinition]:
+        """Convert Tool objects to ToolDefinition objects and filter based on allowed list."""
+        tool_definitions = [
+            ToolDefinition(
+                name=tool.name,
+                description=tool.description or "",
+                input_schema=tool.inputSchema
+            )
+            for tool in tools
+        ]
+        if not self._allowed_tools:
+            return tool_definitions
+        return [tool for tool in tool_definitions if tool.name in self._allowed_tools]
+    # ---- Public API Methods ----
+    def launch_workers(self) -> None:
+        """Launch SkyPilot workers synchronously with programmatically created task."""
+        if self.launch_workers_started:
+            raise RuntimeError("Workers have already been launched.")
+        self.launch_workers_started = True
+        try:
+            # Setup sync directory and copy files
+            sync_dir = self._setup_sync_directory()
+            # Create the task programmatically
+            task = sky.Task(
+                name='fastmcp',
+                setup='pip install fastmcp~=2.10.0\npip install pyyaml\nsh setup.sh',
+                run='python proxy_server.py',
+                workdir=str(sync_dir),
+                num_nodes=self._num_nodes
+            )
+            # Set the resources
+            task.set_resources(
+                sky.Resources(
+                    cloud=self._cloud,
+                    cpus=self._cpus,
+                    ports=self._ports
+                )
+            )
+            # Update environment variables with API token
+            task.update_envs({"API_TOKEN": self._api_token})
+            # Launch the cluster
+            _, handle = sky.launch(
+                task=task,
+                cluster_name=self._full_cluster_name,
+                detach_run=True,
+                detach_setup=True,
+                retry_until_up=True
+            )
+            if handle is None:
+                raise RuntimeError("Failed to launch SkyPilot task.")
+            worker_ips = [
+                external_ip for _, external_ip in handle.stable_internal_external_ips
+            ]
+            logger.info(f"Launched workers with IPs: {worker_ips}")
+            # Start background initialization for each worker
+            for worker_ip in worker_ips:
+                task = asyncio.create_task(self._init_worker(worker_ip))
+                self._worker_init_tasks.append(task)
+        except Exception as e:
+            # Clean up sync directory if launch fails
+            self._cleanup_sync_directory()
+            raise e
+    async def shutdown(self) -> None:
+        """Clean up resources - stop all tasks and close clients."""
+        try:
+            # Cancel worker initialization tasks
+            for task in self._worker_init_tasks:
+                if not task.done():
+                    task.cancel()
+            if self._worker_init_tasks:
+                results = await asyncio.gather(*self._worker_init_tasks, return_exceptions=True)
+                for i, result in enumerate(results):
+                    if isinstance(result, Exception) and not isinstance(result, asyncio.CancelledError):
+                        logger.error(f"Error in worker init task {i}: {result}")
+            # Close FastMCP clients
+            if self._client_pool:
+                close_tasks = [client.close() for client in self._client_pool.values()]
+                results = await asyncio.gather(*close_tasks, return_exceptions=True)
+                for i, result in enumerate(results):
+                    if isinstance(result, Exception):
+                        worker_ip = list(self._client_pool.keys())[i]
+                        logger.error(f"Error closing FastMCP client for {worker_ip}: {result}")
+            # Close HTTP session
+            await self._http_session.close()
+            # Tear down SkyPilot cluster
+            try:
+                sky.down(cluster_name=self._full_cluster_name)
+            except Exception as e:
+                logger.error(f"Error tearing down SkyPilot cluster: {e}")
+        finally:
+            # Always clean up sync directory
+            self._cleanup_sync_directory()
+    async def list_tools(self) -> List[ToolDefinition]:
+        """List available tools, using cached definitions if available."""
+        if self._tool_definitions is not None:
+            return self._tool_definitions
+        # Get any available worker to fetch tools
+        worker_ip = await self._get_available_worker()
+        try:
+            client = self._client_pool[worker_ip]
+            tools = await client.list_tools()
+            self._tool_definitions = self._convert_and_filter_tools(tools)
+            return self._tool_definitions
+        finally:
+            await self._release_worker(worker_ip)
+    async def init_rollout(self, rollout_id: str, **rollout_args) -> None:
+        """Initialize resources for a new rollout - assigns a worker to the rollout."""
+        if rollout_id in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is already initialized")
+        # Get an available worker (blocks until one is ready)
+        worker_ip = await self._get_available_worker()
+        # Assign worker to rollout
+        self._rollout_to_worker[rollout_id] = worker_ip
+        logger.info(f"Rollout {rollout_id} assigned to worker {worker_ip}")
+    async def run_tool(self, rollout_id: str, tool_name: str, **tool_args) -> Optional[str]:
+        """Execute a tool in the context of a specific rollout."""
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized. Call init_rollout() first.")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        client = self._client_pool[worker_ip]
+        try:
+            content_list = (await client.call_tool(tool_name, tool_args, timeout=datetime.timedelta(seconds=30))).content
+            text_content = []
+            # Process content based on type
+            for content in content_list:
+                # Text content
+                if isinstance(content, TextContent):
+                    text_content.append(content.text)
+                # Only process text content for now
+            combined_text = "\n".join(text_content)
+            # Apply output parser if available
+            if tool_name in self._output_parsers and isinstance(combined_text, str):
+                return self._output_parsers[tool_name](combined_text)
+            return combined_text
+        except ToolError as e:
+            logger.error(f"[ERROR] Tool call returned error: {str(e)}")
+            return None
+        except Exception as e:
+            logger.error(f"[ERROR] Tool call failed: {str(e)}")
+            return None
+    async def copy_to_workspace(
+        self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
+    ) -> None:
+        """Copy a file to the workspace for a specific rollout."""
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        upload_url = f"http://{worker_ip}:8080/upload"
+        headers = {"Authorization": self._api_token}
+        # Prepare file for upload
+        filename = dst_filename or src_path.name
+        try:
+            with open(src_path, 'rb') as f:
+                data = aiohttp.FormData()
+                data.add_field('file', f, filename=filename)
+                async with self._http_session.post(upload_url, headers=headers, data=data) as response:
+                    if response.status == 200:
+                        logger.info(f"File {src_path} uploaded as {filename} for rollout {rollout_id}")
+                    else:
+                        error_text = await response.text()
+                        raise RuntimeError(f"Upload failed: {response.status} - {error_text}")
+        except Exception as e:
+            logger.error(f"Failed to copy {src_path} to workspace for rollout {rollout_id}: {e}")
+            raise
+    async def copy_content_to_workspace(
+        self, rollout_id: str, src_content: str | bytes, dst_filename: str, encoding: str = "utf-8"
+    ) -> None:
+        """Copy content (string or bytes) to the workspace for a specific rollout.
+        Args:
+            rollout_id: The rollout identifier.
+            src_content: The content to upload (str or bytes).
+            dst_filename: The filename to assign in the workspace.
+            encoding: Encoding to use if src_content is str. Defaults to UTF-8.
+        """
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        upload_url = f"http://{worker_ip}:8080/upload"
+        headers = {"Authorization": self._api_token}
+        try:
+            if isinstance(src_content, str):
+                file_bytes = src_content.encode(encoding)
+                content_type = "text/plain"
+            else:
+                file_bytes = src_content
+                content_type = "application/octet-stream"
+            data = aiohttp.FormData()
+            data.add_field(
+                "file",
+                file_bytes,
+                filename=dst_filename,
+                content_type=content_type,
+            )
+            async with self._http_session.post(upload_url, headers=headers, data=data) as response:
+                if response.status == 200:
+                    logger.info(f"Content uploaded as {dst_filename} for rollout {rollout_id}")
+                else:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Upload failed: {response.status} - {error_text}")
+        except Exception as e:
+            logger.error(f"Failed to upload content to workspace for rollout {rollout_id}: {e}")
+            raise
+    async def copy_from_workspace(
+        self, rollout_id: str, src_filename: str, dst_path: Path
+    ) -> None:
+        """Copy a file from the workspace for a specific rollout."""
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        download_url = f"http://{worker_ip}:8080/download"
+        headers = {"Authorization": self._api_token}
+        params = {"file_path": src_filename}
+        try:
+            async with self._http_session.get(download_url, headers=headers, params=params) as response:
+                if response.status == 200:
+                    # Ensure destination directory exists
+                    dst_path.parent.mkdir(parents=True, exist_ok=True)
+                    # Write file content
+                    with open(dst_path, 'wb') as f:
+                        async for chunk in response.content.iter_chunked(8192):
+                            f.write(chunk)
+                    logger.info(f"File {src_filename} downloaded from rollout {rollout_id} to {dst_path}")
+                else:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Download failed: {response.status} - {error_text}")
+        except Exception as e:
+            logger.error(f"Failed to copy {src_filename} from workspace for rollout {rollout_id}: {e}")
+            raise
+    async def compute_reward(
+            self,
+            rollout_id: str,
+            completion: str,
+            ground_truth: Any,
+            **kwargs: Any
+    ) -> Dict[str, float]:
+        """Compute rewards using registered functions
+        Returns dict mapping reward function names to their computed scores.
+        """
+        if rollout_id not in self._rollout_to_worker:
+            raise ValueError(f"Rollout {rollout_id} is not initialized")
+        worker_ip = self._rollout_to_worker[rollout_id]
+        compute_reward_url = f"http://{worker_ip}:8080/compute_reward"
+        headers = {
+            "Authorization": self._api_token,
+            "Content-Type": "application/json"
+        }
+        # Prepare request payload
+        payload = {
+            "completion": completion,
+            "ground_truth": ground_truth,
+            **kwargs
+        }
+        try:
+            async with self._http_session.post(
+                compute_reward_url,
+                headers=headers,
+                json=payload
+            ) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    logger.debug(f"Reward computed successfully for rollout {rollout_id}")
+                    return result
+                else:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Reward computation failed: {response.status} - {error_text}")
+        except aiohttp.ClientError as e:
+            logger.error(f"Failed to compute reward for rollout {rollout_id}: {e}")
+            raise RuntimeError(f"Reward computation request failed: {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error computing reward for rollout {rollout_id}: {e}")
+            raise
+        finally:
+            await self._cleanup_rollout(rollout_id)
+async def run_single_rollout(env: RemoteSkypilotMcpEnv, rollout_id: str, expression: str, expected: str, tmp_root: Path):
+    """Run a complete rollout: init -> upload -> download+verify -> tool -> reward -> cleanup"""
+    print(f"Starting rollout: {rollout_id}")
+    # Create rollout-specific tmp dir
+    rollout_tmp = tmp_root / rollout_id
+    rollout_tmp.mkdir(parents=True, exist_ok=True)
+    # Stage 1: Initialize rollout
+    await env.init_rollout(rollout_id)
+    print(f"Initialized rollout: {rollout_id}")
+    # Stage 1.5a: Upload various content types
+    test_contents = {
+        "utf8_text.txt": f"# UTF-8 text for {rollout_id}\nExpression: {expression}\n",
+        "latin1_text.txt": "Café Münster".encode("latin-1"),
+        "json_data.json": '{"rollout": "%s", "value": %s}' % (rollout_id, expression),
+        "binary_data.bin": b"\x00\x01\x02\x03\xFF",
+        "unicode_text.txt": "你好, мир, hello 🌍",
+    }
+    for filename, content in test_contents.items():
+        await env.copy_content_to_workspace(rollout_id, content, filename)
+        print(f"Uploaded {filename} for {rollout_id}")
+    # Stage 1.5b: Test file-based copy
+    tmp_path = rollout_tmp / "local_file.txt"
+    tmp_path.write_text(f"Temporary file for {rollout_id}, expression={expression}\n", encoding="utf-8")
+    await env.copy_to_workspace(rollout_id, tmp_path, dst_filename=f"copied_{rollout_id}.txt")
+    print(f"Copied file {tmp_path} to workspace for {rollout_id}")
+    # Stage 1.6: Download and verify content
+    for filename, original_content in test_contents.items():
+        download_path = rollout_tmp / f"dl_{filename}"
+        await env.copy_from_workspace(rollout_id, filename, download_path)
+        downloaded_bytes = download_path.read_bytes()
+        if isinstance(original_content, str):
+            original_bytes = original_content.encode("utf-8")
+        else:
+            original_bytes = original_content
+        if downloaded_bytes == original_bytes:
+            print(f"Verified {filename} ✅")
+        else:
+            print(f"Mismatch in {filename}! ❌")
+    # Verify copied file
+    print(f"Verifying copied file for {rollout_id}")
+    copied_dl = rollout_tmp / f"dl_copied_{rollout_id}.txt"
+    await env.copy_from_workspace(rollout_id, f"copied_{rollout_id}.txt", copied_dl)
+    if copied_dl.read_text(encoding="utf-8") == tmp_path.read_text(encoding="utf-8"):
+        print(f"Verified copied file ✅")
+    else:
+        print(f"Mismatch in copied file ❌")
+    # Stage 2: Run tool
+    tool_result = await env.run_tool(rollout_id, "calculate", expression=expression)
+    print(f"Tool result for {rollout_id}: {tool_result}")
+    # Stage 3: Compute reward
+    reward = await env.compute_reward(rollout_id, completion=str(tool_result), ground_truth=expected)
+    print(f"Computed reward for {rollout_id}: {reward}")
+    return rollout_id, reward
+async def main():
+    env = RemoteSkypilotMcpEnv(
+        workdir_path="benchmax/envs/skypilot/workdir",
+        num_nodes=2,
+        cluster_name="test-cluster",
+        cloud=sky.Azure(),
+        cpus="2+",
+    )
+    tmp_root = Path("./tmp")
+    tmp_root.mkdir(exist_ok=True)
+    try:
+        tools = await env.list_tools()
+        print(f"Available tools: {[tool.name for tool in tools]}")
+        rollout_tasks = []
+        for i in range(3):  # fewer for debugging; adjust as needed
+            rollout_id = f"test-rollout-{i:03d}"
+            expression = f"{i + 1} + {i + 1}"
+            expected = str((i + 1) + (i + 1))
+            task = run_single_rollout(env, rollout_id, expression, expected, tmp_root)
+            rollout_tasks.append(task)
+        print("Starting concurrent rollouts...")
+        results = await asyncio.gather(*rollout_tasks, return_exceptions=True)
+        print("Rollout results:")
+        for result in results:
+            print(result)
+    finally:
+        await env.shutdown()
+        # Cleanup tmp dir at the very end
+        shutil.rmtree(tmp_root, ignore_errors=True)
+        print("Cleaned up temporary files.")
+if __name__ == "__main__":
+    asyncio.run(main())

benchmax-0.1.1.dev6/benchmax/envs/skypilot/workdir/mcp_config.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+mcpServers:
+  server-name:
+    command: uvx
+    args:
+      - mcp-server-calculator

benchmax-0.1.1.dev6/benchmax/envs/skypilot/workdir/reward_func.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import Any
+from fastmcp import Client
+def reward_function(
+    completion: str,
+    ground_truth: Any,
+    workspace: str,
+    mcp_client: Client,
+    **kwargs: Any
+) -> float:
+    """Compute the reward for a given model completion."""
+    print(f"Workspace for reward function: {workspace}")
+    return 1.0 if completion.strip() == ground_truth.strip() else 0.0
+reward_functions = [reward_function]

benchmax-0.1.1.dev6/benchmax/envs/skypilot/workdir/setup.sh ADDED Viewed

@@ -0,0 +1,3 @@
+#!/bin/bash
+# Install uv for our calculator mcp
+curl -LsSf https://astral.sh/uv/0.8.14/install.sh | sh

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/types.py RENAMED Viewed

@@ -7,7 +7,6 @@ class StandardizedExample(TypedDict):
     ground_truth: Any
     init_rollout_args: Optional[Dict[str, Any]]
 @dataclass
 class ToolDefinition:
     """Definition of a tool's interface"""
@@ -19,10 +18,9 @@ class RewardFunction(Protocol):
     """Function that evaluates model interactions"""
     def __call__(
         self,
-        prompt: str,         # Input prompt given to the model
         completion: str,     # Model's generated completion/response
         ground_truth: Any,   # Expected/correct output to compare against
-        workspace: Path,     # Path to rollout's workspace with tool outputs
+        workspace: str,      # Current workspace of the rollout
         **kwargs: Any        # Additional context for reward computation
     ) -> float:             # Reward score (typically in range [0, 1])
         ...

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/benchmax/envs/wikipedia/wiki_env.py RENAMED Viewed

@@ -220,6 +220,18 @@ class WikipediaEnv(BaseEnv):
     def get_rollout_workspace(self, rollout_id: str) -> Path:
         return super().get_rollout_workspace(rollout_id)
+    def copy_to_workspace(
+        self, rollout_id: str, src_path: Path, dst_filename: Optional[str] = None
+    ) -> None:
+        """Copy a file to the workspace for a specific rollout."""
+        pass
+    def copy_from_workspace(
+        self, rollout_id: str, src_filename: str, dst_path: Path
+    ) -> None:
+        """Copy a file from the workspace for a specific rollout."""
+        pass
 if __name__ == "__main__":
     # Example usage

{benchmax-0.1.1.dev4 → benchmax-0.1.1.dev6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "benchmax"
-version = "0.1.1.dev4"
+version = "0.1.1.dev6"
 description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
 authors = ["cgft.io"]
 readme = "README.md"
@@ -12,6 +12,8 @@ packages = [
 python = ">=3.11,<3.13"
 fastmcp = "~2.10.0"
+skypilot = { version = "0.8.1", optional = true }
 verl-cgft-fork = { version = "0.5.0.dev2", optional = true }
 sglang = { version = "0.4.9", optional = true, extras = ["all"] }
 verifiers = { version = "^0.1.1", optional = true, extras = ["train"]  }
@@ -30,6 +32,9 @@ pytest = "^8.4.1"
 verifiers = ["verifiers"]
 verl = ["verl-cgft-fork", "sglang"]
+# Hosting-specific
+skypilot = ["skypilot"]
 # Environment-specific
 excel-linux = ["openpyxl"]
 excel = ["openpyxl", "xlwings"]