PyPI - benchmax - Versions diffs - 0.1.1.dev0__tar.gz → 0.1.1.dev1__tar.gz - Mend

benchmax 0.1.1.dev0tar.gz → 0.1.1.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: benchmax
-Version: 0.1.1.dev0
+Version: 0.1.1.dev1
 Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
 Author: cgft.io
 Requires-Python: >=3.11,<3.13
@@ -10,17 +10,16 @@ Classifier: Programming Language :: Python :: 3.12
 Provides-Extra: crm
 Provides-Extra: excel
 Provides-Extra: excel-linux
-Provides-Extra: full
 Provides-Extra: verifiers
 Provides-Extra: verl
-Provides-Extra: wikipedia
-Requires-Dist: fastmcp (>=2.10.0,<2.11.0) ; extra == "excel-linux" or extra == "excel" or extra == "crm" or extra == "wikipedia" or extra == "full"
-Requires-Dist: openpyxl (==3.1.5) ; extra == "excel-linux" or extra == "excel" or extra == "full"
-Requires-Dist: python-dateutil (>=2.9.0,<2.10.0) ; extra == "crm" or extra == "full"
-Requires-Dist: simple-salesforce (>=1.12.3) ; extra == "crm" or extra == "full"
-Requires-Dist: verifiers[train] (>=0.1.1,<0.2.0) ; extra == "verifiers" or extra == "full"
-Requires-Dist: verl-cgft (==0.4.1.dev0) ; extra == "verl" or extra == "full"
-Requires-Dist: xlwings (==0.33.15) ; extra == "excel" or extra == "full"
+Requires-Dist: fastmcp (>=2.10.0,<2.11.0)
+Requires-Dist: openpyxl (==3.1.5) ; extra == "excel-linux" or extra == "excel"
+Requires-Dist: python-dateutil (>=2.9.0,<2.10.0) ; extra == "crm"
+Requires-Dist: sglang (==0.4.9) ; extra == "verl"
+Requires-Dist: simple-salesforce (>=1.12.3) ; extra == "crm"
+Requires-Dist: verifiers[train] (>=0.1.1,<0.2.0) ; extra == "verifiers"
+Requires-Dist: verl-cgft-fork (==0.4.1.dev1) ; extra == "verl"
+Requires-Dist: xlwings (==0.33.15) ; extra == "excel"
 Description-Content-Type: text/markdown
 <picture>

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/adapters/verl/benchmax_data_process.py RENAMED Viewed

@@ -136,12 +136,13 @@ if __name__ == "__main__":
                     "init_rollout_args"
                 ]
             }
-            if example.get("init_rollout_args", None):
-                extra_info["tools_kwargs"] = {
-                    tool_name: {
-                        "create_kwargs": {**example.get("init_rollout_args", {})},
-                    } for tool_name in tool_names
-                }
+            create_args = example.get("init_rollout_args", {}) or {"dummy": "dummy"}
+            extra_info["tools_kwargs"] = {
+                tool_name: {
+                    "create_kwargs": {**create_args},
+                } for tool_name in tool_names
+            }
             example.pop("init_rollout_args")
             # This extra_info is used to pass addition info during reward computation
             example["extra_info"] = extra_info

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/base_env.py RENAMED Viewed

@@ -24,11 +24,16 @@ class BaseEnv(ABC):
         - "ground_truth": Any
         - "init_rollout_args": Optional[Dict[str, Any]]
         """
-        return {
-            "prompt": example.get("prompt", ""),
-            "ground_truth": example.get("ground_truth", None),
-            "init_rollout_args": example.get("init_rollout_args", {})
-        }
+        prompt = example.pop("prompt", "")
+        ground_truth = example.pop("ground_truth", "")
+        init_rollout_args = example.pop("init_rollout_args", "")
+        return StandardizedExample(
+            prompt=prompt,
+            ground_truth=ground_truth,
+            init_rollout_args=init_rollout_args,
+            **example,
+        )
     @classmethod
     def load_dataset(
         cls, dataset_name: str, **kwargs

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/crm/crm_env.py RENAMED Viewed

@@ -245,8 +245,9 @@ class CRMEnv(LocalMCPEnv):
         if metadata and "required" in metadata:
             required_metadata = metadata["required"]
             prompt = f"{persona}\n{task}\n{required_metadata}\n{query}"
-        return {
-            "prompt": prompt,
-            "ground_truth": answer,
-            "init_rollout_args": None
-        }
+        return StandardizedExample(
+            prompt=prompt,
+            ground_truth=answer,
+            init_rollout_args={}
+        )

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/excel/README.md RENAMED Viewed

@@ -8,6 +8,9 @@ This is based off the [SpreadsheetBench Benchmark](https://spreadsheetbench.gith
 **Important**: Before using this environment, ensure you have the appropriate spreadsheet application installed:
 - **Linux**: LibreOffice must be installed
+```bash
+sudo apt install libreoffice
+```
 - **Windows/macOS**: Microsoft Excel must be installed
 ## Installation

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/excel/excel_env.py RENAMED Viewed

@@ -121,14 +121,15 @@ class ExcelEnv(LocalMCPEnv):
         Output Path: {target_output_path}
         """
-        return {
-            "prompt": prompt.strip(),
-            "ground_truth": "",
-            "init_rollout_args": {
+        return StandardizedExample(
+            prompt=prompt.strip(),
+            ground_truth="",
+            init_rollout_args={
                 "spreadsheet_path": str(source_input_path),
                 "answer_spreadsheet_path": str(Path(spreadsheet_path) / target_answer_path),
-            }
-        }
+            },
+            **example
+        )
     def init_rollout(self, rollout_id: str, **rollout_args):
         if "spreadsheet_path" not in rollout_args:

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/local_mcp_env.py RENAMED Viewed

@@ -4,6 +4,8 @@ from pathlib import Path
 import asyncio
 import json
 from threading import Thread
+import uuid
 from fastmcp import Client as FastMCPClient
 from fastmcp.exceptions import ToolError
 from mcp import Tool
@@ -58,8 +60,7 @@ class LocalMCPEnv(BaseEnv):
         self._output_parsers: Dict[str, Callable[[str], Any]] = {}
         self._workspace_dir = workspace_dir or Path("workspaces")
         self._workspace_dir.mkdir(parents=True, exist_ok=True)
-        self._counter = 0  # Counter for workspace naming
         self._pre_warmed_pool: List[ClientWorkspacePair] = []  # Available pre-initialized pairs
         self._active_clients: BoundedDict[str, ClientWorkspacePair] = BoundedDict(10000)  # rollout_id -> pair mapping
         self._tool_definitions: Optional[List[ToolDefinition]] = None
@@ -219,12 +220,14 @@ class LocalMCPEnv(BaseEnv):
             else:
                 self._active_clients.pop(rollout_id)
-    def get_rollout_workspace(self, rollout_id: str) -> Path:
+    def get_rollout_workspace(self, rollout_id: str, strict_check: bool = False) -> Path:
         """Get dedicated workspace path for a rollout"""
         if rollout_id in self._active_clients:
             return self._active_clients[rollout_id].workspace
-        else:
+        if strict_check:
             raise ValueError(f"No active client found for rollout {rollout_id}")
+        else:
+            return Path()
     # ---- Private Helper Methods ----
@@ -267,8 +270,7 @@ class LocalMCPEnv(BaseEnv):
     async def _create_client_workspace(self) -> ClientWorkspacePair:
         """Create a new FastMCP client with a unique workspace"""
-        workspace = Path(self._workspace_dir) / f"{self._counter}"
-        self._counter += 1
+        workspace = self._workspace_dir / uuid.uuid4().hex
         workspace.mkdir(parents=True, exist_ok=True)
         config = self._prepare_config(workspace)

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/benchmax/envs/wikipedia/wiki_env.py RENAMED Viewed

@@ -206,11 +206,11 @@ class WikipediaEnv(BaseEnv):
         return tool_function(**tool_args)
     def dataset_preprocess(self, example: Any) -> StandardizedExample:
-        return {
-            "prompt": example.get("Question", ""),
-            "ground_truth": example.get("Answer", None),
-            "init_rollout_args": {}
-        }
+        return StandardizedExample(
+            prompt=example.get("Question", ""),
+            ground_truth=example.get("Answer", None),
+            init_rollout_args={}
+        )
     def init_rollout(self, rollout_id: str, **rollout_args) -> None:
         return super().init_rollout(rollout_id, **rollout_args)

{benchmax-0.1.1.dev0 → benchmax-0.1.1.dev1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "benchmax"
-version = "0.1.1.dev0"
+version = "0.1.1.dev1"
 description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
 authors = ["cgft.io"]
 readme = "README.md"
@@ -10,12 +10,12 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.11,<3.13"
+fastmcp = "~2.10.0"
-verl-cgft      = { version = "0.4.1.dev0", optional = true }
+verl-cgft-fork = { version = "0.4.1.dev1", optional = true }
+sglang = { version = "0.4.9", optional = true }
 verifiers = { version = "^0.1.1", optional = true, extras = ["train"]  }
-fastmcp = { version = "~2.10.0", optional = true }
 openpyxl = { version = "3.1.5", optional = true }
 xlwings = { version = "0.33.15", optional = true }
@@ -28,24 +28,12 @@ pytest = "^8.4.1"
 [tool.poetry.extras]
 # Independent feature extras
 verifiers = ["verifiers"]
-verl = ["verl-cgft"]
+verl = ["verl-cgft-fork", "sglang"]
 # Environment-specific
-excel-linux = ["openpyxl", "fastmcp"]
-excel = ["openpyxl", "xlwings", "fastmcp"]
-crm = ["simple-salesforce", "python-dateutil", "fastmcp"]
-wikipedia = ["fastmcp"]
-# Everything
-full = [
-  "verl-cgft",
-  "verifiers",
-  "openpyxl",
-  "xlwings",
-  "simple-salesforce",
-  "python-dateutil",
-  "fastmcp"
-]
+excel-linux = ["openpyxl"]
+excel = ["openpyxl", "xlwings"]
+crm = ["simple-salesforce", "python-dateutil"]
 [build-system]
 requires = ["poetry-core"]

benchmax-0.1.1.dev0/benchmax/adapters/verifiers/examples/verifiers_crm_example.py DELETED Viewed

@@ -1,54 +0,0 @@
-import verifiers as vf
-from benchmax.adapters.verifiers.verifiers_adapters import get_verifiers_environment
-from benchmax.envs.crm.crm_env import CRMEnv
-"""
-Multi-GPU training (single node, 3 training + 1 inference)
-CUDA_VISIBLE_DEVICES=0 poetry run vf-vllm --model willcb/Qwen3-4B
-CUDA_VISIBLE_DEVICES=1,2,3 accelerate launch benchmax/adapters/verifiers/examples/verifiers_crm_example.py
-"""
-dataset, _ = CRMEnv.load_dataset("Salesforce/CRMArenaPro", name="CRMArenaPro", split="b2b")
-benchmax_env = CRMEnv(pool_size=3)
-dataset = dataset.map(
-    lambda example: benchmax_env.dataset_preprocess(example),
-)
-splits = dataset.train_test_split(test_size=0.1, seed=42)
-train_ds = splits["train"]
-vf_env = get_verifiers_environment(
-    benchmax_env,
-    max_concurrent=3,
-    max_turns=3,
-    dataset=train_ds,
-)
-model_name = "willcb/Qwen3-4B"
-model, tokenizer = vf.get_model_and_tokenizer(model_name)
-run_name = "verifiers-excel" + model_name.split("/")[-1].lower()
-training_args=vf.grpo_defaults(run_name=run_name)
-training_args.per_device_train_batch_size=2
-training_args.num_generations=12
-training_args.gradient_accumulation_steps=2
-training_args.num_iterations=1
-training_args.num_train_epochs=5
-training_args.max_prompt_length=10000
-training_args.max_completion_length=4096
-training_args.max_steps=500
-training_args.save_steps=100
-training_args.report_to = "none"
-training_args.log_completions = False
-trainer = vf.GRPOTrainer(
-    model=model,
-    processing_class=tokenizer,
-    env=vf_env,
-    args=training_args,
-)
-trainer.train()

benchmax-0.1.1.dev0/benchmax/adapters/verifiers/examples/verifiers_excel_example.py DELETED Viewed

@@ -1,54 +0,0 @@
-import verifiers as vf
-from benchmax.adapters.verifiers.verifiers_adapters import get_verifiers_environment
-from benchmax.envs.excel.excel_env import ExcelEnv
-"""
-Multi-GPU training (single node, 3 training + 1 inference)
-CUDA_VISIBLE_DEVICES=0 poetry run vf-vllm --model willcb/Qwen3-4B
-CUDA_VISIBLE_DEVICES=1,2,3 accelerate launch benchmax/adapters/verifiers/examples/verifiers_excel_example.py
-"""
-dataset, dataset_path = ExcelEnv.load_dataset()
-mcp_benchmax_env = ExcelEnv(dataset_path=dataset_path, pool_size=3)
-dataset = dataset.map(
-    lambda example: mcp_benchmax_env.dataset_preprocess(example),
-)
-splits = dataset.train_test_split(test_size=0.1, seed=42)
-train_ds = splits["train"]
-vf_env = get_verifiers_environment(
-    mcp_benchmax_env,
-    max_concurrent=3,
-    max_turns=3,
-    dataset=train_ds,
-)
-model_name = "willcb/Qwen3-4B"
-model, tokenizer = vf.get_model_and_tokenizer(model_name)
-run_name = "verifiers-excel" + model_name.split("/")[-1].lower()
-training_args=vf.grpo_defaults(run_name=run_name)
-training_args.per_device_train_batch_size=2
-training_args.num_generations=12
-training_args.gradient_accumulation_steps=2
-training_args.num_iterations=1
-training_args.num_train_epochs=5
-training_args.max_prompt_length=10000
-training_args.max_completion_length=4096
-training_args.max_steps=500
-training_args.save_steps=100
-training_args.report_to = "none"
-training_args.log_completions = False
-trainer = vf.GRPOTrainer(
-    model=model,
-    processing_class=tokenizer,
-    env=vf_env,
-    args=training_args,
-)
-trainer.train()

benchmax-0.1.1.dev0/benchmax/adapters/verifiers/examples/verifiers_math_example.py DELETED Viewed

@@ -1,54 +0,0 @@
-import verifiers as vf
-from datasets import load_dataset
-from benchmax.adapters.verifiers.verifiers_adapters import get_verifiers_environment
-from benchmax.envs.math.math_env import MathEnv
-"""
-Multi-GPU training (single node, 3 training + 1 inference)
-CUDA_VISIBLE_DEVICES=0 poetry run vf-vllm --model willcb/Qwen3-4B
-CUDA_VISIBLE_DEVICES=1,2,3 accelerate launch benchmax/adapters/verifiers/examples/verifiers_math_example.py
-"""
-math_env = MathEnv()
-dataset, _ = MathEnv.load_dataset("dawidmt/arithmetic50", split="test")
-dataset = dataset.map(
-    lambda example: math_env.dataset_preprocess(example),
-)
-splits = dataset.train_test_split(test_size=0.1, seed=42)
-train_ds = splits["train"]
-vf_env = get_verifiers_environment(
-    math_env,
-    max_concurrent=3,
-    max_turns=3,
-    dataset=train_ds,
-)
-model_name = "willcb/Qwen3-4B"
-model, tokenizer = vf.get_model_and_tokenizer(model_name)
-run_name = "math-grpo" + model_name.split("/")[-1].lower()
-training_args=vf.grpo_defaults(run_name=run_name)
-training_args.per_device_train_batch_size=6
-training_args.num_generations=12
-training_args.gradient_accumulation_steps=2
-training_args.num_iterations=1
-training_args.num_train_epochs=5
-training_args.max_prompt_length=1024
-training_args.max_completion_length=4096
-training_args.max_steps=500
-training_args.save_steps=100
-training_args.report_to = "none"
-trainer = vf.GRPOTrainer(
-    model=model,
-    processing_class=tokenizer,
-    env=vf_env,
-    args=training_args,
-)
-trainer.train()

benchmax-0.1.1.dev0/benchmax/adapters/verl/examples/config/benchmax_multiturn_grpo.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-hydra:
-  searchpath:
-    - pkg://verl.trainer.config
-defaults:
-  - ppo_trainer
-  - _self_
-data:
-  max_prompt_length: 1024
-  max_response_length: 1024
-  train_batch_size: 256
-  return_raw_chat: True
-actor_rollout_ref:
-  hybrid_engine: True
-  rollout:
-    name: sglang
-    multi_turn:
-      enable: True
-      max_assistant_turns: 5

benchmax-0.1.1.dev0/benchmax/adapters/verl/examples/config/tool_config/benchmax_math_tool_config.yaml DELETED Viewed

@@ -1,7 +0,0 @@
-# Example Benchmax Tool Config
-tools:
-  # Class name points to benchmax class. This is expected to be a subclass of benchmax.envs.BaseEnv
-  - class_name: benchmax.envs.math.math_env.MathEnv
-    config:
-      type: benchmax
-      # Specify initialization args for Sandbox here e.g. api_keys

benchmax-0.1.1.dev0/benchmax/adapters/verl/examples/run_qwen2.5-3b_benchmax_math.sh DELETED Viewed

@@ -1,69 +0,0 @@
-# make sure your current working directory is the root of the project
-# Specifically note the last 3 lines
-# The first line points to tool config, which is necessary for initializing tools from the benchmax environment
-# The second and third lines point to the relevant benchmax environment to initialize the rewards from
-set -x
-ulimit -n 65535
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/benchmax/adapters/verl/examples/config"
-CONFIG_NAME="benchmax_multiturn_grpo"
-TRAIN_DATA="~/data/math/train.parquet"
-VAL_DATA="~/data/math/test.parquet"
-TOOL_CONFIG="$CONFIG_PATH/tool_config/benchmax_math_tool_config.yaml"
-BENCHMAX_CLASS_NAME="benchmax.envs.math.math_env.MathEnv"
-PYTHONPATH="$PYTHONPATH:$(pwd)" python -m verl.trainer.main_ppo \
-    --config-path="$CONFIG_PATH" \
-    --config-name="$CONFIG_NAME" \
-    algorithm.adv_estimator=grpo \
-    data.train_batch_size=4 \
-    data.val_batch_size=4 \
-    data.max_prompt_length=4096 \
-    data.max_response_length=3000 \
-    data.filter_overlong_prompts=True \
-    data.truncation='error' \
-    data.return_raw_chat=True \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=4 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.max_model_len=15000 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.rollout.name=sglang \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.val_before_train=False \
-    trainer.logger=['console','wandb'] \
-    trainer.project_name='wiki_search' \
-    trainer.experiment_name='qwen2.5-3b-instruct_wiki_search' \
-    trainer.n_gpus_per_node=4 \
-    trainer.nnodes=1 \
-    trainer.save_freq=100 \
-    trainer.test_freq=50 \
-    data.train_files="$TRAIN_DATA" \
-    data.val_files="$VAL_DATA"  \
-    trainer.total_epochs=1 $@ \
-    actor_rollout_ref.rollout.multi_turn.tool_config_path="$TOOL_CONFIG" \
-    reward_model.reward_manager=benchmax \
-    +reward_model.reward_kwargs.benchmax_cls_name="$BENCHMAX_CLASS_NAME"