PyPI - verifiers - Versions diffs - 0.1.13.dev7__tar.gz → 0.1.14__tar.gz - Mend

verifiers 0.1.13.dev7tar.gz → 0.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (296) hide show

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/.gitignore RENAMED Viewed

@@ -4,7 +4,6 @@ venv/
 env/
 .env
 .env.local
-uv.lock
 .claude/
 .cursorrules
 .ropeproject/
@@ -22,6 +21,7 @@ _build/
 docs/build/
 *.egg-info/
 __pycache__/
+environments/**/uv.lock
 .pytest_cache/
 .ruff_cache/

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.13.dev7
+Version: 0.1.14
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
 Requires-Dist: anthropic>=0.78.0
 Requires-Dist: datasets<4.7.0,>=3.0.0
 Requires-Dist: gepa
+Requires-Dist: httpx>=0.27.0
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: math-verify>=0.8.0
 Requires-Dist: mcp>=1.14.1
@@ -53,6 +54,8 @@ Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
 Requires-Dist: stagehand>=3.0.0; extra == 'browser'
 Provides-Extra: openenv
 Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
+Provides-Extra: renderers
+Requires-Dist: renderers>=0.1.6; extra == 'renderers'
 Provides-Extra: rg
 Requires-Dist: reasoning-gym; extra == 'rg'
 Provides-Extra: rl
@@ -197,11 +200,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
     async def correct_answer(completion, answer) -> float:
         completion_ans = completion[-1]['content']
         return 1.0 if completion_ans == answer else 0.0
-    rubric = Rubric(funcs=[correct_answer])
+    rubric = vf.Rubric(funcs=[correct_answer])
     env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
     return env
 ```
+For composable environments with reusable tasksets, toolsets, custom programs,
+or custom harnesses, use the v1 BYO Harness path:
+```python
+# my_env.py
+import verifiers.v1 as vf
+def source():
+    yield {
+        "prompt": [{"role": "user", "content": "Reverse abc."}],
+        "answer": "cba",
+        "max_turns": 1,
+    }
+@vf.reward(weight=1.0)
+async def contains_answer(task, state) -> float:
+    return float(task["answer"] in str(state.get("completion") or ""))
+def load_taskset(config: vf.TasksetConfig | None = None):
+    return vf.Taskset(source=source, rewards=[contains_answer], config=config)
+def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(taskset=load_taskset(config=config.taskset))
+```
+If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
+**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
+Reusable taskset and harness packages live under `verifiers.v1.packages` while
+the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
+For example, Harbor task directories can run through the bundled OpenCode CLI
+harness with:
+```python
+env = vf.Env(
+    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    harness=vf.OpenCode(),
+)
+```
+The same environment package is the unit used by evals and `prime-rl`. The
+trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
+and harness options stay under `env.taskset` and `env.harness`:
+```toml
+# configs/rl/my-v1-env.toml
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+max_steps = 100
+batch_size = 256
+rollouts_per_example = 8
+[sampling]
+max_tokens = 4096
+[[env]]
+id = "my-env"
+[env.args]
+arg1 = "non-th-arg"
+[env.harness]
+max_turns = 1
+[env.taskset.scoring.contains_answer]
+weight = 1.0
+```
+```bash
+prime env install my-env
+uv run prime-rl configs/rl/my-v1-env.toml
+```
 To install the environment module into your project, do:
 ```bash
 prime env install my-env # installs from ./environments/my_env
@@ -237,6 +310,8 @@ prime eval run primeintellect/math-python
 **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
+**[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
 **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
 **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/README.md RENAMED Viewed

@@ -124,11 +124,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
     async def correct_answer(completion, answer) -> float:
         completion_ans = completion[-1]['content']
         return 1.0 if completion_ans == answer else 0.0
-    rubric = Rubric(funcs=[correct_answer])
+    rubric = vf.Rubric(funcs=[correct_answer])
     env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
     return env
 ```
+For composable environments with reusable tasksets, toolsets, custom programs,
+or custom harnesses, use the v1 BYO Harness path:
+```python
+# my_env.py
+import verifiers.v1 as vf
+def source():
+    yield {
+        "prompt": [{"role": "user", "content": "Reverse abc."}],
+        "answer": "cba",
+        "max_turns": 1,
+    }
+@vf.reward(weight=1.0)
+async def contains_answer(task, state) -> float:
+    return float(task["answer"] in str(state.get("completion") or ""))
+def load_taskset(config: vf.TasksetConfig | None = None):
+    return vf.Taskset(source=source, rewards=[contains_answer], config=config)
+def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(taskset=load_taskset(config=config.taskset))
+```
+If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
+**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
+Reusable taskset and harness packages live under `verifiers.v1.packages` while
+the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
+For example, Harbor task directories can run through the bundled OpenCode CLI
+harness with:
+```python
+env = vf.Env(
+    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    harness=vf.OpenCode(),
+)
+```
+The same environment package is the unit used by evals and `prime-rl`. The
+trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
+and harness options stay under `env.taskset` and `env.harness`:
+```toml
+# configs/rl/my-v1-env.toml
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+max_steps = 100
+batch_size = 256
+rollouts_per_example = 8
+[sampling]
+max_tokens = 4096
+[[env]]
+id = "my-env"
+[env.args]
+arg1 = "non-th-arg"
+[env.harness]
+max_turns = 1
+[env.taskset.scoring.contains_answer]
+weight = 1.0
+```
+```bash
+prime env install my-env
+uv run prime-rl configs/rl/my-v1-env.toml
+```
 To install the environment module into your project, do:
 ```bash
 prime env install my-env # installs from ./environments/my_env
@@ -164,6 +234,8 @@ prime eval run primeintellect/math-python
 **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
+**[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
 **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
 **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/pyproject.toml RENAMED Viewed

@@ -52,7 +52,8 @@ dependencies = [
     "msgpack>=1.1.2",
     "aiolimiter>=1.2.1",
     "setproctitle>=1.3.0",
-    "regex<2026.4.4",  # 2026.4.4 missing cp312/cp313 wheels
+    "regex<2026.4.4",
+    "httpx>=0.27.0",
 ]
 [dependency-groups]
@@ -73,6 +74,7 @@ dev = [
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
     "nltk",
+    "renderers>=0.1.6",
 ]
 [project.optional-dependencies]
@@ -91,6 +93,9 @@ browser = [
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
 ]
+renderers = [
+    "renderers>=0.1.6",
+]
 rl = [
     "torch>=2.8.0,<2.9.0",
     "transformers>=4.56.2",
@@ -108,6 +113,24 @@ rl = [
 preview = true
 required-version = ">=0.11.1"
+[[tool.uv.index]]
+name = "pypi"
+url = "https://pypi.org/simple"
+default = true
+exclude-newer = "7 days"
+[tool.uv.exclude-newer-package]
+# PrimeIntellect-published on PyPI (trusted publisher)
+prime-tunnel = false
+prime-sandboxes = false
+renderers = false
+[tool.uv.sources]
+# Pinned to renderers main until the next PyPI release lands; drop after.
+# fe67f9f = renderers main: PR #4 squash-merge — construction-time
+# preserve_*_thinking flags on create_renderer / create_renderer_pool.
+renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
 [tool.uv.extra-build-dependencies]
 flash-attn = [{ requirement = "torch", match-runtime = true }]
@@ -170,6 +193,7 @@ addopts = [
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "integration: marks tests as integration tests",
+    "prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
     "unit: marks tests as unit tests",
     "asyncio: marks tests as async tests",
     "parsers: marks tests for parser components",
@@ -195,7 +219,7 @@ unknown-argument = "warn"
 redundant-cast = "ignore"
 [tool.ty.src]
-exclude = ["environments"]
+exclude = ["environments", "verifiers/v1/sketch.py"]
 [[tool.ty.overrides]]
 include = ["verifiers/envs/experimental/composable/tasksets/**"]

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/conftest.py RENAMED Viewed

@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
         super().__init__(tools=[offset_tool], **kwargs)
     async def setup_state(self, state, **kwargs):
-        state = await super().setup_state(state, **kwargs)
+        await super().setup_state(state, **kwargs)
         state["offset"] = 3
         state["update_calls"] = 0
-        return state
     def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
         state["update_calls"] += 1
@@ -458,13 +457,15 @@ def make_input() -> Callable[..., RolloutInput]:
     def _make_input(
         example_id: int = 0,
-        task: str = "default",
         prompt: Messages = DEFAULT_PROMPT,
         info: Info = {},
         answer: str = "4",
     ) -> RolloutInput:
         return RolloutInput(
-            example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
+            example_id=example_id,
+            prompt=prompt,
+            answer=answer,
+            info=info,
         )
     return _make_input
@@ -476,7 +477,6 @@ def make_state() -> Callable[..., State]:
     def _make_state(
         example_id: int = 0,
-        task: str = "default",
         prompt: Messages = DEFAULT_PROMPT,
         answer: str = "4",
         info: Info = {},
@@ -488,17 +488,12 @@ def make_state() -> Callable[..., State]:
         stop_condition: str | None = "max_turns_reached",
         tool_defs: list[Tool] | None = None,
         trajectory: list[TrajectoryStep] = [],
-        timing=RolloutTiming(
-            generation_ms=0.0,
-            scoring_ms=0.0,
-            total_ms=0.0,
-        ),
+        timing=RolloutTiming(),
         foo: str = "bar",  # custom field
         **kwargs,
     ) -> State:
         return State(
             example_id=example_id,
-            task=task,
             prompt=prompt,
             answer=answer,
             info=info,
@@ -551,7 +546,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
         rollouts_per_example: int = 1,
         sampling_args: SamplingArgs = {},
         date: str = "1970-01-01",
-        time_ms: float = 0.0,
+        time: float = 0.0,
         avg_reward: float = 0.0,
         avg_metrics: dict[str, float] = {},
         pass_at_k: dict[str, float] = {},
@@ -579,7 +574,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
             rollouts_per_example=rollouts_per_example,
             sampling_args=sampling_args,
             date=date,
-            time_ms=time_ms,
+            time=time,
             avg_reward=avg_reward,
             avg_metrics=avg_metrics,
             pass_at_k=pass_at_k,

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_cli_agent_env.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Tests for CliAgentEnv and HarborEnv."""
+import asyncio
 import tempfile
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch
@@ -8,6 +9,7 @@ import pytest
 from datasets import Dataset
 import verifiers as vf
+from verifiers.utils.interception_utils import serialize_intercept_response
 @pytest.fixture
@@ -62,7 +64,8 @@ class TestCliAgentEnv:
         assert env.run_command == "python agent.py"
         assert env.docker_image == "python:3.11-slim"
         assert env.interception_port == 8765
-        assert env.timeout_seconds == 3600.0
+        assert env.timeout_seconds is None
+        assert env.sandbox_timeout_minutes is None
     def test_init_custom_config(self, sample_dataset):
         """Test initialization with custom configuration."""
@@ -130,22 +133,34 @@ class TestCliAgentEnv:
         state = {"agent_completed": True}
         assert await env.agent_completed(state) is True
-    @pytest.mark.asyncio
-    async def test_timeout_reached_stop_condition(self, sample_dataset):
-        """Test the timeout_reached stop condition."""
+    @pytest.mark.parametrize(
+        "timeout_seconds,expected_minutes",
+        [
+            (None, 24 * 60),  # no rollout cap → SDK ceiling
+            (600.0, 10 + 60),  # finite → ceil + scoring buffer
+            (24 * 3600.0, 24 * 60),  # buffer would overflow → clamped to ceiling
+        ],
+    )
+    def test_sandbox_timeout_auto_derived(
+        self, sample_dataset, timeout_seconds, expected_minutes
+    ):
         env = vf.CliAgentEnv(
             run_command="python agent.py",
             dataset=sample_dataset,
             rubric=vf.Rubric(),
-            timeout_seconds=10.0,
+            timeout_seconds=timeout_seconds,
         )
-        import time
-        state = {"timing": {"start_time": time.time()}}
-        assert await env.timeout_reached(state) is False
+        assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
-        state = {"timing": {"start_time": time.time() - 20}}
-        assert await env.timeout_reached(state) is True
+    def test_sandbox_timeout_explicit_override(self, sample_dataset):
+        env = vf.CliAgentEnv(
+            run_command="python agent.py",
+            dataset=sample_dataset,
+            rubric=vf.Rubric(),
+            timeout_seconds=600.0,
+            sandbox_timeout_minutes=30,
+        )
+        assert env.get_sandbox_resources({})["timeout_minutes"] == 30
     @pytest.mark.asyncio
     async def test_env_response_returns_empty(self, sample_dataset):
@@ -204,6 +219,152 @@ class TestCliAgentEnv:
         assert kwargs["tools"][0].name == "echo"
+@pytest.mark.asyncio
+async def test_cli_agent_env_delivers_intercepted_tool_call_response(
+    sample_dataset, mock_client
+):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    prompt = sample_dataset[0]["prompt"]
+    tool_call = {
+        "id": "call_echo",
+        "type": "function",
+        "function": {"name": "echo", "arguments": '{"text": "hello"}'},
+    }
+    mock_client.add_response(
+        prompt,
+        "",
+        finish_reason="tool_calls",
+        tool_calls=[tool_call],
+    )
+    state = await env.init_state(
+        input=sample_dataset[0],
+        client=mock_client,
+        model="test-model",
+    )
+    response_future = asyncio.Future()
+    request_id = "req-tool-call"
+    state["current_request_id"] = request_id
+    env._interception_server.intercepts[request_id] = {
+        "stream": False,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "echo",
+                    "description": "Return the provided text.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"text": {"type": "string"}},
+                    },
+                },
+            }
+        ],
+        "response_future": response_future,
+    }
+    response = await env.get_model_response(
+        state=state,
+        prompt=prompt,
+        client=mock_client,
+        model="test-model",
+    )
+    assert response_future.done()
+    assert response_future.result() is response
+    assert state["current_request_id"] is None
+    payload = serialize_intercept_response(response_future.result())
+    choice = payload["choices"][0]
+    assert choice["finish_reason"] == "tool_calls"
+    assert choice["message"]["tool_calls"] == [tool_call]
+    assert mock_client.last_call_kwargs["tools"][0].name == "echo"
+@pytest.mark.asyncio
+async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
+    sample_dataset, mock_client
+):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    prompt = sample_dataset[0]["prompt"]
+    tool_call = {
+        "id": "call_echo",
+        "type": "function",
+        "function": {"name": "echo", "arguments": '{"text": "hello"}'},
+    }
+    mock_client.add_response(
+        prompt,
+        "",
+        finish_reason="tool_calls",
+        tool_calls=[tool_call],
+    )
+    state = await env.init_state(
+        input=sample_dataset[0],
+        client=mock_client,
+        model="test-model",
+    )
+    chunk_queue = asyncio.Queue()
+    response_future = asyncio.Future()
+    request_id = "req-stream-tool-call"
+    state["current_request_id"] = request_id
+    env._interception_server.intercepts[request_id] = {
+        "stream": True,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "echo",
+                    "description": "Return the provided text.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"text": {"type": "string"}},
+                    },
+                },
+            }
+        ],
+        "chunk_queue": chunk_queue,
+        "response_future": response_future,
+    }
+    response = await env.get_model_response(
+        state=state,
+        prompt=prompt,
+        client=mock_client,
+        model="test-model",
+    )
+    chunks = []
+    while True:
+        chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
+        if chunk is None:
+            break
+        chunks.append(chunk)
+    assert response_future.done()
+    assert response_future.result() is response
+    assert state["current_request_id"] is None
+    assert chunks[0]["object"] == "chat.completion.chunk"
+    assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
+    assert (
+        chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
+    )
+    assert (
+        chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
+        == '{"text": "hello"}'
+    )
+    assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
 class TestHarborEnv:
     """Tests for HarborEnv."""
@@ -231,7 +392,7 @@ class TestHarborEnv:
             dataset_path=harbor_task_dir,
         )
         assert len(env.dataset) == 1
-        assert env.dataset[0]["task"] == "test_task"
+        assert env.dataset[0]["info"]["task_name"] == "test_task"
     def test_init_filters_tasks(self, harbor_task_dir):
         """Test that HarborEnv can filter tasks by name."""
@@ -247,7 +408,7 @@ class TestHarborEnv:
             tasks=["test_task"],
         )
         assert len(env.dataset) == 1
-        assert env.dataset[0]["task"] == "test_task"
+        assert env.dataset[0]["info"]["task_name"] == "test_task"
     def test_init_raises_on_empty_dataset(self):
         """Test that HarborEnv raises when no valid tasks found."""
@@ -301,7 +462,7 @@ class TestHarborEnv:
         )
         state = {
             "interception_base_url": "https://test.trycloudflare.com/v1",
-            "task": "my_task",
+            "info": {"task_name": "my_task"},
         }
         env_vars = await env.build_env_vars(state)

{verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_composable_env.py RENAMED Viewed

@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
         teardown=lambda: None,
     )
-    state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
+    state = {"sandbox_id": "sbx", "timing": {"total": 0}}
     await env.post_rollout(state)
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }

verifiers 0.1.13.dev7__tar.gz → 0.1.14__tar.gz

verifiers 0.1.13.dev7tar.gz → 0.1.14tar.gz