PyPI - verifiers - Versions diffs - 0.1.13.dev6__tar.gz → 0.1.13.dev8__tar.gz - Mend

verifiers 0.1.13.dev6tar.gz → 0.1.13.dev8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.13.dev6
+Version: 0.1.13.dev8
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/conftest.py RENAMED Viewed

@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
         super().__init__(tools=[offset_tool], **kwargs)
     async def setup_state(self, state, **kwargs):
-        state = await super().setup_state(state, **kwargs)
+        await super().setup_state(state, **kwargs)
         state["offset"] = 3
         state["update_calls"] = 0
-        return state
     def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
         state["update_calls"] += 1

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_cli_agent_env.py RENAMED Viewed

@@ -62,7 +62,8 @@ class TestCliAgentEnv:
         assert env.run_command == "python agent.py"
         assert env.docker_image == "python:3.11-slim"
         assert env.interception_port == 8765
-        assert env.timeout_seconds == 3600.0
+        assert env.timeout_seconds is None
+        assert env.sandbox_timeout_minutes is None
     def test_init_custom_config(self, sample_dataset):
         """Test initialization with custom configuration."""
@@ -130,22 +131,34 @@ class TestCliAgentEnv:
         state = {"agent_completed": True}
         assert await env.agent_completed(state) is True
-    @pytest.mark.asyncio
-    async def test_timeout_reached_stop_condition(self, sample_dataset):
-        """Test the timeout_reached stop condition."""
+    @pytest.mark.parametrize(
+        "timeout_seconds,expected_minutes",
+        [
+            (None, 24 * 60),  # no rollout cap → SDK ceiling
+            (600.0, 10 + 60),  # finite → ceil + scoring buffer
+            (24 * 3600.0, 24 * 60),  # buffer would overflow → clamped to ceiling
+        ],
+    )
+    def test_sandbox_timeout_auto_derived(
+        self, sample_dataset, timeout_seconds, expected_minutes
+    ):
         env = vf.CliAgentEnv(
             run_command="python agent.py",
             dataset=sample_dataset,
             rubric=vf.Rubric(),
-            timeout_seconds=10.0,
+            timeout_seconds=timeout_seconds,
         )
-        import time
+        assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
-        state = {"timing": {"start_time": time.time()}}
-        assert await env.timeout_reached(state) is False
-        state = {"timing": {"start_time": time.time() - 20}}
-        assert await env.timeout_reached(state) is True
+    def test_sandbox_timeout_explicit_override(self, sample_dataset):
+        env = vf.CliAgentEnv(
+            run_command="python agent.py",
+            dataset=sample_dataset,
+            rubric=vf.Rubric(),
+            timeout_seconds=600.0,
+            sandbox_timeout_minutes=30,
+        )
+        assert env.get_sandbox_resources({})["timeout_minutes"] == 30
     @pytest.mark.asyncio
     async def test_env_response_returns_empty(self, sample_dataset):

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment.py RENAMED Viewed

@@ -26,7 +26,6 @@ class SimpleEnvironment(Environment):
     async def setup_state(self, state):
         """Setup state for SimpleEnvironment."""
-        return state
     async def rollout(
         self,
@@ -38,7 +37,7 @@ class SimpleEnvironment(Environment):
         """Simple test rollout implementation."""
         state = await self.init_state(input, client=client, model=model)
         try:
-            state = await self.setup_state(state)
+            await self.setup_state(state)
             prompt_messages = state["prompt"]
             response = await self.get_model_response(state, prompt_messages)
@@ -551,8 +550,6 @@ class RetryCounterEnv(SimpleEnvironment):
                 f"Simulated failure {self.call_counts[example_id]}/{self.fail_count}"
             )
-        return state
 class TestMaybeRetry:
     """Test cases for maybe_retry functionality in Environment.generate()."""

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment_extra.py RENAMED Viewed

@@ -40,7 +40,7 @@ from verifiers.utils.save_utils import state_to_output
 # Local simple concrete Environment for testing
 class DummyEnvironment(Environment):
     async def setup_state(self, state):
-        return state
+        pass
     async def rollout(
         self,
@@ -52,7 +52,7 @@ class DummyEnvironment(Environment):
         state = await self.init_state(
             input, client=client, model=model, sampling_args=sampling_args
         )
-        state = await self.setup_state(state)
+        await self.setup_state(state)
         prompt_messages = state["prompt"]
         response = await self.get_model_response(state=state, prompt=prompt_messages)

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_cli.py RENAMED Viewed

@@ -232,6 +232,36 @@ def test_cli_temperature_not_added_when_none(monkeypatch, run_cli):
     assert "temperature" not in sa
+def test_cli_extra_env_kwargs_support_timeout_seconds(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
+        },
+    )
+    assert captured["configs"][0].extra_env_kwargs == {
+        "timeout_seconds": 30,
+        "foo": "bar",
+    }
+def test_cli_timeout_flag_overrides_extra_env_kwargs(monkeypatch, run_cli):
+    """--timeout wins over timeout_seconds in --extra-env-kwargs."""
+    captured = run_cli(
+        monkeypatch,
+        {
+            "extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
+            "timeout": 600,
+        },
+    )
+    assert captured["configs"][0].extra_env_kwargs == {
+        "timeout_seconds": 600,
+        "foo": "bar",
+    }
 def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
     captured = run_cli(
         monkeypatch,
@@ -874,6 +904,27 @@ def test_load_toml_config_global_values_with_per_eval_override():
     assert result[1]["num_examples"] == 50  # per-eval override
+def test_load_toml_config_with_extra_env_kwargs():
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            '[[eval]]\nenv_id = "env1"\n[eval.extra_env_kwargs]\ntimeout_seconds = 600\n'
+        )
+        f.flush()
+        result = load_toml_config(Path(f.name))
+    assert result[0]["extra_env_kwargs"] == {"timeout_seconds": 600}
+def test_load_toml_config_with_top_level_timeout():
+    """Top-level `timeout` is a recognized field on [[eval]] tables."""
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write('[[eval]]\nenv_id = "env1"\ntimeout = 600\n')
+        f.flush()
+        result = load_toml_config(Path(f.name))
+    assert result[0]["timeout"] == 600
 def test_load_toml_config_invalid_global_field():
     """Invalid global field raises ValueError."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_math_rubric.py RENAMED Viewed

@@ -1,8 +1,11 @@
 """Tests for the MathRubric class."""
+import asyncio
 import pytest
 import verifiers as vf
+from verifiers.rubrics import math_rubric
 class TestMathRubric:
@@ -127,3 +130,85 @@ class TestMathRubric:
             assert state["metrics"]["correct_answer"] == 1.0
         else:
             assert state["metrics"]["correct_answer"] == 0.0
+class TestVerifyResponseExceptionHandling:
+    """Regression tests for the exception handling in verify_response.
+    See commit narrowing ``except BaseException`` to
+    ``except (Exception, MathVerifyTimeout)`` so that ``CancelledError``,
+    ``KeyboardInterrupt``, and ``SystemExit`` propagate instead of being
+    silently reported as a 0.0 score.
+    """
+    def test_cancellederror_propagates(self, monkeypatch):
+        """CancelledError raised during math_verify must propagate, not
+        get swallowed and reported as a score of 0.0."""
+        def raise_cancelled(*args, **kwargs):
+            raise asyncio.CancelledError
+        monkeypatch.setattr(math_rubric, "parse", raise_cancelled)
+        with pytest.raises(asyncio.CancelledError):
+            math_rubric.verify_response(
+                response="\\boxed{1}",
+                answer="1",
+                max_verify_chars=50_000,
+                timeout_seconds=5,
+            )
+    def test_keyboardinterrupt_propagates(self, monkeypatch):
+        """KeyboardInterrupt must propagate so Ctrl-C still works during
+        scoring."""
+        def raise_kbd(*args, **kwargs):
+            raise KeyboardInterrupt
+        monkeypatch.setattr(math_rubric, "parse", raise_kbd)
+        with pytest.raises(KeyboardInterrupt):
+            math_rubric.verify_response(
+                response="\\boxed{1}",
+                answer="1",
+                max_verify_chars=50_000,
+                timeout_seconds=5,
+            )
+    def test_math_verify_timeout_returns_zero(self, monkeypatch):
+        """A real math_verify.errors.TimeoutException (which inherits from
+        BaseException, not Exception) must still be caught and reported as
+        a 0.0 score — that's why the catch is wider than just Exception."""
+        from math_verify.errors import TimeoutException
+        def raise_timeout(*args, **kwargs):
+            raise TimeoutException("simulated math_verify timeout")
+        monkeypatch.setattr(math_rubric, "parse", raise_timeout)
+        score, elapsed = math_rubric.verify_response(
+            response="\\boxed{1}",
+            answer="1",
+            max_verify_chars=50_000,
+            timeout_seconds=5,
+        )
+        assert score == 0.0
+        assert elapsed >= 0.0
+    def test_regular_exception_returns_zero(self, monkeypatch):
+        """A regular Exception from math_verify should continue to be
+        swallowed and reported as 0.0 (library-raised something weird)."""
+        def raise_exc(*args, **kwargs):
+            raise ValueError("simulated parse failure")
+        monkeypatch.setattr(math_rubric, "parse", raise_exc)
+        score, elapsed = math_rubric.verify_response(
+            response="\\boxed{1}",
+            answer="1",
+            max_verify_chars=50_000,
+            timeout_seconds=5,
+        )
+        assert score == 0.0
+        assert elapsed >= 0.0

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_multiturn_env.py RENAMED Viewed

@@ -1,5 +1,7 @@
 """Tests for the MultiTurnEnv class."""
+import asyncio
 import pytest
 from datasets import Dataset
@@ -12,6 +14,7 @@ class TestMultiTurnEnv:
     def test_multiturn_env_initialization(self, mock_multiturn_env):
         """Test MultiTurnEnv initialization."""
         assert mock_multiturn_env.max_turns == 3
+        assert mock_multiturn_env.timeout_seconds is None
         assert mock_multiturn_env.message_type == "chat"  # Default from parent
     def test_multiturn_env_default_max_turns(self, mock_client, sample_chat_dataset):
@@ -26,6 +29,7 @@ class TestMultiTurnEnv:
             rubric=Rubric(),
         )
         assert env.max_turns == -1  # Default value
+        assert env.timeout_seconds is None
     @pytest.mark.asyncio
     async def test_basic_multiturn_rollout(self, mock_multiturn_env, make_input):
@@ -103,6 +107,83 @@ class TestMultiTurnEnv:
         assert completion[1]["role"] == "user"
         assert completion[2]["role"] == "assistant"
+    @pytest.mark.asyncio
+    async def test_timeout_seconds_limits_rollout(
+        self, mock_client, sample_chat_dataset, make_input
+    ):
+        """Test that rollout stops when the wall-clock timeout is reached."""
+        class SlowMultiTurnEnv(MultiTurnEnv):
+            async def env_response(self, messages, state, **kwargs):  # type: ignore[override]
+                return [{"role": "user", "content": "Continue"}]
+            async def add_model_response(self, state, prompt_messages, response):  # type: ignore[override]
+                await super().add_model_response(state, prompt_messages, response)
+                await asyncio.sleep(0.05)
+        env = SlowMultiTurnEnv(
+            client=mock_client,
+            model="test-model",
+            dataset=sample_chat_dataset,
+            parser=Parser(),
+            rubric=Rubric(),
+            timeout_seconds=0.01,
+        )
+        mock_client.set_default_response("Still going")
+        prompt = [{"role": "user", "content": "Start conversation"}]
+        state = await env.rollout(
+            input=make_input(prompt=prompt, answer="target_answer"),
+            client=mock_client,
+            model="test-model",
+        )
+        assert len(state["trajectory"]) == 1
+        assert state["timed_out"] is True
+        assert state["is_completed"] is True
+        assert state["stop_condition"] == "timeout_reached"
+        completion = state["completion"]
+        assert len(completion) == 1
+        assert completion[0]["role"] == "assistant"
+        assert completion[0]["content"] == "Still going"
+    @pytest.mark.asyncio
+    async def test_timeout_seconds_limits_setup(
+        self, mock_client, sample_chat_dataset, make_input
+    ):
+        """Test that the rollout timeout applies while setup is in flight."""
+        class SlowSetupEnv(MultiTurnEnv):
+            async def setup_state(self, state):  # type: ignore[override]
+                await asyncio.sleep(1)
+            async def env_response(self, messages, state, **kwargs):  # type: ignore[override]
+                return [{"role": "user", "content": "Continue"}]
+        env = SlowSetupEnv(
+            client=mock_client,
+            model="test-model",
+            dataset=sample_chat_dataset,
+            parser=Parser(),
+            rubric=Rubric(),
+            timeout_seconds=0.01,
+        )
+        state = await env.rollout(
+            input=make_input(
+                prompt=[{"role": "user", "content": "Start conversation"}],
+                answer="target_answer",
+            ),
+            client=mock_client,
+            model="test-model",
+        )
+        assert state["timed_out"] is True
+        assert state["is_completed"] is True
+        assert state["stop_condition"] == "timeout_reached"
+        assert state["trajectory"] == []
+        assert state["completion"] == []
     @pytest.mark.asyncio
     async def test_override_is_completed_respects_max_turns(
         self, mock_client, sample_chat_dataset, make_input

{verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_opencode_rlm_env.py RENAMED Viewed

@@ -289,13 +289,12 @@ class TestSetupState:
             OpenCodeRLMEnv.__bases__[0],
             "setup_state",
             new_callable=AsyncMock,
-            return_value=state,
         ):
-            result = await env.setup_state(state)
-        assert result["sub_llm_turns"] == 0
-        assert result["sub_llm_prompt_tokens"] == 0
-        assert result["sub_llm_completion_tokens"] == 0
-        assert result["_sub_llm_tasks"] == set()
+            await env.setup_state(state)
+        assert state["sub_llm_turns"] == 0
+        assert state["sub_llm_prompt_tokens"] == 0
+        assert state["sub_llm_completion_tokens"] == 0
+        assert state["_sub_llm_tasks"] == set()
     @pytest.mark.asyncio
     async def test_preserves_existing_sub_metrics(self):
@@ -305,10 +304,9 @@ class TestSetupState:
             OpenCodeRLMEnv.__bases__[0],
             "setup_state",
             new_callable=AsyncMock,
-            return_value=state,
         ):
-            result = await env.setup_state(state)
-        assert result["sub_llm_turns"] == 3
+            await env.setup_state(state)
+        assert state["sub_llm_turns"] == 3
 # =============================================================================

verifiers 0.1.13.dev6__tar.gz → 0.1.13.dev8__tar.gz

verifiers 0.1.13.dev6tar.gz → 0.1.13.dev8tar.gz