PyPI - verifiers - Versions diffs - 0.1.15.dev8__tar.gz → 0.1.15.dev10__tar.gz - Mend

verifiers 0.1.15.dev8tar.gz → 0.1.15.dev10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (326) hide show

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev8
+Version: 0.1.15.dev10
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -48,6 +48,7 @@ Requires-Dist: tenacity>=8.5.0
 Requires-Dist: textual
 Requires-Dist: tomli; python_version < '3.11'
 Requires-Dist: typing-extensions; python_version < '3.12'
+Requires-Dist: uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'
 Provides-Extra: browser
 Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
 Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
@@ -227,7 +228,8 @@ class MyTasksetConfig(vf.TasksetConfig):
     split: str = "train"
-class MyTaskset(vf.Taskset[MyTasksetConfig]):
+class MyTaskset(vf.Taskset):
+    config: MyTasksetConfig
     _default_rewards = (contains_answer,)
     def rows(self) -> list[dict[str, object]]:
@@ -242,12 +244,15 @@ class MyTaskset(vf.Taskset[MyTasksetConfig]):
         return [row for row in rows if row["split"] == self.config.split]
-class MyEnvConfig(vf.EnvConfig):
-    taskset: MyTasksetConfig = MyTasksetConfig()
+def load_taskset(config: MyTasksetConfig) -> MyTaskset:
+    assert isinstance(config, MyTasksetConfig)
+    return MyTaskset(config=config)
-def load_environment(config: MyEnvConfig) -> vf.Env:
-    return vf.Env(taskset=MyTaskset(config=config.taskset))
+def load_environment(config: vf.EnvConfig) -> vf.Env:
+    taskset_config = config.taskset
+    assert isinstance(taskset_config, MyTasksetConfig)
+    return vf.Env(taskset=load_taskset(taskset_config))
 ```
 If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
 **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/README.md RENAMED Viewed

@@ -151,7 +151,8 @@ class MyTasksetConfig(vf.TasksetConfig):
     split: str = "train"
-class MyTaskset(vf.Taskset[MyTasksetConfig]):
+class MyTaskset(vf.Taskset):
+    config: MyTasksetConfig
     _default_rewards = (contains_answer,)
     def rows(self) -> list[dict[str, object]]:
@@ -166,12 +167,15 @@ class MyTaskset(vf.Taskset[MyTasksetConfig]):
         return [row for row in rows if row["split"] == self.config.split]
-class MyEnvConfig(vf.EnvConfig):
-    taskset: MyTasksetConfig = MyTasksetConfig()
+def load_taskset(config: MyTasksetConfig) -> MyTaskset:
+    assert isinstance(config, MyTasksetConfig)
+    return MyTaskset(config=config)
-def load_environment(config: MyEnvConfig) -> vf.Env:
-    return vf.Env(taskset=MyTaskset(config=config.taskset))
+def load_environment(config: vf.EnvConfig) -> vf.Env:
+    taskset_config = config.taskset
+    assert isinstance(taskset_config, MyTasksetConfig)
+    return vf.Env(taskset=load_taskset(taskset_config))
 ```
 If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
 **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/pyproject.toml RENAMED Viewed

@@ -54,6 +54,7 @@ dependencies = [
     "regex<2026.4.4",
     "httpx>=0.27.0",
     "prime-pydantic-config[toml]",
+    "uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'",
 ]
 [dependency-groups]

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_client_multimodal_types.py RENAMED Viewed

@@ -98,6 +98,31 @@ async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
     ]
+@pytest.mark.asyncio
+async def test_anthropic_to_native_prompt_marks_unsupported_images_in_mixed_content():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+    client = AnthropicMessagesClient(object())
+    messages = [
+        UserMessage(
+            content=[
+                TextContentPart(text="describe this"),
+                ImageUrlContentPart(
+                    image_url=ImageUrlSource(url="https://example.com/image.png")
+                ),
+            ]
+        )
+    ]
+    prompt, kwargs = await client.to_native_prompt(messages)
+    assert kwargs["system"] == ""
+    assert prompt[0]["content"] == [
+        {"type": "text", "text": "describe this"},
+        {"type": "text", "text": "[image]"},
+    ]
 @pytest.mark.asyncio
 async def test_anthropic_assistant_tool_calls_use_text_chunks_not_model_repr():
     pytest.importorskip("anthropic")

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_envs.py RENAMED Viewed

@@ -216,10 +216,14 @@ def help_test_can_load_env(tmp_venv_dir: Path, env_dir: Path):
 def help_test_can_eval_env(tmp_venv_dir: Path, env_dir: Path):
     """Test that the environment can be run via vf-eval."""
-    if os.getenv("OPENAI_API_KEY"):
-        model_flags = "-m gpt-4.1-mini -b https://api.openai.com/v1 -k OPENAI_API_KEY"
-    elif os.getenv("PRIME_API_KEY"):
+    if env_dir.name == "tau2_bench_v1" and not os.getenv("PRIME_API_KEY"):
+        pytest.skip(
+            "Skipping tau2 default eval because PRIME_API_KEY is not configured"
+        )
+    if os.getenv("PRIME_API_KEY"):
         model_flags = "-m openai/gpt-4.1-mini -b https://api.pinference.ai/api/v1 -k PRIME_API_KEY"
+    elif os.getenv("OPENAI_API_KEY"):
+        model_flags = "-m gpt-4.1-mini -b https://api.openai.com/v1 -k OPENAI_API_KEY"
     else:
         pytest.skip("Skipping vf-eval smoke test because no API key is configured")

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_eval_cli.py RENAMED Viewed

@@ -288,6 +288,25 @@ def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
     }
+def test_cli_defaults_session_header_to_trajectory_id(monkeypatch, run_cli):
+    captured = run_cli(monkeypatch, {})
+    assert captured["configs"][0].client_config.extra_headers_from_state == {
+        "X-Session-ID": "trajectory_id"
+    }
+def test_cli_header_from_state_overrides_default_session_header(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {"header_from_state": ["X-Session-ID: example_id"]},
+    )
+    assert captured["configs"][0].client_config.extra_headers_from_state == {
+        "X-Session-ID": "example_id"
+    }
 def test_cli_registry_headers_merged_with_eval_toml(tmp_path, monkeypatch, run_cli):
     cfg = tmp_path / "eval.toml"
     cfg.write_text(

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_harbor_env_mcp.py RENAMED Viewed

@@ -239,83 +239,20 @@ class TestLaunchCommandResolution:
         )
-class TestStartStopCommands:
-    def test_start_cmd_tracks_process_group_leader_pid(self):
-        """Start command must capture `$!` (the backgrounded pgroup leader),
-        not `$$` (the outer shell), and must end with `wait` so the recorded
-        exit code reflects the launched daemon's.
-        """
-        cmd = _DummyEnv()._mcp_start_cmd("svc", "python -u /opt/x/server.py")
-        assert "echo $!" in cmd
-        assert "echo $$" not in cmd
-        assert cmd.rstrip().endswith("wait")
-        assert "/tmp/harbor-mcp-svc.pid" in cmd
-        assert "python -u /opt/x/server.py" in cmd
-    def test_start_cmd_wraps_in_setsid_for_process_group_semantics(self):
-        """Wrapping the user's command in `setsid sh -c ...` is what makes
-        `$!` a process-group leader, so `kill -9 -$PID` can reap the whole
-        daemon tree on stop. Compound commands (e.g. `cd /x && python y.py`)
-        must be preserved verbatim inside the sh -c payload so their own
-        semantics are unchanged."""
-        cmd = _DummyEnv()._mcp_start_cmd("svc", "cd /opt && python server.py")
-        assert "setsid sh -c " in cmd
-        assert "'cd /opt && python server.py'" in cmd
-    def test_stop_cmd_is_one_line_sigkill_plus_rm(self):
-        """Default: one SIGKILL to the process group, then unlink the
-        pidfile — no poll/sleep loop."""
-        cmd = _DummyEnv()._mcp_stop_cmd("svc")
-        assert "kill -9" in cmd
-        assert "rm -f" in cmd
-        assert "/tmp/harbor-mcp-svc.pid" in cmd
-        assert "kill -0" not in cmd
-        assert "sleep" not in cmd
-        assert "\n" not in cmd
-        assert len(cmd) < 120
-    def test_stop_cmd_targets_process_group_not_single_pid(self):
-        """The `-` prefix on the `$(cat …)` expansion is what turns kill(1)
-        into a process-group kill — without it, SIGKILL only lands on the
-        wrapping shell and e.g. a `python` child spawned via `cd && python`
-        leaks as an orphan."""
-        cmd = _DummyEnv()._mcp_stop_cmd("svc")
-        assert 'kill -9 -"$(cat' in cmd
-    def test_server_name_with_shell_metachars_is_quoted(self):
-        """Server name is task-author-controlled; every pidfile reference
-        must appear only inside single-quoted spans."""
-        env = _DummyEnv()
-        unquoted = "/tmp/harbor-mcp-evil$(whoami).pid"
-        quoted = f"'{unquoted}'"
-        for cmd in (
-            env._mcp_start_cmd("evil$(whoami)", "x"),
-            env._mcp_stop_cmd("evil$(whoami)"),
-        ):
-            assert quoted in cmd
-            # Every raw occurrence must be inside an already-quoted span.
-            assert cmd.count(unquoted) == cmd.count(quoted)
-    def test_launch_command_with_shell_metachars_is_quoted(self):
-        """Same for the user's launch command: it's task-author-controlled,
-        must land inside a single-quoted span once wrapped in `sh -c`."""
-        env = _DummyEnv()
-        evil_cmd = "python -c 'print(1)' && touch /pwned"
-        quoted = f"'{evil_cmd}'".replace("'", "'\"'\"'")
-        # shlex-quoted output contains the evil string only inside quotes.
-        cmd = env._mcp_start_cmd("svc", evil_cmd)
-        assert "setsid sh -c " in cmd
-        # No unquoted `&& touch /pwned` outside a single-quoted span.
-        assert cmd.count(evil_cmd) == 0 or quoted in cmd
 class TestLifecycle:
     @pytest.mark.asyncio
     async def test_starts_server_with_registered_launch_command(self):
-        env = _DummyEnv(mcp_launch_commands={"svc": "python server.py"})
+        env = _DummyEnv(mcp_launch_commands={"svc": "cd /opt && python server.py"})
         state: dict[str, Any] = {}
         await env.start_mcp_servers("sbx", _config_with_server(), state)
         assert set(state["harbor_mcp_jobs"].keys()) == {"svc"}
+        _, start_cmd = env.started_jobs[0]
+        assert "echo $!" in start_cmd
+        assert "echo $$" not in start_cmd
+        assert start_cmd.rstrip().endswith("wait")
+        assert "/tmp/harbor-mcp-svc.pid" in start_cmd
+        assert "setsid sh -c " in start_cmd
+        assert "'cd /opt && python server.py'" in start_cmd
     @pytest.mark.asyncio
     async def test_externally_managed_server_is_skipped(self):
@@ -342,9 +279,38 @@ class TestLifecycle:
             if "kill -9" in c.args[1]
         ]
         assert len(stop_calls) == 1
-        assert "harbor-mcp-svc.pid" in stop_calls[0]
+        stop_cmd = stop_calls[0]
+        assert "harbor-mcp-svc.pid" in stop_cmd
+        assert 'kill -9 -"$(cat' in stop_cmd
+        assert "rm -f" in stop_cmd
+        assert "kill -0" not in stop_cmd
+        assert "sleep" not in stop_cmd
+        assert "\n" not in stop_cmd
+        assert len(stop_cmd) < 120
         assert state["harbor_mcp_jobs"] == {}
+    @pytest.mark.asyncio
+    async def test_launch_and_stop_commands_quote_task_authored_shell_text(self):
+        env = _DummyEnv(
+            mcp_launch_commands={
+                "evil$(whoami)": "python -c 'print(1)' && touch /pwned"
+            }
+        )
+        state: dict[str, Any] = {"sandbox_id": "sbx"}
+        await env.start_mcp_servers(
+            "sbx", _config_with_server(name="evil$(whoami)"), state
+        )
+        _, start_cmd = env.started_jobs[0]
+        quoted_pidfile = "'/tmp/harbor-mcp-evil$(whoami).pid'"
+        assert quoted_pidfile in start_cmd
+        assert "setsid sh -c " in start_cmd
+        assert "'\"'\"'print(1)'\"'\"'" in start_cmd
+        env.sandbox_client.execute_command.reset_mock()
+        await env.stop_mcp_servers(state)
+        stop_cmd = env.sandbox_client.execute_command.call_args.args[1]
+        assert quoted_pidfile in stop_cmd
     @pytest.mark.asyncio
     async def test_stop_without_sandbox_id_is_a_noop(self):
         env = _DummyEnv()
@@ -530,22 +496,6 @@ class TestBackgroundJob:
 class TestHealthCheck:
     """Readiness probing — default `/proc/net/tcp` + user override."""
-    def test_default_probe_shape(self):
-        """Portable awk on /proc/net/tcp{,6}, matching LISTEN state only,
-        with no bash-ism dependency like /dev/tcp."""
-        cmd = HarborMCPMixin._default_mcp_health_cmd(8000)
-        assert "bash" not in cmd and "/dev/tcp" not in cmd
-        assert "/proc/net/tcp" in cmd and "/proc/net/tcp6" in cmd
-        assert '$4 == "0A"' in cmd  # LISTEN state
-    @pytest.mark.parametrize(
-        "port,hex_expected",
-        [(80, "0050"), (8000, "1F40"), (65535, "FFFF"), (1, "0001")],
-    )
-    def test_default_probe_encodes_port_as_uppercase_hex(self, port, hex_expected):
-        cmd = HarborMCPMixin._default_mcp_health_cmd(port)
-        assert f":{hex_expected}$" in cmd
     @pytest.mark.asyncio
     async def test_custom_healthcheck_command_templated_with_port(self):
         env = _DummyEnv(mcp_launch_commands={"svc": "python x"})
@@ -580,7 +530,11 @@ class TestHealthCheck:
             if "/proc/net/tcp" in c.args[1]
         ]
         assert len(health_calls) == 1
-        assert ":1F40$" in health_calls[0]
+        health_cmd = health_calls[0]
+        assert "bash" not in health_cmd and "/dev/tcp" not in health_cmd
+        assert "/proc/net/tcp6" in health_cmd
+        assert '$4 == "0A"' in health_cmd
+        assert ":1F40$" in health_cmd
     @pytest.mark.asyncio
     async def test_probe_timeout_is_respected(self):

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_imports.py RENAMED Viewed

@@ -1,6 +1,26 @@
+import importlib
+import sys
 import verifiers
+def test_v1_taskset_imports_do_not_import_textarena():
+    textarena_module = "verifiers.v1.packages.tasksets.textarena"
+    sys.modules.pop(textarena_module, None)
+    tasksets = importlib.import_module("verifiers.v1.packages.tasksets")
+    tasksets.__dict__.pop("TextArenaTaskset", None)
+    tasksets.__dict__.pop("TextArenaTasksetConfig", None)
+    importlib.reload(tasksets)
+    assert textarena_module not in sys.modules
+    v1 = importlib.import_module("verifiers.v1")
+    v1.__dict__.pop("TextArenaTaskset", None)
+    v1.__dict__.pop("TextArenaTasksetConfig", None)
+    importlib.reload(v1)
+    assert textarena_module not in sys.modules
 class TestImports:
     """Test that all public API imports work correctly.
     This was inspired by issue #349.

verifiers-0.1.15.dev10/tests/test_init_script.py ADDED Viewed

@@ -0,0 +1,80 @@
+from pathlib import Path
+import verifiers as vf
+from verifiers.scripts.init import init_environment
+def read_env_file(root: Path, env_id: str) -> str:
+    module_name = env_id.replace("-", "_")
+    return (root / module_name / f"{module_name}.py").read_text()
+def test_init_default_writes_v0_stub(tmp_path: Path) -> None:
+    root = init_environment("foo", path=str(tmp_path))
+    content = read_env_file(tmp_path, "foo")
+    assert root == tmp_path / "foo"
+    assert "def load_environment(**kwargs) -> vf.Environment:" in content
+    assert "NotImplementedError" in content
+    assert "load_taskset" not in content
+    assert "EnvTaskset" not in content
+def test_init_v1_writes_thin_taskset_template(tmp_path: Path) -> None:
+    init_environment("bar", path=str(tmp_path), v1=True)
+    content = read_env_file(tmp_path, "bar")
+    assert 'ENV_ID = "bar"' in content
+    assert "def load_tasks():" in content
+    assert "class EnvTasksetConfig(vf.TasksetConfig):" in content
+    assert 'source: str = "bar:load_tasks"' in content
+    assert 'rewards: list[str] = ["bar:exact_answer"]' in content
+    assert "def load_taskset(config: EnvTasksetConfig) -> vf.Taskset:" in content
+    assert "vf.load_taskset(ENV_ID, config=config.taskset)" in content
+    assert "class EnvTaskset(" not in content
+    assert "_default_" not in content
+    assert "assert isinstance" not in content
+def test_init_v1_template_loads_with_vf_load_environment(
+    tmp_path: Path, monkeypatch
+) -> None:
+    init_environment("loadable-v1", path=str(tmp_path), v1=True)
+    monkeypatch.syspath_prepend(str(tmp_path / "loadable_v1"))
+    env = vf.load_environment("loadable-v1")
+    assert isinstance(env, vf.Env)
+    assert env.taskset.rows()[0]["answer"] == "cba"
+    assert env.taskset.rewards[0].__name__ == "exact_answer"
+def test_init_v1_with_harness_writes_harness_stub(tmp_path: Path) -> None:
+    init_environment("baz", path=str(tmp_path), v1=True, with_harness=True)
+    content = read_env_file(tmp_path, "baz")
+    assert "class EnvHarnessConfig(vf.HarnessConfig):" in content
+    assert "class EnvHarness(vf.Harness):" in content
+    assert "def load_harness(config: EnvHarnessConfig) -> EnvHarness:" in content
+    assert "vf.load_harness(ENV_ID, config=config.harness)" in content
+def test_init_with_harness_without_v1_warns_and_uses_v0(tmp_path: Path, capsys) -> None:
+    init_environment("plain", path=str(tmp_path), with_harness=True)
+    content = read_env_file(tmp_path, "plain")
+    captured = capsys.readouterr()
+    assert "--with-harness only applies with --v1; ignoring." in captured.out
+    assert "def load_environment(**kwargs) -> vf.Environment:" in content
+    assert "load_harness" not in content
+def test_init_v1_multifile_exports_component_loaders(tmp_path: Path) -> None:
+    init_environment("pkg-env", path=str(tmp_path), v1=True, multi_file=True)
+    package_dir = tmp_path / "pkg_env" / "pkg_env"
+    init_content = (package_dir / "__init__.py").read_text()
+    env_content = (package_dir / "pkg_env.py").read_text()
+    assert "from .pkg_env import load_environment, load_taskset" in init_content
+    assert "__all__ = ['load_environment', 'load_taskset']" in init_content
+    assert 'source: str = "pkg_env.pkg_env:load_tasks"' in env_content

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_lean_task.py RENAMED Viewed

@@ -9,10 +9,8 @@ from verifiers.envs.experimental.composable.tasksets.lean.lean_task import (
     LEAN_GUARD_END_MARKER,
     LeanRubric,
     _build_starter_file,
-    _expected_protected_region,
     _extract_protected_region,
     _normalize_signature,
-    _wrap_with_lean_guard,
 )
@@ -80,11 +78,13 @@ class TestNormalizeSignature:
         )
-class TestWrapWithLeanGuard:
+class TestBuildStarterFileLeanGuardLayout:
     def test_marker_layout(self) -> None:
         signature = "theorem foo (x : ℝ) : x = x := by"
-        wrapped = _wrap_with_lean_guard(signature)
-        assert wrapped == (
+        starter = _build_starter_file(
+            {"formal_statement": signature, "header": "", "imports": ""}
+        )
+        assert starter == (
             "-- lean-guard: begin protected\n"
             "theorem foo (x : ℝ) : x = x := by\n"
             "-- lean-guard: end protected\n"
@@ -93,8 +93,10 @@ class TestWrapWithLeanGuard:
     def test_round_trip_via_extract(self) -> None:
         signature = "theorem foo : True := by"
-        wrapped = _wrap_with_lean_guard(signature)
-        region = _extract_protected_region(wrapped)
+        starter = _build_starter_file(
+            {"formal_statement": signature, "header": "", "imports": ""}
+        )
+        region = _extract_protected_region(starter)
         assert region is not None
         assert LEAN_GUARD_BEGIN_MARKER in region
         assert LEAN_GUARD_END_MARKER in region
@@ -212,7 +214,7 @@ class TestBuildStarterFile:
             "header": "import Mathlib",
         }
         starter = _build_starter_file(info)
-        expected = _expected_protected_region(info)
+        expected = _extract_protected_region(_build_starter_file(info)) or ""
         actual = _extract_protected_region(starter)
         assert expected == actual
         assert expected != ""

{verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_opencode_rlm_env.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Tests for the OpenCodeRLMEnv class."""
+import asyncio
 import json
 import subprocess
 from unittest.mock import AsyncMock, MagicMock, patch
@@ -7,6 +8,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from datasets import Dataset
+import verifiers as vf
 from verifiers.envs.experimental.opencode_rlm_env import (
     OpenCodeRLMEnv,
     OpenCodeRLMMonitorRubric,
@@ -239,45 +241,6 @@ class TestBuildEnvVars:
         assert "RLM_SUB_MODEL_ID" not in env_vars
-# =============================================================================
-# Sub-LLM detection (header-based)
-# =============================================================================
-class TestIsSubLLMRequest:
-    def test_detects_sub_header(self):
-        assert (
-            OpenCodeRLMEnv._is_sub_llm_request({"headers": {"x-rlm-role": "sub"}})
-            is True
-        )
-    def test_rejects_no_headers(self):
-        assert OpenCodeRLMEnv._is_sub_llm_request({}) is False
-    def test_rejects_empty_headers(self):
-        assert OpenCodeRLMEnv._is_sub_llm_request({"headers": {}}) is False
-    def test_rejects_wrong_value(self):
-        assert (
-            OpenCodeRLMEnv._is_sub_llm_request({"headers": {"x-rlm-role": "main"}})
-            is False
-        )
-    def test_ignores_model_field(self):
-        """Model name should NOT be used for detection."""
-        assert (
-            OpenCodeRLMEnv._is_sub_llm_request({"model": "sub", "headers": {}}) is False
-        )
-    def test_header_takes_precedence(self):
-        assert (
-            OpenCodeRLMEnv._is_sub_llm_request(
-                {"model": "openai/gpt-5-mini", "headers": {"x-rlm-role": "sub"}}
-            )
-            is True
-        )
 # =============================================================================
 # State setup
 # =============================================================================
@@ -330,17 +293,45 @@ class TestMetrics:
         response = MagicMock(spec=[])  # no usage attr
         assert OpenCodeRLMEnv._extract_token_counts(response) == (0, 0)
-    def test_update_sub_metrics(self):
+    @pytest.mark.asyncio
+    async def test_handle_sub_llm_request_updates_sub_metrics(self):
         env = build_env()
         state = {
+            "trajectory": [],
+            "model": "main-model",
             "sub_llm_turns": 0,
             "sub_llm_prompt_tokens": 0,
             "sub_llm_completion_tokens": 0,
         }
-        response = MagicMock()
-        response.usage.prompt_tokens = 50
-        response.usage.completion_tokens = 20
-        env._update_sub_metrics(state, response)
+        response = vf.Response(
+            id="resp",
+            created=0,
+            model="sub-model",
+            message=vf.ResponseMessage(
+                content="ok", finish_reason="stop", is_truncated=False
+            ),
+            usage=vf.Usage(
+                prompt_tokens=50,
+                completion_tokens=20,
+                reasoning_tokens=0,
+                total_tokens=70,
+            ),
+        )
+        future = asyncio.get_running_loop().create_future()
+        intercept = {
+            "messages": [{"role": "user", "content": "hello"}],
+            "headers": {"x-rlm-role": "sub"},
+            "response_future": future,
+        }
+        env._require_interception_server().intercepts["req"] = intercept
+        with patch.object(
+            vf.Environment,
+            "get_model_response",
+            new=AsyncMock(return_value=response),
+        ):
+            await env._handle_sub_llm_request(state, "req", intercept)
+        assert future.result() is response
         assert state["sub_llm_turns"] == 1
         assert state["sub_llm_prompt_tokens"] == 50
         assert state["sub_llm_completion_tokens"] == 20

verifiers 0.1.15.dev8__tar.gz → 0.1.15.dev10__tar.gz

verifiers 0.1.15.dev8tar.gz → 0.1.15.dev10tar.gz