PyPI - verifiers - Versions diffs - 0.1.13.dev8__tar.gz → 0.1.15.dev0__tar.gz - Mend

verifiers 0.1.13.dev8tar.gz → 0.1.15.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (303) hide show

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/.gitignore RENAMED Viewed

@@ -4,7 +4,6 @@ venv/
 env/
 .env
 .env.local
-uv.lock
 .claude/
 .cursorrules
 .ropeproject/
@@ -22,6 +21,7 @@ _build/
 docs/build/
 *.egg-info/
 __pycache__/
+environments/**/uv.lock
 .pytest_cache/
 .ruff_cache/

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.13.dev8
+Version: 0.1.15.dev0
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
 Requires-Dist: anthropic>=0.78.0
 Requires-Dist: datasets<4.7.0,>=3.0.0
 Requires-Dist: gepa
+Requires-Dist: httpx>=0.27.0
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: math-verify>=0.8.0
 Requires-Dist: mcp>=1.14.1
@@ -34,7 +35,7 @@ Requires-Dist: nest-asyncio>=1.6.0
 Requires-Dist: numpy
 Requires-Dist: openai-agents>=0.0.7
 Requires-Dist: openai>=1.108.1
-Requires-Dist: prime-sandboxes>=0.2.21
+Requires-Dist: prime-sandboxes>=0.2.25
 Requires-Dist: prime-tunnel>=0.1.6
 Requires-Dist: pydantic>=2.11.9
 Requires-Dist: pyzmq>=27.1.0
@@ -46,13 +47,14 @@ Requires-Dist: tenacity>=8.5.0
 Requires-Dist: textual
 Requires-Dist: tomli; python_version < '3.11'
 Requires-Dist: typing-extensions; python_version < '3.12'
-Requires-Dist: wget>=3.2
 Provides-Extra: browser
 Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
 Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
 Requires-Dist: stagehand>=3.0.0; extra == 'browser'
 Provides-Extra: openenv
 Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
+Provides-Extra: renderers
+Requires-Dist: renderers>=0.1.6; extra == 'renderers'
 Provides-Extra: rg
 Requires-Dist: reasoning-gym; extra == 'rg'
 Provides-Extra: rl
@@ -107,7 +109,9 @@ Verifiers: Environments for LLM Reinforcement Learning
 ## News & Updates
-- [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
+- [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
+- [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
+- [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
 - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
 - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
 - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -197,11 +201,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
     async def correct_answer(completion, answer) -> float:
         completion_ans = completion[-1]['content']
         return 1.0 if completion_ans == answer else 0.0
-    rubric = Rubric(funcs=[correct_answer])
+    rubric = vf.Rubric(funcs=[correct_answer])
     env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
     return env
 ```
+For new environments with reusable tasksets, toolsets, custom programs, or
+custom harnesses, use the v1 Taskset/Harness path:
+```python
+# my_env.py
+import verifiers.v1 as vf
+def source():
+    yield {
+        "prompt": [{"role": "user", "content": "Reverse abc."}],
+        "answer": "cba",
+        "max_turns": 1,
+    }
+@vf.reward(weight=1.0)
+async def contains_answer(task, state) -> float:
+    return float(task["answer"] in str(state.get("completion") or ""))
+def load_taskset(config: vf.TasksetConfig | None = None):
+    return vf.Taskset(source=source, rewards=[contains_answer], config=config)
+def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(taskset=load_taskset(config=config.taskset))
+```
+If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
+**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
+Reusable taskset and harness packages live under `verifiers.v1.packages` while
+the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
+For example, Harbor task directories can run through the bundled OpenCode CLI
+harness with:
+```python
+env = vf.Env(
+    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    harness=vf.OpenCode(),
+)
+```
+The same environment package is the unit used by evals and `prime-rl`. The
+trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
+and harness options stay under `env.taskset` and `env.harness`:
+```toml
+# configs/rl/my-v1-env.toml
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+max_steps = 100
+batch_size = 256
+rollouts_per_example = 8
+[sampling]
+max_tokens = 4096
+[[env]]
+id = "my-env"
+[env.args]
+arg1 = "non-th-arg"
+[env.harness]
+max_turns = 1
+[env.taskset.scoring.contains_answer]
+weight = 1.0
+```
+```bash
+prime env install my-env
+```
+For self-managed training launch commands, use the `prime-rl` documentation.
 To install the environment module into your project, do:
 ```bash
 prime env install my-env # installs from ./environments/my_env
@@ -237,6 +312,8 @@ prime eval run primeintellect/math-python
 **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
+**[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
 **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
 **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/README.md RENAMED Viewed

@@ -34,7 +34,9 @@ Verifiers: Environments for LLM Reinforcement Learning
 ## News & Updates
-- [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
+- [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
+- [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
+- [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
 - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
 - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
 - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -124,11 +126,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
     async def correct_answer(completion, answer) -> float:
         completion_ans = completion[-1]['content']
         return 1.0 if completion_ans == answer else 0.0
-    rubric = Rubric(funcs=[correct_answer])
+    rubric = vf.Rubric(funcs=[correct_answer])
     env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
     return env
 ```
+For new environments with reusable tasksets, toolsets, custom programs, or
+custom harnesses, use the v1 Taskset/Harness path:
+```python
+# my_env.py
+import verifiers.v1 as vf
+def source():
+    yield {
+        "prompt": [{"role": "user", "content": "Reverse abc."}],
+        "answer": "cba",
+        "max_turns": 1,
+    }
+@vf.reward(weight=1.0)
+async def contains_answer(task, state) -> float:
+    return float(task["answer"] in str(state.get("completion") or ""))
+def load_taskset(config: vf.TasksetConfig | None = None):
+    return vf.Taskset(source=source, rewards=[contains_answer], config=config)
+def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(taskset=load_taskset(config=config.taskset))
+```
+If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
+**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
+Reusable taskset and harness packages live under `verifiers.v1.packages` while
+the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
+For example, Harbor task directories can run through the bundled OpenCode CLI
+harness with:
+```python
+env = vf.Env(
+    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    harness=vf.OpenCode(),
+)
+```
+The same environment package is the unit used by evals and `prime-rl`. The
+trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
+and harness options stay under `env.taskset` and `env.harness`:
+```toml
+# configs/rl/my-v1-env.toml
+model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+max_steps = 100
+batch_size = 256
+rollouts_per_example = 8
+[sampling]
+max_tokens = 4096
+[[env]]
+id = "my-env"
+[env.args]
+arg1 = "non-th-arg"
+[env.harness]
+max_turns = 1
+[env.taskset.scoring.contains_answer]
+weight = 1.0
+```
+```bash
+prime env install my-env
+```
+For self-managed training launch commands, use the `prime-rl` documentation.
 To install the environment module into your project, do:
 ```bash
 prime env install my-env # installs from ./environments/my_env
@@ -164,6 +237,8 @@ prime eval run primeintellect/math-python
 **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
+**[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
 **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
 **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/pyproject.toml RENAMED Viewed

@@ -38,7 +38,7 @@ dependencies = [
     "openai>=1.108.1",
     "openai-agents>=0.0.7",
     "prime-tunnel>=0.1.6",
-    "prime-sandboxes>=0.2.21",
+    "prime-sandboxes>=0.2.25",
     "pydantic>=2.11.9",
     "requests",
     "rich",
@@ -46,13 +46,13 @@ dependencies = [
     "textual",
     "tomli; python_version < '3.11'",
     "typing_extensions; python_version < '3.12'",
-    "wget>=3.2",
     "gepa",
     "pyzmq>=27.1.0",
     "msgpack>=1.1.2",
     "aiolimiter>=1.2.1",
     "setproctitle>=1.3.0",
-    "regex<2026.4.4",  # 2026.4.4 missing cp312/cp313 wheels
+    "regex<2026.4.4",
+    "httpx>=0.27.0",
 ]
 [dependency-groups]
@@ -73,6 +73,7 @@ dev = [
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
     "nltk",
+    "renderers>=0.1.6",
 ]
 [project.optional-dependencies]
@@ -91,6 +92,9 @@ browser = [
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
 ]
+renderers = [
+    "renderers>=0.1.6",
+]
 rl = [
     "torch>=2.8.0,<2.9.0",
     "transformers>=4.56.2",
@@ -108,6 +112,24 @@ rl = [
 preview = true
 required-version = ">=0.11.1"
+[[tool.uv.index]]
+name = "pypi"
+url = "https://pypi.org/simple"
+default = true
+exclude-newer = "7 days"
+[tool.uv.exclude-newer-package]
+# PrimeIntellect-published on PyPI (trusted publisher)
+prime-tunnel = false
+prime-sandboxes = false
+renderers = false
+[tool.uv.sources]
+# Pinned to renderers main until the next PyPI release lands; drop after.
+# fe67f9f = renderers main: PR #4 squash-merge — construction-time
+# preserve_*_thinking flags on create_renderer / create_renderer_pool.
+renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
 [tool.uv.extra-build-dependencies]
 flash-attn = [{ requirement = "torch", match-runtime = true }]
@@ -125,7 +147,6 @@ vf-rl = "verifiers.scripts.rl:main"
 vf-train = "verifiers.scripts.train:main"
 vf-tui = "verifiers.scripts.tui:main"
 vf-vllm = "verifiers.scripts.vllm:main"
-prime-rl = "verifiers.scripts.prime_rl:main"
 # hatchling configuration
 [tool.hatch.version]
@@ -170,6 +191,7 @@ addopts = [
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "integration: marks tests as integration tests",
+    "prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
     "unit: marks tests as unit tests",
     "asyncio: marks tests as async tests",
     "parsers: marks tests for parser components",
@@ -195,7 +217,7 @@ unknown-argument = "warn"
 redundant-cast = "ignore"
 [tool.ty.src]
-exclude = ["environments"]
+exclude = ["environments", "verifiers/v1/sketch.py"]
 [[tool.ty.overrides]]
 include = ["verifiers/envs/experimental/composable/tasksets/**"]

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/conftest.py RENAMED Viewed

@@ -425,9 +425,10 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
         super().__init__(tools=[offset_tool], **kwargs)
     async def setup_state(self, state, **kwargs):
-        await super().setup_state(state, **kwargs)
+        state = await super().setup_state(state, **kwargs)
         state["offset"] = 3
         state["update_calls"] = 0
+        return state
     def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
         state["update_calls"] += 1
@@ -457,13 +458,15 @@ def make_input() -> Callable[..., RolloutInput]:
     def _make_input(
         example_id: int = 0,
-        task: str = "default",
         prompt: Messages = DEFAULT_PROMPT,
         info: Info = {},
         answer: str = "4",
     ) -> RolloutInput:
         return RolloutInput(
-            example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
+            example_id=example_id,
+            prompt=prompt,
+            answer=answer,
+            info=info,
         )
     return _make_input
@@ -475,7 +478,6 @@ def make_state() -> Callable[..., State]:
     def _make_state(
         example_id: int = 0,
-        task: str = "default",
         prompt: Messages = DEFAULT_PROMPT,
         answer: str = "4",
         info: Info = {},
@@ -487,17 +489,12 @@ def make_state() -> Callable[..., State]:
         stop_condition: str | None = "max_turns_reached",
         tool_defs: list[Tool] | None = None,
         trajectory: list[TrajectoryStep] = [],
-        timing=RolloutTiming(
-            generation_ms=0.0,
-            scoring_ms=0.0,
-            total_ms=0.0,
-        ),
+        timing=RolloutTiming(),
         foo: str = "bar",  # custom field
         **kwargs,
     ) -> State:
         return State(
             example_id=example_id,
-            task=task,
             prompt=prompt,
             answer=answer,
             info=info,
@@ -550,7 +547,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
         rollouts_per_example: int = 1,
         sampling_args: SamplingArgs = {},
         date: str = "1970-01-01",
-        time_ms: float = 0.0,
+        time: float = 0.0,
         avg_reward: float = 0.0,
         avg_metrics: dict[str, float] = {},
         pass_at_k: dict[str, float] = {},
@@ -578,7 +575,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
             rollouts_per_example=rollouts_per_example,
             sampling_args=sampling_args,
             date=date,
-            time_ms=time_ms,
+            time=time,
             avg_reward=avg_reward,
             avg_metrics=avg_metrics,
             pass_at_k=pass_at_k,

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_cli_agent_env.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Tests for CliAgentEnv and HarborEnv."""
+import asyncio
 import tempfile
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch
@@ -8,6 +9,7 @@ import pytest
 from datasets import Dataset
 import verifiers as vf
+from verifiers.utils.interception_utils import serialize_intercept_response
 @pytest.fixture
@@ -100,6 +102,11 @@ class TestCliAgentEnv:
         env_vars = await env.build_env_vars(state)
         assert env_vars["OPENAI_BASE_URL"] == "https://test.trycloudflare.com/v1"
+        assert env_vars["OPENAI_API_KEY"] == env._require_interception_server().secret
+        assert env_vars["ANTHROPIC_BASE_URL"] == "https://test.trycloudflare.com"
+        assert (
+            env_vars["ANTHROPIC_API_KEY"] == env._require_interception_server().secret
+        )
         assert env_vars["OPENAI_MODEL"] == "gpt-4"
         assert env_vars["CUSTOM_VAR"] == "value"
@@ -217,6 +224,152 @@ class TestCliAgentEnv:
         assert kwargs["tools"][0].name == "echo"
+@pytest.mark.asyncio
+async def test_cli_agent_env_delivers_intercepted_tool_call_response(
+    sample_dataset, mock_client
+):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    prompt = sample_dataset[0]["prompt"]
+    tool_call = {
+        "id": "call_echo",
+        "type": "function",
+        "function": {"name": "echo", "arguments": '{"text": "hello"}'},
+    }
+    mock_client.add_response(
+        prompt,
+        "",
+        finish_reason="tool_calls",
+        tool_calls=[tool_call],
+    )
+    state = await env.init_state(
+        input=sample_dataset[0],
+        client=mock_client,
+        model="test-model",
+    )
+    response_future = asyncio.Future()
+    request_id = "req-tool-call"
+    state["current_request_id"] = request_id
+    env._interception_server.intercepts[request_id] = {
+        "stream": False,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "echo",
+                    "description": "Return the provided text.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"text": {"type": "string"}},
+                    },
+                },
+            }
+        ],
+        "response_future": response_future,
+    }
+    response = await env.get_model_response(
+        state=state,
+        prompt=prompt,
+        client=mock_client,
+        model="test-model",
+    )
+    assert response_future.done()
+    assert response_future.result() is response
+    assert state["current_request_id"] is None
+    payload = serialize_intercept_response(response_future.result())
+    choice = payload["choices"][0]
+    assert choice["finish_reason"] == "tool_calls"
+    assert choice["message"]["tool_calls"] == [tool_call]
+    assert mock_client.last_call_kwargs["tools"][0].name == "echo"
+@pytest.mark.asyncio
+async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
+    sample_dataset, mock_client
+):
+    env = vf.CliAgentEnv(
+        run_command="python agent.py",
+        dataset=sample_dataset,
+        rubric=vf.Rubric(),
+    )
+    prompt = sample_dataset[0]["prompt"]
+    tool_call = {
+        "id": "call_echo",
+        "type": "function",
+        "function": {"name": "echo", "arguments": '{"text": "hello"}'},
+    }
+    mock_client.add_response(
+        prompt,
+        "",
+        finish_reason="tool_calls",
+        tool_calls=[tool_call],
+    )
+    state = await env.init_state(
+        input=sample_dataset[0],
+        client=mock_client,
+        model="test-model",
+    )
+    chunk_queue = asyncio.Queue()
+    response_future = asyncio.Future()
+    request_id = "req-stream-tool-call"
+    state["current_request_id"] = request_id
+    env._interception_server.intercepts[request_id] = {
+        "stream": True,
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "echo",
+                    "description": "Return the provided text.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"text": {"type": "string"}},
+                    },
+                },
+            }
+        ],
+        "chunk_queue": chunk_queue,
+        "response_future": response_future,
+    }
+    response = await env.get_model_response(
+        state=state,
+        prompt=prompt,
+        client=mock_client,
+        model="test-model",
+    )
+    chunks = []
+    while True:
+        chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
+        if chunk is None:
+            break
+        chunks.append(chunk)
+    assert response_future.done()
+    assert response_future.result() is response
+    assert state["current_request_id"] is None
+    assert chunks[0]["object"] == "chat.completion.chunk"
+    assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
+    assert (
+        chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
+    )
+    assert (
+        chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
+        == '{"text": "hello"}'
+    )
+    assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
 class TestHarborEnv:
     """Tests for HarborEnv."""
@@ -244,7 +397,7 @@ class TestHarborEnv:
             dataset_path=harbor_task_dir,
         )
         assert len(env.dataset) == 1
-        assert env.dataset[0]["task"] == "test_task"
+        assert env.dataset[0]["info"]["task_name"] == "test_task"
     def test_init_filters_tasks(self, harbor_task_dir):
         """Test that HarborEnv can filter tasks by name."""
@@ -260,7 +413,7 @@ class TestHarborEnv:
             tasks=["test_task"],
         )
         assert len(env.dataset) == 1
-        assert env.dataset[0]["task"] == "test_task"
+        assert env.dataset[0]["info"]["task_name"] == "test_task"
     def test_init_raises_on_empty_dataset(self):
         """Test that HarborEnv raises when no valid tasks found."""
@@ -314,7 +467,7 @@ class TestHarborEnv:
         )
         state = {
             "interception_base_url": "https://test.trycloudflare.com/v1",
-            "task": "my_task",
+            "info": {"task_name": "my_task"},
         }
         env_vars = await env.build_env_vars(state)

{verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_composable_env.py RENAMED Viewed

@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
         teardown=lambda: None,
     )
-    state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
+    state = {"sandbox_id": "sbx", "timing": {"total": 0}}
     await env.post_rollout(state)
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
     state = {
         "sandbox_id": "sbx",
         "info": {"id": 0},
-        "timing": {"total_ms": 0},
+        "timing": {"total": 0},
         "trajectory": [],
     }

verifiers 0.1.13.dev8__tar.gz → 0.1.15.dev0__tar.gz

verifiers 0.1.13.dev8tar.gz → 0.1.15.dev0tar.gz