PyPI - verifiers - Versions diffs - 0.1.15.dev2__tar.gz → 0.1.15.dev4__tar.gz - Mend

verifiers 0.1.15.dev2tar.gz → 0.1.15.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (320) hide show

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev2
+Version: 0.1.15.dev4
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -52,7 +52,7 @@ Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
 Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
 Requires-Dist: stagehand>=3.0.0; extra == 'browser'
 Provides-Extra: openenv
-Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
+Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
 Provides-Extra: renderers
 Requires-Dist: renderers>=0.1.8.dev0; extra == 'renderers'
 Provides-Extra: rg
@@ -210,7 +210,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
 custom harnesses, use the v1 Taskset/Harness path:
 ```python
 # my_env.py
-import verifiers.v1 as vf
+import verifiers as vf
 def source():
     yield {
@@ -226,8 +226,7 @@ async def contains_answer(task, state) -> float:
 def load_taskset(config: vf.TasksetConfig | None = None):
     return vf.Taskset(source=source, rewards=[contains_answer], config=config)
-def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
-    config = config or vf.EnvConfig()
+def load_environment(config: vf.EnvConfig) -> vf.Env:
     return vf.Env(taskset=load_taskset(config=config.taskset))
 ```
 If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
@@ -239,7 +238,7 @@ harness with:
 ```python
 env = vf.Env(
-    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    taskset=vf.HarborTaskset(),
     harness=vf.OpenCode(),
 )
 ```

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/README.md RENAMED Viewed

@@ -135,7 +135,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
 custom harnesses, use the v1 Taskset/Harness path:
 ```python
 # my_env.py
-import verifiers.v1 as vf
+import verifiers as vf
 def source():
     yield {
@@ -151,8 +151,7 @@ async def contains_answer(task, state) -> float:
 def load_taskset(config: vf.TasksetConfig | None = None):
     return vf.Taskset(source=source, rewards=[contains_answer], config=config)
-def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
-    config = config or vf.EnvConfig()
+def load_environment(config: vf.EnvConfig) -> vf.Env:
     return vf.Env(taskset=load_taskset(config=config.taskset))
 ```
 If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
@@ -164,7 +163,7 @@ harness with:
 ```python
 env = vf.Env(
-    taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
+    taskset=vf.HarborTaskset(),
     harness=vf.OpenCode(),
 )
 ```

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/pyproject.toml RENAMED Viewed

@@ -68,13 +68,15 @@ dev = [
     "ipywidgets",
     "reasoning-gym",
     "textarena",
-    "openenv-core[core]==0.2.1",
     "stagehand>=3.0.0",
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
     "nltk",
     "renderers>=0.1.8.dev0",
 ]
+policy = [
+    "semgrep>=1.150.0",
+]
 [project.optional-dependencies]
 rg = [
@@ -84,14 +86,14 @@ ta = [
     "textarena",
     "nltk",
 ]
-openenv = [
-    "openenv-core[core]==0.2.1",
-]
 browser = [
     "stagehand>=3.0.0",
     "aiohttp>=3.9.0",
     "python-dotenv>=1.0.0",
 ]
+openenv = [
+    "openenv-core>=0.3.0",
+]
 renderers = [
     "renderers>=0.1.8.dev0",
 ]
@@ -111,7 +113,12 @@ rl = [
 [tool.uv]
 preview = true
 required-version = ">=0.11.1"
+conflicts = [
+    [
+        { extra = "openenv" },
+        { group = "policy" },
+    ],
+]
 [[tool.uv.index]]
 name = "pypi"
 url = "https://pypi.org/simple"
@@ -123,6 +130,7 @@ exclude-newer = "7 days"
 prime-tunnel = false
 prime-sandboxes = false
 renderers = false
+openenv-core = false
 [tool.uv.extra-build-dependencies]
 flash-attn = [{ requirement = "torch", match-runtime = true }]
@@ -130,6 +138,11 @@ flash-attn = [{ requirement = "torch", match-runtime = true }]
 [tool.uv.extra-build-variables]
 flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
+[tool.ruff]
+exclude = [
+    ".semgrep",
+]
 [project.scripts]
 vf-eval = "verifiers.scripts.eval:main"
 vf-gepa = "verifiers.scripts.gepa:main"

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_context_token_metrics.py RENAMED Viewed

@@ -5,10 +5,9 @@ Tests the trajectory-based context token computation
 using the last trajectory step.
 """
-from unittest.mock import MagicMock
 import pytest
+from verifiers.types import Response, ResponseMessage, Usage
 from verifiers.utils.usage_utils import compute_context_token_metrics
@@ -20,12 +19,39 @@ SYS = {"role": "system", "content": "You are helpful"}
 USER = {"role": "user", "content": "hi"}
-def _make_response(prompt_tokens: int, completion_tokens: int) -> MagicMock:
-    response = MagicMock()
-    response.usage = MagicMock(
-        prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
+def _make_response(prompt_tokens: int, completion_tokens: int) -> Response:
+    return Response(
+        id="test",
+        created=0,
+        model="test",
+        usage=Usage(
+            prompt_tokens=prompt_tokens,
+            reasoning_tokens=0,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
+        message=ResponseMessage(
+            role="assistant",
+            content="",
+            finish_reason="stop",
+            is_truncated=False,
+        ),
+    )
+def _make_response_without_usage() -> Response:
+    return Response(
+        id="test",
+        created=0,
+        model="test",
+        usage=None,
+        message=ResponseMessage(
+            role="assistant",
+            content="",
+            finish_reason="stop",
+            is_truncated=False,
+        ),
     )
-    return response
 def _asst(i: int) -> dict:
@@ -115,13 +141,11 @@ class TestContextMetrics:
         assert metrics["final_input_tokens"] == 230 - 50
     def test_skips_responses_without_usage(self):
-        """Responses with no .usage attribute are skipped entirely."""
-        no_usage = MagicMock()
-        no_usage.usage = None
+        """Responses with usage=None are skipped entirely."""
         trajectory = [
             {"response": _make_response(100, 20)},
             {"response": _make_response(200, 30)},
-            {"response": no_usage},  # last step, but no usage
+            {"response": _make_response_without_usage()},
         ]
         metrics = compute_context_token_metrics(trajectory)
         # Should use step 1 (last with usage): total = 230
@@ -130,11 +154,9 @@ class TestContextMetrics:
     def test_all_responses_lack_usage(self):
         """If no response has usage data, return zeros."""
-        no_usage = MagicMock()
-        no_usage.usage = None
         trajectory = [
-            {"response": no_usage},
-            {"response": no_usage},
+            {"response": _make_response_without_usage()},
+            {"response": _make_response_without_usage()},
         ]
         metrics = compute_context_token_metrics(trajectory)
         assert metrics["final_output_tokens"] == 0

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_environment_extra.py RENAMED Viewed

@@ -9,8 +9,6 @@ Covers:
 - make_dataset tool call sanitization
 """
-from __future__ import annotations
 import asyncio
 import json
 from unittest.mock import AsyncMock

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_eval_display.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from rich.console import Console
 from verifiers.types import ClientConfig, EvalConfig
 from verifiers.utils.eval_display import EvalDisplay
@@ -78,3 +80,44 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
     )
     assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
+def render_plain(renderable) -> str:
+    console = Console(width=100, record=True)
+    console.print(renderable)
+    return console.export_text()
+def test_tokens_row_omits_cost_when_unavailable() -> None:
+    display = EvalDisplay([make_config(max_concurrent=1)])
+    rendered = render_plain(
+        display._make_tokens_row({"input_tokens": 12.0, "output_tokens": 7.0})
+    )
+    assert "input 12" in rendered
+    assert "output 7" in rendered
+    assert "cost" not in rendered
+def test_tokens_row_includes_cost_when_available() -> None:
+    display = EvalDisplay([make_config(max_concurrent=1)])
+    rendered = render_plain(
+        display._make_tokens_row(
+            {
+                "input_tokens": 12.0,
+                "output_tokens": 7.0,
+                "final_input_tokens": 10.0,
+                "final_output_tokens": 5.0,
+            },
+            {"input_usd": 0.005, "output_usd": 0.0073, "total_usd": 0.0123},
+        )
+    )
+    assert "input 12" in rendered
+    assert "output 7" in rendered
+    assert "final_input 10" in rendered
+    assert "final_output 5" in rendered
+    assert "cost (all) $0.0123" in rendered
+    assert rendered.index("final_output 5") < rendered.index("cost (all) $0.0123")

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_eval_utils.py RENAMED Viewed

@@ -4,6 +4,8 @@ Covers:
 - print_results indexing with multiple rollouts per example
 """
+import pytest
 from verifiers.types import GenerateOutputs
 from verifiers.utils.save_utils import states_to_outputs
@@ -138,6 +140,55 @@ def test_print_results_includes_usage(capsys, make_metadata, make_output):
     assert "output_tokens (avg): 3.000" in captured.out
+def test_attach_metadata_cost_uses_total_output_usage(make_metadata, make_output):
+    from verifiers.utils.eval_utils import _attach_metadata_cost
+    outputs = [
+        make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
+        make_output(example_id=1, reward=0.0, metrics={"test_metric": 2.0}),
+    ]
+    outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
+    outputs[1]["token_usage"] = {"input_tokens": 6.0, "output_tokens": 2.0}
+    metadata = make_metadata(
+        num_examples=2,
+        rollouts_per_example=1,
+        usage={"input_tokens": 8.0, "output_tokens": 3.0},
+    )
+    cost = _attach_metadata_cost(
+        metadata,
+        {"input_usd_per_mtok": 1.0, "output_usd_per_mtok": 5.0},
+        outputs,
+    )
+    assert cost == {
+        "input_usd": pytest.approx(0.000016),
+        "output_usd": pytest.approx(0.000030),
+        "total_usd": pytest.approx(0.000046),
+    }
+    assert metadata["cost"] == cost
+def test_print_results_labels_cost_as_all(capsys, make_metadata, make_output):
+    from verifiers.utils.eval_utils import print_results
+    outputs = [
+        make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
+    ]
+    outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
+    metadata = make_metadata(num_examples=1, rollouts_per_example=1, usage=None)
+    metadata["cost"] = {
+        "input_usd": 0.005,
+        "output_usd": 0.0073,
+        "total_usd": 0.0123,
+    }
+    print_results(GenerateOutputs(outputs=outputs, metadata=metadata))
+    captured = capsys.readouterr()
+    assert "cost (all): $0.0123" in captured.out
 def test_print_results_handles_heterogeneous_metrics(
     capsys, make_metadata, make_output
 ):

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_gym_env.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 import re
 from typing import Any

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_harbor_env_mcp.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 import asyncio
 import time
 from typing import Any

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_interception_utils.py RENAMED Viewed

@@ -261,7 +261,7 @@ async def test_keepalive_write_failure_surfaces_to_state(monkeypatch):
     assert isinstance(state["error"], StreamInterrupted)
     msg = str(state["error"])
-    assert "keepalive write failed" in msg
+    assert "Keepalive write failed" in msg
     assert "ConnectionResetError" in msg
@@ -306,6 +306,6 @@ async def test_non_streaming_response_future_failure_surfaces_to_state(monkeypat
         f"expected InterceptionError, got {type(state.get('error'))}"
     )
     msg = str(state["error"])
-    assert "intercepted request failed" in msg
+    assert "Intercepted request failed" in msg
     assert "RuntimeError" in msg
     assert "vLLM raised" in msg

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_langchain_deep_agents_wikispeedia.py RENAMED Viewed

@@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
 ) -> None:
     module = load_module(monkeypatch)
-    env = module.load_environment(train_size=1, eval_size=1)
+    env = module.load_environment(config=vf.EnvConfig(), train_size=1, eval_size=1)
     assert isinstance(env, vf.Env)
     assert isinstance(env.taskset, vf.Taskset)
@@ -157,6 +157,7 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
     wiki = make_small_wiki(module)
     monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
     env = module.load_environment(
+        config=vf.EnvConfig(),
         train_size=2,
         eval_size=1,
         min_path_length=1,
@@ -248,6 +249,12 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
         async def ainvoke(self, payload, config=None):
             raise GraphRecursionError("recursion limit")
+    created_system_prompts = []
+    def fake_create_deep_agent(**kwargs):
+        created_system_prompts.append(kwargs["system_prompt"])
+        return FakeAgent()
     fake_deepagents = types.ModuleType("deepagents")
     fake_langchain_openai = types.ModuleType("langchain_openai")
     fake_langgraph = types.ModuleType("langgraph")
@@ -255,7 +262,7 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
     fake_langchain_core = types.ModuleType("langchain_core")
     fake_tools_module = types.ModuleType("langchain_core.tools")
-    fake_deepagents.create_deep_agent = lambda **kwargs: FakeAgent()
+    fake_deepagents.create_deep_agent = fake_create_deep_agent
     fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
     fake_langgraph_errors.GraphRecursionError = GraphRecursionError
     fake_langgraph.errors = fake_langgraph_errors
@@ -276,12 +283,16 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
         {
             "info": {"source": "A"},
             "prompt": [{"role": "user", "content": "start"}],
-            "system_prompt": [{"role": "system", "content": "prompt"}],
+            "system_prompt": [
+                {"role": "user", "content": "first prompt chunk"},
+                {"role": "system", "content": "second prompt chunk"},
+            ],
         }
     )
     result = await program({}, state)
+    assert created_system_prompts == ["first prompt chunk\n\nsecond prompt chunk"]
     assert result["agent_timeout"] is True
     assert result["stop_reason"] == "agent_recursion_limit"
     assert result["agent_completion"] == []
@@ -298,11 +309,10 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(
         {
             "role": "assistant",
             "content": "",
-            "tool_calls": [{"id": "call_1", "name": "click_link"}],
+            "tool_calls": [{"id": "call_1", "name": "click_link", "arguments": "{}"}],
         },
         {
             "role": "tool",
-            "name": "click_link",
             "tool_call_id": "call_1",
             "content": "'C' is not a valid link from 'A'.",
         },

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_lean_task.py RENAMED Viewed

@@ -1,7 +1,5 @@
 """Tests for ``LeanTaskSet`` lean-guard wrapping and reward enforcement."""
-from __future__ import annotations
 from dataclasses import dataclass
 import pytest

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_mcp_search_env.py RENAMED Viewed

@@ -1,10 +1,9 @@
-from __future__ import annotations
 import importlib.util
 import inspect
 from pathlib import Path
 from typing import Any
+import pytest
 import verifiers.v1 as vf
@@ -27,7 +26,7 @@ def _load_mcp_search_module() -> Any:
 def test_mcp_search_env_is_v1_only() -> None:
     module = _load_mcp_search_module()
-    env = module.load_environment(max_turns=4)
+    env = module.load_environment(config=vf.EnvConfig(), max_turns=4)
     assert isinstance(env, vf.Env)
     assert isinstance(env.taskset, vf.Taskset)
@@ -59,3 +58,18 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
     assert env.taskset.config.max_turns == 3
     assert all(row["max_turns"] == 3 for row in rows)
+@pytest.mark.asyncio
+async def test_mcp_search_reward_handles_missing_assistant() -> None:
+    module = _load_mcp_search_module()
+    task = vf.Task({"answer": "expected"})
+    assert await module.exact_title_reward(task, vf.State({"completion": []})) == 0.0
+    assert (
+        await module.exact_title_reward(
+            task,
+            vf.State({"completion": [{"role": "user", "content": "expected"}]}),
+        )
+        == 0.0
+    )

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_message_utils.py RENAMED Viewed

@@ -1,5 +1,9 @@
-from verifiers.types import AssistantMessage
-from verifiers.utils.message_utils import from_raw_message, normalize_messages
+from verifiers.types import AssistantMessage, UserMessage
+from verifiers.utils.message_utils import (
+    from_raw_message,
+    get_messages,
+    normalize_messages,
+)
 def test_from_raw_message_normalizes_oai_tool_calls():
@@ -55,3 +59,30 @@ def test_normalize_messages_accepts_oai_tool_call_dicts():
     assert assistant.tool_calls[0].id == "call_2"
     assert assistant.tool_calls[0].name == "lookup"
     assert assistant.tool_calls[0].arguments == '{"q": "hello"}'
+def test_get_messages_returns_typed_messages():
+    messages = get_messages(
+        [
+            {"role": "user", "content": "question"},
+            {"role": "assistant", "content": "answer"},
+        ]
+    )
+    assert isinstance(messages[0], UserMessage)
+    assert isinstance(messages[1], AssistantMessage)
+    assert messages[-1].content == "answer"
+def test_get_messages_filters_by_role_with_typed_return():
+    messages = get_messages(
+        [
+            {"role": "user", "content": "question"},
+            {"role": "assistant", "content": "answer"},
+        ],
+        role="assistant",
+    )
+    assert len(messages) == 1
+    assert isinstance(messages[0], AssistantMessage)
+    assert messages[0].content == "answer"

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_nemorl_client.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 from unittest.mock import AsyncMock, patch
 import pytest

{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_opencode_harbor.py RENAMED Viewed

@@ -1,11 +1,8 @@
-from __future__ import annotations
 import importlib.util
+import sys
 from pathlib import Path
 from typing import Any, cast
-import pytest
 import verifiers.v1 as vf
@@ -23,6 +20,7 @@ def _load_opencode_module() -> Any:
     assert spec.loader is not None
     module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
     spec.loader.exec_module(module)
     return module
@@ -30,33 +28,32 @@ def _load_opencode_module() -> Any:
 def test_load_environment_uses_v1_taskset_and_harness() -> None:
     module = _load_opencode_module()
-    env = module.load_environment()
+    env = module.load_environment(config=vf.EnvConfig())
     assert isinstance(env, vf.Env)
     assert isinstance(env.taskset, vf.HarborTaskset)
     assert isinstance(env.harness, vf.OpenCode)
     assert isinstance(env.harness.config, vf.OpenCodeConfig)
     assert not hasattr(module, "OpenCodeHarborHarnessConfig")
-    assert Path(env.taskset.tasks) == Path(module.__file__).parent / "tasks"
+    assert not hasattr(module, "TERMINAL_BENCH_SAMPLE_TASKS")
+    assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
     assert env.harness.config.max_turns == 4
-    assert env.harness.config.disabled_tools == ["webfetch", "question"]
+    assert env.harness.config.disabled_tools == vf.OpenCodeConfig().disabled_tools
+    assert "webfetch" in env.harness.config.disabled_tools
+    assert "question" in env.harness.config.disabled_tools
     program = cast(dict[str, object], env.harness.program)
-    mcp_setup = cast(dict[str, object], program["tools"])["mcp"]
+    mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
     assert '"webfetch": false' in cast(str, mcp_setup)
     assert '"question": false' in cast(str, mcp_setup)
-    assert '"read": false' not in cast(str, mcp_setup)
-def test_load_environment_accepts_v1_taskset_and_harness_config(
-    tmp_path: Path,
-) -> None:
+def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
     module = _load_opencode_module()
     env = module.load_environment(
         config=vf.EnvConfig(
             taskset={
-                "tasks": str(tmp_path),
                 "task_names": ["task-a"],
                 "cpu_cores": 1.5,
             },
@@ -68,7 +65,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
         )
     )
-    assert Path(env.taskset.tasks) == tmp_path
+    assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
     assert env.taskset.task_names == ["task-a"]
     assert env.taskset.cpu_cores == 1.5
     assert env.harness.config.agent_workdir == "/workspace"
@@ -76,25 +73,14 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
     program = cast(dict[str, object], env.harness.program)
     command = cast(list[object], program["command"])
-    mcp_setup = cast(dict[str, object], program["tools"])["mcp"]
+    mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
     assert "/workspace" in cast(str, command[2])
     assert '"webfetch": false' in cast(str, mcp_setup)
     assert '"question": false' not in cast(str, mcp_setup)
-def test_dataset_shortcuts_select_task_names() -> None:
-    module = _load_opencode_module()
-    env = module.load_environment(dataset="terminal-bench-sample")
-    assert env.taskset.task_names == module.TERMINAL_BENCH_SAMPLE_TASKS
-def test_dataset_rejects_explicit_task_names() -> None:
+def test_pyproject_does_not_define_unsupported_harness_defaults() -> None:
     module = _load_opencode_module()
+    pyproject = Path(module.__file__).parent / "pyproject.toml"
-    with pytest.raises(ValueError, match="dataset.*task_names"):
-        module.load_environment(
-            dataset="terminal-bench-sample",
-            task_names=["hello-world"],
-        )
+    assert "[tool.verifiers.harness]" not in pyproject.read_text()

verifiers 0.1.15.dev2__tar.gz → 0.1.15.dev4__tar.gz

verifiers 0.1.15.dev2tar.gz → 0.1.15.dev4tar.gz