PyPI - verifiers - Versions diffs - 0.1.15.dev167__tar.gz → 0.1.15.dev169__tar.gz - Mend

verifiers 0.1.15.dev167tar.gz → 0.1.15.dev169tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (346) hide show

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev167
+Version: 0.1.15.dev169
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -22,8 +22,10 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: <3.14,>=3.10
+Requires-Dist: aiohttp>=3.9.0
 Requires-Dist: aiolimiter>=1.2.1
 Requires-Dist: anthropic>=0.78.0
+Requires-Dist: certifi
 Requires-Dist: datasets<4.7.0,>=3.0.0
 Requires-Dist: gepa
 Requires-Dist: httpx>=0.27.0
@@ -35,10 +37,12 @@ Requires-Dist: nest-asyncio>=1.6.0
 Requires-Dist: numpy
 Requires-Dist: openai-agents>=0.0.7
 Requires-Dist: openai>=1.108.1
+Requires-Dist: pillow
 Requires-Dist: prime-pydantic-config[toml]
 Requires-Dist: prime-sandboxes>=0.2.25
 Requires-Dist: prime-tunnel>=0.1.6
 Requires-Dist: pydantic>=2.11.9
+Requires-Dist: pymupdf
 Requires-Dist: pyzmq>=27.1.0
 Requires-Dist: regex<2026.4.4
 Requires-Dist: requests

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/pyproject.toml RENAMED Viewed

@@ -53,6 +53,10 @@ dependencies = [
     "setproctitle>=1.3.0",
     "regex<2026.4.4",
     "httpx>=0.27.0",
+    "aiohttp>=3.9.0",
+    "pymupdf",
+    "pillow",
+    "certifi",
     "prime-pydantic-config[toml]",
     "uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'",
 ]

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_environment.py RENAMED Viewed

@@ -697,6 +697,30 @@ class TestMaybeRetry:
         error_data = rollout_outputs[0]["error"]
         assert "InfraError" == error_data["error"]
+    @pytest.mark.asyncio
+    async def test_retries_serialized_infra_error_subclass(self):
+        """A serialized InfraError subclass (e.g. SandboxError) in returned state
+        must trigger retry.
+        The v1 harness serializes state["error"] to ErrorData before maybe_retry
+        inspects it, so matching must be subclass-aware (rebuild concrete error +
+        isinstance) — base-name substring matching missed SandboxError, which is
+        an InfraError and should be retried.
+        """
+        from verifiers.utils.async_utils import maybe_retry
+        from verifiers.utils.error_utils import error_data
+        serialized = error_data(vf.SandboxError("Program file upload failed"))
+        calls = {"n": 0}
+        async def attempt():
+            calls["n"] += 1
+            return {"error": serialized}
+        result = await maybe_retry(attempt, max_retries=2, initial=0.0, max_wait=0.0)()
+        assert calls["n"] == 3  # 1 initial + 2 retries (InfraError is retryable)
+        assert result["error"] == serialized  # last result returned after exhaustion
 class TestEmptyModelResponseErrors:
     """Test cases for empty and invalid model response error handling."""

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_trajectory_processing.py RENAMED Viewed

@@ -282,6 +282,49 @@ async def test_parsed_prompt_attribution_survives_v1_assert_serializable():
     State({"trajectory": [step]}).assert_serializable()
+def test_assert_serializable_accepts_msgpack_sidecars_rejects_unknown():
+    """The ``assert_serializable`` json.dumps gate must accept exactly what the
+    trainer transport (msgpack) accepts, while staying strict otherwise.
+    Trajectory token steps carry sidecars that are non-JSON by design and reach
+    the trainer via msgpack, not JSON: the renderer ``MultiModalData`` (a
+    dataclass holding numpy pixel arrays) and ``routed_experts`` (a raw
+    ``memoryview`` buffer). Both must clear the gate; any other
+    non-serializable object must still raise.
+    """
+    import dataclasses
+    import numpy as np
+    @dataclasses.dataclass
+    class _FakeMultiModalData:
+        mm_hashes: dict
+        mm_items: dict
+        mm_placeholders: dict
+    mm = _FakeMultiModalData(
+        mm_hashes={"image": ["h1"]},
+        mm_items={"image": [np.zeros((2, 2), dtype=np.uint8)]},
+        mm_placeholders={"image": [{"offset": 0, "length": 4}]},
+    )
+    step = {
+        "tokens": {
+            "prompt_ids": [1, 2],
+            "multi_modal_data": mm,
+            "routed_experts": {"data": memoryview(b"abc"), "shape": [3], "start": 0},
+        }
+    }
+    # Must not raise: both sidecars are msgpack-transported, not JSON.
+    State({"trajectory": [step]}).assert_serializable()
+    # A genuinely non-serializable object must still be rejected.
+    class _Unknown:
+        pass
+    with pytest.raises(TypeError):
+        State({"trajectory": [{"tokens": _Unknown()}]}).assert_serializable()
 def test_process_trajectory_steps_for_training(make_input):
     """Test processing trajectory steps into training examples."""
     state1 = State(

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_runtime_lifecycle.py RENAMED Viewed

@@ -110,6 +110,15 @@ class BlockingModelClient(CapturingModelClient):
         return await super().get_response(**kwargs)
+class RaisingModelClient:
+    def __init__(self, error: vf.Error):
+        self.error = error
+    async def get_response(self, **kwargs: object) -> Response:
+        _ = kwargs
+        raise self.error
 class FakeCreateSandboxRequest:
     def __init__(self, **kwargs: object):
         self.kwargs = kwargs
@@ -517,6 +526,31 @@ async def endpoint_program(task, state):
     }
+async def endpoint_model_error_program(task, state):
+    _ = task
+    root = state["endpoint_root_url"].rstrip("/")
+    endpoint_client = cast(OpenAI, state.get_client(api="chat", sync=True))
+    auth_headers = {"Authorization": f"Bearer {endpoint_client.api_key}"}
+    endpoint_client.close()
+    def post_model() -> None:
+        request = urllib.request.Request(
+            f"{root}/vf/model",
+            data=json.dumps(
+                {"messages": [{"role": "user", "content": "too long"}]}
+            ).encode(),
+            headers={"content-type": "application/json", **auth_headers},
+        )
+        with urllib.request.urlopen(request):
+            pass
+    try:
+        await asyncio.to_thread(post_model)
+    except Exception as exc:
+        raise vf.SandboxError("Sandbox command failed") from exc
+    raise AssertionError("Expected /vf/model to fail")
 async def endpoint_trajectory_program(task, state):
     _ = task
     root = state["endpoint_root_url"].rstrip("/")
@@ -725,6 +759,7 @@ for _name, _value in {
     "initialize_from_taskset": initialize_from_taskset,
     "child_reads_program_sandbox": child_reads_program_sandbox,
     "endpoint_program": endpoint_program,
+    "endpoint_model_error_program": endpoint_model_error_program,
     "endpoint_trajectory_program": endpoint_trajectory_program,
     "concurrent_endpoint_program": concurrent_endpoint_program,
     "mcp_proxy_program": mcp_proxy_program,
@@ -827,6 +862,41 @@ async def test_endpoint_exposes_tool_user_and_stop_surfaces() -> None:
     assert "endpoint_root_url" not in state
+@pytest.mark.asyncio
+async def test_vf_model_bridge_preserves_overlong_prompt_error() -> None:
+    harness = make_harness(
+        program={"fn": program_ref("endpoint_model_error_program")},
+        model="test-model",
+        client=RaisingModelClient(vf.OverlongPromptError("too long")),
+    )
+    task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
+    state = await harness.run(task)
+    await harness.teardown()
+    assert state["prompt_too_long"] is True
+    assert state["is_truncated"] is True
+    assert state["stop_condition"] == "prompt_too_long"
+    assert state.get("error") is None
+@pytest.mark.asyncio
+async def test_vf_model_bridge_preserves_model_error() -> None:
+    harness = make_harness(
+        program={"fn": program_ref("endpoint_model_error_program")},
+        model="test-model",
+        client=RaisingModelClient(vf.ModelError("model failed")),
+    )
+    task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
+    state = await harness.run(task)
+    await harness.teardown()
+    assert state["stop_condition"] == "has_error"
+    assert state["error"]["error"] == "ModelError"
+    assert "SandboxError" not in state["error"]["error_chain_str"]
 @pytest.mark.asyncio
 async def test_endpoint_request_can_hide_internal_model_call_from_trajectory() -> None:
     client = FakeModelClient([fake_response("hidden"), fake_response("shown")])
@@ -1462,6 +1532,70 @@ async def test_create_sandbox_cleans_up_wait_failure_with_retry(
     assert client.delete_calls == 2
+@pytest.mark.asyncio
+async def test_upload_program_files_retries_transient_transfer_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    install_fake_sandboxes(monkeypatch)
+    disable_sandbox_retry_sleep(monkeypatch)
+    class FlakyUploadClient:
+        calls = 0
+        async def upload_bytes(self, *args: object, **kwargs: object) -> None:
+            _ = args, kwargs
+            self.calls += 1
+            if self.calls == 1:
+                raise FakeAPIError("Upload failed: ")
+    client = FlakyUploadClient()
+    task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
+    state = vf.State.for_task(task)
+    await sandbox_utils.upload_program_files(
+        cast(sandbox_utils.SandboxClient, client),
+        "sbx-upload",
+        {"files": {"/tmp/file.txt": "content"}},
+        task,
+        state,
+        Runtime(),
+    )
+    assert client.calls == 2
+@pytest.mark.asyncio
+async def test_upload_program_files_does_not_retry_non_transient_api_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    install_fake_sandboxes(monkeypatch)
+    disable_sandbox_retry_sleep(monkeypatch)
+    class FailingUploadClient:
+        calls = 0
+        async def upload_bytes(self, *args: object, **kwargs: object) -> None:
+            _ = args, kwargs
+            self.calls += 1
+            raise FakeAPIError("Upload failed: HTTP 400: bad request")
+    client = FailingUploadClient()
+    task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
+    state = vf.State.for_task(task)
+    with pytest.raises(vf.SandboxError, match="HTTP 400"):
+        await sandbox_utils.upload_program_files(
+            cast(sandbox_utils.SandboxClient, client),
+            "sbx-upload",
+            {"files": {"/tmp/file.txt": "content"}},
+            task,
+            state,
+            Runtime(),
+        )
+    assert client.calls == 1
 @pytest.mark.asyncio
 async def test_create_sandbox_cancellation_deletes_late_provider_result(
     monkeypatch: pytest.MonkeyPatch,
@@ -1603,8 +1737,8 @@ async def test_sandbox_base_program_max_turns_zero_is_unbounded(
     config_path.write_text(json.dumps({"max_turns": 0}))
     namespace["RUNNER_CONFIG_PATH"] = str(config_path)
-    async def create_model_message(state, messages, client):
-        _ = state, messages, client
+    async def create_model_message(state, messages):
+        _ = state, messages
         return {"role": "assistant", "content": "done"}
     async def call_user(state, messages):
@@ -1621,12 +1755,55 @@ async def test_sandbox_base_program_max_turns_zero_is_unbounded(
     state = {"prompt": [{"role": "user", "content": "hi"}], "runtime": {}}
     run_base = cast(Any, namespace["run_base"])
-    result = await run_base({}, state, object())
+    result = await run_base({}, state)
     assert result["completion"] == [{"role": "assistant", "content": "done"}]
     assert result["stop_condition"] == "no_tools"
+@pytest.mark.asyncio
+async def test_sandbox_base_program_model_call_uses_vf_model_bridge() -> None:
+    namespace: dict[str, object] = {}
+    source = runner_source().rsplit("asyncio.run(main())", 1)[0]
+    exec(source, namespace)
+    posted: list[tuple[str, Any, object]] = []
+    async def vf_post(state, path, payload, timeout=None):
+        _ = state
+        posted.append((path, payload, timeout))
+        return {"message": {"role": "assistant", "content": "ok"}}
+    namespace["vf_post"] = vf_post
+    create_model_message = cast(Any, namespace["create_model_message"])
+    # Canonical Messages (incl. an image content part) are sent unchanged over the
+    # /vf/model bridge; the host owns client resolution + tokenization and returns
+    # the assistant message.
+    messages = [
+        {"role": "user", "content": "hi"},
+        {
+            "role": "tool",
+            "tool_call_id": "call_1",
+            "content": [
+                {"type": "text", "text": "shot"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "data:image/png;base64,AAA"},
+                },
+            ],
+        },
+    ]
+    message = await create_model_message({"runtime": {}}, messages)
+    assert message == {"role": "assistant", "content": "ok"}
+    assert len(posted) == 1
+    path, payload, timeout = posted[0]
+    assert path == "model"
+    assert payload["messages"] == messages  # image part preserved verbatim
+    assert timeout is None
 def test_sandbox_program_patch_cannot_set_lifecycle_fields() -> None:
     state = vf.State.for_task(vf.Task({"prompt": []}).freeze())

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/anthropic_messages_client.py RENAMED Viewed

@@ -214,15 +214,15 @@ class AnthropicMessagesClient(
             return {}
         def build_tool_result_block(message: ToolMessage) -> ToolResultBlockParam:
+            if isinstance(message.content, str):
+                result_content: Any = message.content
+            else:
+                # Keep images: image_url parts -> Anthropic image blocks (not "[image]" text).
+                result_content = normalize_anthropic_content(message.content)
             return ToolResultBlockParam(
                 type="tool_result",
                 tool_use_id=message.tool_call_id,
-                content=cast(
-                    Any,
-                    message.content
-                    if isinstance(message.content, str)
-                    else " ".join(content_to_text_chunks(message.content)),
-                ),
+                content=cast(Any, result_content),
             )
         def from_chat_message(message: Message) -> AnthropicMessageParam | None:

{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_responses_client.py RENAMED Viewed

@@ -156,8 +156,8 @@ class OpenAIResponsesClient(
             if isinstance(message, ToolMessage):
                 output = message.content
                 if not isinstance(output, str):
-                    text = content_to_text(output)
-                    output = text if text else str(output)
+                    # Keep images: image_url parts -> Responses input_image (not text).
+                    output = normalize_message_content(output)
                 return [
                     {
                         "type": "function_call_output",

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Search Tasksets
+Composable search/research tasksets for agents that solve live information-seeking tasks in a sandbox.
+The search family is intentionally backend-oriented, mirroring the SWE taskset pattern while keeping the task contract research-centric: each task expects a single final answer rather than a code patch. Agents may use web/search tools, browser helpers, or other sandbox resources provided by the paired environment.
+## Backends
+| Backend | Source | Default dataset | Status |
+|---|---|---|---|
+| `quest` | [OSU-NLP-Group/QUEST](https://github.com/OSU-NLP-Group/QUEST) | [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data) | Objective tasks supported |
+## Usage
+```python
+from verifiers.envs.experimental.composable.tasksets.search import make_search_taskset
+taskset = make_search_taskset(backend="quest", category="objective")
+```
+`make_search_taskset()` dispatches by backend name. Unknown backends raise `ValueError` with the available backend list.
+## Output Contract
+Search tasksets should define their own output contract. The initial `quest` backend expects the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
+## Error Handling
+Search tasksets should use the framework error taxonomy for infrastructure failures:
+- `vf.SandboxError` for sandbox setup, command, or lifecycle failures.
+- `vf.ModelError` for judge/model provider failures.
+- `vf.InfraError` for dataset, evaluator, or external runtime failures.
+Incorrect answers should not set `state["error"]`; they should score normally, often as `0.0`.

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Composable search/research tasksets."""
+from .search_tasksets import make_quest_taskset, make_search_taskset
+__all__ = ["make_quest_taskset", "make_search_taskset"]

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/README.md ADDED Viewed

@@ -0,0 +1,52 @@
+# QUEST Search Taskset
+Objective QUEST tasks ported into the composable search taskset framework.
+## Source
+- Dataset: [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data)
+- Upstream project: [`OSU-NLP-Group/QUEST`](https://github.com/OSU-NLP-Group/QUEST)
+The taskset loads the Hugging Face dataset, filters to `rl_task_category == "objective"` by default, and uses the dataset-provided generated evaluation scripts under `eval_scripts/*.py`.
+## Task Contract
+Each example is a live research question. The agent should produce one final answer in `/task/answer.txt`.
+The paired `rlm_search` environment prompts RLM to write this file and provides web search/open-page skills. The rubric can fall back to the final assistant text if the answer file is empty, but agents should still write the file directly.
+## Scoring
+`QuestRubric` loads the generated eval script for the example's `task_id` and calls its async `evaluate_answer(...)` entrypoint using the vendored minimal `obj_task_eval` runtime. The rollout reward is `summary["final_score"]`, clipped to `[0.0, 1.0]`.
+Generated scripts may request URL-backed verification. PDF URLs are detected and parsed with the upstream QUEST PDF parser path before falling back to generic webpage retrieval.
+This port intentionally preserves upstream QUEST behavior for URL-backed verification semantics. The upstream verifier generally treats invalid, irrelevant, or inaccessible cited webpages as unsupported claims, which can assign `0.0` to the affected verification node even when the immediate cause is source access such as a bot challenge, rate limit, timeout, or parser failure. Future work should consider a finer-grained source-access taxonomy so verifier infrastructure limitations can be distinguished from model-provided bad URLs or unsupported claims.
+A reward of `0.0` with no `state["error"]` means the QUEST evaluator ran and judged the answer incorrect under the upstream-compatible scoring path. Infrastructure and evaluator failures outside normal QUEST source verification are represented with `vf.Error` subclasses instead of ad hoc success metrics.
+## Error Handling
+QUEST uses Verifiers' framework-managed error field for non-answer failures when the failure comes from external runtime systems:
+- Missing live sandbox or answer-file read failure: `vf.SandboxError`.
+- Transient judge provider/network/rate-limit/server failures: retryable `vf.InfraError`.
+- Empty or invalid judge responses: retryable `vf.InvalidModelResponseError` / `vf.EmptyModelResponseError`.
+- Judge auth, model-not-found, content-filter, or invalid request failures: non-retryable `vf.ModelError`.
+- QUEST eval-script download/cache resolution failure: `vf.InfraError`.
+Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
+## Common Arguments
+| Argument | Default | Description |
+|---|---:|---|
+| `dataset_name` | `osunlp/QUEST-RL-Data` | Hugging Face dataset name. |
+| `split` | `train` | Dataset split. |
+| `category` | `objective` | Initial implementation supports objective tasks only. |
+| `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
+| `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for QUEST verifier calls. |
+| `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
+| `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
+| `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py`. |
+| `quest_cache_dir` | `~/.cache/verifiers/quest` | Host cache for QUEST verifier state. |

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""QUEST search taskset."""
+from .taskset import QuestRubric, QuestTaskSet
+__all__ = ["QuestRubric", "QuestTaskSet"]

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Vendored QUEST objective evaluation runtime."""
+from .eval_toolkit import BinaryEvalResult, Extractor, Verifier, create_evaluator
+from .evaluator import Evaluator
+from .utils import CacheFileSys
+from .verification_tree import AggregationStrategy, VerificationNode
+__all__ = [
+    "AggregationStrategy",
+    "BinaryEvalResult",
+    "CacheFileSys",
+    "Evaluator",
+    "Extractor",
+    "Verifier",
+    "VerificationNode",
+    "create_evaluator",
+]

verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Vendored QUEST API tool shims."""
+from .tool_pdf import PDFParser
+__all__ = ["PDFParser"]

verifiers 0.1.15.dev167__tar.gz → 0.1.15.dev169__tar.gz

verifiers 0.1.15.dev167tar.gz → 0.1.15.dev169tar.gz