PyPI - verifiers - Versions diffs - 0.1.13.dev4__tar.gz → 0.1.13.dev6__tar.gz - Mend

verifiers 0.1.13.dev4tar.gz → 0.1.13.dev6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.13.dev4
+Version: 0.1.13.dev6
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_eval_cli.py RENAMED Viewed

@@ -41,6 +41,8 @@ def run_cli(make_metadata, make_state, make_input):
             "api_base_url": "https://api.openai.com/v1",
             "header": None,
             "headers": None,
+            "header_from_state": None,
+            "headers_from_state": None,
             "num_examples": 1,
             "rollouts_per_example": 1,
             "max_concurrent": 1,

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_interception_utils.py RENAMED Viewed

@@ -131,3 +131,76 @@ async def test_streaming_write_failure_surfaces_to_state(monkeypatch):
     assert isinstance(state["error"], StreamInterrupted)
     assert "ConnectionResetError" in str(state["error"])
+async def test_keepalive_emitted_during_idle(monkeypatch):
+    """During the idle window (no chunks on chunk_queue) the handler must
+    emit SSE keepalive comments so upstream idle-timeouts don't fire."""
+    monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
+    server = InterceptionServer(port=0)
+    state: dict = {}
+    server.register_rollout("r1", state=state)
+    writes: list[bytes] = []
+    async def fake_write(data: bytes) -> None:
+        writes.append(data)
+    fake_response = MagicMock()
+    fake_response.prepare = AsyncMock()
+    fake_response.write = AsyncMock(side_effect=fake_write)
+    fake_response.write_eof = AsyncMock()
+    monkeypatch.setattr(
+        interception_utils.web, "StreamResponse", lambda **_: fake_response
+    )
+    chunk_queue: asyncio.Queue = asyncio.Queue()  # starts empty
+    response_future: asyncio.Future = asyncio.Future()
+    intercept = {
+        "chunk_queue": chunk_queue,
+        "response_future": response_future,
+    }
+    task = asyncio.create_task(
+        server._handle_streaming_response(MagicMock(), "r1", intercept)
+    )
+    await asyncio.sleep(0.2)  # enough for a few keepalive cycles
+    # Close the loop cleanly: EOF sentinel + resolved future → handler returns.
+    response_future.set_result(None)
+    await chunk_queue.put(None)
+    await task
+    assert any(w == b": keepalive\n\n" for w in writes), (
+        f"expected at least one keepalive write, got writes={writes}"
+    )
+async def test_keepalive_write_failure_surfaces_to_state(monkeypatch):
+    """A failed keepalive write (upstream already cut the TCP connection)
+    must funnel into ``state["error"]`` with elapsed-time instrumentation."""
+    monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
+    server = InterceptionServer(port=0)
+    state: dict = {}
+    server.register_rollout("r1", state=state)
+    fake_response = MagicMock()
+    fake_response.prepare = AsyncMock()
+    fake_response.write = AsyncMock(side_effect=ConnectionResetError("tunnel died"))
+    fake_response.write_eof = AsyncMock()
+    monkeypatch.setattr(
+        interception_utils.web, "StreamResponse", lambda **_: fake_response
+    )
+    chunk_queue: asyncio.Queue = asyncio.Queue()  # never produces
+    intercept = {
+        "chunk_queue": chunk_queue,
+        "response_future": asyncio.Future(),
+    }
+    await server._handle_streaming_response(MagicMock(), "r1", intercept)
+    assert isinstance(state["error"], StreamInterrupted)
+    msg = str(state["error"])
+    assert "keepalive write failed" in msg
+    assert "ConnectionResetError" in msg

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.13.dev4"
+__version__ = "0.1.13.dev6"
 import importlib
 import os

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/README.md RENAMED Viewed

@@ -2,6 +2,10 @@
 Newer and more experimental environment classes that may have some sharper edges + change more frequently.
+## SandboxMixin
+`SandboxMixin` works with both container and VM sandboxes. If your environment needs a VM, pass `CreateSandboxRequest(..., vm=True)` to `create_sandbox`. For a GPU VM, also set `gpu_count` and `gpu_type`. Everyday sandbox operations like file upload, file reads, background jobs, and cleanup work the same way. Port exposure and SSH are currently container-only.
 ## GymEnv
 Universal runner for Gym-compatible environments. Wraps any environment that implements `reset(seed)` and `step(action)` methods (following the OpenAI Gym / Gymnasium API). Supports both old-style 4-tuple and new-style 5-tuple step returns.

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/__init__.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from verifiers.envs.experimental.composable.harnesses.rlm import (
+    DEFAULT_RLM_EXEC_TIMEOUT,
     DEFAULT_RLM_MAX_TURNS,
+    DEFAULT_RLM_MAX_TURNS_IN_CONTEXT,
     DEFAULT_RLM_REF,
     DEFAULT_RLM_REPO_URL,
     build_install_script as build_rlm_install_script,
@@ -16,6 +18,13 @@ from verifiers.envs.experimental.composable.harnesses.opencode import (
     build_opencode_run_command,
     opencode_harness,
 )
+from verifiers.envs.experimental.composable.harnesses.mini_swe_agent import (
+    MINI_SWE_AGENT_CONFIG,
+    MINI_SWE_AGENT_INSTALL_SCRIPT,
+    build_mini_swe_agent_install_script,
+    build_mini_swe_agent_run_command,
+    mini_swe_agent_harness,
+)
 __all__ = [
     "rlm_harness",
@@ -24,6 +33,8 @@ __all__ = [
     "DEFAULT_RLM_REF",
     "DEFAULT_RLM_REPO_URL",
     "DEFAULT_RLM_MAX_TURNS",
+    "DEFAULT_RLM_MAX_TURNS_IN_CONTEXT",
+    "DEFAULT_RLM_EXEC_TIMEOUT",
     "opencode_harness",
     "build_opencode_install_script",
     "build_opencode_config",
@@ -32,4 +43,9 @@ __all__ = [
     "DEFAULT_DISABLED_TOOLS",
     "DEFAULT_RELEASE_SHA256",
     "DEFAULT_SYSTEM_PROMPT",
+    "mini_swe_agent_harness",
+    "build_mini_swe_agent_install_script",
+    "build_mini_swe_agent_run_command",
+    "MINI_SWE_AGENT_INSTALL_SCRIPT",
+    "MINI_SWE_AGENT_CONFIG",
 ]

verifiers-0.1.13.dev6/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""mini-SWE-agent harness configuration."""
+from __future__ import annotations
+from pathlib import PurePosixPath
+import shlex
+DEFAULT_INSTALL_DIR = "/opt/mini-swe-agent"
+DEFAULT_PREFIX_DIR = f"{DEFAULT_INSTALL_DIR}/prefix"
+DEFAULT_SITE_PACKAGES_DIR = f"{DEFAULT_PREFIX_DIR}/site-packages"
+DEFAULT_UV_SITE_PACKAGES_DIR = f"{DEFAULT_INSTALL_DIR}/uv-site-packages"
+DEFAULT_MINI_BINARY = f"{DEFAULT_PREFIX_DIR}/bin/mini"
+MINI_SWE_AGENT_CLI_PACKAGE = "mini-swe-agent"
+MINI_SWE_AGENT_CLI_VERSION = "2.2.8"
+MINI_SWE_AGENT_CLI_SHA256 = (
+    "694df4de1337e665e3cd82e99f93374f573bf52b8e7c362ac5d8045ad9f7c37c"
+)
+MINI_SWE_AGENT_PYTHON_VERSION = "3.11"
+UV_PACKAGE_VERSION = "0.11.7"
+DEFAULT_PACKAGE_VERSION = MINI_SWE_AGENT_CLI_VERSION
+DEFAULT_PACKAGE_SHA256 = MINI_SWE_AGENT_CLI_SHA256
+DEFAULT_INSTRUCTION_PATH = "/mini-swe-agent/prompt.txt"
+DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
+DEFAULT_LOG_DIR = "/logs/agent"
+DEFAULT_LOG_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.log"
+DEFAULT_TRAJECTORY_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.traj.json"
+DEFAULT_AGENT_WORKDIR = "${AGENT_WORKDIR:-/app}"
+DEFAULT_CONFIG_SPEC = "mini_textbased"
+DEFAULT_MODEL_CLASS = "litellm_textbased"
+DEFAULT_ENVIRONMENT_TIMEOUT = 120
+def build_mini_swe_agent_install_script(
+    package_version: str = DEFAULT_PACKAGE_VERSION,
+    package_sha256: str = DEFAULT_PACKAGE_SHA256,
+    prefix_dir: str = DEFAULT_PREFIX_DIR,
+    install_python: bool = True,
+) -> str:
+    """Build the shell script that installs mini-SWE-agent."""
+    install_tools = ""
+    if install_python:
+        install_tools = """\
+export DEBIAN_FRONTEND=noninteractive
+if ! command -v python3 >/dev/null 2>&1 || ! python3 -m pip --version >/dev/null 2>&1; then
+  apt-get update -qq
+  apt-get install -y -qq python3 python3-pip ca-certificates
+fi
+"""
+    quoted_prefix_dir = shlex.quote(prefix_dir)
+    site_packages_dir = f"{prefix_dir}/site-packages"
+    wheel_filename = f"mini_swe_agent-{package_version}-py3-none-any.whl"
+    wheel_url = (
+        f"https://files.pythonhosted.org/packages/py3/m/mini-swe-agent/{wheel_filename}"
+    )
+    quoted_site_packages_dir = shlex.quote(site_packages_dir)
+    quoted_install_dir = shlex.quote(DEFAULT_INSTALL_DIR)
+    quoted_uv_site_packages_dir = shlex.quote(DEFAULT_UV_SITE_PACKAGES_DIR)
+    return f"""\
+set -e
+{install_tools}
+rm -rf {quoted_prefix_dir}
+mkdir -p {quoted_install_dir} {quoted_prefix_dir}/bin {quoted_site_packages_dir} {quoted_uv_site_packages_dir} {shlex.quote(DEFAULT_LOG_DIR)} /mini-swe-agent
+export PIP_CONFIG_FILE=/dev/null
+export PIP_INDEX_URL=https://pypi.org/simple
+export PIP_BREAK_SYSTEM_PACKAGES=1
+unset PIP_EXTRA_INDEX_URL
+PYTHON_BIN="$(command -v python3)"
+MINI_SWE_AGENT_PYTHON="$PYTHON_BIN"
+if ! "$PYTHON_BIN" -c 'import sys; raise SystemExit(sys.version_info < (3, 10))'; then
+  "$PYTHON_BIN" -m pip install --quiet --target {quoted_uv_site_packages_dir} uv=={UV_PACKAGE_VERSION}
+  env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python install {MINI_SWE_AGENT_PYTHON_VERSION}
+  MINI_SWE_AGENT_PYTHON="$(env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python find {MINI_SWE_AGENT_PYTHON_VERSION})"
+fi
+MINI_SWE_AGENT_WHEEL_DIR="$(mktemp -d)"
+trap 'rm -rf "$MINI_SWE_AGENT_WHEEL_DIR"' EXIT
+MINI_SWE_AGENT_WHEEL="$MINI_SWE_AGENT_WHEEL_DIR/{wheel_filename}"
+MINI_SWE_AGENT_WHEEL_URL={shlex.quote(wheel_url)}
+export MINI_SWE_AGENT_WHEEL MINI_SWE_AGENT_WHEEL_URL
+"$PYTHON_BIN" -c 'import os, urllib.request; urllib.request.urlretrieve(os.environ["MINI_SWE_AGENT_WHEEL_URL"], os.environ["MINI_SWE_AGENT_WHEEL"])'
+echo "{package_sha256}  $MINI_SWE_AGENT_WHEEL" | sha256sum -c -
+if [ "$MINI_SWE_AGENT_PYTHON" = "$PYTHON_BIN" ]; then
+  "$PYTHON_BIN" -m pip install --quiet --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
+else
+  env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv pip install --python "$MINI_SWE_AGENT_PYTHON" --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
+fi
+echo "$MINI_SWE_AGENT_PYTHON" > {quoted_prefix_dir}/python
+cat > {quoted_prefix_dir}/bin/mini <<'EOF'
+#!/usr/bin/env sh
+export PYTHONPATH={shlex.quote(site_packages_dir)}:${{PYTHONPATH:-}}
+exec "$(cat {quoted_prefix_dir}/python)" -m minisweagent.run.mini "$@"
+EOF
+chmod +x {quoted_prefix_dir}/bin/mini
+test -x {quoted_prefix_dir}/bin/mini
+"""
+def build_mini_swe_agent_run_command(
+    agent_workdir: str = DEFAULT_AGENT_WORKDIR,
+    instruction_path: str = DEFAULT_INSTRUCTION_PATH,
+    system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
+    log_path: str = DEFAULT_LOG_PATH,
+    trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
+    mini_binary: str = DEFAULT_MINI_BINARY,
+    config_spec: str = DEFAULT_CONFIG_SPEC,
+    model_class: str = DEFAULT_MODEL_CLASS,
+    environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
+    extra_config_specs: list[str] | None = None,
+) -> str:
+    """Build the shell command that configures and runs mini-SWE-agent.
+    Config specs layer the cwd, timeout, LiteLLM model class, optional system
+    prompt template, and any caller-provided overrides before writing the
+    trajectory and teeing logs.
+    """
+    # Keep the default workdir shell-expanded for env-level overrides, mirroring
+    # the other harnesses.
+    if agent_workdir == DEFAULT_AGENT_WORKDIR:
+        workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={DEFAULT_AGENT_WORKDIR}"
+    else:
+        workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={shlex.quote(agent_workdir)}"
+    config_args = [
+        "-c",
+        shlex.quote(config_spec),
+        "-c",
+        "agent.cost_limit=0",
+        "-c",
+        f"environment.timeout={environment_timeout}",
+        "-c",
+        f"model.model_class={shlex.quote(model_class)}",
+        "-c",
+        "model.cost_tracking=ignore_errors",
+        "-c",
+        "model.model_kwargs.custom_llm_provider=openai",
+    ]
+    # Config specs are the mini CLI's native override format; use them for cwd,
+    # timeout, model class, and optional system prompt wiring.
+    for spec in extra_config_specs or []:
+        config_args.extend(["-c", shlex.quote(spec)])
+    log_dir = str(PurePosixPath(log_path).parent)
+    trajectory_dir = str(PurePosixPath(trajectory_path).parent)
+    script = f"""\
+set -eo pipefail
+export PATH={shlex.quote(DEFAULT_PREFIX_DIR)}/bin:"$PATH"
+export PYTHONPATH={shlex.quote(DEFAULT_SITE_PACKAGES_DIR)}:"${{PYTHONPATH:-}}"
+export MSWEA_CONFIGURED=true
+export MSWEA_SILENT_STARTUP=true
+export MSWEA_GLOBAL_CONFIG_DIR=/tmp/mini-swe-agent-config
+export OPENAI_API_KEY="${{OPENAI_API_KEY:-intercepted}}"
+{workdir_assignment}
+mkdir -p {shlex.quote(log_dir)} {shlex.quote(trajectory_dir)} "$MINI_SWE_AGENT_WORKDIR" "$MSWEA_GLOBAL_CONFIG_DIR"
+MINI_SWE_AGENT_TASK="$(cat {shlex.quote(instruction_path)})"
+CONFIG_ARGS=({" ".join(config_args)})
+CONFIG_ARGS+=(-c "environment.cwd=$MINI_SWE_AGENT_WORKDIR")
+if [ -s {shlex.quote(system_prompt_path)} ]; then
+  CONFIG_ARGS+=(-c "agent.system_template=$(cat {shlex.quote(system_prompt_path)})")
+fi
+cd "$MINI_SWE_AGENT_WORKDIR"
+timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" {shlex.quote(mini_binary)} \\
+  --model "$OPENAI_MODEL" \\
+  --task "$MINI_SWE_AGENT_TASK" \\
+  --output {shlex.quote(trajectory_path)} \\
+  --exit-immediately \\
+  --yolo \\
+  "${{CONFIG_ARGS[@]}}" 2>&1 | tee -a {shlex.quote(log_path)}
+"""
+    return f"bash -lc {shlex.quote(script)}"
+MINI_SWE_AGENT_INSTALL_SCRIPT = build_mini_swe_agent_install_script()
+MINI_SWE_AGENT_CONFIG = {
+    "install_script": MINI_SWE_AGENT_INSTALL_SCRIPT,
+    "cli_package": MINI_SWE_AGENT_CLI_PACKAGE,
+    "cli_version": MINI_SWE_AGENT_CLI_VERSION,
+    "cli_sha256": MINI_SWE_AGENT_CLI_SHA256,
+}
+def mini_swe_agent_harness(
+    system_prompt: str | None = None,
+    task_system_prompt: str | None = None,
+    agent_workdir: str = DEFAULT_AGENT_WORKDIR,
+    instruction_path: str = DEFAULT_INSTRUCTION_PATH,
+    system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
+    log_path: str = DEFAULT_LOG_PATH,
+    trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
+    package_version: str = DEFAULT_PACKAGE_VERSION,
+    package_sha256: str = DEFAULT_PACKAGE_SHA256,
+    config_spec: str = DEFAULT_CONFIG_SPEC,
+    model_class: str = DEFAULT_MODEL_CLASS,
+    environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
+    extra_config_specs: list[str] | None = None,
+):
+    """Create a Harness configured for mini-SWE-agent."""
+    from verifiers.envs.experimental.composable import Harness
+    if task_system_prompt:
+        if system_prompt:
+            system_prompt = system_prompt + "\n" + task_system_prompt
+        else:
+            system_prompt = task_system_prompt
+    # The system prompt is passed through ComposableEnv as a file and injected
+    # into mini's agent.system_template at runtime.
+    return Harness(
+        install_script=build_mini_swe_agent_install_script(
+            package_version=package_version,
+            package_sha256=package_sha256,
+        ),
+        run_command=build_mini_swe_agent_run_command(
+            agent_workdir=agent_workdir,
+            instruction_path=instruction_path,
+            system_prompt_path=system_prompt_path,
+            log_path=log_path,
+            trajectory_path=trajectory_path,
+            config_spec=config_spec,
+            model_class=model_class,
+            environment_timeout=environment_timeout,
+            extra_config_specs=extra_config_specs,
+        ),
+        system_prompt=system_prompt,
+        instruction_path=instruction_path,
+        system_prompt_path=system_prompt_path,
+        log_path=log_path,
+    )

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/rlm.py RENAMED Viewed

@@ -15,6 +15,8 @@ from verifiers.envs.experimental.utils.git_checkout_cache import (
 DEFAULT_RLM_REPO_URL = "github.com/PrimeIntellect-ai/rlm.git"
 DEFAULT_RLM_REF = "main"
 DEFAULT_RLM_MAX_TURNS = 100
+DEFAULT_RLM_MAX_TURNS_IN_CONTEXT = -1
+DEFAULT_RLM_EXEC_TIMEOUT = 300
 DEFAULT_APPEND_TO_SYSTEM_PROMPT_PATH = "/task/append_to_system_prompt.txt"
 DEFAULT_RLM_CHECKOUT_PATH = "/tmp/rlm-checkout"
 DEFAULT_RLM_CHECKOUT_UPLOAD_NAME = "rlm_checkout"
@@ -98,6 +100,9 @@ def rlm_harness(
     instruction_path: str = "/task/instruction.md",
     rlm_repo_url: str = DEFAULT_RLM_REPO_URL,
     rlm_ref: str = DEFAULT_RLM_REF,
+    rlm_max_turns: int = DEFAULT_RLM_MAX_TURNS,
+    rlm_max_turns_in_context: int = DEFAULT_RLM_MAX_TURNS_IN_CONTEXT,
+    rlm_exec_timeout: int = DEFAULT_RLM_EXEC_TIMEOUT,
     append_to_system_prompt: str | None = None,
     local_checkout: str | Path | None = None,
     gh_token: str | None = None,
@@ -106,13 +111,20 @@ def rlm_harness(
 ) -> Harness:
     """Build an RLM harness.
-    ``rlm_tools`` is the single source of truth for which builtin tools are
-    active. The same list drives both ``Harness.tool_names`` (so
-    ``ToolMonitorRubric`` tracks exactly the active tools) and
-    ``Harness.environment_vars["RLM_TOOLS"]`` (so the RLM sandbox advertises
-    the same set to the model). Callers do not need to — and should not —
-    add ``RLM_TOOLS`` to ``ComposableEnv(environment_vars=...)`` themselves;
-    the harness owns it.
+    The harness is the single source of truth for every ``RLM_*`` sandbox
+    env var the RLM subprocess reads. Kwargs map 1:1 onto env vars written
+    to ``Harness.environment_vars`` and merged into the sandbox by
+    ``ComposableEnv`` (harness-wins):
+    - ``rlm_tools`` → ``RLM_TOOLS`` (also drives ``Harness.tool_names`` so
+      ``ToolMonitorRubric`` tracks exactly the active tools)
+    - ``rlm_max_turns`` → ``RLM_MAX_TURNS``
+    - ``rlm_max_turns_in_context`` → ``RLM_MAX_TURNS_IN_CONTEXT``
+    - ``rlm_exec_timeout`` → ``RLM_EXEC_TIMEOUT``
+    Callers do not need to — and should not — add these keys to
+    ``ComposableEnv(environment_vars=...)`` themselves; pass the kwargs
+    here and the harness owns the env var plumbing.
     ``allow_git`` defaults to False, mirroring opencode's bash tool. When
     False, a ``/usr/local/bin/git`` shim is uploaded that refuses on any
@@ -163,7 +175,12 @@ def rlm_harness(
         metrics_key="metrics",
         metrics_prefix="rlm_",
         tool_names=tool_names,
-        environment_vars={"RLM_TOOLS": ",".join(tool_names)},
+        environment_vars={
+            "RLM_TOOLS": ",".join(tool_names),
+            "RLM_MAX_TURNS": str(rlm_max_turns),
+            "RLM_MAX_TURNS_IN_CONTEXT": str(rlm_max_turns_in_context),
+            "RLM_EXEC_TIMEOUT": str(rlm_exec_timeout),
+        },
         post_install_uploads=post_install_uploads,
         post_install_script=post_install_script,
     )

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/task.py RENAMED Viewed

@@ -32,7 +32,7 @@ from dataclasses import dataclass
 from importlib.abc import Traversable
 from pathlib import Path
 from types import ModuleType
-from typing import Any, Callable
+from typing import Any, Callable, Self
 from verifiers.envs.experimental.composable._filter import _resolve_filter_fn
 from verifiers.types import Messages, State
@@ -279,13 +279,13 @@ class TaskSet:
     # -- Combinators ---------------------------------------------------------
-    def filter(self, predicate: Callable[[dict], bool]) -> TaskSet:
+    def filter(self, predicate: Callable[[dict], bool]) -> Self:
         clone = object.__new__(type(self))
         clone.__dict__.update(self.__dict__)
         clone._dataset = self._dataset.filter(predicate)
         return clone
-    def take(self, n: int) -> TaskSet:
+    def take(self, n: int) -> Self:
         clone = object.__new__(type(self))
         clone.__dict__.update(self.__dict__)
         clone._dataset = self._dataset.select(range(min(n, len(self._dataset))))

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/eval.py RENAMED Viewed

@@ -142,6 +142,47 @@ def build_extra_headers(raw: dict[str, Any]) -> dict[str, str]:
     return {**eval_headers_table, **eval_headers_from_list}
+def build_extra_headers_from_state(raw: dict[str, Any]) -> dict[str, str]:
+    """Build the header-name → state-key map for `ClientConfig.extra_headers_from_state`.
+    Reads a TOML table (`headers_from_state = { "X-Session-ID" = "trajectory_id" }`)
+    and/or a repeatable list (`--header-from-state "X-Session-ID: trajectory_id"`).
+    The CLI list wins on key collisions with the table.
+    """
+    table: dict[str, str] = {}
+    raw_table = raw.get("headers_from_state")
+    if raw_table is not None:
+        table = _validate_extra_headers_value(raw_table)
+    raw_list = raw.get("header_from_state")
+    if raw_list is None:
+        raw_list = []
+    if not isinstance(raw_list, list):
+        raise ValueError(
+            "'header_from_state' must be a list of 'Name: state_key' strings"
+        )
+    from_list: dict[str, str] = {}
+    for entry in raw_list:
+        if not isinstance(entry, str):
+            raise ValueError(
+                f"Each 'header_from_state' entry must be a string 'Name: state_key', got: {entry!r}"
+            )
+        if ":" not in entry:
+            raise ValueError(
+                f"--header-from-state must be 'Name: state_key', got: {entry!r}"
+            )
+        key, value = entry.split(":", 1)
+        key, value = key.strip(), value.strip()
+        if not key:
+            raise ValueError("--header-from-state name cannot be empty")
+        if not value:
+            raise ValueError("--header-from-state state_key cannot be empty")
+        from_list[key] = value
+    return {**table, **from_list}
 def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
     """Get eval config defaults from the environment module's pyproject.toml.
@@ -279,6 +320,16 @@ def build_parser() -> argparse.ArgumentParser:
         default=None,
         help="Extra HTTP header to pass to inference API. 'Name: Value'. Repeatable.",
     )
+    parser.add_argument(
+        "--header-from-state",
+        action="append",
+        default=None,
+        help=(
+            "Per-request HTTP header whose value is read from the rollout state. "
+            "'Name: state_key' (e.g. 'X-Session-ID: trajectory_id'). Repeatable. "
+            "Defaults to X-Session-ID=example_id if unset."
+        ),
+    )
     parser.add_argument(
         "--num-examples",
         "-n",
@@ -639,6 +690,12 @@ def main(argv: list[str] | None = None):
         )
         # Build headers: registry < [[eval]] headers table < header list / --header
         eval_headers_merged = build_extra_headers(raw)
+        # Default X-Session-ID → example_id for sticky DP-aware routing;
+        # user-supplied headers_from_state / --header-from-state override.
+        eval_headers_from_state = {
+            "X-Session-ID": "example_id",
+            **build_extra_headers_from_state(raw),
+        }
         registry_headers_base: dict[str, str] = {}
         if endpoint_group is not None:
@@ -683,7 +740,7 @@ def main(argv: list[str] | None = None):
             api_base_url=primary_api_base_url,
             endpoint_configs=endpoint_configs,
             extra_headers=merged_headers,
-            extra_headers_from_state={"X-Session-ID": "example_id"},
+            extra_headers_from_state=eval_headers_from_state,
         )
         # Backward-compatible TOML field: resume_path

{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/eval_utils.py RENAMED Viewed

@@ -439,6 +439,8 @@ def load_toml_config(
         "api_base_url",
         "header",
         "headers",
+        "header_from_state",
+        "headers_from_state",
         # sampling
         "sampling_args",
         "max_tokens",

verifiers 0.1.13.dev4__tar.gz → 0.1.13.dev6__tar.gz

verifiers 0.1.13.dev4tar.gz → 0.1.13.dev6tar.gz