PyPI - benchflow - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

benchflow 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

{benchflow-0.2.2 → benchflow-0.2.3}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,26 @@
 ## [Unreleased]
+## 0.2.3 — 2026-04-15
+### Added
+- `benchmarks/tb2_multiturn-claude-haiku45.yaml` — shipped config for the README's TB2 multi-turn Claude result.
+- Daytona resource clamping via `BENCHFLOW_DAYTONA_MAX_CPUS` / `MAX_MEMORY_MB`.
+### Changed
+- Renamed `skillsbench-claude-glm5.yaml` → `skillsbench-claude-glm51.yaml` to match the model ID.
+- `codex --login` correction in `docs/getting-started.md`.
+- Restricted sdist build to `src/`, `tests/`, and metadata.
+### Fixed
+- Verifier sandbox hardening follow-ups across several base-image and tooling edge cases.
+- Preserve trusted verifier path entries and workspace answer files.
+- Redirect oracle output to container log.
+- Align YAML path resolution to config file location.
 ## 0.2.2 — 2026-04-13
 ### Added

{benchflow-0.2.2 → benchflow-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchflow
-Version: 0.2.2
+Version: 0.2.3
 Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
 Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
 Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -28,7 +28,7 @@ Requires-Dist: typer>=0.9
 Provides-Extra: dev
 Requires-Dist: pre-commit>=3.7; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
-Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: pytest>=9.0.3; extra == 'dev'
 Requires-Dist: ruff>=0.7.0; extra == 'dev'
 Requires-Dist: ty>=0.0.1a1; extra == 'dev'
 Description-Content-Type: text/markdown
@@ -163,7 +163,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
 **SkillsBench** (86 tasks — tool use, file editing, API calls):
 ```bash
-python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml   # Claude
+python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml   # Claude
 python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml   # Codex
 ```

{benchflow-0.2.2 → benchflow-0.2.3}/README.md RENAMED Viewed

@@ -128,7 +128,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
 **SkillsBench** (86 tasks — tool use, file editing, API calls):
 ```bash
-python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml   # Claude
+python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml   # Claude
 python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml   # Codex
 ```

{benchflow-0.2.2 → benchflow-0.2.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "benchflow"
-version = "0.2.2"
+version = "0.2.3"
 description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
 readme = "README.md"
 requires-python = ">=3.12"
@@ -37,7 +37,7 @@ classifiers = [
 [project.optional-dependencies]
 dev = [
     "pre-commit>=3.7",
-    "pytest>=8.0",
+    "pytest>=9.0.3",
     "pytest-asyncio>=0.24.0",
     "ruff>=0.7.0",
     "ty>=0.0.1a1",
@@ -58,20 +58,20 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.sdist]
-exclude = [
-    ".venv*",
-    ".ref",
-    "jobs",
-    "dist",
-    ".claude",
-    ".dev-docs",
-    ".pytest_cache",
-    "__pycache__",
+# Allowlist: only ship what the installed package needs.
+only-include = [
+    "src",
+    "tests",
+    "README.md",
+    "CHANGELOG.md",
+    "LICENSE",
+    "pyproject.toml",
 ]
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 addopts = "-m 'not live'"
+testpaths = ["tests"]
 markers = [
     "live: requires real Anthropic API and Docker daemon (run with -m live)",
 ]

{benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_env_setup.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import json
 import logging
+import os
 import re
 import shutil
 from pathlib import Path
@@ -14,6 +15,12 @@ from benchflow.agents.registry import AGENTS
 logger = logging.getLogger(__name__)
+# Daytona's per-sandbox cap on the default tier is 4 CPU / 8 GB. Tasks declaring
+# more fail at sandbox creation. Clamp here so tasks degrade gracefully (slower
+# build) instead of erroring out. Override via env if running on a paid tier.
+_DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4"))
+_DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192"))
 # Directories to ignore when copying deps
 _IGNORE_DIRS = {
     ".venv",
@@ -253,12 +260,28 @@ def _create_environment(
     elif environment_type == "daytona":
         from harbor.environments.daytona import DaytonaEnvironment
+        env_config = task.config.environment
+        if env_config.cpus > _DAYTONA_MAX_CPUS:
+            logger.warning(
+                "Clamping cpus %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_CPUS)",
+                env_config.cpus,
+                _DAYTONA_MAX_CPUS,
+            )
+            env_config.cpus = _DAYTONA_MAX_CPUS
+        if env_config.memory_mb > _DAYTONA_MAX_MEMORY_MB:
+            logger.warning(
+                "Clamping memory_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_MEMORY_MB)",
+                env_config.memory_mb,
+                _DAYTONA_MAX_MEMORY_MB,
+            )
+            env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB
         return DaytonaEnvironment(
             environment_dir=task.paths.environment_dir,
             environment_name=task_path.name,
             session_id=trial_name,
             trial_paths=trial_paths,
-            task_env_config=task.config.environment,
+            task_env_config=env_config,
             auto_stop_interval_mins=1440,
             auto_delete_interval_mins=1440,
         )

{benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_sandbox.py RENAMED Viewed

@@ -13,8 +13,11 @@ Does not own:
 import json as _json
 import logging
+import os
 import re
 import shlex
+import tomllib
+from pathlib import Path
 from typing import TYPE_CHECKING
 from benchflow.agents.registry import get_sandbox_home_dirs
@@ -99,7 +102,9 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
     )
-async def setup_sandbox_user(env, sandbox_user: str, workspace: str) -> str:
+async def setup_sandbox_user(
+    env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
+) -> str:
     """Create non-root sandbox user, grant workspace access. Return agent_cwd."""
     if not re.match(r"^[a-z_][a-z0-9_-]*$", sandbox_user):
         raise ValueError(
@@ -119,7 +124,7 @@ async def setup_sandbox_user(env, sandbox_user: str, workspace: str) -> str:
         f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
         f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
         f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
-        timeout_sec=30,
+        timeout_sec=timeout_sec,
     )
     logger.info(f"Sandbox user {sandbox_user} ready (workspace={workspace})")
     return workspace
@@ -314,8 +319,15 @@ VERIFIER_ENV: dict[str, str] = {
     "PYTHONNOUSERSITE": "1",
     "PIP_USER": "0",
     "PIP_NO_USER_CONFIG": "1",
-    # Force HOME to a non-existent path so pip cannot read any pre-staged pip.conf.
-    "HOME": "/nonexistent",
+    # PEP-668 base images (Fedora, recent Debian) refuse pip installs into
+    # system-site without this flag. Verifier runs as root and system-site is
+    # root-owned, so allowing it is safe; without it, tasks that pip-install
+    # pytest in test.sh either fail outright or fall back to a user-site path
+    # that PYTHONNOUSERSITE=1 hides at import time.
+    "PIP_BREAK_SYSTEM_PACKAGES": "1",
+    # /root is root-owned; sandbox_user cannot pre-stage caches there. Pip
+    # config is already blocked by the PIP_* / PYTHONNOUSERSITE vars above.
+    "HOME": "/root",
     # Disable breakpoint() — any other value imports an arbitrary callable.
     "PYTHONBREAKPOINT": "0",
     # Prevent coverage.py from importing a config file as Python on startup.
@@ -325,6 +337,258 @@ VERIFIER_ENV: dict[str, str] = {
     "CELERY_CONFIG_MODULE": "",
 }
+_SAFE_VERIFIER_PATH = VERIFIER_ENV["PATH"]
+_SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":"))
+_RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed")
+# pytest plugin names are not always the same as the PyPI distribution name
+# or the option they register. These aliases cover the common benchmark
+# verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1.
+_PYTEST_PLUGIN_ALIASES = {
+    "ctrf": "ctrf",
+    "pytest-json-ctrf": "ctrf",
+    "pytest_json_ctrf": "ctrf",
+    "pytest_json_ctrf.plugin": "ctrf",
+    "pytest-json-report": "pytest_jsonreport",
+    "pytest_json_report": "pytest_jsonreport",
+    "pytest_jsonreport": "pytest_jsonreport",
+    "pytest_jsonreport.plugin": "pytest_jsonreport",
+}
+_PYTEST_OPTION_PLUGINS = {
+    "--ctrf": "ctrf",
+    "--json-report": "pytest_jsonreport",
+    "--json-report-file": "pytest_jsonreport",
+}
+# Pytest plugins worth auto-loading when test.sh pip-installs them but the
+# task author forgot to declare pytest_plugins in task.toml. Map distribution
+# name (as it appears in `pip install pytest-foo`) to importable plugin name.
+_PYTEST_INSTALLED_PLUGINS = {
+    "pytest-asyncio": "pytest_asyncio",
+    "pytest-anyio": "anyio.pytest_plugin",
+    "pytest-trio": "pytest_trio",
+}
+_PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE)
+def _under_path(path: str, prefix: str) -> bool:
+    prefix = prefix.rstrip("/")
+    return path == prefix or path.startswith(f"{prefix}/")
+def _blocked_verifier_path_prefixes(
+    sandbox_user: str | None, workspace: str | None
+) -> tuple[str, ...]:
+    """Paths that must never be preserved as verifier PATH extras."""
+    prefixes = list(_RUNTIME_PATH_PREFIXES)
+    if workspace:
+        prefixes.append(workspace)
+    if sandbox_user:
+        prefixes.append(f"/home/{sandbox_user}")
+    return tuple(dict.fromkeys(prefixes))
+def _merge_trusted_verifier_path(extras: list[str]) -> str:
+    """Prepend validated image PATH entries to the verifier allowlist."""
+    kept: list[str] = []
+    seen: set[str] = set(_SAFE_VERIFIER_PATH_PARTS)
+    for entry in extras:
+        if entry and entry not in seen:
+            seen.add(entry)
+            kept.append(entry)
+    return ":".join([*kept, *_SAFE_VERIFIER_PATH_PARTS])
+_TRUSTED_PATH_EXTRAS_SCRIPT = r"""
+import json
+import os
+import stat
+import sys
+raw_path = json.loads(sys.argv[1])
+safe_parts = set(json.loads(sys.argv[2]))
+blocked_prefixes = tuple(json.loads(sys.argv[3]))
+def under_path(path, prefix):
+    prefix = prefix.rstrip("/")
+    return path == prefix or path.startswith(prefix + "/")
+trusted = []
+seen = set(safe_parts)
+for entry in raw_path.split(":"):
+    entry = entry.strip()
+    if (
+        not entry
+        or entry in seen
+        or not entry.startswith("/")
+        or "\x00" in entry
+        or "\n" in entry
+    ):
+        continue
+    seen.add(entry)
+    try:
+        real = os.path.realpath(entry)
+        st = os.stat(real)
+    except OSError:
+        continue
+    if not stat.S_ISDIR(st.st_mode):
+        continue
+    if any(under_path(real, prefix) for prefix in blocked_prefixes):
+        continue
+    if st.st_uid != 0:
+        continue
+    if st.st_mode & (stat.S_IWGRP | stat.S_IWOTH):
+        continue
+    trusted.append(entry)
+print(json.dumps(trusted))
+""".strip()
+def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) -> str:
+    """Build the container-side command that validates verifier PATH extras."""
+    return (
+        f"python3 -c {shlex.quote(_TRUSTED_PATH_EXTRAS_SCRIPT)} "
+        f"{shlex.quote(_json.dumps(raw_path))} "
+        f"{shlex.quote(_json.dumps(_SAFE_VERIFIER_PATH_PARTS))} "
+        f"{shlex.quote(_json.dumps(blocked_prefixes))}"
+    )
+def _normalize_pytest_plugin(name: object) -> str | None:
+    """Return the importable pytest plugin name for a task declaration."""
+    if not isinstance(name, str):
+        return None
+    clean = name.strip()
+    if not clean:
+        return None
+    return _PYTEST_PLUGIN_ALIASES.get(clean, clean)
+def _plugins_from_verifier_script(task: "Task") -> list[str]:
+    """Infer known pytest plugins needed by legacy verifier scripts.
+    Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and
+    call options such as --ctrf directly from tests/test.sh. With pytest entry
+    point autoload disabled, those options must be backed by explicit -p flags.
+    """
+    task_dir = getattr(task, "task_dir", None)
+    if not isinstance(task_dir, (str, os.PathLike)):
+        return []
+    test_sh = Path(task_dir) / "tests" / "test.sh"
+    try:
+        content = test_sh.read_text()
+    except OSError:
+        return []
+    plugins: list[str] = []
+    for option, plugin in _PYTEST_OPTION_PLUGINS.items():
+        if option in content and plugin not in plugins:
+            plugins.append(plugin)
+    # Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1
+    # doesn't silently drop them. Only matches the exact installer line so
+    # arbitrary text mentioning the plugin name is ignored.
+    for match in _PIP_INSTALL_RE.findall(content):
+        for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items():
+            if dist in match and plugin not in plugins:
+                plugins.append(plugin)
+    return plugins
+def _declared_pytest_plugins(task: "Task") -> list[object]:
+    """Return pytest_plugins from the model, falling back to raw task.toml."""
+    declared = getattr(task.config.verifier, "pytest_plugins", None)
+    if declared:
+        return list(declared)
+    task_dir = getattr(task, "task_dir", None)
+    if not isinstance(task_dir, (str, os.PathLike)):
+        return []
+    config_path = Path(task_dir) / "task.toml"
+    try:
+        data = tomllib.loads(config_path.read_text())
+    except (OSError, tomllib.TOMLDecodeError):
+        return []
+    plugins = data.get("verifier", {}).get("pytest_plugins", [])
+    if isinstance(plugins, list):
+        return plugins
+    return []
+def _pytest_plugin_flags(task: "Task") -> str:
+    """Build deterministic -p flags for inferred and declared pytest plugins."""
+    plugins: list[str] = []
+    for plugin in _plugins_from_verifier_script(task):
+        if plugin not in plugins:
+            plugins.append(plugin)
+    for plugin in _declared_pytest_plugins(task):
+        normalized = _normalize_pytest_plugin(plugin)
+        if normalized and normalized not in plugins:
+            plugins.append(normalized)
+    return " ".join(f"-p {shlex.quote(p)}" for p in plugins)
+_FEDORA_LIKE = ("fedora", "rhel", "centos", "rocky", "alma")
+async def _distro_pip_env(env) -> dict[str, str]:
+    """Distro-conditional pip env to neutralize Fedora's user-install fallback.
+    Fedora's downstream pip patch routes root pip-installs to ~/.local/lib
+    even with PIP_USER=0 + PIP_BREAK_SYSTEM_PACKAGES=1. PYTHONNOUSERSITE=1 then
+    hides those installs from python3 at import time. Pinning PIP_PREFIX on
+    Fedora-likes only writes them to /usr/local where python3 can find them.
+    Setting PIP_PREFIX on Debian/Ubuntu would double-prefix (their downstream
+    pip already injects --prefix=/usr/local for root), creating
+    /usr/local/usr/local/bin/pytest. So this is conditional on the image distro.
+    """
+    try:
+        result = await env.exec(
+            "cat /etc/os-release 2>/dev/null || true", user="root", timeout_sec=5
+        )
+    except Exception:
+        return {}
+    text = (result.stdout or "").lower()
+    ids: list[str] = []
+    for line in text.splitlines():
+        if line.startswith("id=") or line.startswith("id_like="):
+            value = line.split("=", 1)[1].strip().strip('"').strip("'")
+            ids.extend(value.split())
+    if any(d in ids for d in _FEDORA_LIKE):
+        return {"PIP_PREFIX": "/usr/local"}
+    return {}
+async def _trusted_verifier_path(
+    env, sandbox_user: str | None, workspace: str | None
+) -> str:
+    """Return verifier PATH with trusted image extras preserved.
+    Dockerfile PATH additions are accepted only after container-side stat
+    checks prove they are root-owned directories and not group/world writable.
+    Runtime locations and sandbox-user writable locations stay excluded.
+    """
+    path_result = await env.exec("printenv PATH", user="root", timeout_sec=10)
+    raw_path = path_result.stdout or ""
+    if not raw_path.strip():
+        return _SAFE_VERIFIER_PATH
+    cmd = _trusted_path_extras_cmd(
+        raw_path, _blocked_verifier_path_prefixes(sandbox_user, workspace)
+    )
+    result = await env.exec(cmd, user="root", timeout_sec=10)
+    try:
+        extras = _json.loads(result.stdout or "[]")
+    except _json.JSONDecodeError:
+        logger.warning("Could not parse trusted verifier PATH extras; using safe PATH")
+        extras = []
+    if not isinstance(extras, list):
+        logger.warning("Invalid trusted verifier PATH extras; using safe PATH")
+        extras = []
+    return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)])
 # Wipe and recreate /logs/verifier/ before the verifier runs.
 # rm -rf severs hardlinks, removes symlink replacements, and eliminates
 # variant filenames/subdirs the agent may have pre-staged.
@@ -355,18 +619,24 @@ CLEANUP_CMD = (
 async def harden_before_verify(
-    env, task: "Task", sandbox_user: str | None, workspace: str | None = None
+    env,
+    task: "Task",
+    sandbox_user: str | None,
+    workspace: str | None = None,
+    # Default false because SkillsBench/TB2-style answers often are workspace
+    # edits. Going forward, enforce true only via an explicit task/benchmark
+    # contract, e.g. task.toml [verifier] restore_workspace = true after an
+    # oracle/diff audit proves the answer is not stored in the workspace.
+    restore_workspace: bool = False,
 ) -> None:
     """Neutralize agent tampering before running the verifier.
     1. Kill sandbox-user processes (prevent concurrent writes during teardown).
     2. Assert all sandbox-user processes are dead, then wipe/recreate
        /logs/verifier/ with a clean root-owned directory.
-    3. Restore build-config files to pre-agent state (if workspace provided).
-    4. Sync restored build-config files into /testbed_verify (if workspace provided).
-    4b.Full workspace restore from /testbed_verify — resets ALL source files to
-       pre-agent canonical state, not just the build-config subset.
-    4c.Purge symlinks and __pycache__ trees from workspace.
+    3. Optionally restore the workspace from the pre-agent snapshot. This is
+       destructive to legitimate workspace-edit answers, so it is opt-in.
+    4. Purge symlinks and __pycache__ trees from workspace.
     5. chown workspace to root (belt-and-suspenders against zombie sandbox writes).
     6. Remove injected conftest.py, sitecustomize.py, .pth files.
     7. Merge trusted env vars into task.config.verifier.env.
@@ -386,7 +656,7 @@ async def harden_before_verify(
         )
     # Wipe and recreate /logs/verifier/ with a clean root-owned directory.
     await env.exec(_CLEAR_VERIFIER_DIR_CMD, user="root")
-    if workspace:
+    if workspace and restore_workspace:
         await _restore_build_config(env, workspace)
         await _refresh_verifier_workspace(env, workspace)
         # Full workspace restore from /testbed_verify (the pre-agent canonical
@@ -402,18 +672,41 @@ async def harden_before_verify(
             f"fi",
             user="root",
         )
-        # Purge symlinks so an agent-planted symlink pointing to a writable
-        # external target (e.g. utils.py -> /tmp/evil.py) cannot be followed
-        # during import after the workspace is frozen.
+    if workspace:
+        # Purge symlinks whose realpath escapes the workspace (e.g. agent-planted
+        # utils.py -> /tmp/evil.py). In-tree symlinks are preserved — some
+        # source trees (OTP, lake-packages, etc.) ship them as fixtures.
+        # Residual risk: agent plants utils.py -> ./evil.py inside workspace.
+        # Mitigated by the workspace chown below + CLEANUP_CMD path scrubbing.
+        purge_external_symlinks = (
+            "import os, sys, pathlib;"
+            "root = pathlib.Path(sys.argv[1]).resolve();"
+            "root_str = str(root) + os.sep;"
+            "[p.unlink() for p in root.rglob('*')"
+            " if p.is_symlink()"
+            " and not (str(p.resolve()) + os.sep).startswith(root_str)"
+            " and str(p.resolve()) != str(root)]"
+        )
         await env.exec(
-            f"find {shlex.quote(workspace)} -type l -delete 2>/dev/null; true",
+            f"python3 -c {shlex.quote(purge_external_symlinks)} "
+            f"{shlex.quote(workspace)} 2>/dev/null; true",
             user="root",
         )
-        # Purge __pycache__ trees so pre-compiled .pyc bytecode cannot execute
-        # even if PYTHONPYCACHEPREFIX is bypassed (defense-in-depth).
+        # Purge __pycache__ trees that did not exist in the pre-agent baseline,
+        # so agent-planted .pyc bytecode cannot execute even if
+        # PYTHONPYCACHEPREFIX is bypassed. Baseline-present caches are kept so
+        # tasks whose tests diff workspace against /testbed_verify don't break.
         await env.exec(
-            f"find {shlex.quote(workspace)} -type d -name '__pycache__'"
-            f" -exec rm -rf {{}} + 2>/dev/null; true",
+            f"if [ -d /testbed_verify ]; then "
+            f"  find {shlex.quote(workspace)} -type d -name __pycache__ -print0 "
+            f"  | while IFS= read -r -d '' d; do "
+            f"      rel=${{d#{shlex.quote(workspace)}/}}; "
+            f'      [ -d "/testbed_verify/$rel" ] || rm -rf "$d"; '
+            f"  done; "
+            f"else "
+            f"  find {shlex.quote(workspace)} -type d -name '__pycache__'"
+            f" -exec rm -rf {{}} + 2>/dev/null; "
+            f"fi; true",
             user="root",
         )
         # chown workspace to root: belt-and-suspenders against any zombie
@@ -424,24 +717,29 @@ async def harden_before_verify(
         )
     await env.exec(CLEANUP_CMD, user="root", timeout_sec=10)
+    hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace)
+    distro_env = await _distro_pip_env(env)
     verifier_env = dict(VERIFIER_ENV)
+    verifier_env.update(distro_env)
     if task.config.verifier.env:
         verifier_env.update(task.config.verifier.env)
     # Hard security invariants — re-pin after task-env merge so a task cannot
-    # strip -c /dev/null / --confcutdir, re-enable entry-point plugin loading,
-    # or inject code via breakpoint()/coverage/Django/Celery startup hooks.
+    # replace PATH, strip -c /dev/null / --confcutdir, re-enable entry-point
+    # plugin loading, or inject code via breakpoint()/coverage/Django/Celery
+    # startup hooks.
+    verifier_env["PATH"] = hardened_path
     verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1"
     verifier_env["PYTHONBREAKPOINT"] = "0"
     verifier_env["COVERAGE_PROCESS_START"] = ""
     verifier_env["DJANGO_SETTINGS_MODULE"] = ""
     verifier_env["CELERY_CONFIG_MODULE"] = ""
-    # Re-enable explicitly declared plugins by appending -p flags to the
-    # hardened base — never to a task-supplied PYTEST_ADDOPTS.
-    # getattr: field absent in older harbor deployments; bare access was a live crash.
-    allowed_plugins = getattr(task.config.verifier, "pytest_plugins", None) or []
+    # Re-enable known verifier plugins by appending -p flags to the hardened
+    # base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are
+    # inferred from tests/test.sh; newer tasks may declare pytest_plugins.
     base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"]
-    if allowed_plugins:
-        flags = " ".join(f"-p {shlex.quote(p)}" for p in allowed_plugins)
+    flags = _pytest_plugin_flags(task)
+    if flags:
         verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}"
     else:
         verifier_env["PYTEST_ADDOPTS"] = base_addopts

{benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/job.py RENAMED Viewed

@@ -243,14 +243,14 @@ class Job:
         # Detect format: Harbor uses "agents" + "datasets", benchflow uses "agent"
         if "agents" in raw or "datasets" in raw:
-            return cls._from_harbor_yaml(raw, path.parent, **kwargs)
-        return cls._from_native_yaml(raw, path.parent, **kwargs)
+            return cls._from_harbor_yaml(raw, **kwargs)
+        return cls._from_native_yaml(raw, **kwargs)
     @classmethod
-    def _from_native_yaml(cls, raw: dict, base_dir: Path, **kwargs) -> "Job":
+    def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
         """Parse benchflow-native YAML."""
-        tasks_dir = base_dir / raw["tasks_dir"]
-        jobs_dir = base_dir / raw.get("jobs_dir", "jobs")
+        tasks_dir = Path(raw["tasks_dir"])
+        jobs_dir = Path(raw.get("jobs_dir", "jobs"))
         # Parse prompts — YAML null becomes Python None
         prompts = raw.get("prompts")
@@ -268,9 +268,7 @@ class Job:
             prompts=prompts,
             agent_env=agent_env_raw,
             retry=RetryConfig(max_retries=raw.get("max_retries", 2)),
-            skills_dir=str(base_dir / raw["skills_dir"])
-            if raw.get("skills_dir")
-            else None,
+            skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
             sandbox_user=sandbox_user,
             sandbox_locked_paths=sandbox_locked_paths,
             exclude_tasks=exclude,
@@ -278,7 +276,7 @@ class Job:
         return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
     @classmethod
-    def _from_harbor_yaml(cls, raw: dict, base_dir: Path, **kwargs) -> "Job":
+    def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
         """Parse Harbor-compatible YAML."""
         # Agent
         agents = raw.get("agents", [{}])
@@ -306,20 +304,20 @@ class Job:
         # Datasets
         datasets = raw.get("datasets", [{}])
-        tasks_dir = base_dir / datasets[0].get("path", "tasks")
+        tasks_dir = Path(datasets[0].get("path", "tasks"))
         # Orchestrator
         orch = raw.get("orchestrator", {})
         concurrency = orch.get("n_concurrent_trials", 4)
-        jobs_dir = base_dir / raw.get("jobs_dir", "jobs")
+        jobs_dir = Path(raw.get("jobs_dir", "jobs"))
         max_retries = (
             raw.get("n_attempts", 1) - 1
         )  # Harbor n_attempts includes first try
         # Skills dir (shared with benchflow-native format)
         skills_dir_raw = raw.get("skills_dir")
-        skills_dir = str(base_dir / skills_dir_raw) if skills_dir_raw else None
+        skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
         sandbox_user = raw.get("sandbox_user", "agent")
         sandbox_locked_paths = raw.get("sandbox_locked_paths")

benchflow 0.2.2__tar.gz → 0.2.3__tar.gz

benchflow 0.2.2tar.gz → 0.2.3tar.gz