benchflow 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.2.2 → benchflow-0.2.3}/CHANGELOG.md +20 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/PKG-INFO +3 -3
- {benchflow-0.2.2 → benchflow-0.2.3}/README.md +1 -1
- {benchflow-0.2.2 → benchflow-0.2.3}/pyproject.toml +11 -11
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_env_setup.py +24 -1
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_sandbox.py +325 -27
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/job.py +10 -12
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/sdk.py +36 -11
- benchflow-0.2.3/tests/test_oracle.py +63 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox_hardening.py +299 -27
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox_verifier_workspace.py +7 -1
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_yaml_config.py +48 -4
- benchflow-0.2.2/.devcontainer/Dockerfile +0 -88
- benchflow-0.2.2/.devcontainer/devcontainer.json +0 -37
- benchflow-0.2.2/.env.sample +0 -27
- benchflow-0.2.2/.git +0 -1
- benchflow-0.2.2/.github/workflows/test.yml +0 -38
- benchflow-0.2.2/.pre-commit-config.yaml +0 -22
- benchflow-0.2.2/.python-version +0 -2
- benchflow-0.2.2/CLAUDE.md +0 -31
- benchflow-0.2.2/benchmarks/run_skillsbench.py +0 -25
- benchflow-0.2.2/benchmarks/run_tb2.py +0 -30
- benchflow-0.2.2/benchmarks/skillsbench-claude-glm5.yaml +0 -10
- benchflow-0.2.2/benchmarks/skillsbench-codex-gpt54.yaml +0 -10
- benchflow-0.2.2/benchmarks/tb2_multiturn-codex-gpt54.yaml +0 -10
- benchflow-0.2.2/benchmarks/tb2_single-codex-gpt54.yaml +0 -7
- benchflow-0.2.2/docs/architecture.md +0 -265
- benchflow-0.2.2/docs/cli-reference.md +0 -283
- benchflow-0.2.2/docs/getting-started.md +0 -295
- benchflow-0.2.2/docs/labs.md +0 -88
- benchflow-0.2.2/docs/task-authoring.md +0 -219
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/README.md +0 -153
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/_attack_runner.py +0 -74
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/comparison.ipynb +0 -172
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/Dockerfile +0 -9
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/conftest_exploit.py +0 -18
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/instruction.md +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/solution/solve.sh +0 -26
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/task.toml +0 -17
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test.sh +0 -19
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +0 -13
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/environment/Dockerfile +0 -3
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/instruction.md +0 -3
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/answer.txt +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/solve.sh +0 -15
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/task.toml +0 -17
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/ground_truth.txt +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/test.sh +0 -10
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/environment/Dockerfile +0 -4
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/instruction.md +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/solution/solve.sh +0 -28
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/task.toml +0 -17
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test.sh +0 -10
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +0 -12
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/run_comparison.py +0 -201
- benchflow-0.2.2/labs/reward-hack-matrix/.gitignore +0 -6
- benchflow-0.2.2/labs/reward-hack-matrix/README.md +0 -119
- benchflow-0.2.2/labs/reward-hack-matrix/_runner.py +0 -82
- benchflow-0.2.2/labs/reward-hack-matrix/_worker.py +0 -179
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_hook.sh +0 -87
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_payload.py +0 -33
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/path_trojan.sh +0 -56
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/pth_injection.sh +0 -122
- benchflow-0.2.2/labs/reward-hack-matrix/fetch_corpora.sh +0 -53
- benchflow-0.2.2/labs/reward-hack-matrix/run_matrix.py +0 -758
- benchflow-0.2.2/labs/reward-hack-matrix/sweep_0.2.0_vs_0.2.2.json +0 -7994
- benchflow-0.2.2/uv.lock +0 -3302
- {benchflow-0.2.2 → benchflow-0.2.3}/.gitignore +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/LICENSE +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_acp_run.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_agent_env.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_agent_setup.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_credentials.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_scoring.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/registry.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/user_agent.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/environments.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/metrics.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/models.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/process.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/py.typed +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/skills.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/task_download.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/tasks.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/atif.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/claude_code.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/proxy.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/viewer.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/conftest.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_acp.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_agent_registry.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_atif_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_env_setup.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_job.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_metrics.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_process.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_providers.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_reexport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_scoring.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_skills.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_smoke.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_tasks.py +0 -0
- {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_verify.py +0 -0
|
@@ -2,6 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## 0.2.3 — 2026-04-15
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- `benchmarks/tb2_multiturn-claude-haiku45.yaml` — shipped config for the README's TB2 multi-turn Claude result.
|
|
10
|
+
- Daytona resource clamping via `BENCHFLOW_DAYTONA_MAX_CPUS` / `MAX_MEMORY_MB`.
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Renamed `skillsbench-claude-glm5.yaml` → `skillsbench-claude-glm51.yaml` to match the model ID.
|
|
15
|
+
- `codex --login` correction in `docs/getting-started.md`.
|
|
16
|
+
- Restricted sdist build to `src/`, `tests/`, and metadata.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Verifier sandbox hardening follow-ups across several base-image and tooling edge cases.
|
|
21
|
+
- Preserve trusted verifier path entries and workspace answer files.
|
|
22
|
+
- Redirect oracle output to container log.
|
|
23
|
+
- Align YAML path resolution to config file location.
|
|
24
|
+
|
|
5
25
|
## 0.2.2 — 2026-04-13
|
|
6
26
|
|
|
7
27
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -28,7 +28,7 @@ Requires-Dist: typer>=0.9
|
|
|
28
28
|
Provides-Extra: dev
|
|
29
29
|
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
30
30
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
31
|
-
Requires-Dist: pytest>=
|
|
31
|
+
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
32
32
|
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
33
33
|
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
34
34
|
Description-Content-Type: text/markdown
|
|
@@ -163,7 +163,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
|
|
|
163
163
|
**SkillsBench** (86 tasks — tool use, file editing, API calls):
|
|
164
164
|
|
|
165
165
|
```bash
|
|
166
|
-
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-
|
|
166
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml # Claude
|
|
167
167
|
python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
|
|
168
168
|
```
|
|
169
169
|
|
|
@@ -128,7 +128,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
|
|
|
128
128
|
**SkillsBench** (86 tasks — tool use, file editing, API calls):
|
|
129
129
|
|
|
130
130
|
```bash
|
|
131
|
-
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-
|
|
131
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml # Claude
|
|
132
132
|
python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
|
|
133
133
|
```
|
|
134
134
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "benchflow"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.3"
|
|
4
4
|
description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -37,7 +37,7 @@ classifiers = [
|
|
|
37
37
|
[project.optional-dependencies]
|
|
38
38
|
dev = [
|
|
39
39
|
"pre-commit>=3.7",
|
|
40
|
-
"pytest>=
|
|
40
|
+
"pytest>=9.0.3",
|
|
41
41
|
"pytest-asyncio>=0.24.0",
|
|
42
42
|
"ruff>=0.7.0",
|
|
43
43
|
"ty>=0.0.1a1",
|
|
@@ -58,20 +58,20 @@ requires = ["hatchling"]
|
|
|
58
58
|
build-backend = "hatchling.build"
|
|
59
59
|
|
|
60
60
|
[tool.hatch.build.targets.sdist]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
".
|
|
67
|
-
"
|
|
68
|
-
".
|
|
69
|
-
"__pycache__",
|
|
61
|
+
# Allowlist: only ship what the installed package needs.
|
|
62
|
+
only-include = [
|
|
63
|
+
"src",
|
|
64
|
+
"tests",
|
|
65
|
+
"README.md",
|
|
66
|
+
"CHANGELOG.md",
|
|
67
|
+
"LICENSE",
|
|
68
|
+
"pyproject.toml",
|
|
70
69
|
]
|
|
71
70
|
|
|
72
71
|
[tool.pytest.ini_options]
|
|
73
72
|
asyncio_mode = "auto"
|
|
74
73
|
addopts = "-m 'not live'"
|
|
74
|
+
testpaths = ["tests"]
|
|
75
75
|
markers = [
|
|
76
76
|
"live: requires real Anthropic API and Docker daemon (run with -m live)",
|
|
77
77
|
]
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
5
6
|
import re
|
|
6
7
|
import shutil
|
|
7
8
|
from pathlib import Path
|
|
@@ -14,6 +15,12 @@ from benchflow.agents.registry import AGENTS
|
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
18
|
+
# Daytona's per-sandbox cap on the default tier is 4 CPU / 8 GB. Tasks declaring
|
|
19
|
+
# more fail at sandbox creation. Clamp here so tasks degrade gracefully (slower
|
|
20
|
+
# build) instead of erroring out. Override via env if running on a paid tier.
|
|
21
|
+
_DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4"))
|
|
22
|
+
_DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192"))
|
|
23
|
+
|
|
17
24
|
# Directories to ignore when copying deps
|
|
18
25
|
_IGNORE_DIRS = {
|
|
19
26
|
".venv",
|
|
@@ -253,12 +260,28 @@ def _create_environment(
|
|
|
253
260
|
elif environment_type == "daytona":
|
|
254
261
|
from harbor.environments.daytona import DaytonaEnvironment
|
|
255
262
|
|
|
263
|
+
env_config = task.config.environment
|
|
264
|
+
if env_config.cpus > _DAYTONA_MAX_CPUS:
|
|
265
|
+
logger.warning(
|
|
266
|
+
"Clamping cpus %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_CPUS)",
|
|
267
|
+
env_config.cpus,
|
|
268
|
+
_DAYTONA_MAX_CPUS,
|
|
269
|
+
)
|
|
270
|
+
env_config.cpus = _DAYTONA_MAX_CPUS
|
|
271
|
+
if env_config.memory_mb > _DAYTONA_MAX_MEMORY_MB:
|
|
272
|
+
logger.warning(
|
|
273
|
+
"Clamping memory_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_MEMORY_MB)",
|
|
274
|
+
env_config.memory_mb,
|
|
275
|
+
_DAYTONA_MAX_MEMORY_MB,
|
|
276
|
+
)
|
|
277
|
+
env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB
|
|
278
|
+
|
|
256
279
|
return DaytonaEnvironment(
|
|
257
280
|
environment_dir=task.paths.environment_dir,
|
|
258
281
|
environment_name=task_path.name,
|
|
259
282
|
session_id=trial_name,
|
|
260
283
|
trial_paths=trial_paths,
|
|
261
|
-
task_env_config=
|
|
284
|
+
task_env_config=env_config,
|
|
262
285
|
auto_stop_interval_mins=1440,
|
|
263
286
|
auto_delete_interval_mins=1440,
|
|
264
287
|
)
|
|
@@ -13,8 +13,11 @@ Does not own:
|
|
|
13
13
|
|
|
14
14
|
import json as _json
|
|
15
15
|
import logging
|
|
16
|
+
import os
|
|
16
17
|
import re
|
|
17
18
|
import shlex
|
|
19
|
+
import tomllib
|
|
20
|
+
from pathlib import Path
|
|
18
21
|
from typing import TYPE_CHECKING
|
|
19
22
|
|
|
20
23
|
from benchflow.agents.registry import get_sandbox_home_dirs
|
|
@@ -99,7 +102,9 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
|
|
|
99
102
|
)
|
|
100
103
|
|
|
101
104
|
|
|
102
|
-
async def setup_sandbox_user(
|
|
105
|
+
async def setup_sandbox_user(
|
|
106
|
+
env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
|
|
107
|
+
) -> str:
|
|
103
108
|
"""Create non-root sandbox user, grant workspace access. Return agent_cwd."""
|
|
104
109
|
if not re.match(r"^[a-z_][a-z0-9_-]*$", sandbox_user):
|
|
105
110
|
raise ValueError(
|
|
@@ -119,7 +124,7 @@ async def setup_sandbox_user(env, sandbox_user: str, workspace: str) -> str:
|
|
|
119
124
|
f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
|
|
120
125
|
f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
|
|
121
126
|
f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
|
|
122
|
-
timeout_sec=
|
|
127
|
+
timeout_sec=timeout_sec,
|
|
123
128
|
)
|
|
124
129
|
logger.info(f"Sandbox user {sandbox_user} ready (workspace={workspace})")
|
|
125
130
|
return workspace
|
|
@@ -314,8 +319,15 @@ VERIFIER_ENV: dict[str, str] = {
|
|
|
314
319
|
"PYTHONNOUSERSITE": "1",
|
|
315
320
|
"PIP_USER": "0",
|
|
316
321
|
"PIP_NO_USER_CONFIG": "1",
|
|
317
|
-
#
|
|
318
|
-
|
|
322
|
+
# PEP-668 base images (Fedora, recent Debian) refuse pip installs into
|
|
323
|
+
# system-site without this flag. Verifier runs as root and system-site is
|
|
324
|
+
# root-owned, so allowing it is safe; without it, tasks that pip-install
|
|
325
|
+
# pytest in test.sh either fail outright or fall back to a user-site path
|
|
326
|
+
# that PYTHONNOUSERSITE=1 hides at import time.
|
|
327
|
+
"PIP_BREAK_SYSTEM_PACKAGES": "1",
|
|
328
|
+
# /root is root-owned; sandbox_user cannot pre-stage caches there. Pip
|
|
329
|
+
# config is already blocked by the PIP_* / PYTHONNOUSERSITE vars above.
|
|
330
|
+
"HOME": "/root",
|
|
319
331
|
# Disable breakpoint() — any other value imports an arbitrary callable.
|
|
320
332
|
"PYTHONBREAKPOINT": "0",
|
|
321
333
|
# Prevent coverage.py from importing a config file as Python on startup.
|
|
@@ -325,6 +337,258 @@ VERIFIER_ENV: dict[str, str] = {
|
|
|
325
337
|
"CELERY_CONFIG_MODULE": "",
|
|
326
338
|
}
|
|
327
339
|
|
|
340
|
+
_SAFE_VERIFIER_PATH = VERIFIER_ENV["PATH"]
|
|
341
|
+
_SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":"))
|
|
342
|
+
_RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed")
|
|
343
|
+
|
|
344
|
+
# pytest plugin names are not always the same as the PyPI distribution name
|
|
345
|
+
# or the option they register. These aliases cover the common benchmark
|
|
346
|
+
# verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1.
|
|
347
|
+
_PYTEST_PLUGIN_ALIASES = {
|
|
348
|
+
"ctrf": "ctrf",
|
|
349
|
+
"pytest-json-ctrf": "ctrf",
|
|
350
|
+
"pytest_json_ctrf": "ctrf",
|
|
351
|
+
"pytest_json_ctrf.plugin": "ctrf",
|
|
352
|
+
"pytest-json-report": "pytest_jsonreport",
|
|
353
|
+
"pytest_json_report": "pytest_jsonreport",
|
|
354
|
+
"pytest_jsonreport": "pytest_jsonreport",
|
|
355
|
+
"pytest_jsonreport.plugin": "pytest_jsonreport",
|
|
356
|
+
}
|
|
357
|
+
_PYTEST_OPTION_PLUGINS = {
|
|
358
|
+
"--ctrf": "ctrf",
|
|
359
|
+
"--json-report": "pytest_jsonreport",
|
|
360
|
+
"--json-report-file": "pytest_jsonreport",
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
# Pytest plugins worth auto-loading when test.sh pip-installs them but the
|
|
364
|
+
# task author forgot to declare pytest_plugins in task.toml. Map distribution
|
|
365
|
+
# name (as it appears in `pip install pytest-foo`) to importable plugin name.
|
|
366
|
+
_PYTEST_INSTALLED_PLUGINS = {
|
|
367
|
+
"pytest-asyncio": "pytest_asyncio",
|
|
368
|
+
"pytest-anyio": "anyio.pytest_plugin",
|
|
369
|
+
"pytest-trio": "pytest_trio",
|
|
370
|
+
}
|
|
371
|
+
_PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _under_path(path: str, prefix: str) -> bool:
|
|
375
|
+
prefix = prefix.rstrip("/")
|
|
376
|
+
return path == prefix or path.startswith(f"{prefix}/")
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _blocked_verifier_path_prefixes(
|
|
380
|
+
sandbox_user: str | None, workspace: str | None
|
|
381
|
+
) -> tuple[str, ...]:
|
|
382
|
+
"""Paths that must never be preserved as verifier PATH extras."""
|
|
383
|
+
prefixes = list(_RUNTIME_PATH_PREFIXES)
|
|
384
|
+
if workspace:
|
|
385
|
+
prefixes.append(workspace)
|
|
386
|
+
if sandbox_user:
|
|
387
|
+
prefixes.append(f"/home/{sandbox_user}")
|
|
388
|
+
return tuple(dict.fromkeys(prefixes))
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _merge_trusted_verifier_path(extras: list[str]) -> str:
|
|
392
|
+
"""Prepend validated image PATH entries to the verifier allowlist."""
|
|
393
|
+
kept: list[str] = []
|
|
394
|
+
seen: set[str] = set(_SAFE_VERIFIER_PATH_PARTS)
|
|
395
|
+
for entry in extras:
|
|
396
|
+
if entry and entry not in seen:
|
|
397
|
+
seen.add(entry)
|
|
398
|
+
kept.append(entry)
|
|
399
|
+
return ":".join([*kept, *_SAFE_VERIFIER_PATH_PARTS])
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
_TRUSTED_PATH_EXTRAS_SCRIPT = r"""
|
|
403
|
+
import json
|
|
404
|
+
import os
|
|
405
|
+
import stat
|
|
406
|
+
import sys
|
|
407
|
+
|
|
408
|
+
raw_path = json.loads(sys.argv[1])
|
|
409
|
+
safe_parts = set(json.loads(sys.argv[2]))
|
|
410
|
+
blocked_prefixes = tuple(json.loads(sys.argv[3]))
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def under_path(path, prefix):
|
|
414
|
+
prefix = prefix.rstrip("/")
|
|
415
|
+
return path == prefix or path.startswith(prefix + "/")
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
trusted = []
|
|
419
|
+
seen = set(safe_parts)
|
|
420
|
+
for entry in raw_path.split(":"):
|
|
421
|
+
entry = entry.strip()
|
|
422
|
+
if (
|
|
423
|
+
not entry
|
|
424
|
+
or entry in seen
|
|
425
|
+
or not entry.startswith("/")
|
|
426
|
+
or "\x00" in entry
|
|
427
|
+
or "\n" in entry
|
|
428
|
+
):
|
|
429
|
+
continue
|
|
430
|
+
seen.add(entry)
|
|
431
|
+
try:
|
|
432
|
+
real = os.path.realpath(entry)
|
|
433
|
+
st = os.stat(real)
|
|
434
|
+
except OSError:
|
|
435
|
+
continue
|
|
436
|
+
if not stat.S_ISDIR(st.st_mode):
|
|
437
|
+
continue
|
|
438
|
+
if any(under_path(real, prefix) for prefix in blocked_prefixes):
|
|
439
|
+
continue
|
|
440
|
+
if st.st_uid != 0:
|
|
441
|
+
continue
|
|
442
|
+
if st.st_mode & (stat.S_IWGRP | stat.S_IWOTH):
|
|
443
|
+
continue
|
|
444
|
+
trusted.append(entry)
|
|
445
|
+
print(json.dumps(trusted))
|
|
446
|
+
""".strip()
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) -> str:
|
|
450
|
+
"""Build the container-side command that validates verifier PATH extras."""
|
|
451
|
+
return (
|
|
452
|
+
f"python3 -c {shlex.quote(_TRUSTED_PATH_EXTRAS_SCRIPT)} "
|
|
453
|
+
f"{shlex.quote(_json.dumps(raw_path))} "
|
|
454
|
+
f"{shlex.quote(_json.dumps(_SAFE_VERIFIER_PATH_PARTS))} "
|
|
455
|
+
f"{shlex.quote(_json.dumps(blocked_prefixes))}"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _normalize_pytest_plugin(name: object) -> str | None:
|
|
460
|
+
"""Return the importable pytest plugin name for a task declaration."""
|
|
461
|
+
if not isinstance(name, str):
|
|
462
|
+
return None
|
|
463
|
+
clean = name.strip()
|
|
464
|
+
if not clean:
|
|
465
|
+
return None
|
|
466
|
+
return _PYTEST_PLUGIN_ALIASES.get(clean, clean)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _plugins_from_verifier_script(task: "Task") -> list[str]:
|
|
470
|
+
"""Infer known pytest plugins needed by legacy verifier scripts.
|
|
471
|
+
|
|
472
|
+
Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and
|
|
473
|
+
call options such as --ctrf directly from tests/test.sh. With pytest entry
|
|
474
|
+
point autoload disabled, those options must be backed by explicit -p flags.
|
|
475
|
+
"""
|
|
476
|
+
task_dir = getattr(task, "task_dir", None)
|
|
477
|
+
if not isinstance(task_dir, (str, os.PathLike)):
|
|
478
|
+
return []
|
|
479
|
+
test_sh = Path(task_dir) / "tests" / "test.sh"
|
|
480
|
+
try:
|
|
481
|
+
content = test_sh.read_text()
|
|
482
|
+
except OSError:
|
|
483
|
+
return []
|
|
484
|
+
|
|
485
|
+
plugins: list[str] = []
|
|
486
|
+
for option, plugin in _PYTEST_OPTION_PLUGINS.items():
|
|
487
|
+
if option in content and plugin not in plugins:
|
|
488
|
+
plugins.append(plugin)
|
|
489
|
+
# Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1
|
|
490
|
+
# doesn't silently drop them. Only matches the exact installer line so
|
|
491
|
+
# arbitrary text mentioning the plugin name is ignored.
|
|
492
|
+
for match in _PIP_INSTALL_RE.findall(content):
|
|
493
|
+
for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items():
|
|
494
|
+
if dist in match and plugin not in plugins:
|
|
495
|
+
plugins.append(plugin)
|
|
496
|
+
return plugins
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def _declared_pytest_plugins(task: "Task") -> list[object]:
|
|
500
|
+
"""Return pytest_plugins from the model, falling back to raw task.toml."""
|
|
501
|
+
declared = getattr(task.config.verifier, "pytest_plugins", None)
|
|
502
|
+
if declared:
|
|
503
|
+
return list(declared)
|
|
504
|
+
|
|
505
|
+
task_dir = getattr(task, "task_dir", None)
|
|
506
|
+
if not isinstance(task_dir, (str, os.PathLike)):
|
|
507
|
+
return []
|
|
508
|
+
config_path = Path(task_dir) / "task.toml"
|
|
509
|
+
try:
|
|
510
|
+
data = tomllib.loads(config_path.read_text())
|
|
511
|
+
except (OSError, tomllib.TOMLDecodeError):
|
|
512
|
+
return []
|
|
513
|
+
plugins = data.get("verifier", {}).get("pytest_plugins", [])
|
|
514
|
+
if isinstance(plugins, list):
|
|
515
|
+
return plugins
|
|
516
|
+
return []
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _pytest_plugin_flags(task: "Task") -> str:
|
|
520
|
+
"""Build deterministic -p flags for inferred and declared pytest plugins."""
|
|
521
|
+
plugins: list[str] = []
|
|
522
|
+
for plugin in _plugins_from_verifier_script(task):
|
|
523
|
+
if plugin not in plugins:
|
|
524
|
+
plugins.append(plugin)
|
|
525
|
+
for plugin in _declared_pytest_plugins(task):
|
|
526
|
+
normalized = _normalize_pytest_plugin(plugin)
|
|
527
|
+
if normalized and normalized not in plugins:
|
|
528
|
+
plugins.append(normalized)
|
|
529
|
+
return " ".join(f"-p {shlex.quote(p)}" for p in plugins)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
_FEDORA_LIKE = ("fedora", "rhel", "centos", "rocky", "alma")
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
async def _distro_pip_env(env) -> dict[str, str]:
|
|
536
|
+
"""Distro-conditional pip env to neutralize Fedora's user-install fallback.
|
|
537
|
+
|
|
538
|
+
Fedora's downstream pip patch routes root pip-installs to ~/.local/lib
|
|
539
|
+
even with PIP_USER=0 + PIP_BREAK_SYSTEM_PACKAGES=1. PYTHONNOUSERSITE=1 then
|
|
540
|
+
hides those installs from python3 at import time. Pinning PIP_PREFIX on
|
|
541
|
+
Fedora-likes only writes them to /usr/local where python3 can find them.
|
|
542
|
+
|
|
543
|
+
Setting PIP_PREFIX on Debian/Ubuntu would double-prefix (their downstream
|
|
544
|
+
pip already injects --prefix=/usr/local for root), creating
|
|
545
|
+
/usr/local/usr/local/bin/pytest. So this is conditional on the image distro.
|
|
546
|
+
"""
|
|
547
|
+
try:
|
|
548
|
+
result = await env.exec(
|
|
549
|
+
"cat /etc/os-release 2>/dev/null || true", user="root", timeout_sec=5
|
|
550
|
+
)
|
|
551
|
+
except Exception:
|
|
552
|
+
return {}
|
|
553
|
+
text = (result.stdout or "").lower()
|
|
554
|
+
ids: list[str] = []
|
|
555
|
+
for line in text.splitlines():
|
|
556
|
+
if line.startswith("id=") or line.startswith("id_like="):
|
|
557
|
+
value = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
558
|
+
ids.extend(value.split())
|
|
559
|
+
if any(d in ids for d in _FEDORA_LIKE):
|
|
560
|
+
return {"PIP_PREFIX": "/usr/local"}
|
|
561
|
+
return {}
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
async def _trusted_verifier_path(
|
|
565
|
+
env, sandbox_user: str | None, workspace: str | None
|
|
566
|
+
) -> str:
|
|
567
|
+
"""Return verifier PATH with trusted image extras preserved.
|
|
568
|
+
|
|
569
|
+
Dockerfile PATH additions are accepted only after container-side stat
|
|
570
|
+
checks prove they are root-owned directories and not group/world writable.
|
|
571
|
+
Runtime locations and sandbox-user writable locations stay excluded.
|
|
572
|
+
"""
|
|
573
|
+
path_result = await env.exec("printenv PATH", user="root", timeout_sec=10)
|
|
574
|
+
raw_path = path_result.stdout or ""
|
|
575
|
+
if not raw_path.strip():
|
|
576
|
+
return _SAFE_VERIFIER_PATH
|
|
577
|
+
cmd = _trusted_path_extras_cmd(
|
|
578
|
+
raw_path, _blocked_verifier_path_prefixes(sandbox_user, workspace)
|
|
579
|
+
)
|
|
580
|
+
result = await env.exec(cmd, user="root", timeout_sec=10)
|
|
581
|
+
try:
|
|
582
|
+
extras = _json.loads(result.stdout or "[]")
|
|
583
|
+
except _json.JSONDecodeError:
|
|
584
|
+
logger.warning("Could not parse trusted verifier PATH extras; using safe PATH")
|
|
585
|
+
extras = []
|
|
586
|
+
if not isinstance(extras, list):
|
|
587
|
+
logger.warning("Invalid trusted verifier PATH extras; using safe PATH")
|
|
588
|
+
extras = []
|
|
589
|
+
return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)])
|
|
590
|
+
|
|
591
|
+
|
|
328
592
|
# Wipe and recreate /logs/verifier/ before the verifier runs.
|
|
329
593
|
# rm -rf severs hardlinks, removes symlink replacements, and eliminates
|
|
330
594
|
# variant filenames/subdirs the agent may have pre-staged.
|
|
@@ -355,18 +619,24 @@ CLEANUP_CMD = (
|
|
|
355
619
|
|
|
356
620
|
|
|
357
621
|
async def harden_before_verify(
|
|
358
|
-
env,
|
|
622
|
+
env,
|
|
623
|
+
task: "Task",
|
|
624
|
+
sandbox_user: str | None,
|
|
625
|
+
workspace: str | None = None,
|
|
626
|
+
# Default false because SkillsBench/TB2-style answers often are workspace
|
|
627
|
+
# edits. Going forward, enforce true only via an explicit task/benchmark
|
|
628
|
+
# contract, e.g. task.toml [verifier] restore_workspace = true after an
|
|
629
|
+
# oracle/diff audit proves the answer is not stored in the workspace.
|
|
630
|
+
restore_workspace: bool = False,
|
|
359
631
|
) -> None:
|
|
360
632
|
"""Neutralize agent tampering before running the verifier.
|
|
361
633
|
|
|
362
634
|
1. Kill sandbox-user processes (prevent concurrent writes during teardown).
|
|
363
635
|
2. Assert all sandbox-user processes are dead, then wipe/recreate
|
|
364
636
|
/logs/verifier/ with a clean root-owned directory.
|
|
365
|
-
3.
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
pre-agent canonical state, not just the build-config subset.
|
|
369
|
-
4c.Purge symlinks and __pycache__ trees from workspace.
|
|
637
|
+
3. Optionally restore the workspace from the pre-agent snapshot. This is
|
|
638
|
+
destructive to legitimate workspace-edit answers, so it is opt-in.
|
|
639
|
+
4. Purge symlinks and __pycache__ trees from workspace.
|
|
370
640
|
5. chown workspace to root (belt-and-suspenders against zombie sandbox writes).
|
|
371
641
|
6. Remove injected conftest.py, sitecustomize.py, .pth files.
|
|
372
642
|
7. Merge trusted env vars into task.config.verifier.env.
|
|
@@ -386,7 +656,7 @@ async def harden_before_verify(
|
|
|
386
656
|
)
|
|
387
657
|
# Wipe and recreate /logs/verifier/ with a clean root-owned directory.
|
|
388
658
|
await env.exec(_CLEAR_VERIFIER_DIR_CMD, user="root")
|
|
389
|
-
if workspace:
|
|
659
|
+
if workspace and restore_workspace:
|
|
390
660
|
await _restore_build_config(env, workspace)
|
|
391
661
|
await _refresh_verifier_workspace(env, workspace)
|
|
392
662
|
# Full workspace restore from /testbed_verify (the pre-agent canonical
|
|
@@ -402,18 +672,41 @@ async def harden_before_verify(
|
|
|
402
672
|
f"fi",
|
|
403
673
|
user="root",
|
|
404
674
|
)
|
|
405
|
-
|
|
406
|
-
#
|
|
407
|
-
#
|
|
675
|
+
if workspace:
|
|
676
|
+
# Purge symlinks whose realpath escapes the workspace (e.g. agent-planted
|
|
677
|
+
# utils.py -> /tmp/evil.py). In-tree symlinks are preserved — some
|
|
678
|
+
# source trees (OTP, lake-packages, etc.) ship them as fixtures.
|
|
679
|
+
# Residual risk: agent plants utils.py -> ./evil.py inside workspace.
|
|
680
|
+
# Mitigated by the workspace chown below + CLEANUP_CMD path scrubbing.
|
|
681
|
+
purge_external_symlinks = (
|
|
682
|
+
"import os, sys, pathlib;"
|
|
683
|
+
"root = pathlib.Path(sys.argv[1]).resolve();"
|
|
684
|
+
"root_str = str(root) + os.sep;"
|
|
685
|
+
"[p.unlink() for p in root.rglob('*')"
|
|
686
|
+
" if p.is_symlink()"
|
|
687
|
+
" and not (str(p.resolve()) + os.sep).startswith(root_str)"
|
|
688
|
+
" and str(p.resolve()) != str(root)]"
|
|
689
|
+
)
|
|
408
690
|
await env.exec(
|
|
409
|
-
f"
|
|
691
|
+
f"python3 -c {shlex.quote(purge_external_symlinks)} "
|
|
692
|
+
f"{shlex.quote(workspace)} 2>/dev/null; true",
|
|
410
693
|
user="root",
|
|
411
694
|
)
|
|
412
|
-
# Purge __pycache__ trees
|
|
413
|
-
#
|
|
695
|
+
# Purge __pycache__ trees that did not exist in the pre-agent baseline,
|
|
696
|
+
# so agent-planted .pyc bytecode cannot execute even if
|
|
697
|
+
# PYTHONPYCACHEPREFIX is bypassed. Baseline-present caches are kept so
|
|
698
|
+
# tasks whose tests diff workspace against /testbed_verify don't break.
|
|
414
699
|
await env.exec(
|
|
415
|
-
f"
|
|
416
|
-
f" -
|
|
700
|
+
f"if [ -d /testbed_verify ]; then "
|
|
701
|
+
f" find {shlex.quote(workspace)} -type d -name __pycache__ -print0 "
|
|
702
|
+
f" | while IFS= read -r -d '' d; do "
|
|
703
|
+
f" rel=${{d#{shlex.quote(workspace)}/}}; "
|
|
704
|
+
f' [ -d "/testbed_verify/$rel" ] || rm -rf "$d"; '
|
|
705
|
+
f" done; "
|
|
706
|
+
f"else "
|
|
707
|
+
f" find {shlex.quote(workspace)} -type d -name '__pycache__'"
|
|
708
|
+
f" -exec rm -rf {{}} + 2>/dev/null; "
|
|
709
|
+
f"fi; true",
|
|
417
710
|
user="root",
|
|
418
711
|
)
|
|
419
712
|
# chown workspace to root: belt-and-suspenders against any zombie
|
|
@@ -424,24 +717,29 @@ async def harden_before_verify(
|
|
|
424
717
|
)
|
|
425
718
|
await env.exec(CLEANUP_CMD, user="root", timeout_sec=10)
|
|
426
719
|
|
|
720
|
+
hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace)
|
|
721
|
+
distro_env = await _distro_pip_env(env)
|
|
722
|
+
|
|
427
723
|
verifier_env = dict(VERIFIER_ENV)
|
|
724
|
+
verifier_env.update(distro_env)
|
|
428
725
|
if task.config.verifier.env:
|
|
429
726
|
verifier_env.update(task.config.verifier.env)
|
|
430
727
|
# Hard security invariants — re-pin after task-env merge so a task cannot
|
|
431
|
-
# strip -c /dev/null / --confcutdir, re-enable entry-point
|
|
432
|
-
# or inject code via breakpoint()/coverage/Django/Celery
|
|
728
|
+
# replace PATH, strip -c /dev/null / --confcutdir, re-enable entry-point
|
|
729
|
+
# plugin loading, or inject code via breakpoint()/coverage/Django/Celery
|
|
730
|
+
# startup hooks.
|
|
731
|
+
verifier_env["PATH"] = hardened_path
|
|
433
732
|
verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1"
|
|
434
733
|
verifier_env["PYTHONBREAKPOINT"] = "0"
|
|
435
734
|
verifier_env["COVERAGE_PROCESS_START"] = ""
|
|
436
735
|
verifier_env["DJANGO_SETTINGS_MODULE"] = ""
|
|
437
736
|
verifier_env["CELERY_CONFIG_MODULE"] = ""
|
|
438
|
-
# Re-enable
|
|
439
|
-
#
|
|
440
|
-
#
|
|
441
|
-
allowed_plugins = getattr(task.config.verifier, "pytest_plugins", None) or []
|
|
737
|
+
# Re-enable known verifier plugins by appending -p flags to the hardened
|
|
738
|
+
# base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are
|
|
739
|
+
# inferred from tests/test.sh; newer tasks may declare pytest_plugins.
|
|
442
740
|
base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"]
|
|
443
|
-
|
|
444
|
-
|
|
741
|
+
flags = _pytest_plugin_flags(task)
|
|
742
|
+
if flags:
|
|
445
743
|
verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}"
|
|
446
744
|
else:
|
|
447
745
|
verifier_env["PYTEST_ADDOPTS"] = base_addopts
|
|
@@ -243,14 +243,14 @@ class Job:
|
|
|
243
243
|
|
|
244
244
|
# Detect format: Harbor uses "agents" + "datasets", benchflow uses "agent"
|
|
245
245
|
if "agents" in raw or "datasets" in raw:
|
|
246
|
-
return cls._from_harbor_yaml(raw,
|
|
247
|
-
return cls._from_native_yaml(raw,
|
|
246
|
+
return cls._from_harbor_yaml(raw, **kwargs)
|
|
247
|
+
return cls._from_native_yaml(raw, **kwargs)
|
|
248
248
|
|
|
249
249
|
@classmethod
|
|
250
|
-
def _from_native_yaml(cls, raw: dict,
|
|
250
|
+
def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
|
|
251
251
|
"""Parse benchflow-native YAML."""
|
|
252
|
-
tasks_dir =
|
|
253
|
-
jobs_dir =
|
|
252
|
+
tasks_dir = Path(raw["tasks_dir"])
|
|
253
|
+
jobs_dir = Path(raw.get("jobs_dir", "jobs"))
|
|
254
254
|
|
|
255
255
|
# Parse prompts — YAML null becomes Python None
|
|
256
256
|
prompts = raw.get("prompts")
|
|
@@ -268,9 +268,7 @@ class Job:
|
|
|
268
268
|
prompts=prompts,
|
|
269
269
|
agent_env=agent_env_raw,
|
|
270
270
|
retry=RetryConfig(max_retries=raw.get("max_retries", 2)),
|
|
271
|
-
skills_dir=str(
|
|
272
|
-
if raw.get("skills_dir")
|
|
273
|
-
else None,
|
|
271
|
+
skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
|
|
274
272
|
sandbox_user=sandbox_user,
|
|
275
273
|
sandbox_locked_paths=sandbox_locked_paths,
|
|
276
274
|
exclude_tasks=exclude,
|
|
@@ -278,7 +276,7 @@ class Job:
|
|
|
278
276
|
return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
|
|
279
277
|
|
|
280
278
|
@classmethod
|
|
281
|
-
def _from_harbor_yaml(cls, raw: dict,
|
|
279
|
+
def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
|
|
282
280
|
"""Parse Harbor-compatible YAML."""
|
|
283
281
|
# Agent
|
|
284
282
|
agents = raw.get("agents", [{}])
|
|
@@ -306,20 +304,20 @@ class Job:
|
|
|
306
304
|
|
|
307
305
|
# Datasets
|
|
308
306
|
datasets = raw.get("datasets", [{}])
|
|
309
|
-
tasks_dir =
|
|
307
|
+
tasks_dir = Path(datasets[0].get("path", "tasks"))
|
|
310
308
|
|
|
311
309
|
# Orchestrator
|
|
312
310
|
orch = raw.get("orchestrator", {})
|
|
313
311
|
concurrency = orch.get("n_concurrent_trials", 4)
|
|
314
312
|
|
|
315
|
-
jobs_dir =
|
|
313
|
+
jobs_dir = Path(raw.get("jobs_dir", "jobs"))
|
|
316
314
|
max_retries = (
|
|
317
315
|
raw.get("n_attempts", 1) - 1
|
|
318
316
|
) # Harbor n_attempts includes first try
|
|
319
317
|
|
|
320
318
|
# Skills dir (shared with benchflow-native format)
|
|
321
319
|
skills_dir_raw = raw.get("skills_dir")
|
|
322
|
-
skills_dir = str(
|
|
320
|
+
skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
|
|
323
321
|
sandbox_user = raw.get("sandbox_user", "agent")
|
|
324
322
|
sandbox_locked_paths = raw.get("sandbox_locked_paths")
|
|
325
323
|
|