benchflow 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. {benchflow-0.2.2 → benchflow-0.2.3}/CHANGELOG.md +20 -0
  2. {benchflow-0.2.2 → benchflow-0.2.3}/PKG-INFO +3 -3
  3. {benchflow-0.2.2 → benchflow-0.2.3}/README.md +1 -1
  4. {benchflow-0.2.2 → benchflow-0.2.3}/pyproject.toml +11 -11
  5. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_env_setup.py +24 -1
  6. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_sandbox.py +325 -27
  7. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/job.py +10 -12
  8. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/sdk.py +36 -11
  9. benchflow-0.2.3/tests/test_oracle.py +63 -0
  10. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox_hardening.py +299 -27
  11. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox_verifier_workspace.py +7 -1
  12. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_yaml_config.py +48 -4
  13. benchflow-0.2.2/.devcontainer/Dockerfile +0 -88
  14. benchflow-0.2.2/.devcontainer/devcontainer.json +0 -37
  15. benchflow-0.2.2/.env.sample +0 -27
  16. benchflow-0.2.2/.git +0 -1
  17. benchflow-0.2.2/.github/workflows/test.yml +0 -38
  18. benchflow-0.2.2/.pre-commit-config.yaml +0 -22
  19. benchflow-0.2.2/.python-version +0 -2
  20. benchflow-0.2.2/CLAUDE.md +0 -31
  21. benchflow-0.2.2/benchmarks/run_skillsbench.py +0 -25
  22. benchflow-0.2.2/benchmarks/run_tb2.py +0 -30
  23. benchflow-0.2.2/benchmarks/skillsbench-claude-glm5.yaml +0 -10
  24. benchflow-0.2.2/benchmarks/skillsbench-codex-gpt54.yaml +0 -10
  25. benchflow-0.2.2/benchmarks/tb2_multiturn-codex-gpt54.yaml +0 -10
  26. benchflow-0.2.2/benchmarks/tb2_single-codex-gpt54.yaml +0 -7
  27. benchflow-0.2.2/docs/architecture.md +0 -265
  28. benchflow-0.2.2/docs/cli-reference.md +0 -283
  29. benchflow-0.2.2/docs/getting-started.md +0 -295
  30. benchflow-0.2.2/docs/labs.md +0 -88
  31. benchflow-0.2.2/docs/task-authoring.md +0 -219
  32. benchflow-0.2.2/labs/benchjack-sandbox-hardening/README.md +0 -153
  33. benchflow-0.2.2/labs/benchjack-sandbox-hardening/_attack_runner.py +0 -74
  34. benchflow-0.2.2/labs/benchjack-sandbox-hardening/comparison.ipynb +0 -172
  35. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/Dockerfile +0 -9
  36. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/conftest_exploit.py +0 -18
  37. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/instruction.md +0 -1
  38. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/solution/solve.sh +0 -26
  39. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/task.toml +0 -17
  40. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test.sh +0 -19
  41. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +0 -13
  42. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/environment/Dockerfile +0 -3
  43. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/instruction.md +0 -3
  44. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/answer.txt +0 -1
  45. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/solve.sh +0 -15
  46. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/task.toml +0 -17
  47. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/ground_truth.txt +0 -1
  48. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/test.sh +0 -10
  49. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/environment/Dockerfile +0 -4
  50. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/instruction.md +0 -1
  51. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/solution/solve.sh +0 -28
  52. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/task.toml +0 -17
  53. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test.sh +0 -10
  54. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +0 -12
  55. benchflow-0.2.2/labs/benchjack-sandbox-hardening/run_comparison.py +0 -201
  56. benchflow-0.2.2/labs/reward-hack-matrix/.gitignore +0 -6
  57. benchflow-0.2.2/labs/reward-hack-matrix/README.md +0 -119
  58. benchflow-0.2.2/labs/reward-hack-matrix/_runner.py +0 -82
  59. benchflow-0.2.2/labs/reward-hack-matrix/_worker.py +0 -179
  60. benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_hook.sh +0 -87
  61. benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_payload.py +0 -33
  62. benchflow-0.2.2/labs/reward-hack-matrix/exploits/path_trojan.sh +0 -56
  63. benchflow-0.2.2/labs/reward-hack-matrix/exploits/pth_injection.sh +0 -122
  64. benchflow-0.2.2/labs/reward-hack-matrix/fetch_corpora.sh +0 -53
  65. benchflow-0.2.2/labs/reward-hack-matrix/run_matrix.py +0 -758
  66. benchflow-0.2.2/labs/reward-hack-matrix/sweep_0.2.0_vs_0.2.2.json +0 -7994
  67. benchflow-0.2.2/uv.lock +0 -3302
  68. {benchflow-0.2.2 → benchflow-0.2.3}/.gitignore +0 -0
  69. {benchflow-0.2.2 → benchflow-0.2.3}/LICENSE +0 -0
  70. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/__init__.py +0 -0
  71. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_acp_run.py +0 -0
  72. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_agent_env.py +0 -0
  73. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_agent_setup.py +0 -0
  74. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_credentials.py +0 -0
  75. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_scoring.py +0 -0
  76. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/_trajectory.py +0 -0
  77. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/__init__.py +0 -0
  78. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/client.py +0 -0
  79. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/container_transport.py +0 -0
  80. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/session.py +0 -0
  81. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/transport.py +0 -0
  82. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/acp/types.py +0 -0
  83. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/__init__.py +0 -0
  84. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  85. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/providers.py +0 -0
  86. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/registry.py +0 -0
  87. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/agents/user_agent.py +0 -0
  88. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/cli/__init__.py +0 -0
  89. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/cli/main.py +0 -0
  90. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/environments.py +0 -0
  91. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/metrics.py +0 -0
  92. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/models.py +0 -0
  93. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/process.py +0 -0
  94. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/py.typed +0 -0
  95. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/skills.py +0 -0
  96. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/task_download.py +0 -0
  97. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/tasks.py +0 -0
  98. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/__init__.py +0 -0
  99. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/atif.py +0 -0
  100. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/claude_code.py +0 -0
  101. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/otel.py +0 -0
  102. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/proxy.py +0 -0
  103. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/trajectories/types.py +0 -0
  104. {benchflow-0.2.2 → benchflow-0.2.3}/src/benchflow/viewer.py +0 -0
  105. {benchflow-0.2.2 → benchflow-0.2.3}/tests/__init__.py +0 -0
  106. {benchflow-0.2.2 → benchflow-0.2.3}/tests/conftest.py +0 -0
  107. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  108. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/instruction.md +0 -0
  109. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  110. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/task.toml +0 -0
  111. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/hello-world-task/tests/test.sh +0 -0
  112. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_claude.sh +0 -0
  113. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_codex.sh +0 -0
  114. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_gemini.sh +0 -0
  115. {benchflow-0.2.2 → benchflow-0.2.3}/tests/examples/test_openclaw.sh +0 -0
  116. {benchflow-0.2.2 → benchflow-0.2.3}/tests/fixtures/mock_acp_agent.py +0 -0
  117. {benchflow-0.2.2 → benchflow-0.2.3}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  118. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_acp.py +0 -0
  119. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_agent_model_decouple.py +0 -0
  120. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_agent_registry.py +0 -0
  121. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_atif_trajectory.py +0 -0
  122. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_capture_trajectory.py +0 -0
  123. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_env_setup.py +0 -0
  124. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_exclude_tasks.py +0 -0
  125. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_job.py +0 -0
  126. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_metrics.py +0 -0
  127. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_process.py +0 -0
  128. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_providers.py +0 -0
  129. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_reexport.py +0 -0
  130. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_registry_invariants.py +0 -0
  131. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_resolve_env_helpers.py +0 -0
  132. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sandbox.py +0 -0
  133. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_scoring.py +0 -0
  134. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sdk_internals.py +0 -0
  135. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_sdk_lockdown.py +0 -0
  136. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_skills.py +0 -0
  137. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_smoke.py +0 -0
  138. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_subscription_auth.py +0 -0
  139. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_tasks.py +0 -0
  140. {benchflow-0.2.2 → benchflow-0.2.3}/tests/test_verify.py +0 -0
@@ -2,6 +2,26 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## 0.2.3 — 2026-04-15
6
+
7
+ ### Added
8
+
9
+ - `benchmarks/tb2_multiturn-claude-haiku45.yaml` — shipped config for the README's TB2 multi-turn Claude result.
10
+ - Daytona resource clamping via `BENCHFLOW_DAYTONA_MAX_CPUS` / `MAX_MEMORY_MB`.
11
+
12
+ ### Changed
13
+
14
+ - Renamed `skillsbench-claude-glm5.yaml` → `skillsbench-claude-glm51.yaml` to match the model ID.
15
+ - `codex --login` correction in `docs/getting-started.md`.
16
+ - Restricted sdist build to `src/`, `tests/`, and metadata.
17
+
18
+ ### Fixed
19
+
20
+ - Verifier sandbox hardening follow-ups across several base-image and tooling edge cases.
21
+ - Preserve trusted verifier path entries and workspace answer files.
22
+ - Redirect oracle output to container log.
23
+ - Align YAML path resolution to config file location.
24
+
5
25
  ## 0.2.2 — 2026-04-13
6
26
 
7
27
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -28,7 +28,7 @@ Requires-Dist: typer>=0.9
28
28
  Provides-Extra: dev
29
29
  Requires-Dist: pre-commit>=3.7; extra == 'dev'
30
30
  Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
31
- Requires-Dist: pytest>=8.0; extra == 'dev'
31
+ Requires-Dist: pytest>=9.0.3; extra == 'dev'
32
32
  Requires-Dist: ruff>=0.7.0; extra == 'dev'
33
33
  Requires-Dist: ty>=0.0.1a1; extra == 'dev'
34
34
  Description-Content-Type: text/markdown
@@ -163,7 +163,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
163
163
  **SkillsBench** (86 tasks — tool use, file editing, API calls):
164
164
 
165
165
  ```bash
166
- python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml # Claude
166
+ python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml # Claude
167
167
  python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
168
168
  ```
169
169
 
@@ -128,7 +128,7 @@ Tasks are auto-downloaded on first run (cloned into `.ref/`).
128
128
  **SkillsBench** (86 tasks — tool use, file editing, API calls):
129
129
 
130
130
  ```bash
131
- python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml # Claude
131
+ python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm51.yaml # Claude
132
132
  python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
133
133
  ```
134
134
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -37,7 +37,7 @@ classifiers = [
37
37
  [project.optional-dependencies]
38
38
  dev = [
39
39
  "pre-commit>=3.7",
40
- "pytest>=8.0",
40
+ "pytest>=9.0.3",
41
41
  "pytest-asyncio>=0.24.0",
42
42
  "ruff>=0.7.0",
43
43
  "ty>=0.0.1a1",
@@ -58,20 +58,20 @@ requires = ["hatchling"]
58
58
  build-backend = "hatchling.build"
59
59
 
60
60
  [tool.hatch.build.targets.sdist]
61
- exclude = [
62
- ".venv*",
63
- ".ref",
64
- "jobs",
65
- "dist",
66
- ".claude",
67
- ".dev-docs",
68
- ".pytest_cache",
69
- "__pycache__",
61
+ # Allowlist: only ship what the installed package needs.
62
+ only-include = [
63
+ "src",
64
+ "tests",
65
+ "README.md",
66
+ "CHANGELOG.md",
67
+ "LICENSE",
68
+ "pyproject.toml",
70
69
  ]
71
70
 
72
71
  [tool.pytest.ini_options]
73
72
  asyncio_mode = "auto"
74
73
  addopts = "-m 'not live'"
74
+ testpaths = ["tests"]
75
75
  markers = [
76
76
  "live: requires real Anthropic API and Docker daemon (run with -m live)",
77
77
  ]
@@ -2,6 +2,7 @@
2
2
 
3
3
  import json
4
4
  import logging
5
+ import os
5
6
  import re
6
7
  import shutil
7
8
  from pathlib import Path
@@ -14,6 +15,12 @@ from benchflow.agents.registry import AGENTS
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
18
+ # Daytona's per-sandbox cap on the default tier is 4 CPU / 8 GB. Tasks declaring
19
+ # more fail at sandbox creation. Clamp here so tasks degrade gracefully (slower
20
+ # build) instead of erroring out. Override via env if running on a paid tier.
21
+ _DAYTONA_MAX_CPUS = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_CPUS", "4"))
22
+ _DAYTONA_MAX_MEMORY_MB = int(os.environ.get("BENCHFLOW_DAYTONA_MAX_MEMORY_MB", "8192"))
23
+
17
24
  # Directories to ignore when copying deps
18
25
  _IGNORE_DIRS = {
19
26
  ".venv",
@@ -253,12 +260,28 @@ def _create_environment(
253
260
  elif environment_type == "daytona":
254
261
  from harbor.environments.daytona import DaytonaEnvironment
255
262
 
263
+ env_config = task.config.environment
264
+ if env_config.cpus > _DAYTONA_MAX_CPUS:
265
+ logger.warning(
266
+ "Clamping cpus %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_CPUS)",
267
+ env_config.cpus,
268
+ _DAYTONA_MAX_CPUS,
269
+ )
270
+ env_config.cpus = _DAYTONA_MAX_CPUS
271
+ if env_config.memory_mb > _DAYTONA_MAX_MEMORY_MB:
272
+ logger.warning(
273
+ "Clamping memory_mb %d -> %d for Daytona (override with BENCHFLOW_DAYTONA_MAX_MEMORY_MB)",
274
+ env_config.memory_mb,
275
+ _DAYTONA_MAX_MEMORY_MB,
276
+ )
277
+ env_config.memory_mb = _DAYTONA_MAX_MEMORY_MB
278
+
256
279
  return DaytonaEnvironment(
257
280
  environment_dir=task.paths.environment_dir,
258
281
  environment_name=task_path.name,
259
282
  session_id=trial_name,
260
283
  trial_paths=trial_paths,
261
- task_env_config=task.config.environment,
284
+ task_env_config=env_config,
262
285
  auto_stop_interval_mins=1440,
263
286
  auto_delete_interval_mins=1440,
264
287
  )
@@ -13,8 +13,11 @@ Does not own:
13
13
 
14
14
  import json as _json
15
15
  import logging
16
+ import os
16
17
  import re
17
18
  import shlex
19
+ import tomllib
20
+ from pathlib import Path
18
21
  from typing import TYPE_CHECKING
19
22
 
20
23
  from benchflow.agents.registry import get_sandbox_home_dirs
@@ -99,7 +102,9 @@ def build_priv_drop_cmd(agent_launch: str, sandbox_user: str) -> str:
99
102
  )
100
103
 
101
104
 
102
- async def setup_sandbox_user(env, sandbox_user: str, workspace: str) -> str:
105
+ async def setup_sandbox_user(
106
+ env, sandbox_user: str, workspace: str, *, timeout_sec: int = 120
107
+ ) -> str:
103
108
  """Create non-root sandbox user, grant workspace access. Return agent_cwd."""
104
109
  if not re.match(r"^[a-z_][a-z0-9_-]*$", sandbox_user):
105
110
  raise ValueError(
@@ -119,7 +124,7 @@ async def setup_sandbox_user(env, sandbox_user: str, workspace: str) -> str:
119
124
  f"cp -a /root/$d/. /home/{sandbox_user}/$d/ 2>/dev/null || true; fi; done && "
120
125
  f"chown -R {sandbox_user}:{sandbox_user} /home/{sandbox_user} && "
121
126
  f"chown -R {sandbox_user}:{sandbox_user} {shlex.quote(workspace)}",
122
- timeout_sec=30,
127
+ timeout_sec=timeout_sec,
123
128
  )
124
129
  logger.info(f"Sandbox user {sandbox_user} ready (workspace={workspace})")
125
130
  return workspace
@@ -314,8 +319,15 @@ VERIFIER_ENV: dict[str, str] = {
314
319
  "PYTHONNOUSERSITE": "1",
315
320
  "PIP_USER": "0",
316
321
  "PIP_NO_USER_CONFIG": "1",
317
- # Force HOME to a non-existent path so pip cannot read any pre-staged pip.conf.
318
- "HOME": "/nonexistent",
322
+ # PEP-668 base images (Fedora, recent Debian) refuse pip installs into
323
+ # system-site without this flag. Verifier runs as root and system-site is
324
+ # root-owned, so allowing it is safe; without it, tasks that pip-install
325
+ # pytest in test.sh either fail outright or fall back to a user-site path
326
+ # that PYTHONNOUSERSITE=1 hides at import time.
327
+ "PIP_BREAK_SYSTEM_PACKAGES": "1",
328
+ # /root is root-owned; sandbox_user cannot pre-stage caches there. Pip
329
+ # config is already blocked by the PIP_* / PYTHONNOUSERSITE vars above.
330
+ "HOME": "/root",
319
331
  # Disable breakpoint() — any other value imports an arbitrary callable.
320
332
  "PYTHONBREAKPOINT": "0",
321
333
  # Prevent coverage.py from importing a config file as Python on startup.
@@ -325,6 +337,258 @@ VERIFIER_ENV: dict[str, str] = {
325
337
  "CELERY_CONFIG_MODULE": "",
326
338
  }
327
339
 
340
+ _SAFE_VERIFIER_PATH = VERIFIER_ENV["PATH"]
341
+ _SAFE_VERIFIER_PATH_PARTS = tuple(_SAFE_VERIFIER_PATH.split(":"))
342
+ _RUNTIME_PATH_PREFIXES = ("/tmp", "/var/tmp", "/logs", "/testbed")
343
+
344
+ # pytest plugin names are not always the same as the PyPI distribution name
345
+ # or the option they register. These aliases cover the common benchmark
346
+ # verifier plugins while preserving PYTEST_DISABLE_PLUGIN_AUTOLOAD=1.
347
+ _PYTEST_PLUGIN_ALIASES = {
348
+ "ctrf": "ctrf",
349
+ "pytest-json-ctrf": "ctrf",
350
+ "pytest_json_ctrf": "ctrf",
351
+ "pytest_json_ctrf.plugin": "ctrf",
352
+ "pytest-json-report": "pytest_jsonreport",
353
+ "pytest_json_report": "pytest_jsonreport",
354
+ "pytest_jsonreport": "pytest_jsonreport",
355
+ "pytest_jsonreport.plugin": "pytest_jsonreport",
356
+ }
357
+ _PYTEST_OPTION_PLUGINS = {
358
+ "--ctrf": "ctrf",
359
+ "--json-report": "pytest_jsonreport",
360
+ "--json-report-file": "pytest_jsonreport",
361
+ }
362
+
363
+ # Pytest plugins worth auto-loading when test.sh pip-installs them but the
364
+ # task author forgot to declare pytest_plugins in task.toml. Map distribution
365
+ # name (as it appears in `pip install pytest-foo`) to importable plugin name.
366
+ _PYTEST_INSTALLED_PLUGINS = {
367
+ "pytest-asyncio": "pytest_asyncio",
368
+ "pytest-anyio": "anyio.pytest_plugin",
369
+ "pytest-trio": "pytest_trio",
370
+ }
371
+ _PIP_INSTALL_RE = re.compile(r"\bpip3?\s+install\b[^\n;|&]*", re.IGNORECASE)
372
+
373
+
374
+ def _under_path(path: str, prefix: str) -> bool:
375
+ prefix = prefix.rstrip("/")
376
+ return path == prefix or path.startswith(f"{prefix}/")
377
+
378
+
379
+ def _blocked_verifier_path_prefixes(
380
+ sandbox_user: str | None, workspace: str | None
381
+ ) -> tuple[str, ...]:
382
+ """Paths that must never be preserved as verifier PATH extras."""
383
+ prefixes = list(_RUNTIME_PATH_PREFIXES)
384
+ if workspace:
385
+ prefixes.append(workspace)
386
+ if sandbox_user:
387
+ prefixes.append(f"/home/{sandbox_user}")
388
+ return tuple(dict.fromkeys(prefixes))
389
+
390
+
391
+ def _merge_trusted_verifier_path(extras: list[str]) -> str:
392
+ """Prepend validated image PATH entries to the verifier allowlist."""
393
+ kept: list[str] = []
394
+ seen: set[str] = set(_SAFE_VERIFIER_PATH_PARTS)
395
+ for entry in extras:
396
+ if entry and entry not in seen:
397
+ seen.add(entry)
398
+ kept.append(entry)
399
+ return ":".join([*kept, *_SAFE_VERIFIER_PATH_PARTS])
400
+
401
+
402
+ _TRUSTED_PATH_EXTRAS_SCRIPT = r"""
403
+ import json
404
+ import os
405
+ import stat
406
+ import sys
407
+
408
+ raw_path = json.loads(sys.argv[1])
409
+ safe_parts = set(json.loads(sys.argv[2]))
410
+ blocked_prefixes = tuple(json.loads(sys.argv[3]))
411
+
412
+
413
+ def under_path(path, prefix):
414
+ prefix = prefix.rstrip("/")
415
+ return path == prefix or path.startswith(prefix + "/")
416
+
417
+
418
+ trusted = []
419
+ seen = set(safe_parts)
420
+ for entry in raw_path.split(":"):
421
+ entry = entry.strip()
422
+ if (
423
+ not entry
424
+ or entry in seen
425
+ or not entry.startswith("/")
426
+ or "\x00" in entry
427
+ or "\n" in entry
428
+ ):
429
+ continue
430
+ seen.add(entry)
431
+ try:
432
+ real = os.path.realpath(entry)
433
+ st = os.stat(real)
434
+ except OSError:
435
+ continue
436
+ if not stat.S_ISDIR(st.st_mode):
437
+ continue
438
+ if any(under_path(real, prefix) for prefix in blocked_prefixes):
439
+ continue
440
+ if st.st_uid != 0:
441
+ continue
442
+ if st.st_mode & (stat.S_IWGRP | stat.S_IWOTH):
443
+ continue
444
+ trusted.append(entry)
445
+ print(json.dumps(trusted))
446
+ """.strip()
447
+
448
+
449
+ def _trusted_path_extras_cmd(raw_path: str, blocked_prefixes: tuple[str, ...]) -> str:
450
+ """Build the container-side command that validates verifier PATH extras."""
451
+ return (
452
+ f"python3 -c {shlex.quote(_TRUSTED_PATH_EXTRAS_SCRIPT)} "
453
+ f"{shlex.quote(_json.dumps(raw_path))} "
454
+ f"{shlex.quote(_json.dumps(_SAFE_VERIFIER_PATH_PARTS))} "
455
+ f"{shlex.quote(_json.dumps(blocked_prefixes))}"
456
+ )
457
+
458
+
459
+ def _normalize_pytest_plugin(name: object) -> str | None:
460
+ """Return the importable pytest plugin name for a task declaration."""
461
+ if not isinstance(name, str):
462
+ return None
463
+ clean = name.strip()
464
+ if not clean:
465
+ return None
466
+ return _PYTEST_PLUGIN_ALIASES.get(clean, clean)
467
+
468
+
469
+ def _plugins_from_verifier_script(task: "Task") -> list[str]:
470
+ """Infer known pytest plugins needed by legacy verifier scripts.
471
+
472
+ Older SkillsBench/TB2 tasks predate task-level pytest plugin metadata and
473
+ call options such as --ctrf directly from tests/test.sh. With pytest entry
474
+ point autoload disabled, those options must be backed by explicit -p flags.
475
+ """
476
+ task_dir = getattr(task, "task_dir", None)
477
+ if not isinstance(task_dir, (str, os.PathLike)):
478
+ return []
479
+ test_sh = Path(task_dir) / "tests" / "test.sh"
480
+ try:
481
+ content = test_sh.read_text()
482
+ except OSError:
483
+ return []
484
+
485
+ plugins: list[str] = []
486
+ for option, plugin in _PYTEST_OPTION_PLUGINS.items():
487
+ if option in content and plugin not in plugins:
488
+ plugins.append(plugin)
489
+ # Detect pip-installed pytest plugins so PYTEST_DISABLE_PLUGIN_AUTOLOAD=1
490
+ # doesn't silently drop them. Only matches the exact installer line so
491
+ # arbitrary text mentioning the plugin name is ignored.
492
+ for match in _PIP_INSTALL_RE.findall(content):
493
+ for dist, plugin in _PYTEST_INSTALLED_PLUGINS.items():
494
+ if dist in match and plugin not in plugins:
495
+ plugins.append(plugin)
496
+ return plugins
497
+
498
+
499
+ def _declared_pytest_plugins(task: "Task") -> list[object]:
500
+ """Return pytest_plugins from the model, falling back to raw task.toml."""
501
+ declared = getattr(task.config.verifier, "pytest_plugins", None)
502
+ if declared:
503
+ return list(declared)
504
+
505
+ task_dir = getattr(task, "task_dir", None)
506
+ if not isinstance(task_dir, (str, os.PathLike)):
507
+ return []
508
+ config_path = Path(task_dir) / "task.toml"
509
+ try:
510
+ data = tomllib.loads(config_path.read_text())
511
+ except (OSError, tomllib.TOMLDecodeError):
512
+ return []
513
+ plugins = data.get("verifier", {}).get("pytest_plugins", [])
514
+ if isinstance(plugins, list):
515
+ return plugins
516
+ return []
517
+
518
+
519
+ def _pytest_plugin_flags(task: "Task") -> str:
520
+ """Build deterministic -p flags for inferred and declared pytest plugins."""
521
+ plugins: list[str] = []
522
+ for plugin in _plugins_from_verifier_script(task):
523
+ if plugin not in plugins:
524
+ plugins.append(plugin)
525
+ for plugin in _declared_pytest_plugins(task):
526
+ normalized = _normalize_pytest_plugin(plugin)
527
+ if normalized and normalized not in plugins:
528
+ plugins.append(normalized)
529
+ return " ".join(f"-p {shlex.quote(p)}" for p in plugins)
530
+
531
+
532
+ _FEDORA_LIKE = ("fedora", "rhel", "centos", "rocky", "alma")
533
+
534
+
535
+ async def _distro_pip_env(env) -> dict[str, str]:
536
+ """Distro-conditional pip env to neutralize Fedora's user-install fallback.
537
+
538
+ Fedora's downstream pip patch routes root pip-installs to ~/.local/lib
539
+ even with PIP_USER=0 + PIP_BREAK_SYSTEM_PACKAGES=1. PYTHONNOUSERSITE=1 then
540
+ hides those installs from python3 at import time. Pinning PIP_PREFIX on
541
+ Fedora-likes only writes them to /usr/local where python3 can find them.
542
+
543
+ Setting PIP_PREFIX on Debian/Ubuntu would double-prefix (their downstream
544
+ pip already injects --prefix=/usr/local for root), creating
545
+ /usr/local/usr/local/bin/pytest. So this is conditional on the image distro.
546
+ """
547
+ try:
548
+ result = await env.exec(
549
+ "cat /etc/os-release 2>/dev/null || true", user="root", timeout_sec=5
550
+ )
551
+ except Exception:
552
+ return {}
553
+ text = (result.stdout or "").lower()
554
+ ids: list[str] = []
555
+ for line in text.splitlines():
556
+ if line.startswith("id=") or line.startswith("id_like="):
557
+ value = line.split("=", 1)[1].strip().strip('"').strip("'")
558
+ ids.extend(value.split())
559
+ if any(d in ids for d in _FEDORA_LIKE):
560
+ return {"PIP_PREFIX": "/usr/local"}
561
+ return {}
562
+
563
+
564
+ async def _trusted_verifier_path(
565
+ env, sandbox_user: str | None, workspace: str | None
566
+ ) -> str:
567
+ """Return verifier PATH with trusted image extras preserved.
568
+
569
+ Dockerfile PATH additions are accepted only after container-side stat
570
+ checks prove they are root-owned directories and not group/world writable.
571
+ Runtime locations and sandbox-user writable locations stay excluded.
572
+ """
573
+ path_result = await env.exec("printenv PATH", user="root", timeout_sec=10)
574
+ raw_path = path_result.stdout or ""
575
+ if not raw_path.strip():
576
+ return _SAFE_VERIFIER_PATH
577
+ cmd = _trusted_path_extras_cmd(
578
+ raw_path, _blocked_verifier_path_prefixes(sandbox_user, workspace)
579
+ )
580
+ result = await env.exec(cmd, user="root", timeout_sec=10)
581
+ try:
582
+ extras = _json.loads(result.stdout or "[]")
583
+ except _json.JSONDecodeError:
584
+ logger.warning("Could not parse trusted verifier PATH extras; using safe PATH")
585
+ extras = []
586
+ if not isinstance(extras, list):
587
+ logger.warning("Invalid trusted verifier PATH extras; using safe PATH")
588
+ extras = []
589
+ return _merge_trusted_verifier_path([e for e in extras if isinstance(e, str)])
590
+
591
+
328
592
  # Wipe and recreate /logs/verifier/ before the verifier runs.
329
593
  # rm -rf severs hardlinks, removes symlink replacements, and eliminates
330
594
  # variant filenames/subdirs the agent may have pre-staged.
@@ -355,18 +619,24 @@ CLEANUP_CMD = (
355
619
 
356
620
 
357
621
  async def harden_before_verify(
358
- env, task: "Task", sandbox_user: str | None, workspace: str | None = None
622
+ env,
623
+ task: "Task",
624
+ sandbox_user: str | None,
625
+ workspace: str | None = None,
626
+ # Default false because SkillsBench/TB2-style answers often are workspace
627
+ # edits. Going forward, enforce true only via an explicit task/benchmark
628
+ # contract, e.g. task.toml [verifier] restore_workspace = true after an
629
+ # oracle/diff audit proves the answer is not stored in the workspace.
630
+ restore_workspace: bool = False,
359
631
  ) -> None:
360
632
  """Neutralize agent tampering before running the verifier.
361
633
 
362
634
  1. Kill sandbox-user processes (prevent concurrent writes during teardown).
363
635
  2. Assert all sandbox-user processes are dead, then wipe/recreate
364
636
  /logs/verifier/ with a clean root-owned directory.
365
- 3. Restore build-config files to pre-agent state (if workspace provided).
366
- 4. Sync restored build-config files into /testbed_verify (if workspace provided).
367
- 4b.Full workspace restore from /testbed_verify resets ALL source files to
368
- pre-agent canonical state, not just the build-config subset.
369
- 4c.Purge symlinks and __pycache__ trees from workspace.
637
+ 3. Optionally restore the workspace from the pre-agent snapshot. This is
638
+ destructive to legitimate workspace-edit answers, so it is opt-in.
639
+ 4. Purge symlinks and __pycache__ trees from workspace.
370
640
  5. chown workspace to root (belt-and-suspenders against zombie sandbox writes).
371
641
  6. Remove injected conftest.py, sitecustomize.py, .pth files.
372
642
  7. Merge trusted env vars into task.config.verifier.env.
@@ -386,7 +656,7 @@ async def harden_before_verify(
386
656
  )
387
657
  # Wipe and recreate /logs/verifier/ with a clean root-owned directory.
388
658
  await env.exec(_CLEAR_VERIFIER_DIR_CMD, user="root")
389
- if workspace:
659
+ if workspace and restore_workspace:
390
660
  await _restore_build_config(env, workspace)
391
661
  await _refresh_verifier_workspace(env, workspace)
392
662
  # Full workspace restore from /testbed_verify (the pre-agent canonical
@@ -402,18 +672,41 @@ async def harden_before_verify(
402
672
  f"fi",
403
673
  user="root",
404
674
  )
405
- # Purge symlinks so an agent-planted symlink pointing to a writable
406
- # external target (e.g. utils.py -> /tmp/evil.py) cannot be followed
407
- # during import after the workspace is frozen.
675
+ if workspace:
676
+ # Purge symlinks whose realpath escapes the workspace (e.g. agent-planted
677
+ # utils.py -> /tmp/evil.py). In-tree symlinks are preserved — some
678
+ # source trees (OTP, lake-packages, etc.) ship them as fixtures.
679
+ # Residual risk: agent plants utils.py -> ./evil.py inside workspace.
680
+ # Mitigated by the workspace chown below + CLEANUP_CMD path scrubbing.
681
+ purge_external_symlinks = (
682
+ "import os, sys, pathlib;"
683
+ "root = pathlib.Path(sys.argv[1]).resolve();"
684
+ "root_str = str(root) + os.sep;"
685
+ "[p.unlink() for p in root.rglob('*')"
686
+ " if p.is_symlink()"
687
+ " and not (str(p.resolve()) + os.sep).startswith(root_str)"
688
+ " and str(p.resolve()) != str(root)]"
689
+ )
408
690
  await env.exec(
409
- f"find {shlex.quote(workspace)} -type l -delete 2>/dev/null; true",
691
+ f"python3 -c {shlex.quote(purge_external_symlinks)} "
692
+ f"{shlex.quote(workspace)} 2>/dev/null; true",
410
693
  user="root",
411
694
  )
412
- # Purge __pycache__ trees so pre-compiled .pyc bytecode cannot execute
413
- # even if PYTHONPYCACHEPREFIX is bypassed (defense-in-depth).
695
+ # Purge __pycache__ trees that did not exist in the pre-agent baseline,
696
+ # so agent-planted .pyc bytecode cannot execute even if
697
+ # PYTHONPYCACHEPREFIX is bypassed. Baseline-present caches are kept so
698
+ # tasks whose tests diff workspace against /testbed_verify don't break.
414
699
  await env.exec(
415
- f"find {shlex.quote(workspace)} -type d -name '__pycache__'"
416
- f" -exec rm -rf {{}} + 2>/dev/null; true",
700
+ f"if [ -d /testbed_verify ]; then "
701
+ f" find {shlex.quote(workspace)} -type d -name __pycache__ -print0 "
702
+ f" | while IFS= read -r -d '' d; do "
703
+ f" rel=${{d#{shlex.quote(workspace)}/}}; "
704
+ f' [ -d "/testbed_verify/$rel" ] || rm -rf "$d"; '
705
+ f" done; "
706
+ f"else "
707
+ f" find {shlex.quote(workspace)} -type d -name '__pycache__'"
708
+ f" -exec rm -rf {{}} + 2>/dev/null; "
709
+ f"fi; true",
417
710
  user="root",
418
711
  )
419
712
  # chown workspace to root: belt-and-suspenders against any zombie
@@ -424,24 +717,29 @@ async def harden_before_verify(
424
717
  )
425
718
  await env.exec(CLEANUP_CMD, user="root", timeout_sec=10)
426
719
 
720
+ hardened_path = await _trusted_verifier_path(env, sandbox_user, workspace)
721
+ distro_env = await _distro_pip_env(env)
722
+
427
723
  verifier_env = dict(VERIFIER_ENV)
724
+ verifier_env.update(distro_env)
428
725
  if task.config.verifier.env:
429
726
  verifier_env.update(task.config.verifier.env)
430
727
  # Hard security invariants — re-pin after task-env merge so a task cannot
431
- # strip -c /dev/null / --confcutdir, re-enable entry-point plugin loading,
432
- # or inject code via breakpoint()/coverage/Django/Celery startup hooks.
728
+ # replace PATH, strip -c /dev/null / --confcutdir, re-enable entry-point
729
+ # plugin loading, or inject code via breakpoint()/coverage/Django/Celery
730
+ # startup hooks.
731
+ verifier_env["PATH"] = hardened_path
433
732
  verifier_env["PYTEST_DISABLE_PLUGIN_AUTOLOAD"] = "1"
434
733
  verifier_env["PYTHONBREAKPOINT"] = "0"
435
734
  verifier_env["COVERAGE_PROCESS_START"] = ""
436
735
  verifier_env["DJANGO_SETTINGS_MODULE"] = ""
437
736
  verifier_env["CELERY_CONFIG_MODULE"] = ""
438
- # Re-enable explicitly declared plugins by appending -p flags to the
439
- # hardened base — never to a task-supplied PYTEST_ADDOPTS.
440
- # getattr: field absent in older harbor deployments; bare access was a live crash.
441
- allowed_plugins = getattr(task.config.verifier, "pytest_plugins", None) or []
737
+ # Re-enable known verifier plugins by appending -p flags to the hardened
738
+ # base — never to a task-supplied PYTEST_ADDOPTS. Legacy task sets are
739
+ # inferred from tests/test.sh; newer tasks may declare pytest_plugins.
442
740
  base_addopts = VERIFIER_ENV["PYTEST_ADDOPTS"]
443
- if allowed_plugins:
444
- flags = " ".join(f"-p {shlex.quote(p)}" for p in allowed_plugins)
741
+ flags = _pytest_plugin_flags(task)
742
+ if flags:
445
743
  verifier_env["PYTEST_ADDOPTS"] = base_addopts + f" {flags}"
446
744
  else:
447
745
  verifier_env["PYTEST_ADDOPTS"] = base_addopts
@@ -243,14 +243,14 @@ class Job:
243
243
 
244
244
  # Detect format: Harbor uses "agents" + "datasets", benchflow uses "agent"
245
245
  if "agents" in raw or "datasets" in raw:
246
- return cls._from_harbor_yaml(raw, path.parent, **kwargs)
247
- return cls._from_native_yaml(raw, path.parent, **kwargs)
246
+ return cls._from_harbor_yaml(raw, **kwargs)
247
+ return cls._from_native_yaml(raw, **kwargs)
248
248
 
249
249
  @classmethod
250
- def _from_native_yaml(cls, raw: dict, base_dir: Path, **kwargs) -> "Job":
250
+ def _from_native_yaml(cls, raw: dict, **kwargs) -> "Job":
251
251
  """Parse benchflow-native YAML."""
252
- tasks_dir = base_dir / raw["tasks_dir"]
253
- jobs_dir = base_dir / raw.get("jobs_dir", "jobs")
252
+ tasks_dir = Path(raw["tasks_dir"])
253
+ jobs_dir = Path(raw.get("jobs_dir", "jobs"))
254
254
 
255
255
  # Parse prompts — YAML null becomes Python None
256
256
  prompts = raw.get("prompts")
@@ -268,9 +268,7 @@ class Job:
268
268
  prompts=prompts,
269
269
  agent_env=agent_env_raw,
270
270
  retry=RetryConfig(max_retries=raw.get("max_retries", 2)),
271
- skills_dir=str(base_dir / raw["skills_dir"])
272
- if raw.get("skills_dir")
273
- else None,
271
+ skills_dir=str(Path(raw["skills_dir"])) if raw.get("skills_dir") else None,
274
272
  sandbox_user=sandbox_user,
275
273
  sandbox_locked_paths=sandbox_locked_paths,
276
274
  exclude_tasks=exclude,
@@ -278,7 +276,7 @@ class Job:
278
276
  return cls(tasks_dir=tasks_dir, jobs_dir=jobs_dir, config=config, **kwargs)
279
277
 
280
278
  @classmethod
281
- def _from_harbor_yaml(cls, raw: dict, base_dir: Path, **kwargs) -> "Job":
279
+ def _from_harbor_yaml(cls, raw: dict, **kwargs) -> "Job":
282
280
  """Parse Harbor-compatible YAML."""
283
281
  # Agent
284
282
  agents = raw.get("agents", [{}])
@@ -306,20 +304,20 @@ class Job:
306
304
 
307
305
  # Datasets
308
306
  datasets = raw.get("datasets", [{}])
309
- tasks_dir = base_dir / datasets[0].get("path", "tasks")
307
+ tasks_dir = Path(datasets[0].get("path", "tasks"))
310
308
 
311
309
  # Orchestrator
312
310
  orch = raw.get("orchestrator", {})
313
311
  concurrency = orch.get("n_concurrent_trials", 4)
314
312
 
315
- jobs_dir = base_dir / raw.get("jobs_dir", "jobs")
313
+ jobs_dir = Path(raw.get("jobs_dir", "jobs"))
316
314
  max_retries = (
317
315
  raw.get("n_attempts", 1) - 1
318
316
  ) # Harbor n_attempts includes first try
319
317
 
320
318
  # Skills dir (shared with benchflow-native format)
321
319
  skills_dir_raw = raw.get("skills_dir")
322
- skills_dir = str(base_dir / skills_dir_raw) if skills_dir_raw else None
320
+ skills_dir = str(Path(skills_dir_raw)) if skills_dir_raw else None
323
321
  sandbox_user = raw.get("sandbox_user", "agent")
324
322
  sandbox_locked_paths = raw.get("sandbox_locked_paths")
325
323