PyPI - benchflow - Versions diffs - 0.3.4__tar.gz → 0.4.0__tar.gz - Mend

benchflow 0.3.4tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

{benchflow-0.3.4 → benchflow-0.4.0}/.gitignore RENAMED Viewed

@@ -184,3 +184,4 @@ tmp/
 tests/.smoke-jobs/
 context/
 tutorials/
+.playwright-mcp/

{benchflow-0.3.4 → benchflow-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchflow
-Version: 0.3.4
+Version: 0.4.0
 Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
 Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
 Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -11,7 +11,7 @@ Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@g
 Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
 License: Apache-2.0
 License-File: LICENSE
-Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
+Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.12
 Requires-Dist: anyio>=4.0
-Requires-Dist: harbor==0.3.0
 Requires-Dist: httpx>=0.27.0
 Requires-Dist: pydantic>=2.0
 Requires-Dist: pyyaml>=6.0
@@ -33,6 +32,12 @@ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
 Requires-Dist: pytest>=9.0.3; extra == 'dev'
 Requires-Dist: ruff>=0.7.0; extra == 'dev'
 Requires-Dist: ty>=0.0.1a1; extra == 'dev'
+Provides-Extra: sandbox-daytona
+Requires-Dist: daytona>=0.153.0; extra == 'sandbox-daytona'
+Requires-Dist: tenacity>=8.0; extra == 'sandbox-daytona'
+Provides-Extra: sandbox-modal
+Requires-Dist: modal>=0.73; extra == 'sandbox-modal'
+Requires-Dist: tenacity>=8.0; extra == 'sandbox-modal'
 Description-Content-Type: text/markdown
 <div align="center">
@@ -116,7 +121,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
 ## Featured
-- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
+- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
 ## Research artifacts
@@ -130,7 +135,7 @@ Two runnable labs validate the security story:
 - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
 - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
 - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
-- **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
+- **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
 ## Contributing

{benchflow-0.3.4 → benchflow-0.4.0}/README.md RENAMED Viewed

@@ -79,7 +79,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
 ## Featured
-- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
+- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
 ## Research artifacts
@@ -93,7 +93,7 @@ Two runnable labs validate the security story:
 - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
 - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
 - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
-- **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
+- **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
 ## Contributing

{benchflow-0.3.4 → benchflow-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,12 +1,11 @@
 [project]
 name = "benchflow"
-version = "0.3.4"
+version = "0.4.0"
 description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
 readme = "README.md"
 requires-python = ">=3.12"
-keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "terminal-bench", "skillsbench"]
+keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "skillsbench"]
 dependencies = [
-    "harbor==0.3.0",
     "httpx>=0.27.0",
     "anyio>=4.0",
     "pydantic>=2.0",
@@ -42,6 +41,14 @@ dev = [
     "ruff>=0.7.0",
     "ty>=0.0.1a1",
 ]
+sandbox-daytona = [
+    "daytona>=0.153.0",
+    "tenacity>=8.0",
+]
+sandbox-modal = [
+    "modal>=0.73",
+    "tenacity>=8.0",
+]
 bedrock = [
     "boto3>=1.40",
 ]
@@ -114,5 +121,24 @@ ignore = [
 [tool.ty.environment]
 python-version = "3.12"
+[tool.ty.rules]
+# Many modules lazily import optional deps (daytona, modal, openai, toml, …).
+# These are guarded by try/except at runtime but ty can't resolve them in CI.
+unresolved-import = "ignore"
 [tool.ty.src]
 include = ["src"]
+# Modules that heavily use optional-dep types (daytona, modal, openai, boto3, …)
+# produce cascading type errors when those packages aren't installed.
+exclude = [
+    "src/benchflow/sandbox/daytona.py",
+    "src/benchflow/sandbox/modal_impl.py",
+    "src/benchflow/sandbox/docker.py",
+    "src/benchflow/sandbox/_base.py",
+    "src/benchflow/_env_setup.py",
+    "src/benchflow/rewards/llm.py",
+    "src/benchflow/rewards/file_readers.py",
+    "src/benchflow/rewards/rubric_config.py",
+    "src/benchflow/providers/bedrock_runtime.py",
+    "src/benchflow/experimental/mcp/reviewer_server.py",
+]

{benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/__init__.py RENAMED Viewed

@@ -1,10 +1,12 @@
 """benchflow — ACP-native agent benchmarking framework.
-Re-exports environment APIs and adds:
+Public API surface:
+- Sandbox protocol for isolated execution environments
 - ACP client for multi-turn agent communication
 - Trajectory capture (HTTP proxy, OTel collector, ACP native)
 - Rollout lifecycle for single-task execution
 - Evaluation orchestration with retries and concurrency
+- Rewards protocol (composable Rubric + RewardFunc)
 - Metrics collection and aggregation
 """
@@ -12,23 +14,9 @@ from importlib.metadata import version as _version
 __version__ = _version("benchflow")
-# Re-export Harbor's core types for downstream task authors
-from harbor import (
-    BaseAgent,
-    BaseEnvironment,
-    ExecResult,
-    Task,
-    TaskConfig,
-    Verifier,
-    VerifierResult,
-)
-# benchflow's additions
-from benchflow._env_setup import stage_dockerfile_deps
-from benchflow._scene import MailboxTransport, Message, MessageTransport, SceneRole
-from benchflow._scene import Scene as SceneRuntime
-from benchflow._snapshot import list_snapshots, restore, snapshot
+# Core types
 from benchflow._types import Role, Scene, Turn
+from benchflow._utils.yaml_loader import rollout_config_from_yaml
 from benchflow.acp.client import ACPClient
 from benchflow.acp.session import ACPSession
 from benchflow.adapters import (
@@ -45,12 +33,6 @@ from benchflow.agents.registry import (
     list_agents,
     register_agent,
 )
-from benchflow.environments import (
-    SERVICES,
-    build_service_hooks,
-    detect_services_from_dockerfile,
-    register_service,
-)
 from benchflow.evaluation import (
     Evaluation,
     EvaluationConfig,
@@ -85,32 +67,39 @@ from benchflow.runtime import (
     RuntimeResult,
     run,
 )  # bf.run() — supports Agent, RolloutConfig, and str calling conventions
+from benchflow.sandbox import (
+    SERVICES,
+    ImageBuilder,
+    ImageConfig,
+    ImageRef,
+    Sandbox,
+    build_service_hooks,
+    detect_services_from_dockerfile,
+    register_service,
+)
-# Sandbox protocol (v0.4 — parallel types, Harbor not yet removed)
+# Sandbox protocol (v0.4)
 from benchflow.sandbox import ExecResult as SandboxExecResult
-from benchflow.sandbox import ImageBuilder, ImageConfig, ImageRef, Sandbox
+from benchflow.sandbox.protocol import ExecResult
+from benchflow.sandbox.setup import stage_dockerfile_deps
+from benchflow.sandbox.snapshot import list_snapshots, restore, snapshot
+from benchflow.sandbox.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
+from benchflow.scenes import MailboxTransport, Message, MessageTransport, SceneRole
+from benchflow.scenes import Scene as SceneRuntime
 from benchflow.sdk import SDK
 from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
+from benchflow.task import (
+    Task,
+    TaskConfig,
+    Verifier,
+    VerifierResult,
+)
 from benchflow.trajectories.otel import OTelCollector
 from benchflow.trajectories.proxy import TrajectoryProxy
 from benchflow.trajectories.types import Trajectory
-from benchflow.trial_yaml import trial_config_from_yaml
-from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
-# Backward-compat aliases
-Trial = Rollout
-TrialConfig = RolloutConfig
-TrialRole = Role
-TrialScene = Scene
-RunResult = RolloutResult
-Job = Evaluation
-JobConfig = EvaluationConfig
-JobResult = EvaluationResult
 # Public API surface. Anything not in this list is implementation detail and
-# may change without notice. Names are grouped by source module to match the
-# imports above and to make it obvious to a future agent which module owns
-# what.
+# may change without notice.
 __all__ = [
     "__version__",
     # Rewards protocol (v0.4)
@@ -122,21 +111,17 @@ __all__ = [
     "LLMJudgeRewardFunc",
     "StringMatchRewardFunc",
     "CodeExecRewardFunc",
-    # Rubric config (ENG-55)
     "Criterion",
     "JudgeConfig",
     "RubricConfig",
     "ScoringConfig",
     "load_rubric_toml",
-    # Sandbox protocol (v0.4)
+    # Sandbox protocol
     "Sandbox",
     "SandboxExecResult",
     "ImageBuilder",
     "ImageConfig",
     "ImageRef",
-    # Harbor re-exports
-    "BaseAgent",
-    "BaseEnvironment",
     "ExecResult",
     "Task",
     "TaskConfig",
@@ -152,15 +137,11 @@ __all__ = [
     "is_vertex_model",
     "list_agents",
     "register_agent",
-    # Evaluation orchestration (new names)
+    # Evaluation orchestration
     "Evaluation",
     "EvaluationConfig",
     "EvaluationResult",
     "RetryConfig",
-    # Backward-compat aliases for Job
-    "Job",
-    "JobConfig",
-    "JobResult",
     # Metrics
     "BenchmarkMetrics",
     "collect_metrics",
@@ -168,8 +149,7 @@ __all__ = [
     "AgentInstallError",
     "AgentTimeoutError",
     "RolloutResult",
-    "RunResult",
-    # Runtime (0.3 compat)
+    # Runtime
     "Agent",
     "Environment",
     "Runtime",
@@ -177,7 +157,7 @@ __all__ = [
     "RuntimeResult",
     # Single entry point
     "run",
-    # Canonical declarative types (_types.py — ENG-47)
+    # Declarative types
     "Role",
     "Scene",
     "Turn",
@@ -191,23 +171,18 @@ __all__ = [
     "snapshot",
     "restore",
     "list_snapshots",
-    # Rollout (single execution path — ENG-46)
+    # Rollout
     "Rollout",
     "RolloutConfig",
-    # Backward-compat aliases for Trial
-    "Trial",
-    "TrialConfig",
-    "TrialRole",
-    "TrialScene",
-    "trial_config_from_yaml",
+    "rollout_config_from_yaml",
     # User abstraction (progressive disclosure)
     "BaseUser",
     "FunctionUser",
     "PassthroughUser",
     "RoundResult",
-    # SDK (backwards compat)
+    # SDK
     "SDK",
-    # Environments / dep staging
+    # Sandbox services
     "SERVICES",
     "build_service_hooks",
     "detect_services_from_dockerfile",
@@ -222,7 +197,7 @@ __all__ = [
     "OTelCollector",
     "TrajectoryProxy",
     "Trajectory",
-    # External adapters (ENG-51)
+    # External adapters
     "InspectAdapter",
     "ORSAdapter",
     "to_inspect_task",
@@ -231,8 +206,7 @@ __all__ = [
 def __getattr__(name: str):
-    """Fall through to harbor for names not explicitly re-exported."""
-    # Let Python's normal submodule resolution handle subpackages first.
+    """Lazy submodule resolution."""
     import importlib
     try:
@@ -240,16 +214,4 @@ def __getattr__(name: str):
     except ModuleNotFoundError as e:
         if e.name != f"benchflow.{name}":
             raise
-    import harbor
-    if hasattr(harbor, name):
-        import warnings
-        warnings.warn(
-            f"'{name}' is not directly re-exported by benchflow. Use 'from harbor import {name}' instead.",
-            ImportWarning,
-            stacklevel=2,
-        )
-        return getattr(harbor, name)
     raise AttributeError(f"module 'benchflow' has no attribute {name!r}")

benchflow-0.4.0/src/benchflow/_dotenv.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Small `.env` reader shared by CLI/runtime env resolution."""
+from __future__ import annotations
+import os
+from pathlib import Path
+_DEFAULT_DOTENV_PATH = Path(".env")
+_DOTENV_PATH_ENV = "BENCHFLOW_DOTENV_PATH"
+def load_dotenv_env(path: str | Path | None = None) -> dict[str, str]:
+    """Read a local `.env` file into a plain dict.
+    Missing files are treated as empty input. `BENCHFLOW_DOTENV_PATH` lets tests
+    or callers override the implicit `.env` lookup without changing cwd.
+    """
+    if path is not None:
+        dotenv_path = Path(path)
+    else:
+        dotenv_path = Path(os.environ.get(_DOTENV_PATH_ENV, _DEFAULT_DOTENV_PATH))
+    if not dotenv_path.exists() or not dotenv_path.is_file():
+        return {}
+    parsed: dict[str, str] = {}
+    for raw_line in dotenv_path.read_text().splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line.startswith("export "):
+            line = line[len("export ") :].lstrip()
+        if "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        if not key:
+            continue
+        if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
+            value = value[1:-1]
+        elif " #" in value:
+            value = value.split(" #", 1)[0].rstrip()
+        parsed[key] = value
+    return parsed

benchflow-0.4.0/src/benchflow/_utils/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""benchflow._utils — small periphery I/O glue, private.
+Holds small (<200 LOC) periphery modules that translate between external
+artifacts (YAML files, git repos, scaffolded task dirs) and benchflow
+shapes.
+Members:
+    yaml_loader      — YAML → RolloutConfig/EvaluationConfig
+    benchmark_repos  — clone benchmark repos
+    task_authoring   — init_task / check_task scaffolding
+"""

benchflow-0.3.4/src/benchflow/task_download.py → benchflow-0.4.0/src/benchflow/_utils/benchmark_repos.py RENAMED Viewed

@@ -138,7 +138,11 @@ def resolve_source(repo: str, path: str | None = None, ref: str | None = None) -
 # Format: (org/repo, ref, subpath)
 TASK_ALIASES: dict[str, tuple[str, str | None, str | None]] = {
     "skillsbench": ("benchflow-ai/skillsbench", "main", "tasks"),
-    "programbench": ("benchflow-ai/benchmarks", "main", "datasets/programbench/tasks"),
+    "programbench": (
+        "facebookresearch/programbench",
+        "main",
+        "src/programbench/data/tasks",
+    ),
     "harvey-lab": ("benchflow-ai/benchmarks", "main", "datasets/harvey-lab/tasks"),
 }

benchflow-0.4.0/src/benchflow/_utils/config.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Shared configuration normalization helpers."""
+from __future__ import annotations
+from benchflow.agents.registry import parse_agent_spec
+def normalize_agent_name(agent: str) -> str:
+    """Return the canonical registry name for an ACP agent alias."""
+    protocol, canonical = parse_agent_spec(agent)
+    if protocol == "acp":
+        return canonical
+    return agent
+def normalize_sandbox_user(sandbox_user: str | None) -> str | None:
+    """Map text root-user sentinels to ``None``."""
+    if sandbox_user is None:
+        return None
+    if sandbox_user.lower() in {"none", "null"}:
+        return None
+    return sandbox_user

benchflow-0.3.4/src/benchflow/trial_yaml.py → benchflow-0.4.0/src/benchflow/_utils/yaml_loader.py RENAMED Viewed

@@ -1,6 +1,6 @@
-"""YAML trial config loader.
+"""YAML rollout config loader.
-Parses trial YAML files into TrialConfig with Scene support.
+Parses rollout YAML files into RolloutConfig with Scene support.
 Handles both new scene-based format and legacy flat format.
 New format::
@@ -44,13 +44,13 @@ from typing import Any
 import yaml
 from benchflow._types import Role, Scene, Turn
-from benchflow.trial import TrialConfig
+from benchflow.rollout import RolloutConfig
 logger = logging.getLogger(__name__)
-def load_trial_yaml(path: str | Path) -> dict:
-    """Load and normalize a trial YAML file."""
+def load_rollout_yaml(path: str | Path) -> dict:
+    """Load and normalize a rollout YAML file."""
     with open(path) as f:
         raw = yaml.safe_load(f)
     if not isinstance(raw, dict):
@@ -58,23 +58,23 @@ def load_trial_yaml(path: str | Path) -> dict:
     return raw
-def trial_config_from_yaml(
+def rollout_config_from_yaml(
     path: str | Path,
     task_path: Path | None = None,
-) -> TrialConfig:
-    """Parse a YAML file into a TrialConfig.
+) -> RolloutConfig:
+    """Parse a YAML file into a RolloutConfig.
     If task_path is provided, it overrides task_dir from the YAML.
     """
-    raw = load_trial_yaml(path)
-    return trial_config_from_dict(raw, task_path=task_path)
+    raw = load_rollout_yaml(path)
+    return rollout_config_from_dict(raw, task_path=task_path)
-def trial_config_from_dict(
+def rollout_config_from_dict(
     raw: dict[str, Any],
     task_path: Path | None = None,
-) -> TrialConfig:
-    """Convert a raw dict (from YAML or programmatic) into a TrialConfig."""
+) -> RolloutConfig:
+    """Convert a raw dict (from YAML or programmatic) into a RolloutConfig."""
     tp = task_path or Path(raw.get("task_dir", raw.get("task_path", ".")))
     # Scene-based format
@@ -105,7 +105,7 @@ def trial_config_from_dict(
     else:
         raise ValueError("YAML must have either 'scenes' or 'agent' at top level")
-    return TrialConfig(
+    return RolloutConfig(
         task_path=tp,
         scenes=scenes,
         environment=raw.get("environment", "docker"),
@@ -113,7 +113,7 @@ def trial_config_from_dict(
         sandbox_locked_paths=raw.get("sandbox_locked_paths"),
         sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
         job_name=raw.get("job_name"),
-        trial_name=raw.get("trial_name"),
+        rollout_name=raw.get("rollout_name"),
         jobs_dir=raw.get("jobs_dir", "jobs"),
         context_root=raw.get("context_root"),
         agent=raw.get("agent", "claude-agent-acp"),
@@ -165,9 +165,9 @@ def job_config_from_yaml(path: str | Path) -> dict:
     """Parse a YAML file and return both job-level and trial-level config.
     Returns a dict with keys: task_dir, concurrency, max_retries,
-    trial_config (TrialConfig), and any other job-level fields.
+    trial_config (RolloutConfig), and any other job-level fields.
     """
-    raw = load_trial_yaml(path)
+    raw = load_rollout_yaml(path)
     task_dir = Path(raw.get("task_dir", raw.get("tasks_dir", ".")))
     concurrency = raw.get("concurrency", 4)
     max_retries = raw.get("max_retries", 2)
@@ -176,6 +176,6 @@ def job_config_from_yaml(path: str | Path) -> dict:
         "task_dir": task_dir,
         "concurrency": concurrency,
         "max_retries": max_retries,
-        "trial_config": trial_config_from_dict(raw, task_path=task_dir),
+        "trial_config": rollout_config_from_dict(raw, task_path=task_dir),
         "raw": raw,
     }

{benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/container_transport.py RENAMED Viewed

@@ -5,7 +5,7 @@ import logging
 from pathlib import Path
 from typing import Any
-from benchflow.process import LiveProcess
+from benchflow.sandbox.process import LiveProcess
 from .transport import Transport, decode_json_rpc_message

benchflow-0.3.4/src/benchflow/_acp_run.py → benchflow-0.4.0/src/benchflow/acp/runtime.py RENAMED Viewed

@@ -7,7 +7,7 @@ Owns the live agent-side of a run:
       ACP-native trajectory, and report tool-call counts
 The one allowed horizontal phase import in this refactor lives here:
-``from benchflow._sandbox import build_priv_drop_cmd``. connect_acp wraps
+``from benchflow.sandbox.lockdown import build_priv_drop_cmd``. connect_acp wraps
 the agent launch command in the sandbox user's privilege-drop prefix
 before handing it to the transport. It is a single pure-function call
 with no shared state — not a coupling of concerns.
@@ -22,13 +22,13 @@ import contextlib
 import logging
 from pathlib import Path
-from benchflow._sandbox import build_priv_drop_cmd
-from benchflow._trajectory import _capture_session_trajectory
 from benchflow.acp.client import ACPClient
 from benchflow.acp.container_transport import ContainerTransport
 from benchflow.agents.providers import find_provider, strip_provider_prefix
 from benchflow.agents.registry import AGENTS
-from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
+from benchflow.sandbox.lockdown import build_priv_drop_cmd
+from benchflow.sandbox.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
+from benchflow.trajectories._capture import _capture_session_trajectory
 logger = logging.getLogger(__name__)
@@ -144,7 +144,7 @@ async def connect_acp(
     agent_env: dict,
     sandbox_user: str | None,
     model: str | None,
-    trial_dir: Path,
+    rollout_dir: Path,
     environment: str,
     agent_cwd: str,
 ) -> tuple[ACPClient, object, str]:
@@ -181,18 +181,18 @@ async def connect_acp(
         try:
             if environment == "docker":
-                live_proc = DockerProcess.from_harbor_env(env)
+                live_proc = DockerProcess.from_sandbox_env(env)
             else:
                 is_dind = hasattr(env, "_strategy") and hasattr(
                     env._strategy, "_compose_cmd"
                 )
                 if is_dind:
-                    live_proc = await DaytonaPtyProcess.from_harbor_env(env)
+                    live_proc = await DaytonaPtyProcess.from_sandbox_env(env)
                     logger.info("Using PTY transport for DinD compose task")
                 else:
-                    live_proc = await DaytonaProcess.from_harbor_env(env)
+                    live_proc = await DaytonaProcess.from_sandbox_env(env)
-            agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
+            agent_log = rollout_dir / "agent" / f"{agent.replace('-', '_')}.txt"
             transport = ContainerTransport(
                 container_process=live_proc,
                 command=agent_launch,

{benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/transport.py RENAMED Viewed

@@ -6,7 +6,7 @@ import logging
 from abc import ABC, abstractmethod
 from typing import Any
-from benchflow.process import drain_oversized_line
+from benchflow.sandbox.process import drain_oversized_line
 logger = logging.getLogger(__name__)

benchflow 0.3.4__tar.gz → 0.4.0__tar.gz

benchflow 0.3.4tar.gz → 0.4.0tar.gz