PyPI - benchflow - Versions diffs - 0.5.3.dev906__tar.gz → 0.5.3.dev908__tar.gz - Mend

benchflow 0.5.3.dev906tar.gz → 0.5.3.dev908tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (412) hide show

{benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,16 @@
 ## [Unreleased]
+### Added
+- **`benchflow continue <run-folder>`** — resume a previous, unfinished
+  (timed-out) `openhands` run to completion. A standalone tool (it does not
+  touch the normal run path) that reconstructs the run's exact workspace and
+  agent memory from the recorded `llm_trajectory.jsonl` via record-replay,
+  then continues with the live model — no injected prompt — and writes a new
+  HF-compatible folder with `continued_from` provenance. See
+  [`docs/continue-runs.md`](docs/continue-runs.md).
 ### Changed
 - Document the public vs internal preview install/upgrade command matrix,

{benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchflow
-Version: 0.5.3.dev906
+Version: 0.5.3.dev908
 Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
 Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
 Project-URL: Repository, https://github.com/benchflow-ai/benchflow

{benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "benchflow"
-version = "0.5.3.dev906"
+version = "0.5.3.dev908"
 description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
 readme = "README.md"
 requires-python = ">=3.12"

benchflow-0.5.3.dev908/src/benchflow/cli/continue_cmd.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""``benchflow continue`` — resume a timed-out run to completion.
+Standalone command (does not touch the normal eval/run path): reconstruct a
+previous unfinished ``openhands`` run's exact env + memory from its recorded
+trajectory via record-replay, continue it live, and write a new HF-compatible
+folder linked to the parent. See :mod:`benchflow.continue_run`.
+"""
+from __future__ import annotations
+import asyncio
+import logging
+import os
+from pathlib import Path
+from typing import Annotated
+import typer
+logger = logging.getLogger(__name__)
+def _load_env_defaults() -> None:
+    from benchflow._dotenv import load_dotenv_env
+    for key, value in load_dotenv_env().items():
+        os.environ.setdefault(key, value)
+def register_continue(app: typer.Typer) -> None:
+    """Attach the ``continue`` command to the top-level benchflow app."""
+    @app.command("continue")
+    def continue_cmd(
+        folder: Annotated[
+            Path,
+            typer.Argument(
+                help="Original run output folder (contains config.json + "
+                "trajectory/llm_trajectory.jsonl)."
+            ),
+        ],
+        tasks_dir: Annotated[
+            Path | None,
+            typer.Option(
+                "--tasks-dir",
+                help="Directory holding the task source (instruction + verifier). "
+                "Required unless the recorded task_path still exists on disk.",
+            ),
+        ] = None,
+        model: Annotated[
+            str | None,
+            typer.Option(
+                "--model",
+                help="Override the live-continuation model (default: the "
+                "original run's model). Tests use gemini-3.1-flash-lite-preview.",
+            ),
+        ] = None,
+        timeout: Annotated[
+            int | None,
+            typer.Option(
+                "--timeout",
+                help="Wall-clock budget for the continuation, in seconds "
+                "(default: the original run's timeout).",
+            ),
+        ] = None,
+        output: Annotated[
+            Path | None,
+            typer.Option(
+                "--output",
+                help="Output jobs dir for the new run (default: "
+                "<orig-parent>/continued).",
+            ),
+        ] = None,
+        require_timeout: Annotated[
+            bool,
+            typer.Option(
+                "--require-timeout/--no-require-timeout",
+                help="Refuse runs whose recorded status is not a timeout.",
+            ),
+        ] = False,
+        strict_divergence: Annotated[
+            bool,
+            typer.Option(
+                "--strict-divergence/--no-strict-divergence",
+                help="Abort if replay leaves the original rails (message-count "
+                "mismatch) instead of warning.",
+            ),
+        ] = False,
+        replay_only: Annotated[
+            bool,
+            typer.Option(
+                "--replay-only/--no-replay-only",
+                help="Rebuild the env via replay and stop at the cut-point "
+                "(no live model needed) — useful for testing.",
+            ),
+        ] = False,
+        proxy_mode: Annotated[
+            str,
+            typer.Option(
+                "--proxy-mode",
+                help=(
+                    "Replay proxy placement: auto, host, or sandbox. Auto uses "
+                    "sandbox-local replay for Daytona/Modal and host replay for Docker."
+                ),
+            ),
+        ] = "auto",
+    ) -> None:
+        """Resume a previous unfinished (timed-out) openhands run to completion."""
+        from benchflow.continue_run.orchestrator import continue_run
+        from benchflow.continue_run.run_folder import RunFolderError
+        _load_env_defaults()
+        try:
+            result = asyncio.run(
+                continue_run(
+                    folder,
+                    tasks_dir=tasks_dir,
+                    model=model,
+                    timeout=timeout,
+                    output_dir=output,
+                    require_timeout=require_timeout,
+                    strict_divergence=strict_divergence,
+                    replay_only=replay_only,
+                    proxy_mode=proxy_mode,
+                )
+            )
+        except RunFolderError as exc:
+            typer.secho(f"benchflow continue: {exc}", fg=typer.colors.RED, err=True)
+            raise typer.Exit(1) from exc
+        typer.secho(
+            f"\n✓ continued run written to {result.rollout_dir}", fg=typer.colors.GREEN
+        )
+        typer.echo(
+            f"  replayed {result.n_recorded} recorded turn(s); "
+            f"{result.n_live} live turn(s); {result.divergences} divergence(s)"
+        )
+        if result.rewards is not None:
+            typer.echo(f"  rewards: {result.rewards}")
+        if result.error:
+            typer.secho(f"  agent error: {result.error}", fg=typer.colors.YELLOW)
+    @app.command("continue-batch")
+    def continue_batch_cmd(
+        root: Annotated[
+            Path,
+            typer.Argument(
+                help=(
+                    "Run folder or directory tree containing timeout run folders "
+                    "(config.json + trajectory/llm_trajectory.jsonl)."
+                )
+            ),
+        ],
+        tasks_dir: Annotated[
+            Path | None,
+            typer.Option(
+                "--tasks-dir",
+                help="Directory holding task sources; required unless recorded task_path exists.",
+            ),
+        ] = None,
+        model: Annotated[
+            str | None,
+            typer.Option("--model", help="Override live-continuation model."),
+        ] = None,
+        timeout: Annotated[
+            int | None,
+            typer.Option("--timeout", help="Wall-clock budget per continuation."),
+        ] = None,
+        output: Annotated[
+            Path | None,
+            typer.Option("--output", help="Output jobs dir for continued runs."),
+        ] = None,
+        concurrency: Annotated[
+            int,
+            typer.Option(
+                "--concurrency",
+                help="Maximum number of continuation runs in flight.",
+            ),
+        ] = 100,
+        limit: Annotated[
+            int | None,
+            typer.Option("--limit", help="Limit discovered timeout folders."),
+        ] = None,
+        strict_divergence: Annotated[
+            bool,
+            typer.Option(
+                "--strict-divergence/--no-strict-divergence",
+                help="Abort a run if replay leaves the original rails.",
+            ),
+        ] = False,
+        proxy_mode: Annotated[
+            str,
+            typer.Option(
+                "--proxy-mode",
+                help=(
+                    "Replay proxy placement: auto, host, or sandbox. For PR5 "
+                    "Daytona runs, use the default auto or sandbox."
+                ),
+            ),
+        ] = "auto",
+    ) -> None:
+        """Continue all timed-out OpenHands runs under a directory tree."""
+        import json
+        from benchflow.continue_run.batch import (
+            continue_batch,
+            discover_timeout_run_folders,
+            summarize_batch,
+        )
+        _load_env_defaults()
+        folders = discover_timeout_run_folders(root, limit=limit)
+        if not folders:
+            typer.secho("No timeout run folders found.", fg=typer.colors.YELLOW)
+            return
+        typer.echo(
+            f"Continuing {len(folders)} timeout run(s) with concurrency={concurrency}"
+        )
+        results = asyncio.run(
+            continue_batch(
+                folders,
+                concurrency=concurrency,
+                tasks_dir=tasks_dir,
+                model=model,
+                timeout=timeout,
+                output_dir=output,
+                require_timeout=True,
+                strict_divergence=strict_divergence,
+                proxy_mode=proxy_mode,
+            )
+        )
+        summary = summarize_batch(results)
+        typer.echo(json.dumps(summary, indent=2))
+        if summary["failed"]:
+            raise typer.Exit(1)

{benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/main.py RENAMED Viewed

@@ -21,6 +21,7 @@ from benchflow._utils.config import (
     normalize_sandbox_user,
 )
 from benchflow.agents.registry import parse_agent_spec
+from benchflow.cli.continue_cmd import register_continue
 from benchflow.cli.trace_import import register_tasks_generate
 from benchflow.evaluation import DEFAULT_AGENT, effective_model
 from benchflow.skill_policy import SKILL_MODE_NO_SKILL
@@ -40,6 +41,9 @@ app = typer.Typer(
     no_args_is_help=True,
 )
+# Standalone `benchflow continue <orig-run-folder>` — resume a timed-out run.
+register_continue(app)
 def _version_callback(value: bool) -> None:
     if value:

benchflow-0.5.3.dev908/src/benchflow/continue_run/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Resume a previous, unfinished (timed-out) agent run to completion.
+``benchflow continue <orig-run-output-folder>`` is a *standalone* tool that does
+**not** touch benchflow's normal run path. It reconstructs a timed-out run's
+exact workspace and agent memory from the recorded trajectory (record-replay),
+then lets the agent continue as if the timeout had simply been larger, and
+writes a new HF-compatible result folder linked to the parent.
+The mechanism (agreed design):
+1. Load the original run folder (``config.json`` + ``trajectory/llm_trajectory.jsonl``).
+2. Boot a fresh *pristine* sandbox from the same base image.
+3. Stand up a :class:`~benchflow.continue_run.replay_proxy.ReplayProxy` that
+   OpenHands talks to via ``LLM_BASE_URL``. It serves the recorded LLM
+   responses **in order**, so the agent re-executes its own past decisions for
+   real — rebuilding the byte-exact workspace and its exact internal state.
+4. When the recorded responses run out (the timeout cut-point), the proxy flips
+   to the **live** model and the agent continues — no injected prompt.
+5. Re-verify and write a new folder with ``continued_from`` provenance.
+Only the ``openhands`` agent is supported for now (the LLM-proxy seam this relies
+on is wired for openhands via ``LLM_BASE_URL``).
+"""
+from benchflow.continue_run.run_folder import (
+    RunFolder,
+    RunFolderError,
+    load_run_folder,
+)
+__all__ = [
+    "RunFolder",
+    "RunFolderError",
+    "load_run_folder",
+]

benchflow-0.5.3.dev908/src/benchflow/continue_run/batch.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""Batch orchestration for continuing many timed-out runs."""
+from __future__ import annotations
+import asyncio
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from benchflow.continue_run.orchestrator import ContinueResult, continue_run
+from benchflow.continue_run.run_folder import RunFolderError, load_run_folder
+ContinueRunner = Callable[..., Awaitable[ContinueResult]]
+@dataclass(frozen=True)
+class BatchContinueResult:
+    """Result for one source folder in a batch continuation."""
+    folder: Path
+    ok: bool
+    continued: ContinueResult | None = None
+    error: str | None = None
+def discover_timeout_run_folders(
+    root: str | Path, *, limit: int | None = None
+) -> list[Path]:
+    """Find OpenHands timeout run folders below ``root``.
+    Discovery is intentionally artifact-based: a candidate must have a
+    ``config.json`` and a usable ``trajectory/llm_trajectory.jsonl``. Non-timeout
+    runs are skipped by ``load_run_folder(require_timeout=True)``.
+    """
+    root_path = Path(root).expanduser()
+    candidates = [root_path] if (root_path / "config.json").is_file() else []
+    candidates.extend(path.parent for path in root_path.rglob("config.json"))
+    folders: list[Path] = []
+    seen: set[Path] = set()
+    for folder in sorted(candidates):
+        resolved = folder.resolve()
+        if resolved in seen:
+            continue
+        seen.add(resolved)
+        try:
+            load_run_folder(folder, require_timeout=True)
+        except RunFolderError:
+            continue
+        folders.append(folder)
+        if limit is not None and len(folders) >= limit:
+            break
+    return folders
+async def continue_batch(
+    folders: list[Path],
+    *,
+    concurrency: int,
+    tasks_dir: str | Path | None,
+    model: str | None,
+    timeout: int | None,
+    output_dir: str | Path | None,
+    require_timeout: bool = True,
+    strict_divergence: bool = False,
+    proxy_mode: str = "auto",
+    runner: ContinueRunner = continue_run,
+) -> list[BatchContinueResult]:
+    """Run ``benchflow continue`` over folders with rolling concurrency."""
+    if concurrency < 1:
+        raise ValueError("concurrency must be >= 1")
+    semaphore = asyncio.Semaphore(concurrency)
+    async def _one(folder: Path) -> BatchContinueResult:
+        async with semaphore:
+            try:
+                result = await runner(
+                    folder,
+                    tasks_dir=tasks_dir,
+                    model=model,
+                    timeout=timeout,
+                    output_dir=output_dir,
+                    require_timeout=require_timeout,
+                    strict_divergence=strict_divergence,
+                    proxy_mode=proxy_mode,
+                )
+            except Exception as exc:
+                return BatchContinueResult(folder=folder, ok=False, error=str(exc))
+            if result.error:
+                return BatchContinueResult(
+                    folder=folder,
+                    ok=False,
+                    continued=result,
+                    error=result.error,
+                )
+            return BatchContinueResult(folder=folder, ok=True, continued=result)
+    return list(await asyncio.gather(*(_one(folder) for folder in folders)))
+def summarize_batch(results: list[BatchContinueResult]) -> dict[str, Any]:
+    """Small JSON-serializable summary for CLI output and dashboards."""
+    ok = [result for result in results if result.ok]
+    failed = [result for result in results if not result.ok]
+    return {
+        "total": len(results),
+        "succeeded": len(ok),
+        "failed": len(failed),
+        "outputs": [
+            str(result.continued.rollout_dir)
+            for result in ok
+            if result.continued is not None
+        ],
+        "errors": [
+            {
+                "folder": str(result.folder),
+                "output": str(result.continued.rollout_dir)
+                if result.continued is not None
+                else None,
+                "error": result.error,
+            }
+            for result in failed
+        ],
+    }

benchflow 0.5.3.dev906__tar.gz → 0.5.3.dev908__tar.gz

benchflow 0.5.3.dev906tar.gz → 0.5.3.dev908tar.gz