PyPI - inspect-swe - Versions diffs - 0.2.10__tar.gz → 0.2.11__tar.gz - Mend

inspect-swe 0.2.10tar.gz → 0.2.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_swe
-Version: 0.2.10
+Version: 0.2.11
 Summary: Software engineering agents for Inspect AI.
 Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
 Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
@@ -10,7 +10,7 @@ License: MIT License
 License-File: LICENSE
 Requires-Python: >=3.10
 Requires-Dist: httpx
-Requires-Dist: inspect-ai>=0.3.129
+Requires-Dist: inspect-ai>=0.3.130
 Requires-Dist: nest-asyncio
 Requires-Dist: platformdirs
 Requires-Dist: pydantic>=2.11.4
@@ -19,6 +19,7 @@ Requires-Dist: sniffio
 Requires-Dist: typing-extensions>=4.9.0
 Provides-Extra: dev
 Requires-Dist: anthropic; extra == 'dev'
+Requires-Dist: ipython; extra == 'dev'
 Requires-Dist: mypy; extra == 'dev'
 Requires-Dist: openai; extra == 'dev'
 Requires-Dist: pytest; extra == 'dev'

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/pyproject.toml RENAMED Viewed

@@ -12,7 +12,7 @@ requires-python = ">=3.10"
 license = { text = "MIT License" }
 dependencies = [
     "httpx",
-    "inspect_ai>=0.3.129",
+    "inspect_ai>=0.3.130",
     "nest_asyncio",
     "platformdirs",
     "pydantic>=2.11.4",
@@ -39,6 +39,7 @@ dev = [
     "openai",
     "pytest-dotenv",
     "types-PyYAML",
+    "IPython",
 ]
 doc = ["quarto-cli==1.7.31"]
@@ -85,3 +86,4 @@ strict = true
 mypy_path = "src"
 namespace_packages = true
 explicit_package_bases = true
+python_executable = ".venv/bin/python"

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from ._claude_code.claude_code import claude_code
+from ._codex_cli.codex_cli import codex_cli
 from ._tools.download import download_agent_binary
 from ._util.sandbox import SandboxPlatform
@@ -10,6 +11,7 @@ except ImportError:
 __all__ = [
     "claude_code",
+    "codex_cli",
     "download_agent_binary",
     "SandboxPlatform",
     "__version__",

inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/download.py → inspect_swe-0.2.11/src/inspect_swe/_claude_code/agentbinary.py RENAMED Viewed

@@ -1,50 +1,43 @@
 import re
-from typing import Callable, Literal
+from pathlib import Path
 from pydantic import BaseModel
-from ..._util.checksum import verify_checksum
-from ..._util.download import download_file, download_text_file
-from ..._util.sandbox import SandboxPlatform
-from .cache import (
-    read_cached_claude_code_binary,
-    write_cached_claude_code_binary,
-)
-async def download_claude_code_async(
-    version: Literal["stable", "latest"] | str,
-    platform: SandboxPlatform,
-    logger: Callable[[str], None] | None = None,
-) -> bytes:
-    # resovle logger
-    logger = logger or print
-    # determine version and checksum
-    gcs_bucket = await _claude_code_gcs_bucket()
-    version = await _claude_code_version(gcs_bucket, version)
-    manifest = await _claude_code_manifest(gcs_bucket, version)
-    expected_checksum = _checksum_for_platform(manifest, platform)
-    # check the cache
-    binary_data = read_cached_claude_code_binary(version, platform, expected_checksum)
-    if binary_data is None:
-        # not in cache, download and verify checksum
-        binary_url = f"{gcs_bucket}/{version}/{platform}/claude"
-        binary_data = await download_file(binary_url)
-        if not verify_checksum(binary_data, expected_checksum):
-            raise ValueError("Checksum verification failed")
-        # save to cache
-        write_cached_claude_code_binary(binary_data, version, platform)
-        # trace
-        logger(f"Downloaded claude code binary: {version} ({platform})")
-    else:
-        logger(f"Used claude code binary from cache: {version} ({platform})")
-    # return data
-    return binary_data
+from typing_extensions import Literal
+from .._util.agentbinary import AgentBinarySource, AgentBinaryVersion
+from .._util.appdirs import package_cache_dir
+from .._util.download import download_text_file
+from .._util.sandbox import SandboxPlatform
+def claude_code_binary_source() -> AgentBinarySource:
+    cached_binary_dir = package_cache_dir("claude-code-downloads")
+    async def resolve_version(
+        version: Literal["stable", "latest"] | str, platform: SandboxPlatform
+    ) -> AgentBinaryVersion:
+        gcs_bucket = await _claude_code_gcs_bucket()
+        version = await _claude_code_version(gcs_bucket, version)
+        manifest = await _claude_code_manifest(gcs_bucket, version)
+        expected_checksum = _checksum_for_platform(manifest, platform)
+        download_url = f"{gcs_bucket}/{version}/{platform}/claude"
+        return AgentBinaryVersion(version, expected_checksum, download_url)
+    def cached_binary_path(version: str, platform: SandboxPlatform) -> Path:
+        return cached_binary_dir / f"claude-{version}-{platform}"
+    def list_cached_binaries() -> list[Path]:
+        return list(cached_binary_dir.glob("claude-*"))
+    return AgentBinarySource(
+        agent="claude code",
+        binary="claude",
+        resolve_version=resolve_version,
+        cached_binary_path=cached_binary_path,
+        list_cached_binaries=list_cached_binaries,
+        post_download=None,
+        post_install="config list",
+    )
 async def _claude_code_gcs_bucket() -> str:

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_claude_code/claude_code.py RENAMED Viewed

@@ -10,16 +10,16 @@ from inspect_ai.agent import (
     agent_with,
     sandbox_agent_bridge,
 )
-from inspect_ai.model import ChatMessageSystem, ChatMessageUser
+from inspect_ai.model import ChatMessageSystem, ChatMessageUser, GenerateFilter
 from inspect_ai.scorer import score
 from inspect_ai.tool import MCPServerConfig
 from inspect_ai.util import sandbox as sandbox_env
 from pydantic_core import to_json
-from inspect_swe._util.trace import trace
 from .._util._async import is_callable_coroutine
-from .install.install import ensure_claude_code_installed
+from .._util.agentbinary import ensure_agent_binary_installed
+from .._util.trace import trace
+from .agentbinary import claude_code_binary_source
 @agent
@@ -31,11 +31,11 @@ def claude_code(
     """),
     system_prompt: str | None = None,
     mcp_servers: Sequence[MCPServerConfig] | None = None,
-    allowed_tools: list[str] | None = None,
     disallowed_tools: list[str] | None = None,
     attempts: int | AgentAttempts = 1,
     model: str | None = None,
     small_model: str | None = None,
+    filter: GenerateFilter | None = None,
     cwd: str | None = None,
     env: dict[str, str] | None = None,
     user: str | None = None,
@@ -48,7 +48,7 @@ def claude_code(
     The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
-    Use `allowed_tools` and `disallowed_tools` to control access to tools. See [Tools available to Claude](https://docs.anthropic.com/en/docs/claude-code/settings#tools-available-to-claude) for the list of built-in tools and [How to use Allowed Tools in Claude Code](https://www.instructa.ai/blog/claude-code/how-to-use-allowed-tools-in-claude-code) for details on the supported syntax. Note that `allowed_tools` enables you to filter allowed parameter values and `disallowed_tools` enables you to remove tools entirely. In other words, `allowed_tools` is not a complete list of what tools are available but rather just filters on tool parameters---to remove tools you need to explicitly set `disallowed_tools`.
+    Use `disallowed_tools` to control access to tools. See [Tools available to Claude](https://docs.anthropic.com/en/docs/claude-code/settings#tools-available-to-claude) for the list of built-in tools which can be disallowed.
     Use the `attempts` option to enable additional submissions if the initial
     submission(s) are incorrect (by default, no additional attempts are permitted).
@@ -58,11 +58,11 @@ def claude_code(
         description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
         system_prompt: Additional system prompt to append to default system prompt.
         mcp_servers: MCP servers to make available to the agent.
-        allowed_tools: Parameter filters for built-in tools.
         disallowed_tools: List of tool names to disallow entirely.
         attempts: Configure agent to make multiple attempts.
         model: Model name to use for Opus and Sonnet calls (defaults to main model for task).
         small_model: Model to use for Haiku calls (defaults to main model for task).
+        filter: Filter for intercepting bridged model requests.
         cwd: Working directory to run claude code within.
         env: Environment variables to set for claude code.
         user: User to execute claude code with.
@@ -82,10 +82,10 @@ def claude_code(
     attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
     async def execute(state: AgentState) -> AgentState:
-        async with sandbox_agent_bridge(state) as bridge:
+        async with sandbox_agent_bridge(state, filter=filter) as bridge:
             # ensure claude is installed and get binary location
-            claude_binary = await ensure_claude_code_installed(
-                version, user, sandbox_env(sandbox)
+            claude_binary = await ensure_agent_binary_installed(
+                claude_code_binary_source(), version, user, sandbox_env(sandbox)
             )
             # allocate session_id
@@ -111,7 +111,7 @@ def claude_code(
                 cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
             # mcp servers
-            cmd_allowed_tools = allowed_tools or []
+            cmd_allowed_tools: list[str] = []
             if mcp_servers:
                 mcp_server_args, mcp_allowed_tools = resolve_mcp_servers(mcp_servers)
                 cmd.extend(mcp_server_args)
@@ -146,7 +146,7 @@ def claude_code(
                 # run agent
                 result = await sbox.exec(
-                    cmd=agent_cmd,
+                    cmd=["bash", "-c", 'exec "$@"', "bash"] + agent_cmd,
                     cwd=cwd,
                     env={
                         "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
@@ -171,7 +171,9 @@ def claude_code(
                 # raise for error
                 if not result.success:
-                    f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
+                    raise RuntimeError(
+                        f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
+                    )
                 # exit if we are at max_attempts
                 attempt_count += 1

inspect_swe-0.2.11/src/inspect_swe/_codex_cli/agentbinary.py ADDED Viewed

@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+from typing import Any
+from typing_extensions import Literal
+from .._util.agentbinary import AgentBinarySource, AgentBinaryVersion
+from .._util.appdirs import package_cache_dir
+from .._util.download import download_text_file
+from .._util.sandbox import SandboxPlatform
+from .._util.tarball import extract_tarball
+def codex_cli_binary_source() -> AgentBinarySource:
+    cached_binary_dir = package_cache_dir("codex-cli-downloads")
+    async def resolve_version(
+        version: Literal["stable", "latest"] | str, platform: SandboxPlatform
+    ) -> AgentBinaryVersion:
+        # Resolve version alias if needed
+        if version in ["stable", "latest"]:
+            version = await _fetch_latest_stable_version()
+        # Get release information
+        release = await _fetch_release_assets(version)
+        # Get the platform-specific asset
+        arch = _platform_to_codex_arch(platform)
+        asset_name = f"codex-{arch}.tar.gz"
+        # Find the matching asset
+        asset = None
+        for a in release.get("assets", []):
+            if a["name"] == asset_name:
+                asset = a
+                break
+        if asset is None:
+            raise RuntimeError(
+                f"No asset found for platform {platform} in version {version}"
+            )
+        # Extract checksum (format: "sha256:xxx")
+        digest = asset.get("digest", "")
+        if not digest.startswith("sha256:"):
+            raise RuntimeError(f"Invalid digest format: {digest}")
+        expected_checksum = digest[7:]  # Remove "sha256:" prefix
+        # Get download URL
+        download_url = asset["browser_download_url"]
+        return AgentBinaryVersion(version, expected_checksum, download_url)
+    def cached_binary_path(version: str, platform: SandboxPlatform) -> Path:
+        return cached_binary_dir / f"codex-{version}-{platform}"
+    def list_cached_binaries() -> list[Path]:
+        return list(cached_binary_dir.glob("codex-*"))
+    return AgentBinarySource(
+        agent="codex cli",
+        binary="codex",
+        resolve_version=resolve_version,
+        cached_binary_path=cached_binary_path,
+        list_cached_binaries=list_cached_binaries,
+        post_download=extract_tarball,
+        post_install=None,
+    )
+def _platform_to_codex_arch(platform: SandboxPlatform) -> str:
+    """Map SandboxPlatform to Codex architecture string.
+    Always use musl variants for better compatibility since they're
+    statically linked and don't depend on system GLIBC version.
+    """
+    platform_map = {
+        "linux-x64": "x86_64-unknown-linux-musl",
+        "linux-x64-musl": "x86_64-unknown-linux-musl",
+        "linux-arm64": "aarch64-unknown-linux-musl",
+        "linux-arm64-musl": "aarch64-unknown-linux-musl",
+    }
+    if platform not in platform_map:
+        raise ValueError(f"Unsupported platform: {platform}")
+    return platform_map[platform]
+async def _fetch_latest_stable_version() -> str:
+    """Fetch the latest stable version from GitHub releases."""
+    releases_url = "https://api.github.com/repos/openai/codex/releases"
+    releases_json = await download_text_file(releases_url)
+    releases = json.loads(releases_json)
+    # Filter out pre-releases and alpha versions
+    stable_releases = [
+        r
+        for r in releases
+        if not r.get("prerelease", False) and "-alpha" not in r.get("tag_name", "")
+    ]
+    if not stable_releases:
+        raise RuntimeError("No stable releases found for codex")
+    # Get the most recent stable release
+    latest = stable_releases[0]
+    tag_name = latest["tag_name"]
+    # Extract version from tag (e.g., "rust-v0.29.0" -> "0.29.0")
+    if tag_name.startswith("rust-v"):
+        result: str = tag_name[6:]  # Remove "rust-v" prefix
+        return result
+    else:
+        raise RuntimeError(f"Unexpected tag format: {tag_name}")
+async def _fetch_release_assets(version: str) -> dict[str, Any]:
+    """Fetch release assets for a specific version."""
+    tag = f"rust-v{version}"
+    release_url = f"https://api.github.com/repos/openai/codex/releases/tags/{tag}"
+    release_json = await download_text_file(release_url)
+    result: dict[str, Any] = json.loads(release_json)
+    return result

inspect_swe-0.2.11/src/inspect_swe/_codex_cli/codex_cli.py ADDED Viewed

@@ -0,0 +1,252 @@
+import os
+from logging import getLogger
+from textwrap import dedent
+from typing import Any, Literal, Sequence
+from inspect_ai.agent import (
+    Agent,
+    AgentAttempts,
+    AgentState,
+    agent,
+    agent_with,
+    sandbox_agent_bridge,
+)
+from inspect_ai.model import ChatMessageSystem, ChatMessageUser, GenerateFilter
+from inspect_ai.scorer import score
+from inspect_ai.tool import MCPServerConfig
+from inspect_ai.util import SandboxEnvironment
+from inspect_ai.util import sandbox as sandbox_env
+from inspect_swe._util._async import is_callable_coroutine
+from inspect_swe._util.sandbox import sandbox_exec
+from inspect_swe._util.toml import to_toml
+from inspect_swe._util.trace import trace
+from .._util.agentbinary import ensure_agent_binary_installed
+from .agentbinary import codex_cli_binary_source
+logger = getLogger(__file__)
+@agent
+def codex_cli(
+    name: str = "Codex CLI",
+    description: str = dedent("""
+       Autonomous coding agent capable of writing, testing, debugging,
+       and iterating on code across multiple languages.
+    """),
+    system_prompt: str | None = None,
+    mcp_servers: Sequence[MCPServerConfig] | None = None,
+    disallowed_tools: list[Literal["web_search"]] | None = None,
+    attempts: int | AgentAttempts = 1,
+    model: str | None = None,
+    filter: GenerateFilter | None = None,
+    cwd: str | None = None,
+    env: dict[str, str] | None = None,
+    user: str | None = None,
+    sandbox: str | None = None,
+    version: Literal["auto", "sandbox", "latest"] | str = "auto",
+) -> Agent:
+    """Codex CLI.
+    Agent that uses OpenAI [Codex CLI](https://github.com/openai/codex) running in a sandbox.
+    Use the `attempts` option to enable additional submissions if the initial
+    submission(s) are incorrect (by default, no additional attempts are permitted).
+    Args:
+        name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
+        description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
+        system_prompt: Additional system prompt to append to default system prompt.
+        mcp_servers: MCP servers to make available to the agent.
+        disallowed_tools: Optionally disallow tools (currently only web_search).
+        attempts: Configure agent to make multiple attempts.
+        model: Model name to use (defaults to main model for task).
+        filter: Filter for intercepting bridged model requests.
+        cwd: Working directory to run codex cli within.
+        env: Environment variables to set for codex cli
+        user: User to execute codex cli with.
+        sandbox: Optional sandbox environment name.
+        version: Version of codex cli to use. One of:
+            - "auto": Use any available version of codex cli in the sandbox, otherwise download the latest version.
+            - "sandbox": Use the version of codex cli in the sandbox (raises `RuntimeError` if codex is not available in the sandbox)
+            - "latest": Download and use the very latest version of codex cli.
+            - "x.x.x": Download and use a specific version of codex cli.
+    """
+    # resolve model
+    model = f"inspect/{model}" if model is not None else "inspect"
+    # resolve attempts
+    attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
+    # ensure disallowed_tools list
+    disallowed_tools = disallowed_tools or []
+    async def execute(state: AgentState) -> AgentState:
+        async with sandbox_agent_bridge(state, model=model, filter=filter) as bridge:
+            # ensure codex is installed and get binary location
+            codex_binary = await ensure_agent_binary_installed(
+                codex_cli_binary_source(), version, user, sandbox_env(sandbox)
+            )
+            # helper to create codex cwd relative paths
+            def codex_path(file: str) -> str:
+                return (
+                    file if cwd is None else os.path.join(cwd, file).replace("\\", "/")
+                )
+            # build system prompt
+            system_messages = [
+                m.text for m in state.messages if isinstance(m, ChatMessageSystem)
+            ]
+            if system_prompt is not None:
+                system_messages.append(system_prompt)
+            # resolve sandbox
+            sbox = sandbox_env(sandbox)
+            # determine CODEX_HOME (we want this to be whatever sandbox working dir is)
+            working_dir = (await sandbox_exec(sbox, "pwd", user=user, cwd=cwd)).strip()
+            if not working_dir.endswith("/"):
+                working_dir = f"{working_dir}/"
+            codex_home = f"{working_dir}.codex"
+            await sandbox_exec(sbox, cmd=f"mkdir -p {codex_home}", user=user)
+            # write system messages to AGENTS.md
+            if system_messages:
+                await sbox.write_file(
+                    codex_path("AGENTS.md"), "\n\n".join(system_messages)
+                )
+            # built full promot
+            prompt = "\n\n".join(
+                [
+                    message.text
+                    for message in state.messages
+                    if isinstance(message, ChatMessageUser)
+                ]
+            )
+            # build agent cmd
+            cmd = [
+                codex_binary,
+                "exec",
+                "--model",
+                "gpt-5",  # real model is passed to the bridge above
+                "--skip-git-repo-check",
+                "--dangerously-bypass-approvals-and-sandbox",
+                "--color",
+                "never",
+            ]
+            # include the plan and apply patch tools.
+            # NOTE: update_plan not currently working in 'exec' mode:
+            # https://github.com/openai/codex/issues/1952
+            cmd.extend(["-c", "include_plan_tool=true"])
+            cmd.extend(["-c", "include_apply_patch_tool=true"])
+            # include web search if appropriate
+            if "web_search" not in disallowed_tools:
+                cmd.extend(["-c", "tools.web_search=true"])
+            # register mcp servers
+            if mcp_servers:
+                mcp_config: dict[str, Any] = {}
+                for mcp_server in mcp_servers or []:
+                    mcp_config[f"mcp_servers.{mcp_server.name}"] = (
+                        mcp_server.model_dump(
+                            exclude={"name", "tools"}, exclude_none=True
+                        )
+                    )
+                await sandbox_exec(
+                    sbox, cmd=f"mkdir -p {codex_path('.codex')}", user=user
+                )
+                await sbox.write_file(
+                    codex_path(".codex/config.toml"), to_toml(mcp_config)
+                )
+            # execute the agent (track debug output)
+            debug_output: list[str] = []
+            agent_prompt = prompt
+            attempt_count = 0
+            resume_rollout: str | None = None
+            while True:
+                # resume if requested
+                agent_cmd = cmd.copy()
+                if resume_rollout is not None:
+                    agent_cmd.extend(["-c", f'experimental_resume="{resume_rollout}"'])
+                # append prompt
+                agent_cmd.append(agent_prompt)
+                # run agent
+                result = await sbox.exec(
+                    cmd=["bash", "-c", 'exec "$@"', "bash"] + agent_cmd,
+                    cwd=cwd,
+                    env={
+                        "CODEX_HOME": codex_home,
+                        "OPENAI_BASE_URL": f"http://localhost:{bridge.port}/v1",
+                        "RUST_LOG": "debug",
+                    }
+                    | (env or {}),
+                )
+                # record output for debug
+                debug_output.append(result.stdout)
+                debug_output.append(result.stderr)
+                # raise for error
+                if not result.success:
+                    raise RuntimeError(
+                        f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
+                    )
+                # exit if we are at max_attempts
+                attempt_count += 1
+                if attempt_count >= attempts.attempts:
+                    break
+                # score this attempt
+                answer_scores = await score(state)
+                # break if we score 'correct'
+                if attempts.score_value(answer_scores[0].value) == 1.0:
+                    break
+                # otherwise update prompt with incorrect message and continue
+                else:
+                    resume_rollout = await _last_rollout(sbox, codex_home, user)
+                    if callable(attempts.incorrect_message):
+                        if not is_callable_coroutine(attempts.incorrect_message):
+                            raise ValueError(
+                                "The incorrect_message function must be async."
+                            )
+                        agent_prompt = await attempts.incorrect_message(
+                            state, answer_scores
+                        )
+                    else:
+                        agent_prompt = attempts.incorrect_message
+            # trace debug info
+            debug_output.insert(0, "Codex CLI Debug Output:")
+            trace("\n".join(debug_output))
+        # return success
+        return bridge.state
+    return agent_with(execute, name=name, description=description)
+async def _last_rollout(
+    sandbox: SandboxEnvironment, codex_home: str, user: str | None
+) -> str | None:
+    try:
+        rollout = await sandbox_exec(
+            sandbox,
+            f"find '{codex_home}/sessions' -type f -name 'rollout-*.jsonl' -exec ls -t -- {{}} + | head -n 1",
+            user=user,
+        )
+        return rollout.strip()
+    except RuntimeError as ex:
+        logger.warning(f"Error attempting to read rollout file: {ex}")
+        return None

inspect_swe-0.2.11/src/inspect_swe/_registry.py ADDED Viewed

@@ -0,0 +1,6 @@
+# ruff: noqa: F401
+from ._claude_code.claude_code import claude_code
+from ._codex_cli.codex_cli import codex_cli
+__all__ = ["codex_cli", "claude_code"]

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_tools/download.py RENAMED Viewed

@@ -1,12 +1,14 @@
 from typing import Literal
-from .._claude_code.install.download import download_claude_code_async
+from .._claude_code.agentbinary import claude_code_binary_source
+from .._codex_cli.agentbinary import codex_cli_binary_source
 from .._util._async import run_coroutine
+from .._util.agentbinary import download_agent_binary_async
 from .._util.sandbox import SandboxPlatform
 def download_agent_binary(
-    binary: Literal["claude_code"],
+    binary: Literal["claude_code", "codex_cli"],
     version: Literal["stable", "latest"] | str,
     platform: SandboxPlatform,
 ) -> None:
@@ -21,7 +23,12 @@ def download_agent_binary(
         version: Version to download ("stable", "latest", or an explicit version number).
         platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
     """
-    if binary == "claude_code":
-        run_coroutine(download_claude_code_async(version, platform))
-    else:
-        raise ValueError(f"Unsuported agent binary type: {binary}")
+    match binary:
+        case "claude_code":
+            source = claude_code_binary_source()
+        case "codex_cli":
+            source = codex_cli_binary_source()
+        case _:
+            raise ValueError(f"Unsuported agent binary type: {binary}")
+    run_coroutine(download_agent_binary_async(source, version, platform))

inspect_swe-0.2.11/src/inspect_swe/_util/agentbinary.py ADDED Viewed

@@ -0,0 +1,185 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Awaitable, Callable, Literal, NamedTuple
+from inspect_ai.util import SandboxEnvironment, concurrency
+from inspect_ai.util import sandbox as sandbox_env
+from inspect_swe._util.trace import trace
+from .checksum import verify_checksum
+from .download import download_file
+from .sandbox import (
+    SandboxPlatform,
+    bash_command,
+    detect_sandbox_platform,
+    sandbox_exec,
+)
+class AgentBinaryVersion(NamedTuple):
+    version: str
+    expected_checksum: str
+    download_url: str
+@dataclass
+class AgentBinarySource:
+    agent: str
+    binary: str
+    resolve_version: Callable[
+        [Literal["stable", "latest"] | str, SandboxPlatform],
+        Awaitable[AgentBinaryVersion],
+    ]
+    cached_binary_path: Callable[[str, SandboxPlatform], Path]
+    list_cached_binaries: Callable[[], list[Path]]
+    post_download: Callable[[bytes], bytes] | None
+    post_install: str | None
+async def ensure_agent_binary_installed(
+    source: AgentBinarySource,
+    version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
+    user: str | None = None,
+    sandbox: SandboxEnvironment | None = None,
+) -> str:
+    # resolve sandbox
+    sandbox = sandbox or sandbox_env()
+    # look in the sandbox first if we need to
+    if version == "auto" or version == "sandbox":
+        result = await sandbox.exec(bash_command(f"which {source.binary}"), user=user)
+        if result.success:
+            binary_path = result.stdout.strip()
+            trace(f"Using {source.agent} installed in sandbox: {binary_path}")
+            return binary_path
+        # if version == "sandbox" and we don't find it that's an error
+        if version == "sandbox":
+            raise RuntimeError(f"unable to locate {source.agent} in sandbox")
+        # otherwise set to "stable"
+        version = "stable"
+    # detect the sandbox target platform
+    platform = await detect_sandbox_platform(sandbox)
+    # use concurrency so multiple samples don't attempt the same download all at once
+    async with concurrency(f"{source.binary}-install", 1, visible=False):
+        # if a specific version is requested, first try to read it directly from the cache
+        if version not in ["stable", "latest"]:
+            binary_bytes: bytes | None = read_cached_binary(
+                source, version, platform, None
+            )
+            if binary_bytes is not None:
+                trace(f"Used claude code binary from cache: {version} ({platform})")
+        else:
+            binary_bytes = None
+        # download the binary
+        if binary_bytes is None:
+            binary_bytes, resolved_version = await download_agent_binary_async(
+                source, version, platform, trace
+            )
+        else:
+            # If we got it from cache, version is already the resolved version
+            resolved_version = version
+        # write it into the container and return it
+        binary_path = f"/opt/{source.binary}-{resolved_version}-{platform}"
+        await sandbox.write_file(binary_path, binary_bytes)
+        await sandbox_exec(sandbox, f"chmod +x {binary_path}")
+        if source.post_install:
+            await sandbox_exec(
+                sandbox, f"{binary_path} {source.post_install}", user=user
+            )
+        return binary_path
+async def download_agent_binary_async(
+    source: AgentBinarySource,
+    version: Literal["stable", "latest"] | str,
+    platform: SandboxPlatform,
+    logger: Callable[[str], None] | None = None,
+) -> tuple[bytes, str]:
+    # resovle logger
+    logger = logger or print
+    # determine version and checksum
+    version, expected_checksum, download_url = await source.resolve_version(
+        version, platform
+    )
+    # check the cache (if post_download is used, don't verify checksum since cached is processed)
+    cache_checksum = None if source.post_download else expected_checksum
+    binary_data = read_cached_binary(source, version, platform, cache_checksum)
+    if binary_data is None:
+        # not in cache, download and verify checksum
+        binary_data = await download_file(download_url)
+        if not verify_checksum(binary_data, expected_checksum):
+            raise ValueError("Checksum verification failed")
+        # apply post-download processing if provided (e.g., extract from tar.gz)
+        if source.post_download is not None:
+            binary_data = source.post_download(binary_data)
+        # save to cache
+        write_cached_binary(source, binary_data, version, platform)
+        # trace
+        logger(f"Downloaded {source.agent} binary: {version} ({platform})")
+    else:
+        logger(f"Used {source.agent} binary from cache: {version} ({platform})")
+    # return data and resolved version
+    return binary_data, version
+def read_cached_binary(
+    source: AgentBinarySource,
+    version: str,
+    platform: SandboxPlatform,
+    expected_checksum: str | None,
+) -> bytes | None:
+    # no cached binary
+    cache_path = source.cached_binary_path(version, platform)
+    if not cache_path.exists():
+        return None
+    # read binary
+    with open(cache_path, "rb") as f:
+        binary_data = f.read()
+    if expected_checksum is None or verify_checksum(binary_data, expected_checksum):
+        cache_path.touch()
+        return binary_data
+    else:
+        cache_path.unlink()
+        return None
+def write_cached_binary(
+    source: AgentBinarySource,
+    binary_data: bytes,
+    version: str,
+    platform: SandboxPlatform,
+) -> None:
+    binary_path = source.cached_binary_path(version, platform)
+    with open(binary_path, "wb") as f:
+        f.write(binary_data)
+    _cleanup_binary_cache(source, keep_count=3)
+def _cleanup_binary_cache(source: AgentBinarySource, keep_count: int = 5) -> None:
+    # get all cached binaries
+    cache_files = source.list_cached_binaries()
+    if len(cache_files) <= keep_count:
+        return
+    # remove oldest
+    cache_files.sort(key=lambda f: f.stat().st_atime)
+    files_to_remove = cache_files[:-keep_count]
+    for file_path in files_to_remove:
+        file_path.unlink()

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/appdirs.py RENAMED Viewed

@@ -1,8 +1,9 @@
 from pathlib import Path
-from inspect_ai._util.constants import PKG_NAME
 from platformdirs import user_cache_path, user_data_path
+from .constants import PKG_NAME
 def package_data_dir(subdir: str | None) -> Path:
     data_dir = user_data_path(PKG_NAME)

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/sandbox.py RENAMED Viewed

@@ -49,9 +49,12 @@ def bash_command(cmd: str) -> list[str]:
 async def sandbox_exec(
-    sandbox: SandboxEnvironment, cmd: str, user: str | None = None
+    sandbox: SandboxEnvironment,
+    cmd: str,
+    user: str | None = None,
+    cwd: str | None = None,
 ) -> str:
-    result = await sandbox.exec(bash_command(cmd), user=user)
+    result = await sandbox.exec(bash_command(cmd), user=user, cwd=cwd)
     if not result.success:
         raise RuntimeError(f"Error executing sandbox command {cmd}: {result.stderr}")
     return result.stdout.strip()

inspect_swe-0.2.11/src/inspect_swe/_util/tarball.py ADDED Viewed

@@ -0,0 +1,27 @@
+import gzip
+import tarfile
+from io import BytesIO
+from typing import Any, cast
+def extract_tarball(tarball_bytes: bytes) -> bytes:
+    """Extract the binary from a tar.gz archive."""
+    # Open the gzip-compressed tarball
+    with BytesIO(tarball_bytes) as tarball_io:
+        with gzip.open(tarball_io, "rb") as gz:
+            with tarfile.open(fileobj=cast(Any, gz), mode="r") as tar:
+                # List all members (should be just one file)
+                members = tar.getmembers()
+                if len(members) != 1:
+                    raise ValueError(
+                        f"Expected 1 file in tarball, found {len(members)}"
+                    )
+                # Extract the binary file
+                member = members[0]
+                extracted = tar.extractfile(member)
+                if extracted is None:
+                    raise ValueError(f"Could not extract {member.name}")
+                result: bytes = extracted.read()
+                return result

inspect_swe-0.2.11/src/inspect_swe/_util/toml.py ADDED Viewed

@@ -0,0 +1,62 @@
+from datetime import date, datetime, time
+from typing import Any, Dict, List
+def to_toml(data: Dict[str, Any]) -> str:
+    """Convert a dictionary to TOML format string."""
+    lines = []
+    # Handle top-level key-value pairs first
+    top_level = {}
+    tables = {}
+    for key, value in data.items():
+        if isinstance(value, dict):
+            tables[key] = value
+        else:
+            top_level[key] = value
+    # Write top-level pairs
+    for key, value in top_level.items():
+        lines.append(f"{key} = {_format_value(value)}")
+    # Write tables
+    for table_name, table_data in tables.items():
+        if lines:  # Add blank line before tables
+            lines.append("")
+        lines.append(f"[{table_name}]")
+        _write_table(lines, table_data)
+    return "\n".join(lines)
+def _format_value(value: Any) -> str:
+    """Format a Python value as TOML."""
+    if isinstance(value, str):
+        # Escape special characters and quote
+        escaped = value.replace("\\", "\\\\").replace('"', '\\"')
+        return f'"{escaped}"'
+    elif isinstance(value, bool):
+        return "true" if value else "false"
+    elif isinstance(value, (int, float)):
+        return str(value)
+    elif isinstance(value, datetime):
+        return value.isoformat()
+    elif isinstance(value, date):
+        return value.isoformat()
+    elif isinstance(value, time):
+        return value.isoformat()
+    elif isinstance(value, list):
+        formatted_items = [_format_value(item) for item in value]
+        return f"[{', '.join(formatted_items)}]"
+    elif value is None:
+        raise ValueError("TOML doesn't support null values")
+    else:
+        raise TypeError(f"Unsupported type: {type(value)}")
+def _write_table(lines: List[str], data: Dict[str, Any]) -> None:
+    """Write table contents."""
+    for key, value in data.items():
+        if not isinstance(value, dict):
+            lines.append(f"{key} = {_format_value(value)}")

{inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.10'
-__version_tuple__ = version_tuple = (0, 2, 10)
+__version__ = version = '0.2.11'
+__version_tuple__ = version_tuple = (0, 2, 11)
 __commit_id__ = commit_id = None

inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/cache.py DELETED Viewed

@@ -1,58 +0,0 @@
-from pathlib import Path
-from inspect_swe._util.sandbox import SandboxPlatform
-from ..._util.appdirs import package_cache_dir
-from ..._util.checksum import verify_checksum
-def read_cached_claude_code_binary(
-    version: str, platform: SandboxPlatform, expected_checksum: str | None
-) -> bytes | None:
-    # no cached binary
-    cache_path = _claude_code_cached_binary(version, platform)
-    if not cache_path.exists():
-        return None
-    # read binary
-    with open(cache_path, "rb") as f:
-        binary_data = f.read()
-    if expected_checksum is None or verify_checksum(binary_data, expected_checksum):
-        cache_path.touch()
-        return binary_data
-    else:
-        cache_path.unlink()
-        return None
-def write_cached_claude_code_binary(
-    binary_data: bytes, version: str, platform: SandboxPlatform
-) -> None:
-    binary_path = _claude_code_cached_binary(version, platform)
-    with open(binary_path, "wb") as f:
-        f.write(binary_data)
-    _cleanup_claude_code_binary_cache(keep_count=3)
-def _cleanup_claude_code_binary_cache(keep_count: int = 5) -> None:
-    # get all cached binaries
-    cache_files = list(_claude_code_cached_binary_dir().glob("claude-*"))
-    if len(cache_files) <= keep_count:
-        return
-    # remove oldest
-    cache_files.sort(key=lambda f: f.stat().st_atime)
-    files_to_remove = cache_files[:-keep_count]
-    for file_path in files_to_remove:
-        file_path.unlink()
-def _claude_code_cached_binary_dir() -> Path:
-    return package_cache_dir("claude-code-downloads")
-def _claude_code_cached_binary(version: str, platform: SandboxPlatform) -> Path:
-    return _claude_code_cached_binary_dir() / f"claude-{version}-{platform}"

inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/install.py DELETED Viewed

@@ -1,62 +0,0 @@
-from typing import Literal
-from inspect_ai.util import SandboxEnvironment, concurrency
-from inspect_ai.util import sandbox as sandbox_env
-from inspect_swe._claude_code.install.cache import read_cached_claude_code_binary
-from inspect_swe._util.trace import trace
-from ..._util.sandbox import bash_command, detect_sandbox_platform, sandbox_exec
-from .download import download_claude_code_async
-async def ensure_claude_code_installed(
-    version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
-    user: str | None = None,
-    sandbox: SandboxEnvironment | None = None,
-) -> str:
-    # resolve sandbox
-    sandbox = sandbox or sandbox_env()
-    # look in the sandbox first if we need to
-    if version == "auto" or version == "sandbox":
-        result = await sandbox.exec(bash_command("which claude"), user=user)
-        if result.success:
-            claude_binary = result.stdout.strip()
-            trace(f"Using claude code installed in sandbox: {claude_binary}")
-            return claude_binary
-        # if version == "sandbox" and we don't find it that's an error
-        if version == "sandbox":
-            raise RuntimeError("unable to locate claude code in sandbox")
-        # otherwise set to "stable"
-        version = "stable"
-    # detect the sandbox target platform
-    platform = await detect_sandbox_platform(sandbox)
-    # use concurrency so multiple samples don't attempt the same download all at once
-    async with concurrency("claude-install", 1, visible=False):
-        # if a specific version is requested, first try to read it directly from the cache
-        if version not in ["stable", "latest"]:
-            claude_binary_bytes: bytes | None = read_cached_claude_code_binary(
-                version, platform, None
-            )
-            if claude_binary_bytes is not None:
-                trace(f"Used claude code binary from cache: {version} ({platform})")
-        else:
-            claude_binary_bytes = None
-        # download the binary
-        if claude_binary_bytes is None:
-            claude_binary_bytes = await download_claude_code_async(
-                version, platform, trace
-            )
-        # write it into the container and return it
-        claude_binary = f"/opt/claude-{version}-{platform}"
-        await sandbox.write_file(claude_binary, claude_binary_bytes)
-        await sandbox_exec(sandbox, f"chmod +x {claude_binary}")
-        await sandbox_exec(sandbox, f"{claude_binary} config list", user=user)
-        return claude_binary