inspect-swe 0.2.10__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/PKG-INFO +3 -2
  2. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/pyproject.toml +3 -1
  3. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/__init__.py +2 -0
  4. inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/download.py → inspect_swe-0.2.11/src/inspect_swe/_claude_code/agentbinary.py +37 -44
  5. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_claude_code/claude_code.py +15 -13
  6. inspect_swe-0.2.11/src/inspect_swe/_codex_cli/agentbinary.py +122 -0
  7. inspect_swe-0.2.11/src/inspect_swe/_codex_cli/codex_cli.py +252 -0
  8. inspect_swe-0.2.11/src/inspect_swe/_registry.py +6 -0
  9. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_tools/download.py +13 -6
  10. inspect_swe-0.2.11/src/inspect_swe/_util/agentbinary.py +185 -0
  11. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/appdirs.py +2 -1
  12. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/sandbox.py +5 -2
  13. inspect_swe-0.2.11/src/inspect_swe/_util/tarball.py +27 -0
  14. inspect_swe-0.2.11/src/inspect_swe/_util/toml.py +62 -0
  15. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_version.py +2 -2
  16. inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/cache.py +0 -58
  17. inspect_swe-0.2.10/src/inspect_swe/_claude_code/install/install.py +0 -62
  18. inspect_swe-0.2.10/src/inspect_swe/_registry.py +0 -5
  19. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/.gitignore +0 -0
  20. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/LICENSE +0 -0
  21. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/README.md +0 -0
  22. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_claude_code/__init__.py +0 -0
  23. {inspect_swe-0.2.10/src/inspect_swe/_claude_code/install → inspect_swe-0.2.11/src/inspect_swe/_codex_cli}/__init__.py +0 -0
  24. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_tools/__init__.py +0 -0
  25. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/__init__.py +0 -0
  26. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/_async.py +0 -0
  27. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/_yaml.py +0 -0
  28. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/checksum.py +0 -0
  29. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/constants.py +0 -0
  30. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/download.py +0 -0
  31. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/platform.py +0 -0
  32. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/_util/trace.py +0 -0
  33. {inspect_swe-0.2.10 → inspect_swe-0.2.11}/src/inspect_swe/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inspect_swe
3
- Version: 0.2.10
3
+ Version: 0.2.11
4
4
  Summary: Software engineering agents for Inspect AI.
5
5
  Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
6
6
  Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
@@ -10,7 +10,7 @@ License: MIT License
10
10
  License-File: LICENSE
11
11
  Requires-Python: >=3.10
12
12
  Requires-Dist: httpx
13
- Requires-Dist: inspect-ai>=0.3.129
13
+ Requires-Dist: inspect-ai>=0.3.130
14
14
  Requires-Dist: nest-asyncio
15
15
  Requires-Dist: platformdirs
16
16
  Requires-Dist: pydantic>=2.11.4
@@ -19,6 +19,7 @@ Requires-Dist: sniffio
19
19
  Requires-Dist: typing-extensions>=4.9.0
20
20
  Provides-Extra: dev
21
21
  Requires-Dist: anthropic; extra == 'dev'
22
+ Requires-Dist: ipython; extra == 'dev'
22
23
  Requires-Dist: mypy; extra == 'dev'
23
24
  Requires-Dist: openai; extra == 'dev'
24
25
  Requires-Dist: pytest; extra == 'dev'
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
12
12
  license = { text = "MIT License" }
13
13
  dependencies = [
14
14
  "httpx",
15
- "inspect_ai>=0.3.129",
15
+ "inspect_ai>=0.3.130",
16
16
  "nest_asyncio",
17
17
  "platformdirs",
18
18
  "pydantic>=2.11.4",
@@ -39,6 +39,7 @@ dev = [
39
39
  "openai",
40
40
  "pytest-dotenv",
41
41
  "types-PyYAML",
42
+ "IPython",
42
43
  ]
43
44
  doc = ["quarto-cli==1.7.31"]
44
45
 
@@ -85,3 +86,4 @@ strict = true
85
86
  mypy_path = "src"
86
87
  namespace_packages = true
87
88
  explicit_package_bases = true
89
+ python_executable = ".venv/bin/python"
@@ -1,4 +1,5 @@
1
1
  from ._claude_code.claude_code import claude_code
2
+ from ._codex_cli.codex_cli import codex_cli
2
3
  from ._tools.download import download_agent_binary
3
4
  from ._util.sandbox import SandboxPlatform
4
5
 
@@ -10,6 +11,7 @@ except ImportError:
10
11
 
11
12
  __all__ = [
12
13
  "claude_code",
14
+ "codex_cli",
13
15
  "download_agent_binary",
14
16
  "SandboxPlatform",
15
17
  "__version__",
@@ -1,50 +1,43 @@
1
1
  import re
2
- from typing import Callable, Literal
2
+ from pathlib import Path
3
3
 
4
4
  from pydantic import BaseModel
5
-
6
- from ..._util.checksum import verify_checksum
7
- from ..._util.download import download_file, download_text_file
8
- from ..._util.sandbox import SandboxPlatform
9
- from .cache import (
10
- read_cached_claude_code_binary,
11
- write_cached_claude_code_binary,
12
- )
13
-
14
-
15
- async def download_claude_code_async(
16
- version: Literal["stable", "latest"] | str,
17
- platform: SandboxPlatform,
18
- logger: Callable[[str], None] | None = None,
19
- ) -> bytes:
20
- # resovle logger
21
- logger = logger or print
22
-
23
- # determine version and checksum
24
- gcs_bucket = await _claude_code_gcs_bucket()
25
- version = await _claude_code_version(gcs_bucket, version)
26
- manifest = await _claude_code_manifest(gcs_bucket, version)
27
- expected_checksum = _checksum_for_platform(manifest, platform)
28
-
29
- # check the cache
30
- binary_data = read_cached_claude_code_binary(version, platform, expected_checksum)
31
- if binary_data is None:
32
- # not in cache, download and verify checksum
33
- binary_url = f"{gcs_bucket}/{version}/{platform}/claude"
34
- binary_data = await download_file(binary_url)
35
- if not verify_checksum(binary_data, expected_checksum):
36
- raise ValueError("Checksum verification failed")
37
-
38
- # save to cache
39
- write_cached_claude_code_binary(binary_data, version, platform)
40
-
41
- # trace
42
- logger(f"Downloaded claude code binary: {version} ({platform})")
43
- else:
44
- logger(f"Used claude code binary from cache: {version} ({platform})")
45
-
46
- # return data
47
- return binary_data
5
+ from typing_extensions import Literal
6
+
7
+ from .._util.agentbinary import AgentBinarySource, AgentBinaryVersion
8
+ from .._util.appdirs import package_cache_dir
9
+ from .._util.download import download_text_file
10
+ from .._util.sandbox import SandboxPlatform
11
+
12
+
13
+ def claude_code_binary_source() -> AgentBinarySource:
14
+ cached_binary_dir = package_cache_dir("claude-code-downloads")
15
+
16
+ async def resolve_version(
17
+ version: Literal["stable", "latest"] | str, platform: SandboxPlatform
18
+ ) -> AgentBinaryVersion:
19
+ gcs_bucket = await _claude_code_gcs_bucket()
20
+ version = await _claude_code_version(gcs_bucket, version)
21
+ manifest = await _claude_code_manifest(gcs_bucket, version)
22
+ expected_checksum = _checksum_for_platform(manifest, platform)
23
+ download_url = f"{gcs_bucket}/{version}/{platform}/claude"
24
+ return AgentBinaryVersion(version, expected_checksum, download_url)
25
+
26
+ def cached_binary_path(version: str, platform: SandboxPlatform) -> Path:
27
+ return cached_binary_dir / f"claude-{version}-{platform}"
28
+
29
+ def list_cached_binaries() -> list[Path]:
30
+ return list(cached_binary_dir.glob("claude-*"))
31
+
32
+ return AgentBinarySource(
33
+ agent="claude code",
34
+ binary="claude",
35
+ resolve_version=resolve_version,
36
+ cached_binary_path=cached_binary_path,
37
+ list_cached_binaries=list_cached_binaries,
38
+ post_download=None,
39
+ post_install="config list",
40
+ )
48
41
 
49
42
 
50
43
  async def _claude_code_gcs_bucket() -> str:
@@ -10,16 +10,16 @@ from inspect_ai.agent import (
10
10
  agent_with,
11
11
  sandbox_agent_bridge,
12
12
  )
13
- from inspect_ai.model import ChatMessageSystem, ChatMessageUser
13
+ from inspect_ai.model import ChatMessageSystem, ChatMessageUser, GenerateFilter
14
14
  from inspect_ai.scorer import score
15
15
  from inspect_ai.tool import MCPServerConfig
16
16
  from inspect_ai.util import sandbox as sandbox_env
17
17
  from pydantic_core import to_json
18
18
 
19
- from inspect_swe._util.trace import trace
20
-
21
19
  from .._util._async import is_callable_coroutine
22
- from .install.install import ensure_claude_code_installed
20
+ from .._util.agentbinary import ensure_agent_binary_installed
21
+ from .._util.trace import trace
22
+ from .agentbinary import claude_code_binary_source
23
23
 
24
24
 
25
25
  @agent
@@ -31,11 +31,11 @@ def claude_code(
31
31
  """),
32
32
  system_prompt: str | None = None,
33
33
  mcp_servers: Sequence[MCPServerConfig] | None = None,
34
- allowed_tools: list[str] | None = None,
35
34
  disallowed_tools: list[str] | None = None,
36
35
  attempts: int | AgentAttempts = 1,
37
36
  model: str | None = None,
38
37
  small_model: str | None = None,
38
+ filter: GenerateFilter | None = None,
39
39
  cwd: str | None = None,
40
40
  env: dict[str, str] | None = None,
41
41
  user: str | None = None,
@@ -48,7 +48,7 @@ def claude_code(
48
48
 
49
49
  The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
50
50
 
51
- Use `allowed_tools` and `disallowed_tools` to control access to tools. See [Tools available to Claude](https://docs.anthropic.com/en/docs/claude-code/settings#tools-available-to-claude) for the list of built-in tools and [How to use Allowed Tools in Claude Code](https://www.instructa.ai/blog/claude-code/how-to-use-allowed-tools-in-claude-code) for details on the supported syntax. Note that `allowed_tools` enables you to filter allowed parameter values and `disallowed_tools` enables you to remove tools entirely. In other words, `allowed_tools` is not a complete list of what tools are available but rather just filters on tool parameters---to remove tools you need to explicitly set `disallowed_tools`.
51
+ Use `disallowed_tools` to control access to tools. See [Tools available to Claude](https://docs.anthropic.com/en/docs/claude-code/settings#tools-available-to-claude) for the list of built-in tools which can be disallowed.
52
52
 
53
53
  Use the `attempts` option to enable additional submissions if the initial
54
54
  submission(s) are incorrect (by default, no additional attempts are permitted).
@@ -58,11 +58,11 @@ def claude_code(
58
58
  description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
59
59
  system_prompt: Additional system prompt to append to default system prompt.
60
60
  mcp_servers: MCP servers to make available to the agent.
61
- allowed_tools: Parameter filters for built-in tools.
62
61
  disallowed_tools: List of tool names to disallow entirely.
63
62
  attempts: Configure agent to make multiple attempts.
64
63
  model: Model name to use for Opus and Sonnet calls (defaults to main model for task).
65
64
  small_model: Model to use for Haiku calls (defaults to main model for task).
65
+ filter: Filter for intercepting bridged model requests.
66
66
  cwd: Working directory to run claude code within.
67
67
  env: Environment variables to set for claude code.
68
68
  user: User to execute claude code with.
@@ -82,10 +82,10 @@ def claude_code(
82
82
  attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
83
83
 
84
84
  async def execute(state: AgentState) -> AgentState:
85
- async with sandbox_agent_bridge(state) as bridge:
85
+ async with sandbox_agent_bridge(state, filter=filter) as bridge:
86
86
  # ensure claude is installed and get binary location
87
- claude_binary = await ensure_claude_code_installed(
88
- version, user, sandbox_env(sandbox)
87
+ claude_binary = await ensure_agent_binary_installed(
88
+ claude_code_binary_source(), version, user, sandbox_env(sandbox)
89
89
  )
90
90
 
91
91
  # allocate session_id
@@ -111,7 +111,7 @@ def claude_code(
111
111
  cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
112
112
 
113
113
  # mcp servers
114
- cmd_allowed_tools = allowed_tools or []
114
+ cmd_allowed_tools: list[str] = []
115
115
  if mcp_servers:
116
116
  mcp_server_args, mcp_allowed_tools = resolve_mcp_servers(mcp_servers)
117
117
  cmd.extend(mcp_server_args)
@@ -146,7 +146,7 @@ def claude_code(
146
146
 
147
147
  # run agent
148
148
  result = await sbox.exec(
149
- cmd=agent_cmd,
149
+ cmd=["bash", "-c", 'exec "$@"', "bash"] + agent_cmd,
150
150
  cwd=cwd,
151
151
  env={
152
152
  "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
@@ -171,7 +171,9 @@ def claude_code(
171
171
 
172
172
  # raise for error
173
173
  if not result.success:
174
- f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
174
+ raise RuntimeError(
175
+ f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
176
+ )
175
177
 
176
178
  # exit if we are at max_attempts
177
179
  attempt_count += 1
@@ -0,0 +1,122 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from typing_extensions import Literal
6
+
7
+ from .._util.agentbinary import AgentBinarySource, AgentBinaryVersion
8
+ from .._util.appdirs import package_cache_dir
9
+ from .._util.download import download_text_file
10
+ from .._util.sandbox import SandboxPlatform
11
+ from .._util.tarball import extract_tarball
12
+
13
+
14
+ def codex_cli_binary_source() -> AgentBinarySource:
15
+ cached_binary_dir = package_cache_dir("codex-cli-downloads")
16
+
17
+ async def resolve_version(
18
+ version: Literal["stable", "latest"] | str, platform: SandboxPlatform
19
+ ) -> AgentBinaryVersion:
20
+ # Resolve version alias if needed
21
+ if version in ["stable", "latest"]:
22
+ version = await _fetch_latest_stable_version()
23
+
24
+ # Get release information
25
+ release = await _fetch_release_assets(version)
26
+
27
+ # Get the platform-specific asset
28
+ arch = _platform_to_codex_arch(platform)
29
+ asset_name = f"codex-{arch}.tar.gz"
30
+
31
+ # Find the matching asset
32
+ asset = None
33
+ for a in release.get("assets", []):
34
+ if a["name"] == asset_name:
35
+ asset = a
36
+ break
37
+
38
+ if asset is None:
39
+ raise RuntimeError(
40
+ f"No asset found for platform {platform} in version {version}"
41
+ )
42
+
43
+ # Extract checksum (format: "sha256:xxx")
44
+ digest = asset.get("digest", "")
45
+ if not digest.startswith("sha256:"):
46
+ raise RuntimeError(f"Invalid digest format: {digest}")
47
+ expected_checksum = digest[7:] # Remove "sha256:" prefix
48
+
49
+ # Get download URL
50
+ download_url = asset["browser_download_url"]
51
+
52
+ return AgentBinaryVersion(version, expected_checksum, download_url)
53
+
54
+ def cached_binary_path(version: str, platform: SandboxPlatform) -> Path:
55
+ return cached_binary_dir / f"codex-{version}-{platform}"
56
+
57
+ def list_cached_binaries() -> list[Path]:
58
+ return list(cached_binary_dir.glob("codex-*"))
59
+
60
+ return AgentBinarySource(
61
+ agent="codex cli",
62
+ binary="codex",
63
+ resolve_version=resolve_version,
64
+ cached_binary_path=cached_binary_path,
65
+ list_cached_binaries=list_cached_binaries,
66
+ post_download=extract_tarball,
67
+ post_install=None,
68
+ )
69
+
70
+
71
+ def _platform_to_codex_arch(platform: SandboxPlatform) -> str:
72
+ """Map SandboxPlatform to Codex architecture string.
73
+
74
+ Always use musl variants for better compatibility since they're
75
+ statically linked and don't depend on system GLIBC version.
76
+ """
77
+ platform_map = {
78
+ "linux-x64": "x86_64-unknown-linux-musl",
79
+ "linux-x64-musl": "x86_64-unknown-linux-musl",
80
+ "linux-arm64": "aarch64-unknown-linux-musl",
81
+ "linux-arm64-musl": "aarch64-unknown-linux-musl",
82
+ }
83
+ if platform not in platform_map:
84
+ raise ValueError(f"Unsupported platform: {platform}")
85
+ return platform_map[platform]
86
+
87
+
88
+ async def _fetch_latest_stable_version() -> str:
89
+ """Fetch the latest stable version from GitHub releases."""
90
+ releases_url = "https://api.github.com/repos/openai/codex/releases"
91
+ releases_json = await download_text_file(releases_url)
92
+ releases = json.loads(releases_json)
93
+
94
+ # Filter out pre-releases and alpha versions
95
+ stable_releases = [
96
+ r
97
+ for r in releases
98
+ if not r.get("prerelease", False) and "-alpha" not in r.get("tag_name", "")
99
+ ]
100
+
101
+ if not stable_releases:
102
+ raise RuntimeError("No stable releases found for codex")
103
+
104
+ # Get the most recent stable release
105
+ latest = stable_releases[0]
106
+ tag_name = latest["tag_name"]
107
+
108
+ # Extract version from tag (e.g., "rust-v0.29.0" -> "0.29.0")
109
+ if tag_name.startswith("rust-v"):
110
+ result: str = tag_name[6:] # Remove "rust-v" prefix
111
+ return result
112
+ else:
113
+ raise RuntimeError(f"Unexpected tag format: {tag_name}")
114
+
115
+
116
+ async def _fetch_release_assets(version: str) -> dict[str, Any]:
117
+ """Fetch release assets for a specific version."""
118
+ tag = f"rust-v{version}"
119
+ release_url = f"https://api.github.com/repos/openai/codex/releases/tags/{tag}"
120
+ release_json = await download_text_file(release_url)
121
+ result: dict[str, Any] = json.loads(release_json)
122
+ return result
@@ -0,0 +1,252 @@
1
+ import os
2
+ from logging import getLogger
3
+ from textwrap import dedent
4
+ from typing import Any, Literal, Sequence
5
+
6
+ from inspect_ai.agent import (
7
+ Agent,
8
+ AgentAttempts,
9
+ AgentState,
10
+ agent,
11
+ agent_with,
12
+ sandbox_agent_bridge,
13
+ )
14
+ from inspect_ai.model import ChatMessageSystem, ChatMessageUser, GenerateFilter
15
+ from inspect_ai.scorer import score
16
+ from inspect_ai.tool import MCPServerConfig
17
+ from inspect_ai.util import SandboxEnvironment
18
+ from inspect_ai.util import sandbox as sandbox_env
19
+
20
+ from inspect_swe._util._async import is_callable_coroutine
21
+ from inspect_swe._util.sandbox import sandbox_exec
22
+ from inspect_swe._util.toml import to_toml
23
+ from inspect_swe._util.trace import trace
24
+
25
+ from .._util.agentbinary import ensure_agent_binary_installed
26
+ from .agentbinary import codex_cli_binary_source
27
+
28
+ logger = getLogger(__file__)
29
+
30
+
31
+ @agent
32
+ def codex_cli(
33
+ name: str = "Codex CLI",
34
+ description: str = dedent("""
35
+ Autonomous coding agent capable of writing, testing, debugging,
36
+ and iterating on code across multiple languages.
37
+ """),
38
+ system_prompt: str | None = None,
39
+ mcp_servers: Sequence[MCPServerConfig] | None = None,
40
+ disallowed_tools: list[Literal["web_search"]] | None = None,
41
+ attempts: int | AgentAttempts = 1,
42
+ model: str | None = None,
43
+ filter: GenerateFilter | None = None,
44
+ cwd: str | None = None,
45
+ env: dict[str, str] | None = None,
46
+ user: str | None = None,
47
+ sandbox: str | None = None,
48
+ version: Literal["auto", "sandbox", "latest"] | str = "auto",
49
+ ) -> Agent:
50
+ """Codex CLI.
51
+
52
+ Agent that uses OpenAI [Codex CLI](https://github.com/openai/codex) running in a sandbox.
53
+
54
+ Use the `attempts` option to enable additional submissions if the initial
55
+ submission(s) are incorrect (by default, no additional attempts are permitted).
56
+
57
+ Args:
58
+ name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
59
+ description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
60
+ system_prompt: Additional system prompt to append to default system prompt.
61
+ mcp_servers: MCP servers to make available to the agent.
62
+ disallowed_tools: Optionally disallow tools (currently only web_search).
63
+ attempts: Configure agent to make multiple attempts.
64
+ model: Model name to use (defaults to main model for task).
65
+ filter: Filter for intercepting bridged model requests.
66
+ cwd: Working directory to run codex cli within.
67
+ env: Environment variables to set for codex cli
68
+ user: User to execute codex cli with.
69
+ sandbox: Optional sandbox environment name.
70
+ version: Version of codex cli to use. One of:
71
+ - "auto": Use any available version of codex cli in the sandbox, otherwise download the latest version.
72
+ - "sandbox": Use the version of codex cli in the sandbox (raises `RuntimeError` if codex is not available in the sandbox)
73
+ - "latest": Download and use the very latest version of codex cli.
74
+ - "x.x.x": Download and use a specific version of codex cli.
75
+ """
76
+ # resolve model
77
+ model = f"inspect/{model}" if model is not None else "inspect"
78
+
79
+ # resolve attempts
80
+ attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
81
+
82
+ # ensure disallowed_tools list
83
+ disallowed_tools = disallowed_tools or []
84
+
85
+ async def execute(state: AgentState) -> AgentState:
86
+ async with sandbox_agent_bridge(state, model=model, filter=filter) as bridge:
87
+ # ensure codex is installed and get binary location
88
+ codex_binary = await ensure_agent_binary_installed(
89
+ codex_cli_binary_source(), version, user, sandbox_env(sandbox)
90
+ )
91
+
92
+ # helper to create codex cwd relative paths
93
+ def codex_path(file: str) -> str:
94
+ return (
95
+ file if cwd is None else os.path.join(cwd, file).replace("\\", "/")
96
+ )
97
+
98
+ # build system prompt
99
+ system_messages = [
100
+ m.text for m in state.messages if isinstance(m, ChatMessageSystem)
101
+ ]
102
+ if system_prompt is not None:
103
+ system_messages.append(system_prompt)
104
+
105
+ # resolve sandbox
106
+ sbox = sandbox_env(sandbox)
107
+
108
+ # determine CODEX_HOME (we want this to be whatever sandbox working dir is)
109
+ working_dir = (await sandbox_exec(sbox, "pwd", user=user, cwd=cwd)).strip()
110
+ if not working_dir.endswith("/"):
111
+ working_dir = f"{working_dir}/"
112
+ codex_home = f"{working_dir}.codex"
113
+ await sandbox_exec(sbox, cmd=f"mkdir -p {codex_home}", user=user)
114
+
115
+ # write system messages to AGENTS.md
116
+ if system_messages:
117
+ await sbox.write_file(
118
+ codex_path("AGENTS.md"), "\n\n".join(system_messages)
119
+ )
120
+
121
+ # built full promot
122
+ prompt = "\n\n".join(
123
+ [
124
+ message.text
125
+ for message in state.messages
126
+ if isinstance(message, ChatMessageUser)
127
+ ]
128
+ )
129
+
130
+ # build agent cmd
131
+ cmd = [
132
+ codex_binary,
133
+ "exec",
134
+ "--model",
135
+ "gpt-5", # real model is passed to the bridge above
136
+ "--skip-git-repo-check",
137
+ "--dangerously-bypass-approvals-and-sandbox",
138
+ "--color",
139
+ "never",
140
+ ]
141
+
142
+ # include the plan and apply patch tools.
143
+ # NOTE: update_plan not currently working in 'exec' mode:
144
+ # https://github.com/openai/codex/issues/1952
145
+ cmd.extend(["-c", "include_plan_tool=true"])
146
+ cmd.extend(["-c", "include_apply_patch_tool=true"])
147
+
148
+ # include web search if appropriate
149
+ if "web_search" not in disallowed_tools:
150
+ cmd.extend(["-c", "tools.web_search=true"])
151
+
152
+ # register mcp servers
153
+ if mcp_servers:
154
+ mcp_config: dict[str, Any] = {}
155
+ for mcp_server in mcp_servers or []:
156
+ mcp_config[f"mcp_servers.{mcp_server.name}"] = (
157
+ mcp_server.model_dump(
158
+ exclude={"name", "tools"}, exclude_none=True
159
+ )
160
+ )
161
+ await sandbox_exec(
162
+ sbox, cmd=f"mkdir -p {codex_path('.codex')}", user=user
163
+ )
164
+ await sbox.write_file(
165
+ codex_path(".codex/config.toml"), to_toml(mcp_config)
166
+ )
167
+
168
+ # execute the agent (track debug output)
169
+ debug_output: list[str] = []
170
+ agent_prompt = prompt
171
+ attempt_count = 0
172
+ resume_rollout: str | None = None
173
+ while True:
174
+ # resume if requested
175
+ agent_cmd = cmd.copy()
176
+ if resume_rollout is not None:
177
+ agent_cmd.extend(["-c", f'experimental_resume="{resume_rollout}"'])
178
+
179
+ # append prompt
180
+ agent_cmd.append(agent_prompt)
181
+
182
+ # run agent
183
+ result = await sbox.exec(
184
+ cmd=["bash", "-c", 'exec "$@"', "bash"] + agent_cmd,
185
+ cwd=cwd,
186
+ env={
187
+ "CODEX_HOME": codex_home,
188
+ "OPENAI_BASE_URL": f"http://localhost:{bridge.port}/v1",
189
+ "RUST_LOG": "debug",
190
+ }
191
+ | (env or {}),
192
+ )
193
+
194
+ # record output for debug
195
+ debug_output.append(result.stdout)
196
+ debug_output.append(result.stderr)
197
+
198
+ # raise for error
199
+ if not result.success:
200
+ raise RuntimeError(
201
+ f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
202
+ )
203
+
204
+ # exit if we are at max_attempts
205
+ attempt_count += 1
206
+ if attempt_count >= attempts.attempts:
207
+ break
208
+
209
+ # score this attempt
210
+ answer_scores = await score(state)
211
+
212
+ # break if we score 'correct'
213
+ if attempts.score_value(answer_scores[0].value) == 1.0:
214
+ break
215
+
216
+ # otherwise update prompt with incorrect message and continue
217
+ else:
218
+ resume_rollout = await _last_rollout(sbox, codex_home, user)
219
+ if callable(attempts.incorrect_message):
220
+ if not is_callable_coroutine(attempts.incorrect_message):
221
+ raise ValueError(
222
+ "The incorrect_message function must be async."
223
+ )
224
+ agent_prompt = await attempts.incorrect_message(
225
+ state, answer_scores
226
+ )
227
+ else:
228
+ agent_prompt = attempts.incorrect_message
229
+
230
+ # trace debug info
231
+ debug_output.insert(0, "Codex CLI Debug Output:")
232
+ trace("\n".join(debug_output))
233
+
234
+ # return success
235
+ return bridge.state
236
+
237
+ return agent_with(execute, name=name, description=description)
238
+
239
+
240
+ async def _last_rollout(
241
+ sandbox: SandboxEnvironment, codex_home: str, user: str | None
242
+ ) -> str | None:
243
+ try:
244
+ rollout = await sandbox_exec(
245
+ sandbox,
246
+ f"find '{codex_home}/sessions' -type f -name 'rollout-*.jsonl' -exec ls -t -- {{}} + | head -n 1",
247
+ user=user,
248
+ )
249
+ return rollout.strip()
250
+ except RuntimeError as ex:
251
+ logger.warning(f"Error attempting to read rollout file: {ex}")
252
+ return None
@@ -0,0 +1,6 @@
1
+ # ruff: noqa: F401
2
+
3
+ from ._claude_code.claude_code import claude_code
4
+ from ._codex_cli.codex_cli import codex_cli
5
+
6
+ __all__ = ["codex_cli", "claude_code"]
@@ -1,12 +1,14 @@
1
1
  from typing import Literal
2
2
 
3
- from .._claude_code.install.download import download_claude_code_async
3
+ from .._claude_code.agentbinary import claude_code_binary_source
4
+ from .._codex_cli.agentbinary import codex_cli_binary_source
4
5
  from .._util._async import run_coroutine
6
+ from .._util.agentbinary import download_agent_binary_async
5
7
  from .._util.sandbox import SandboxPlatform
6
8
 
7
9
 
8
10
  def download_agent_binary(
9
- binary: Literal["claude_code"],
11
+ binary: Literal["claude_code", "codex_cli"],
10
12
  version: Literal["stable", "latest"] | str,
11
13
  platform: SandboxPlatform,
12
14
  ) -> None:
@@ -21,7 +23,12 @@ def download_agent_binary(
21
23
  version: Version to download ("stable", "latest", or an explicit version number).
22
24
  platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
23
25
  """
24
- if binary == "claude_code":
25
- run_coroutine(download_claude_code_async(version, platform))
26
- else:
27
- raise ValueError(f"Unsuported agent binary type: {binary}")
26
+ match binary:
27
+ case "claude_code":
28
+ source = claude_code_binary_source()
29
+ case "codex_cli":
30
+ source = codex_cli_binary_source()
31
+ case _:
32
+ raise ValueError(f"Unsuported agent binary type: {binary}")
33
+
34
+ run_coroutine(download_agent_binary_async(source, version, platform))
@@ -0,0 +1,185 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import Awaitable, Callable, Literal, NamedTuple
4
+
5
+ from inspect_ai.util import SandboxEnvironment, concurrency
6
+ from inspect_ai.util import sandbox as sandbox_env
7
+
8
+ from inspect_swe._util.trace import trace
9
+
10
+ from .checksum import verify_checksum
11
+ from .download import download_file
12
+ from .sandbox import (
13
+ SandboxPlatform,
14
+ bash_command,
15
+ detect_sandbox_platform,
16
+ sandbox_exec,
17
+ )
18
+
19
+
20
+ class AgentBinaryVersion(NamedTuple):
21
+ version: str
22
+ expected_checksum: str
23
+ download_url: str
24
+
25
+
26
+ @dataclass
27
+ class AgentBinarySource:
28
+ agent: str
29
+ binary: str
30
+ resolve_version: Callable[
31
+ [Literal["stable", "latest"] | str, SandboxPlatform],
32
+ Awaitable[AgentBinaryVersion],
33
+ ]
34
+ cached_binary_path: Callable[[str, SandboxPlatform], Path]
35
+ list_cached_binaries: Callable[[], list[Path]]
36
+ post_download: Callable[[bytes], bytes] | None
37
+ post_install: str | None
38
+
39
+
40
+ async def ensure_agent_binary_installed(
41
+ source: AgentBinarySource,
42
+ version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
43
+ user: str | None = None,
44
+ sandbox: SandboxEnvironment | None = None,
45
+ ) -> str:
46
+ # resolve sandbox
47
+ sandbox = sandbox or sandbox_env()
48
+
49
+ # look in the sandbox first if we need to
50
+ if version == "auto" or version == "sandbox":
51
+ result = await sandbox.exec(bash_command(f"which {source.binary}"), user=user)
52
+ if result.success:
53
+ binary_path = result.stdout.strip()
54
+ trace(f"Using {source.agent} installed in sandbox: {binary_path}")
55
+ return binary_path
56
+
57
+ # if version == "sandbox" and we don't find it that's an error
58
+ if version == "sandbox":
59
+ raise RuntimeError(f"unable to locate {source.agent} in sandbox")
60
+
61
+ # otherwise set to "stable"
62
+ version = "stable"
63
+
64
+ # detect the sandbox target platform
65
+ platform = await detect_sandbox_platform(sandbox)
66
+
67
+ # use concurrency so multiple samples don't attempt the same download all at once
68
+ async with concurrency(f"{source.binary}-install", 1, visible=False):
69
+ # if a specific version is requested, first try to read it directly from the cache
70
+ if version not in ["stable", "latest"]:
71
+ binary_bytes: bytes | None = read_cached_binary(
72
+ source, version, platform, None
73
+ )
74
+ if binary_bytes is not None:
75
+ trace(f"Used claude code binary from cache: {version} ({platform})")
76
+ else:
77
+ binary_bytes = None
78
+
79
+ # download the binary
80
+ if binary_bytes is None:
81
+ binary_bytes, resolved_version = await download_agent_binary_async(
82
+ source, version, platform, trace
83
+ )
84
+ else:
85
+ # If we got it from cache, version is already the resolved version
86
+ resolved_version = version
87
+
88
+ # write it into the container and return it
89
+ binary_path = f"/opt/{source.binary}-{resolved_version}-{platform}"
90
+ await sandbox.write_file(binary_path, binary_bytes)
91
+ await sandbox_exec(sandbox, f"chmod +x {binary_path}")
92
+ if source.post_install:
93
+ await sandbox_exec(
94
+ sandbox, f"{binary_path} {source.post_install}", user=user
95
+ )
96
+ return binary_path
97
+
98
+
99
+ async def download_agent_binary_async(
100
+ source: AgentBinarySource,
101
+ version: Literal["stable", "latest"] | str,
102
+ platform: SandboxPlatform,
103
+ logger: Callable[[str], None] | None = None,
104
+ ) -> tuple[bytes, str]:
105
+ # resovle logger
106
+ logger = logger or print
107
+
108
+ # determine version and checksum
109
+ version, expected_checksum, download_url = await source.resolve_version(
110
+ version, platform
111
+ )
112
+
113
+ # check the cache (if post_download is used, don't verify checksum since cached is processed)
114
+ cache_checksum = None if source.post_download else expected_checksum
115
+ binary_data = read_cached_binary(source, version, platform, cache_checksum)
116
+ if binary_data is None:
117
+ # not in cache, download and verify checksum
118
+ binary_data = await download_file(download_url)
119
+ if not verify_checksum(binary_data, expected_checksum):
120
+ raise ValueError("Checksum verification failed")
121
+
122
+ # apply post-download processing if provided (e.g., extract from tar.gz)
123
+ if source.post_download is not None:
124
+ binary_data = source.post_download(binary_data)
125
+
126
+ # save to cache
127
+ write_cached_binary(source, binary_data, version, platform)
128
+
129
+ # trace
130
+ logger(f"Downloaded {source.agent} binary: {version} ({platform})")
131
+ else:
132
+ logger(f"Used {source.agent} binary from cache: {version} ({platform})")
133
+
134
+ # return data and resolved version
135
+ return binary_data, version
136
+
137
+
138
+ def read_cached_binary(
139
+ source: AgentBinarySource,
140
+ version: str,
141
+ platform: SandboxPlatform,
142
+ expected_checksum: str | None,
143
+ ) -> bytes | None:
144
+ # no cached binary
145
+ cache_path = source.cached_binary_path(version, platform)
146
+ if not cache_path.exists():
147
+ return None
148
+
149
+ # read binary
150
+ with open(cache_path, "rb") as f:
151
+ binary_data = f.read()
152
+
153
+ if expected_checksum is None or verify_checksum(binary_data, expected_checksum):
154
+ cache_path.touch()
155
+ return binary_data
156
+ else:
157
+ cache_path.unlink()
158
+ return None
159
+
160
+
161
+ def write_cached_binary(
162
+ source: AgentBinarySource,
163
+ binary_data: bytes,
164
+ version: str,
165
+ platform: SandboxPlatform,
166
+ ) -> None:
167
+ binary_path = source.cached_binary_path(version, platform)
168
+
169
+ with open(binary_path, "wb") as f:
170
+ f.write(binary_data)
171
+
172
+ _cleanup_binary_cache(source, keep_count=3)
173
+
174
+
175
+ def _cleanup_binary_cache(source: AgentBinarySource, keep_count: int = 5) -> None:
176
+ # get all cached binaries
177
+ cache_files = source.list_cached_binaries()
178
+ if len(cache_files) <= keep_count:
179
+ return
180
+
181
+ # remove oldest
182
+ cache_files.sort(key=lambda f: f.stat().st_atime)
183
+ files_to_remove = cache_files[:-keep_count]
184
+ for file_path in files_to_remove:
185
+ file_path.unlink()
@@ -1,8 +1,9 @@
1
1
  from pathlib import Path
2
2
 
3
- from inspect_ai._util.constants import PKG_NAME
4
3
  from platformdirs import user_cache_path, user_data_path
5
4
 
5
+ from .constants import PKG_NAME
6
+
6
7
 
7
8
  def package_data_dir(subdir: str | None) -> Path:
8
9
  data_dir = user_data_path(PKG_NAME)
@@ -49,9 +49,12 @@ def bash_command(cmd: str) -> list[str]:
49
49
 
50
50
 
51
51
  async def sandbox_exec(
52
- sandbox: SandboxEnvironment, cmd: str, user: str | None = None
52
+ sandbox: SandboxEnvironment,
53
+ cmd: str,
54
+ user: str | None = None,
55
+ cwd: str | None = None,
53
56
  ) -> str:
54
- result = await sandbox.exec(bash_command(cmd), user=user)
57
+ result = await sandbox.exec(bash_command(cmd), user=user, cwd=cwd)
55
58
  if not result.success:
56
59
  raise RuntimeError(f"Error executing sandbox command {cmd}: {result.stderr}")
57
60
  return result.stdout.strip()
@@ -0,0 +1,27 @@
1
+ import gzip
2
+ import tarfile
3
+ from io import BytesIO
4
+ from typing import Any, cast
5
+
6
+
7
+ def extract_tarball(tarball_bytes: bytes) -> bytes:
8
+ """Extract the binary from a tar.gz archive."""
9
+ # Open the gzip-compressed tarball
10
+ with BytesIO(tarball_bytes) as tarball_io:
11
+ with gzip.open(tarball_io, "rb") as gz:
12
+ with tarfile.open(fileobj=cast(Any, gz), mode="r") as tar:
13
+ # List all members (should be just one file)
14
+ members = tar.getmembers()
15
+ if len(members) != 1:
16
+ raise ValueError(
17
+ f"Expected 1 file in tarball, found {len(members)}"
18
+ )
19
+
20
+ # Extract the binary file
21
+ member = members[0]
22
+ extracted = tar.extractfile(member)
23
+ if extracted is None:
24
+ raise ValueError(f"Could not extract {member.name}")
25
+
26
+ result: bytes = extracted.read()
27
+ return result
@@ -0,0 +1,62 @@
1
+ from datetime import date, datetime, time
2
+ from typing import Any, Dict, List
3
+
4
+
5
+ def to_toml(data: Dict[str, Any]) -> str:
6
+ """Convert a dictionary to TOML format string."""
7
+ lines = []
8
+
9
+ # Handle top-level key-value pairs first
10
+ top_level = {}
11
+ tables = {}
12
+
13
+ for key, value in data.items():
14
+ if isinstance(value, dict):
15
+ tables[key] = value
16
+ else:
17
+ top_level[key] = value
18
+
19
+ # Write top-level pairs
20
+ for key, value in top_level.items():
21
+ lines.append(f"{key} = {_format_value(value)}")
22
+
23
+ # Write tables
24
+ for table_name, table_data in tables.items():
25
+ if lines: # Add blank line before tables
26
+ lines.append("")
27
+ lines.append(f"[{table_name}]")
28
+ _write_table(lines, table_data)
29
+
30
+ return "\n".join(lines)
31
+
32
+
33
+ def _format_value(value: Any) -> str:
34
+ """Format a Python value as TOML."""
35
+ if isinstance(value, str):
36
+ # Escape special characters and quote
37
+ escaped = value.replace("\\", "\\\\").replace('"', '\\"')
38
+ return f'"{escaped}"'
39
+ elif isinstance(value, bool):
40
+ return "true" if value else "false"
41
+ elif isinstance(value, (int, float)):
42
+ return str(value)
43
+ elif isinstance(value, datetime):
44
+ return value.isoformat()
45
+ elif isinstance(value, date):
46
+ return value.isoformat()
47
+ elif isinstance(value, time):
48
+ return value.isoformat()
49
+ elif isinstance(value, list):
50
+ formatted_items = [_format_value(item) for item in value]
51
+ return f"[{', '.join(formatted_items)}]"
52
+ elif value is None:
53
+ raise ValueError("TOML doesn't support null values")
54
+ else:
55
+ raise TypeError(f"Unsupported type: {type(value)}")
56
+
57
+
58
+ def _write_table(lines: List[str], data: Dict[str, Any]) -> None:
59
+ """Write table contents."""
60
+ for key, value in data.items():
61
+ if not isinstance(value, dict):
62
+ lines.append(f"{key} = {_format_value(value)}")
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.10'
32
- __version_tuple__ = version_tuple = (0, 2, 10)
31
+ __version__ = version = '0.2.11'
32
+ __version_tuple__ = version_tuple = (0, 2, 11)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,58 +0,0 @@
1
- from pathlib import Path
2
-
3
- from inspect_swe._util.sandbox import SandboxPlatform
4
-
5
- from ..._util.appdirs import package_cache_dir
6
- from ..._util.checksum import verify_checksum
7
-
8
-
9
- def read_cached_claude_code_binary(
10
- version: str, platform: SandboxPlatform, expected_checksum: str | None
11
- ) -> bytes | None:
12
- # no cached binary
13
- cache_path = _claude_code_cached_binary(version, platform)
14
- if not cache_path.exists():
15
- return None
16
-
17
- # read binary
18
- with open(cache_path, "rb") as f:
19
- binary_data = f.read()
20
-
21
- if expected_checksum is None or verify_checksum(binary_data, expected_checksum):
22
- cache_path.touch()
23
- return binary_data
24
- else:
25
- cache_path.unlink()
26
- return None
27
-
28
-
29
- def write_cached_claude_code_binary(
30
- binary_data: bytes, version: str, platform: SandboxPlatform
31
- ) -> None:
32
- binary_path = _claude_code_cached_binary(version, platform)
33
-
34
- with open(binary_path, "wb") as f:
35
- f.write(binary_data)
36
-
37
- _cleanup_claude_code_binary_cache(keep_count=3)
38
-
39
-
40
- def _cleanup_claude_code_binary_cache(keep_count: int = 5) -> None:
41
- # get all cached binaries
42
- cache_files = list(_claude_code_cached_binary_dir().glob("claude-*"))
43
- if len(cache_files) <= keep_count:
44
- return
45
-
46
- # remove oldest
47
- cache_files.sort(key=lambda f: f.stat().st_atime)
48
- files_to_remove = cache_files[:-keep_count]
49
- for file_path in files_to_remove:
50
- file_path.unlink()
51
-
52
-
53
- def _claude_code_cached_binary_dir() -> Path:
54
- return package_cache_dir("claude-code-downloads")
55
-
56
-
57
- def _claude_code_cached_binary(version: str, platform: SandboxPlatform) -> Path:
58
- return _claude_code_cached_binary_dir() / f"claude-{version}-{platform}"
@@ -1,62 +0,0 @@
1
- from typing import Literal
2
-
3
- from inspect_ai.util import SandboxEnvironment, concurrency
4
- from inspect_ai.util import sandbox as sandbox_env
5
-
6
- from inspect_swe._claude_code.install.cache import read_cached_claude_code_binary
7
- from inspect_swe._util.trace import trace
8
-
9
- from ..._util.sandbox import bash_command, detect_sandbox_platform, sandbox_exec
10
- from .download import download_claude_code_async
11
-
12
-
13
- async def ensure_claude_code_installed(
14
- version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
15
- user: str | None = None,
16
- sandbox: SandboxEnvironment | None = None,
17
- ) -> str:
18
- # resolve sandbox
19
- sandbox = sandbox or sandbox_env()
20
-
21
- # look in the sandbox first if we need to
22
- if version == "auto" or version == "sandbox":
23
- result = await sandbox.exec(bash_command("which claude"), user=user)
24
- if result.success:
25
- claude_binary = result.stdout.strip()
26
- trace(f"Using claude code installed in sandbox: {claude_binary}")
27
- return claude_binary
28
-
29
- # if version == "sandbox" and we don't find it that's an error
30
- if version == "sandbox":
31
- raise RuntimeError("unable to locate claude code in sandbox")
32
-
33
- # otherwise set to "stable"
34
- version = "stable"
35
-
36
- # detect the sandbox target platform
37
- platform = await detect_sandbox_platform(sandbox)
38
-
39
- # use concurrency so multiple samples don't attempt the same download all at once
40
- async with concurrency("claude-install", 1, visible=False):
41
- # if a specific version is requested, first try to read it directly from the cache
42
- if version not in ["stable", "latest"]:
43
- claude_binary_bytes: bytes | None = read_cached_claude_code_binary(
44
- version, platform, None
45
- )
46
- if claude_binary_bytes is not None:
47
- trace(f"Used claude code binary from cache: {version} ({platform})")
48
- else:
49
- claude_binary_bytes = None
50
-
51
- # download the binary
52
- if claude_binary_bytes is None:
53
- claude_binary_bytes = await download_claude_code_async(
54
- version, platform, trace
55
- )
56
-
57
- # write it into the container and return it
58
- claude_binary = f"/opt/claude-{version}-{platform}"
59
- await sandbox.write_file(claude_binary, claude_binary_bytes)
60
- await sandbox_exec(sandbox, f"chmod +x {claude_binary}")
61
- await sandbox_exec(sandbox, f"{claude_binary} config list", user=user)
62
- return claude_binary
@@ -1,5 +0,0 @@
1
- # ruff: noqa: F401
2
-
3
- from ._claude_code.claude_code import claude_code
4
-
5
- __all__ = ["claude_code"]
File without changes
File without changes
File without changes