inspect-swe 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inspect_swe/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
- from ._claude_code.claude_code import claude_code
2
- from ._claude_code.install.download import download_claude_code
1
+ from ._claude_code.claude_code import ClaudeCodeOptions, claude_code
2
+ from ._tools.download import download_agent_binary
3
3
  from ._util.sandbox import SandboxPlatform
4
4
 
5
5
  try:
@@ -8,4 +8,10 @@ except ImportError:
8
8
  __version__ = "unknown"
9
9
 
10
10
 
11
- __all__ = ["claude_code", "download_claude_code", "SandboxPlatform", "__version__"]
11
+ __all__ = [
12
+ "claude_code",
13
+ "ClaudeCodeOptions",
14
+ "download_agent_binary",
15
+ "SandboxPlatform",
16
+ "__version__",
17
+ ]
@@ -1,19 +1,54 @@
1
- from typing import Literal
1
+ import uuid
2
+ from textwrap import dedent
3
+ from typing import Any, Literal, Sequence
2
4
 
3
5
  from inspect_ai.agent import (
4
6
  Agent,
5
7
  AgentState,
6
8
  agent,
9
+ agent_with,
7
10
  sandbox_agent_bridge,
8
11
  )
9
12
  from inspect_ai.model import ChatMessageSystem, ChatMessageUser
13
+ from inspect_ai.tool import MCPServerConfig
10
14
  from inspect_ai.util import sandbox as sandbox_env
15
+ from pydantic import BaseModel, Field
16
+ from pydantic_core import to_json
11
17
 
12
18
  from inspect_swe._claude_code.install.install import ensure_claude_code_installed
13
19
 
20
+ # TODO: AgentAttempts
21
+ # TODO: AgentContinue
22
+ # TODO: generate config merging (they are passing max_tokens=32000, temperature=1)
23
+
24
+
25
+ class ClaudeCodeOptions(BaseModel):
26
+ """Claude Code options."""
27
+
28
+ system_prompt: str | None = Field(default=None)
29
+ """Additional system prompt to append to default system prompt."""
30
+
31
+ mcp_servers: Sequence[MCPServerConfig] | None = Field(default=None)
32
+ """MCP servers to make available to the agent."""
33
+
34
+ model: str | None = Field(default=None)
35
+ """ Model name to use for Opus and Sonnet calls (defaults to main model for task)."""
36
+
37
+ small_model: str | None = Field(default=None)
38
+ """Model to use for Haiku calls (defaults to main model for task)."""
39
+
40
+ env: dict[str, str] | None = Field(default=None)
41
+ """Environment variables to set for claude code."""
42
+
14
43
 
15
44
  @agent
16
45
  def claude_code(
46
+ name: str = "Claude Code",
47
+ description: str = dedent("""
48
+ Autonomous coding agent capable of writing, testing, debugging,
49
+ and iterating on code across multiple languages.
50
+ """),
51
+ options: ClaudeCodeOptions | None = None,
17
52
  version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
18
53
  user: str | None = None,
19
54
  sandbox: str | None = None,
@@ -25,6 +60,9 @@ def claude_code(
25
60
  The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
26
61
 
27
62
  Args:
63
+ name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
64
+ description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
65
+ options: Claude code options.
28
66
  version: Version of claude code to use. One of:
29
67
  - "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
30
68
  - "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
@@ -34,6 +72,16 @@ def claude_code(
34
72
  user: User to execute claude code with.
35
73
  sandbox: Optional sandbox environment name.
36
74
  """
75
+ # provide default options if none specified
76
+ options = options or ClaudeCodeOptions()
77
+
78
+ # resolve models
79
+ model = f"inspect/{options.model}" if options.model is not None else "inspect"
80
+ small_model = (
81
+ f"inspect/{options.small_model}"
82
+ if options.small_model is not None
83
+ else "inspect"
84
+ )
37
85
 
38
86
  async def execute(state: AgentState) -> AgentState:
39
87
  async with sandbox_agent_bridge(state) as bridge:
@@ -42,38 +90,59 @@ def claude_code(
42
90
  version, user, sandbox_env(sandbox)
43
91
  )
44
92
 
93
+ # allocate session_id
94
+ session_id = str(uuid.uuid4())
95
+
45
96
  # base options
46
97
  cmd = [
47
98
  claude_binary,
99
+ "--session-id",
100
+ session_id,
48
101
  "--print", # run without interactions
49
102
  "--dangerously-skip-permissions",
50
- "--model", # use current inspect model
51
- "inspect",
103
+ "--model",
104
+ model,
52
105
  ]
53
106
 
54
- # system message
55
- system_message = "\n\n".join(
56
- [m.text for m in state.messages if isinstance(m, ChatMessageSystem)]
57
- )
58
- if system_message:
59
- cmd.extend(["--append-system-prompt", system_message])
107
+ # system prompt
108
+ system_messages = [
109
+ m.text for m in state.messages if isinstance(m, ChatMessageSystem)
110
+ ]
111
+ if options.system_prompt is not None:
112
+ system_messages.append(options.system_prompt)
113
+ if system_messages:
114
+ cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
115
+
116
+ # mcp servers
117
+ if options.mcp_servers:
118
+ cmd.extend(mcp_server_args(options.mcp_servers))
60
119
 
61
120
  # user prompt
62
121
  prompt = "\n\n".join(
63
122
  [m.text for m in state.messages if isinstance(m, ChatMessageUser)]
64
123
  )
124
+ cmd.append("--")
65
125
  cmd.append(prompt)
66
126
 
127
+ # resolve sandbox
128
+ sbox = sandbox_env(sandbox)
129
+
67
130
  # execute the agent
68
- result = await sandbox_env(sandbox).exec(
131
+ result = await sbox.exec(
69
132
  cmd=cmd,
70
133
  env={
71
134
  "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
72
135
  "ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
73
- "ANTHROPIC_SMALL_FAST_MODEL": "inspect",
136
+ "ANTHROPIC_MODEL": model,
137
+ "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
138
+ "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
139
+ "CLAUDE_CODE_SUBAGENT_MODEL": model,
140
+ "ANTHROPIC_DEFAULT_HAIKU_MODEL": small_model,
141
+ "ANTHROPIC_SMALL_FAST_MODEL": small_model,
74
142
  "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
75
143
  "IS_SANDBOX": "1",
76
- },
144
+ }
145
+ | (options.env or {}),
77
146
  user=user,
78
147
  )
79
148
 
@@ -84,4 +153,38 @@ def claude_code(
84
153
  f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
85
154
  )
86
155
 
87
- return execute
156
+ # return agent with specified name and descritpion
157
+ return agent_with(execute, name=name, description=description)
158
+
159
+
160
+ def mcp_server_args(mcp_servers: Sequence[MCPServerConfig]) -> list[str]:
161
+ # build servers and allowed tools
162
+ mcp_servers_json: dict[str, dict[str, Any]] = {}
163
+ allowed_tools: list[str] = []
164
+ for mcp_server in mcp_servers:
165
+ mcp_servers_json[mcp_server.name] = mcp_server.model_dump(
166
+ exclude={"name", "tools"}, exclude_none=True
167
+ )
168
+ if mcp_server.tools == "all":
169
+ allowed_tools.append(f"mcp__{mcp_server.name}_*")
170
+ elif isinstance(mcp_server.tools, list):
171
+ allowed_tools.extend(
172
+ [f"mcp__{mcp_server.name}__{tool}" for tool in mcp_server.tools]
173
+ )
174
+ else:
175
+ raise ValueError(
176
+ f"Unexpected value for mcp server tools: {mcp_server.tools}"
177
+ )
178
+
179
+ # map to cli args
180
+ cmds: list[str] = []
181
+ if len(mcp_servers_json) > 0:
182
+ cmds.append("--mcp-config")
183
+ cmds.append(
184
+ to_json({"mcpServers": mcp_servers_json}, exclude_none=True).decode()
185
+ )
186
+ if len(allowed_tools):
187
+ cmds.append("--allowed-tools")
188
+ cmds.append(",".join(allowed_tools))
189
+
190
+ return cmds
@@ -3,7 +3,6 @@ from typing import Literal
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from ..._util._async import run_coroutine
7
6
  from ..._util.checksum import verify_checksum
8
7
  from ..._util.download import download_file, download_text_file
9
8
  from ..._util.sandbox import SandboxPlatform
@@ -14,22 +13,6 @@ from .cache import (
14
13
  )
15
14
 
16
15
 
17
- def download_claude_code(
18
- version: Literal["stable", "latest"] | str, platform: SandboxPlatform
19
- ) -> None:
20
- """Download Claude Code.
21
-
22
- Download a version of Claude Code. This version will be added to the cache of downloaded versions (which retains the 5 most recently downloaded versions).
23
-
24
- Use this if you need to ensure that a specific version of Claude Code is downloaded in advance (e.g. if you are going to run your evaluations offline). After downloading, explicit requests for the downloaded version (e.g. `claude_code(version="1.0.98")`) will not require network access.
25
-
26
- Args:
27
- version: Version to download ("stable", "latest", or an explicit version number).
28
- platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
29
- """
30
- run_coroutine(download_claude_code_async(version, platform))
31
-
32
-
33
16
  async def download_claude_code_async(
34
17
  version: Literal["stable", "latest"] | str, platform: SandboxPlatform
35
18
  ) -> bytes:
@@ -45,6 +45,8 @@ async def ensure_claude_code_installed(
45
45
  )
46
46
  if claude_binary_bytes is not None:
47
47
  trace(f"Used claude code binary from cache: {version} ({platform})")
48
+ else:
49
+ claude_binary_bytes = None
48
50
 
49
51
  # download the binary
50
52
  if claude_binary_bytes is None:
File without changes
@@ -0,0 +1,27 @@
1
+ from typing import Literal
2
+
3
+ from .._claude_code.install.download import download_claude_code_async
4
+ from .._util._async import run_coroutine
5
+ from .._util.sandbox import SandboxPlatform
6
+
7
+
8
+ def download_agent_binary(
9
+ binary: Literal["claude_code"],
10
+ version: Literal["stable", "latest"] | str,
11
+ platform: SandboxPlatform,
12
+ ) -> None:
13
+ """Download agent binary.
14
+
15
+ Download an agent binary. This version will be added to the cache of downloaded versions (which retains the 5 most recently downloaded versions).
16
+
17
+ Use this if you need to ensure that a specific version of an agent binary is downloaded in advance (e.g. if you are going to run your evaluations offline). After downloading, explicit requests for the downloaded version (e.g. `claude_code(version="1.0.98")`) will not require network access.
18
+
19
+ Args:
20
+ binary: Type of binary to download (currently only "claude_code")
21
+ version: Version to download ("stable", "latest", or an explicit version number).
22
+ platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
23
+ """
24
+ if binary == "claude_code":
25
+ run_coroutine(download_claude_code_async(version, platform))
26
+ else:
27
+ raise ValueError(f"Unsuported agent binary type: {binary}")
@@ -0,0 +1,21 @@
1
+ import re
2
+
3
+ import yaml
4
+
5
+
6
+ def read_front_matter_name(content: str) -> str | None:
7
+ # front-matter
8
+ frontmatter_match = re.match(r"^\s*---\s*\n(.*?)\n---", content, re.DOTALL)
9
+ if not frontmatter_match:
10
+ return None
11
+ frontmatter = frontmatter_match.group(1)
12
+
13
+ try:
14
+ # Parse as YAML
15
+ data = yaml.safe_load(frontmatter)
16
+ if "name" in data:
17
+ return str(data.get("name"))
18
+ else:
19
+ return None
20
+ except yaml.YAMLError:
21
+ return None
@@ -45,7 +45,7 @@ async def detect_sandbox_platform(sandbox: SandboxEnvironment) -> SandboxPlatfor
45
45
 
46
46
 
47
47
  def bash_command(cmd: str) -> list[str]:
48
- return ["bash", "--login", "-c", cmd]
48
+ return ["bash", "-c", cmd]
49
49
 
50
50
 
51
51
  async def sandbox_exec(
@@ -53,7 +53,5 @@ async def sandbox_exec(
53
53
  ) -> str:
54
54
  result = await sandbox.exec(bash_command(cmd), user=user)
55
55
  if not result.success:
56
- raise RuntimeError(
57
- f"Error executing sandbox command {','.join(cmd)}: {result.stderr}"
58
- )
56
+ raise RuntimeError(f"Error executing sandbox command {cmd}: {result.stderr}")
59
57
  return result.stdout.strip()
inspect_swe/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.3'
32
- __version_tuple__ = version_tuple = (0, 2, 3)
31
+ __version__ = version = '0.2.4'
32
+ __version_tuple__ = version_tuple = (0, 2, 4)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inspect_swe
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Software engineering agents for Inspect AI.
5
5
  Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
6
6
  Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
@@ -14,6 +14,7 @@ Requires-Dist: inspect-ai>=0.3.125
14
14
  Requires-Dist: nest-asyncio
15
15
  Requires-Dist: platformdirs
16
16
  Requires-Dist: pydantic>=2.11.4
17
+ Requires-Dist: pyyaml
17
18
  Requires-Dist: sniffio
18
19
  Requires-Dist: typing-extensions>=4.9.0
19
20
  Provides-Extra: dev
@@ -23,6 +24,7 @@ Requires-Dist: openai; extra == 'dev'
23
24
  Requires-Dist: pytest; extra == 'dev'
24
25
  Requires-Dist: pytest-dotenv; extra == 'dev'
25
26
  Requires-Dist: ruff; extra == 'dev'
27
+ Requires-Dist: types-pyyaml; extra == 'dev'
26
28
  Provides-Extra: doc
27
29
  Requires-Dist: quarto-cli==1.7.31; extra == 'doc'
28
30
  Description-Content-Type: text/markdown
@@ -1,24 +1,27 @@
1
- inspect_swe/__init__.py,sha256=Jg2VYr_eK8_fOXA4Oj0UAQj-g-RxDJuXrIhxKhassko,335
1
+ inspect_swe/__init__.py,sha256=aqHkY79cer0TXcw2dy7RRLkGWoTCmCPFprPuCtXR_6k,386
2
2
  inspect_swe/_registry.py,sha256=jM37ysrY39Ufd67GRKbiwfSViOLlm-82lm_JEaWKshw,97
3
- inspect_swe/_version.py,sha256=kBRz0P2plw1eVdIpt70W6m1LMbEIhLY3RyOfVGdubaI,704
3
+ inspect_swe/_version.py,sha256=NRw4Jle4n9v_DD2wtplRqflGCvX8OU5eAjycYY0vY3Y,704
4
4
  inspect_swe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  inspect_swe/_claude_code/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- inspect_swe/_claude_code/claude_code.py,sha256=YfxNLgohMMhAohLdclgGyLsfcjocwgmMyOxl2-HlepA,3297
6
+ inspect_swe/_claude_code/claude_code.py,sha256=-E_Ibu_xwIuDGGYgSFKXeBitfny0kINjwu-n-2rQdj8,7114
7
7
  inspect_swe/_claude_code/install/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  inspect_swe/_claude_code/install/cache.py,sha256=k08bCxGq-iYVpO16LNQhPjxTM9p2iecpqMjqYd2WBss,1708
9
- inspect_swe/_claude_code/install/download.py,sha256=QKlFuDqCV55coTumIjyTXt2MU-vUQg8qPL3z3LHIUq8,4132
10
- inspect_swe/_claude_code/install/install.py,sha256=cJP2JOUZNfPphz0eWbzrY7ULjSUU_SbSlPy3QecBltw,2430
9
+ inspect_swe/_claude_code/install/download.py,sha256=s1y4CDHVbJenfsR7OUwwxr5QFp-rDi4XnIxumDEvmws,3217
10
+ inspect_swe/_claude_code/install/install.py,sha256=nbf1SZJzr4DBPfUmBH64zWcdI4AnKiKhm4Q4Zelh_TM,2483
11
+ inspect_swe/_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ inspect_swe/_tools/download.py,sha256=Jn_gcFR5Kw2vTYA1dWOFYRpqFtoFnKFv2Kv-4xT8tz4,1283
11
13
  inspect_swe/_util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
14
  inspect_swe/_util/_async.py,sha256=cL8_Smmj2Es41TefceGDYLyVaO7gZ56VJcA4oByrWfQ,1520
15
+ inspect_swe/_util/_yaml.py,sha256=sRgf0UryF9Bd7pEEyfzL1qZBCgrpYe0l3l3U7bYeU44,505
13
16
  inspect_swe/_util/appdirs.py,sha256=V3o1ERdSYLjKP-m4O1T_Hvkx0UsP2HdfvsshLSQgP6E,562
14
17
  inspect_swe/_util/checksum.py,sha256=i-_GhtgCFd5eFj3PPJiGSCHDhZdPcIPNwiqddX93Sls,186
15
18
  inspect_swe/_util/constants.py,sha256=xKvGgaJ0MwNbdzaken5HMbxYyKBEw_3VrBwCgkvAIWo,25
16
19
  inspect_swe/_util/download.py,sha256=cCUau4ZBOKezpotJV5-v3JY_5CuYDZ-VcWlLf_EyNL0,340
17
20
  inspect_swe/_util/platform.py,sha256=wm4efIFfdyTeaV2oxOXVvYl1u22MHX3jQMERHJMgv7A,339
18
- inspect_swe/_util/sandbox.py,sha256=RixiEY1asFHa8HTsAHAxYXcPL-mUMgprQke1-TRbWYE,1812
21
+ inspect_swe/_util/sandbox.py,sha256=2wYmVz5EGUDBhqbN3NgLAOsyKeU-KRI161MZMJ54n4M,1769
19
22
  inspect_swe/_util/trace.py,sha256=mFHmBKn2F8iJP9PpTHaCseMHnTMz3ErRx6RCKV83rZk,139
20
- inspect_swe-0.2.3.dist-info/METADATA,sha256=yod5MyJGNjnpnlPCPczXyXMfx5BXhBrHJDoIkcTGpDI,1658
21
- inspect_swe-0.2.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- inspect_swe-0.2.3.dist-info/entry_points.txt,sha256=OzpvUhd7M3T2Rog4MjwJAxIKeX5ljiR0mVYM9GefBKg,49
23
- inspect_swe-0.2.3.dist-info/licenses/LICENSE,sha256=Hi3UDcbD6yCKZ1mcgt7pprzSG0rDEnSrbrm3XinyiDA,1070
24
- inspect_swe-0.2.3.dist-info/RECORD,,
23
+ inspect_swe-0.2.4.dist-info/METADATA,sha256=wxryGFAjtZarLk41tmkyAGPVuiIh5OWFQq1QylHw0VM,1724
24
+ inspect_swe-0.2.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ inspect_swe-0.2.4.dist-info/entry_points.txt,sha256=OzpvUhd7M3T2Rog4MjwJAxIKeX5ljiR0mVYM9GefBKg,49
26
+ inspect_swe-0.2.4.dist-info/licenses/LICENSE,sha256=Hi3UDcbD6yCKZ1mcgt7pprzSG0rDEnSrbrm3XinyiDA,1070
27
+ inspect_swe-0.2.4.dist-info/RECORD,,