inspect-swe 0.2.4__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/PKG-INFO +2 -2
  2. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/pyproject.toml +1 -1
  3. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/__init__.py +1 -2
  4. inspect_swe-0.2.7/src/inspect_swe/_claude_code/claude_code.py +223 -0
  5. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/_async.py +10 -1
  6. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_version.py +2 -2
  7. inspect_swe-0.2.4/src/inspect_swe/_claude_code/claude_code.py +0 -190
  8. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/.gitignore +0 -0
  9. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/LICENSE +0 -0
  10. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/README.md +0 -0
  11. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_claude_code/__init__.py +0 -0
  12. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_claude_code/install/__init__.py +0 -0
  13. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_claude_code/install/cache.py +0 -0
  14. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_claude_code/install/download.py +0 -0
  15. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_claude_code/install/install.py +0 -0
  16. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_registry.py +0 -0
  17. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_tools/__init__.py +0 -0
  18. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_tools/download.py +0 -0
  19. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/__init__.py +0 -0
  20. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/_yaml.py +0 -0
  21. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/appdirs.py +0 -0
  22. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/checksum.py +0 -0
  23. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/constants.py +0 -0
  24. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/download.py +0 -0
  25. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/platform.py +0 -0
  26. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/sandbox.py +0 -0
  27. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/_util/trace.py +0 -0
  28. {inspect_swe-0.2.4 → inspect_swe-0.2.7}/src/inspect_swe/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inspect_swe
3
- Version: 0.2.4
3
+ Version: 0.2.7
4
4
  Summary: Software engineering agents for Inspect AI.
5
5
  Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
6
6
  Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
@@ -10,7 +10,7 @@ License: MIT License
10
10
  License-File: LICENSE
11
11
  Requires-Python: >=3.10
12
12
  Requires-Dist: httpx
13
- Requires-Dist: inspect-ai>=0.3.125
13
+ Requires-Dist: inspect-ai>=0.3.126
14
14
  Requires-Dist: nest-asyncio
15
15
  Requires-Dist: platformdirs
16
16
  Requires-Dist: pydantic>=2.11.4
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
12
12
  license = { text = "MIT License" }
13
13
  dependencies = [
14
14
  "httpx",
15
- "inspect_ai>=0.3.125",
15
+ "inspect_ai>=0.3.126",
16
16
  "nest_asyncio",
17
17
  "platformdirs",
18
18
  "pydantic>=2.11.4",
@@ -1,4 +1,4 @@
1
- from ._claude_code.claude_code import ClaudeCodeOptions, claude_code
1
+ from ._claude_code.claude_code import claude_code
2
2
  from ._tools.download import download_agent_binary
3
3
  from ._util.sandbox import SandboxPlatform
4
4
 
@@ -10,7 +10,6 @@ except ImportError:
10
10
 
11
11
  __all__ = [
12
12
  "claude_code",
13
- "ClaudeCodeOptions",
14
13
  "download_agent_binary",
15
14
  "SandboxPlatform",
16
15
  "__version__",
@@ -0,0 +1,223 @@
1
+ import uuid
2
+ from textwrap import dedent
3
+ from typing import Any, Literal, Sequence
4
+
5
+ from inspect_ai.agent import (
6
+ Agent,
7
+ AgentAttempts,
8
+ AgentState,
9
+ agent,
10
+ agent_with,
11
+ sandbox_agent_bridge,
12
+ )
13
+ from inspect_ai.model import ChatMessageSystem, ChatMessageUser
14
+ from inspect_ai.scorer import score
15
+ from inspect_ai.tool import MCPServerConfig
16
+ from inspect_ai.util import sandbox as sandbox_env
17
+ from pydantic_core import to_json
18
+
19
+ from .._util._async import is_callable_coroutine
20
+ from .install.install import ensure_claude_code_installed
21
+
22
+
23
+ @agent
24
+ def claude_code(
25
+ name: str = "Claude Code",
26
+ description: str = dedent("""
27
+ Autonomous coding agent capable of writing, testing, debugging,
28
+ and iterating on code across multiple languages.
29
+ """),
30
+ system_prompt: str | None = None,
31
+ mcp_servers: Sequence[MCPServerConfig] | None = None,
32
+ allowed_tools: list[str] | None = None,
33
+ disallowed_tools: list[str] | None = None,
34
+ attempts: int | AgentAttempts = 1,
35
+ model: str | None = None,
36
+ small_model: str | None = None,
37
+ env: dict[str, str] | None = None,
38
+ version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
39
+ user: str | None = None,
40
+ sandbox: str | None = None,
41
+ ) -> Agent:
42
+ """Claude Code agent.
43
+
44
+ Agent that uses [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) running in a sandbox.
45
+
46
+ The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
47
+
48
+ Use `allowed_tools` and `disallowed_tools` to control access to tools. See [Tools available to Claude](https://docs.anthropic.com/en/docs/claude-code/settings#tools-available-to-claude) for the list of built-in tools and [How to use Allowed Tools in Claude Code](https://www.instructa.ai/blog/claude-code/how-to-use-allowed-tools-in-claude-code) for details on the supported syntax. Note that `allowed_tools` enables you to filter allowed parameter values and `disallowed_tools` enables you to remove tools entirely. In other words, `allowed_tools` is not a complete list of what tools are available but rather just filters on tool parameters---to remove tools you need to explicitly set `disallowed_tools`.
49
+
50
+ Use the `attempts` option to enable additional submissions if the initial
51
+ submission(s) are incorrect (by default, no additional attempts are permitted).
52
+
53
+ Args:
54
+ name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
55
+ description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
56
+ system_prompt: Additional system prompt to append to default system prompt.
57
+ mcp_servers: MCP servers to make available to the agent.
58
+ allowed_tools: Parameter filters for built-in tools.
59
+ disallowed_tools: List of tool names to disallow entirely.
60
+ attempts: Configure agent to make multiple attempts.
61
+ model: Model name to use for Opus and Sonnet calls (defaults to main model for task).
62
+ small_model: Model to use for Haiku calls (defaults to main model for task).
63
+ env: Environment variables to set for claude code.
64
+ version: Version of claude code to use. One of:
65
+ - "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
66
+ - "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
67
+ - "stable": Download and use the current stable version of claude code.
68
+ - "latest": Download and use the very latest version of claude code.
69
+ - "x.x.x": Download and use a specific version of claude code.
70
+ user: User to execute claude code with.
71
+ sandbox: Optional sandbox environment name.
72
+ """
73
+ # resolve models
74
+ model = f"inspect/{model}" if model is not None else "inspect"
75
+ small_model = f"inspect/{small_model}" if small_model is not None else "inspect"
76
+
77
+ # resolve attempts
78
+ attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
79
+
80
+ async def execute(state: AgentState) -> AgentState:
81
+ async with sandbox_agent_bridge(state) as bridge:
82
+ # ensure claude is installed and get binary location
83
+ claude_binary = await ensure_claude_code_installed(
84
+ version, user, sandbox_env(sandbox)
85
+ )
86
+
87
+ # allocate session_id
88
+ session_id = str(uuid.uuid4())
89
+
90
+ # base options
91
+ cmd = [
92
+ "--print", # run without interactions
93
+ "--dangerously-skip-permissions",
94
+ "--model",
95
+ model,
96
+ ]
97
+
98
+ # system prompt
99
+ system_messages = [
100
+ m.text for m in state.messages if isinstance(m, ChatMessageSystem)
101
+ ]
102
+ if system_prompt is not None:
103
+ system_messages.append(system_prompt)
104
+ if system_messages:
105
+ cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
106
+
107
+ # mcp servers
108
+ cmd_allowed_tools = allowed_tools or []
109
+ if mcp_servers:
110
+ mcp_server_args, mcp_allowed_tools = resolve_mcp_servers(mcp_servers)
111
+ cmd.extend(mcp_server_args)
112
+ cmd_allowed_tools.extend(mcp_allowed_tools)
113
+
114
+ # add allowed and disallowed tools
115
+ if len(cmd_allowed_tools) > 0:
116
+ cmd.append("--allowed-tools")
117
+ cmd.append(",".join(cmd_allowed_tools))
118
+ if disallowed_tools is not None and len(disallowed_tools) > 0:
119
+ cmd.append("--disallowed-tools")
120
+ cmd.append(",".join(disallowed_tools))
121
+
122
+ # user prompt
123
+ prompt = "\n\n".join(
124
+ [m.text for m in state.messages if isinstance(m, ChatMessageUser)]
125
+ )
126
+
127
+ # resolve sandbox
128
+ sbox = sandbox_env(sandbox)
129
+
130
+ # execute the agent
131
+ agent_prompt = prompt
132
+ attempt_count = 0
133
+ while True:
134
+ # either starting a new session or resuming one
135
+ id_param = "--session-id" if attempt_count == 0 else "--resume"
136
+ agent_cmd = (
137
+ [claude_binary, id_param, session_id] + cmd + ["--", agent_prompt]
138
+ )
139
+
140
+ # run agent
141
+ result = await sbox.exec(
142
+ cmd=agent_cmd,
143
+ env={
144
+ "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
145
+ "ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
146
+ "ANTHROPIC_MODEL": model,
147
+ "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
148
+ "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
149
+ "CLAUDE_CODE_SUBAGENT_MODEL": model,
150
+ "ANTHROPIC_DEFAULT_HAIKU_MODEL": small_model,
151
+ "ANTHROPIC_SMALL_FAST_MODEL": small_model,
152
+ "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
153
+ "IS_SANDBOX": "1",
154
+ }
155
+ | (env or {}),
156
+ user=user,
157
+ )
158
+
159
+ # raise for error
160
+ if not result.success:
161
+ f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
162
+
163
+ # exit if we are at max_attempts
164
+ attempt_count += 1
165
+ if attempt_count >= attempts.attempts:
166
+ break
167
+
168
+ # score this attempt
169
+ answer_scores = await score(state)
170
+
171
+ # break if we score 'correct'
172
+ if attempts.score_value(answer_scores[0].value) == 1.0:
173
+ break
174
+
175
+ # otherwise update prompt with incorrect message and continue
176
+ else:
177
+ if callable(attempts.incorrect_message):
178
+ if not is_callable_coroutine(attempts.incorrect_message):
179
+ raise ValueError(
180
+ "The incorrect_message function must be async."
181
+ )
182
+ agent_prompt = await attempts.incorrect_message(
183
+ state, answer_scores
184
+ )
185
+ else:
186
+ agent_prompt = attempts.incorrect_message
187
+
188
+ return bridge.state
189
+
190
+ # return agent with specified name and descritpion
191
+ return agent_with(execute, name=name, description=description)
192
+
193
+
194
+ def resolve_mcp_servers(
195
+ mcp_servers: Sequence[MCPServerConfig],
196
+ ) -> tuple[list[str], list[str]]:
197
+ # build servers and allowed tools
198
+ mcp_servers_json: dict[str, dict[str, Any]] = {}
199
+ allowed_tools: list[str] = []
200
+ for mcp_server in mcp_servers:
201
+ mcp_servers_json[mcp_server.name] = mcp_server.model_dump(
202
+ exclude={"name", "tools"}, exclude_none=True
203
+ )
204
+ if mcp_server.tools == "all":
205
+ allowed_tools.append(f"mcp__{mcp_server.name}_*")
206
+ elif isinstance(mcp_server.tools, list):
207
+ allowed_tools.extend(
208
+ [f"mcp__{mcp_server.name}__{tool}" for tool in mcp_server.tools]
209
+ )
210
+ else:
211
+ raise ValueError(
212
+ f"Unexpected value for mcp server tools: {mcp_server.tools}"
213
+ )
214
+
215
+ # map to cli args
216
+ mcp_config_cmds: list[str] = []
217
+ if len(mcp_servers_json) > 0:
218
+ mcp_config_cmds.append("--mcp-config")
219
+ mcp_config_cmds.append(
220
+ to_json({"mcpServers": mcp_servers_json}, exclude_none=True).decode()
221
+ )
222
+
223
+ return mcp_config_cmds, allowed_tools
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
- from typing import Coroutine, Literal, TypeVar, cast
2
+ import inspect
3
+ from typing import Any, Coroutine, Literal, TypeVar, cast
3
4
 
4
5
  import nest_asyncio # type: ignore
5
6
  import sniffio
@@ -9,6 +10,14 @@ from .platform import running_in_notebook
9
10
  T = TypeVar("T")
10
11
 
11
12
 
13
+ def is_callable_coroutine(func_or_cls: Any) -> bool:
14
+ if inspect.iscoroutinefunction(func_or_cls):
15
+ return True
16
+ elif callable(func_or_cls):
17
+ return inspect.iscoroutinefunction(func_or_cls.__call__)
18
+ return False
19
+
20
+
12
21
  def run_coroutine(coroutine: Coroutine[None, None, T]) -> T:
13
22
  if current_async_backend() == "trio":
14
23
  raise RuntimeError("run_coroutine cannot be used with trio")
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.4'
32
- __version_tuple__ = version_tuple = (0, 2, 4)
31
+ __version__ = version = '0.2.7'
32
+ __version_tuple__ = version_tuple = (0, 2, 7)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,190 +0,0 @@
1
- import uuid
2
- from textwrap import dedent
3
- from typing import Any, Literal, Sequence
4
-
5
- from inspect_ai.agent import (
6
- Agent,
7
- AgentState,
8
- agent,
9
- agent_with,
10
- sandbox_agent_bridge,
11
- )
12
- from inspect_ai.model import ChatMessageSystem, ChatMessageUser
13
- from inspect_ai.tool import MCPServerConfig
14
- from inspect_ai.util import sandbox as sandbox_env
15
- from pydantic import BaseModel, Field
16
- from pydantic_core import to_json
17
-
18
- from inspect_swe._claude_code.install.install import ensure_claude_code_installed
19
-
20
- # TODO: AgentAttempts
21
- # TODO: AgentContinue
22
- # TODO: generate config merging (they are passing max_tokens=32000, temperature=1)
23
-
24
-
25
- class ClaudeCodeOptions(BaseModel):
26
- """Claude Code options."""
27
-
28
- system_prompt: str | None = Field(default=None)
29
- """Additional system prompt to append to default system prompt."""
30
-
31
- mcp_servers: Sequence[MCPServerConfig] | None = Field(default=None)
32
- """MCP servers to make available to the agent."""
33
-
34
- model: str | None = Field(default=None)
35
- """ Model name to use for Opus and Sonnet calls (defaults to main model for task)."""
36
-
37
- small_model: str | None = Field(default=None)
38
- """Model to use for Haiku calls (defaults to main model for task)."""
39
-
40
- env: dict[str, str] | None = Field(default=None)
41
- """Environment variables to set for claude code."""
42
-
43
-
44
- @agent
45
- def claude_code(
46
- name: str = "Claude Code",
47
- description: str = dedent("""
48
- Autonomous coding agent capable of writing, testing, debugging,
49
- and iterating on code across multiple languages.
50
- """),
51
- options: ClaudeCodeOptions | None = None,
52
- version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
53
- user: str | None = None,
54
- sandbox: str | None = None,
55
- ) -> Agent:
56
- """Claude Code agent.
57
-
58
- Agent that uses [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) running in a sandbox.
59
-
60
- The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
61
-
62
- Args:
63
- name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
64
- description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
65
- options: Claude code options.
66
- version: Version of claude code to use. One of:
67
- - "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
68
- - "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
69
- - "stable": Download and use the current stable version of claude code.
70
- - "latest": Download and use the very latest version of claude code.
71
- - "x.x.x": Download and use a specific version of claude code.
72
- user: User to execute claude code with.
73
- sandbox: Optional sandbox environment name.
74
- """
75
- # provide default options if none specified
76
- options = options or ClaudeCodeOptions()
77
-
78
- # resolve models
79
- model = f"inspect/{options.model}" if options.model is not None else "inspect"
80
- small_model = (
81
- f"inspect/{options.small_model}"
82
- if options.small_model is not None
83
- else "inspect"
84
- )
85
-
86
- async def execute(state: AgentState) -> AgentState:
87
- async with sandbox_agent_bridge(state) as bridge:
88
- # ensure claude is installed and get binary location
89
- claude_binary = await ensure_claude_code_installed(
90
- version, user, sandbox_env(sandbox)
91
- )
92
-
93
- # allocate session_id
94
- session_id = str(uuid.uuid4())
95
-
96
- # base options
97
- cmd = [
98
- claude_binary,
99
- "--session-id",
100
- session_id,
101
- "--print", # run without interactions
102
- "--dangerously-skip-permissions",
103
- "--model",
104
- model,
105
- ]
106
-
107
- # system prompt
108
- system_messages = [
109
- m.text for m in state.messages if isinstance(m, ChatMessageSystem)
110
- ]
111
- if options.system_prompt is not None:
112
- system_messages.append(options.system_prompt)
113
- if system_messages:
114
- cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
115
-
116
- # mcp servers
117
- if options.mcp_servers:
118
- cmd.extend(mcp_server_args(options.mcp_servers))
119
-
120
- # user prompt
121
- prompt = "\n\n".join(
122
- [m.text for m in state.messages if isinstance(m, ChatMessageUser)]
123
- )
124
- cmd.append("--")
125
- cmd.append(prompt)
126
-
127
- # resolve sandbox
128
- sbox = sandbox_env(sandbox)
129
-
130
- # execute the agent
131
- result = await sbox.exec(
132
- cmd=cmd,
133
- env={
134
- "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
135
- "ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
136
- "ANTHROPIC_MODEL": model,
137
- "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
138
- "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
139
- "CLAUDE_CODE_SUBAGENT_MODEL": model,
140
- "ANTHROPIC_DEFAULT_HAIKU_MODEL": small_model,
141
- "ANTHROPIC_SMALL_FAST_MODEL": small_model,
142
- "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
143
- "IS_SANDBOX": "1",
144
- }
145
- | (options.env or {}),
146
- user=user,
147
- )
148
-
149
- if result.success:
150
- return bridge.state
151
- else:
152
- raise RuntimeError(
153
- f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
154
- )
155
-
156
- # return agent with specified name and descritpion
157
- return agent_with(execute, name=name, description=description)
158
-
159
-
160
- def mcp_server_args(mcp_servers: Sequence[MCPServerConfig]) -> list[str]:
161
- # build servers and allowed tools
162
- mcp_servers_json: dict[str, dict[str, Any]] = {}
163
- allowed_tools: list[str] = []
164
- for mcp_server in mcp_servers:
165
- mcp_servers_json[mcp_server.name] = mcp_server.model_dump(
166
- exclude={"name", "tools"}, exclude_none=True
167
- )
168
- if mcp_server.tools == "all":
169
- allowed_tools.append(f"mcp__{mcp_server.name}_*")
170
- elif isinstance(mcp_server.tools, list):
171
- allowed_tools.extend(
172
- [f"mcp__{mcp_server.name}__{tool}" for tool in mcp_server.tools]
173
- )
174
- else:
175
- raise ValueError(
176
- f"Unexpected value for mcp server tools: {mcp_server.tools}"
177
- )
178
-
179
- # map to cli args
180
- cmds: list[str] = []
181
- if len(mcp_servers_json) > 0:
182
- cmds.append("--mcp-config")
183
- cmds.append(
184
- to_json({"mcpServers": mcp_servers_json}, exclude_none=True).decode()
185
- )
186
- if len(allowed_tools):
187
- cmds.append("--allowed-tools")
188
- cmds.append(",".join(allowed_tools))
189
-
190
- return cmds
File without changes
File without changes
File without changes