inspect-swe 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/PKG-INFO +2 -2
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/pyproject.toml +1 -1
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/__init__.py +1 -2
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/claude_code.py +86 -69
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/_async.py +10 -1
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_version.py +2 -2
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/.gitignore +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/LICENSE +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/README.md +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/__init__.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/install/__init__.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/install/cache.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/install/download.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_claude_code/install/install.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_registry.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_tools/__init__.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_tools/download.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/__init__.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/_yaml.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/appdirs.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/checksum.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/constants.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/download.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/platform.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/sandbox.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/_util/trace.py +0 -0
- {inspect_swe-0.2.4 → inspect_swe-0.2.6}/src/inspect_swe/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: inspect_swe
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Software engineering agents for Inspect AI.
|
5
5
|
Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
|
6
6
|
Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
|
@@ -10,7 +10,7 @@ License: MIT License
|
|
10
10
|
License-File: LICENSE
|
11
11
|
Requires-Python: >=3.10
|
12
12
|
Requires-Dist: httpx
|
13
|
-
Requires-Dist: inspect-ai>=0.3.
|
13
|
+
Requires-Dist: inspect-ai>=0.3.126
|
14
14
|
Requires-Dist: nest-asyncio
|
15
15
|
Requires-Dist: platformdirs
|
16
16
|
Requires-Dist: pydantic>=2.11.4
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from ._claude_code.claude_code import
|
1
|
+
from ._claude_code.claude_code import claude_code
|
2
2
|
from ._tools.download import download_agent_binary
|
3
3
|
from ._util.sandbox import SandboxPlatform
|
4
4
|
|
@@ -10,7 +10,6 @@ except ImportError:
|
|
10
10
|
|
11
11
|
__all__ = [
|
12
12
|
"claude_code",
|
13
|
-
"ClaudeCodeOptions",
|
14
13
|
"download_agent_binary",
|
15
14
|
"SandboxPlatform",
|
16
15
|
"__version__",
|
@@ -4,41 +4,20 @@ from typing import Any, Literal, Sequence
|
|
4
4
|
|
5
5
|
from inspect_ai.agent import (
|
6
6
|
Agent,
|
7
|
+
AgentAttempts,
|
7
8
|
AgentState,
|
8
9
|
agent,
|
9
10
|
agent_with,
|
10
11
|
sandbox_agent_bridge,
|
11
12
|
)
|
12
13
|
from inspect_ai.model import ChatMessageSystem, ChatMessageUser
|
14
|
+
from inspect_ai.scorer import score
|
13
15
|
from inspect_ai.tool import MCPServerConfig
|
14
16
|
from inspect_ai.util import sandbox as sandbox_env
|
15
|
-
from pydantic import BaseModel, Field
|
16
17
|
from pydantic_core import to_json
|
17
18
|
|
18
|
-
from
|
19
|
-
|
20
|
-
# TODO: AgentAttempts
|
21
|
-
# TODO: AgentContinue
|
22
|
-
# TODO: generate config merging (they are passing max_tokens=32000, temperature=1)
|
23
|
-
|
24
|
-
|
25
|
-
class ClaudeCodeOptions(BaseModel):
|
26
|
-
"""Claude Code options."""
|
27
|
-
|
28
|
-
system_prompt: str | None = Field(default=None)
|
29
|
-
"""Additional system prompt to append to default system prompt."""
|
30
|
-
|
31
|
-
mcp_servers: Sequence[MCPServerConfig] | None = Field(default=None)
|
32
|
-
"""MCP servers to make available to the agent."""
|
33
|
-
|
34
|
-
model: str | None = Field(default=None)
|
35
|
-
""" Model name to use for Opus and Sonnet calls (defaults to main model for task)."""
|
36
|
-
|
37
|
-
small_model: str | None = Field(default=None)
|
38
|
-
"""Model to use for Haiku calls (defaults to main model for task)."""
|
39
|
-
|
40
|
-
env: dict[str, str] | None = Field(default=None)
|
41
|
-
"""Environment variables to set for claude code."""
|
19
|
+
from .._util._async import is_callable_coroutine
|
20
|
+
from .install.install import ensure_claude_code_installed
|
42
21
|
|
43
22
|
|
44
23
|
@agent
|
@@ -48,7 +27,12 @@ def claude_code(
|
|
48
27
|
Autonomous coding agent capable of writing, testing, debugging,
|
49
28
|
and iterating on code across multiple languages.
|
50
29
|
"""),
|
51
|
-
|
30
|
+
system_prompt: str | None = None,
|
31
|
+
mcp_servers: Sequence[MCPServerConfig] | None = None,
|
32
|
+
attempts: int | AgentAttempts = 1,
|
33
|
+
model: str | None = None,
|
34
|
+
small_model: str | None = None,
|
35
|
+
env: dict[str, str] | None = None,
|
52
36
|
version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
|
53
37
|
user: str | None = None,
|
54
38
|
sandbox: str | None = None,
|
@@ -59,10 +43,18 @@ def claude_code(
|
|
59
43
|
|
60
44
|
The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
|
61
45
|
|
46
|
+
Use the `attempts` option to enable additional submissions if the initial
|
47
|
+
submission(s) are incorrect (by default, no additional attempts are permitted).
|
48
|
+
|
62
49
|
Args:
|
63
50
|
name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
|
64
51
|
description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
|
65
|
-
|
52
|
+
system_prompt: Additional system prompt to append to default system prompt.
|
53
|
+
mcp_servers: MCP servers to make available to the agent.
|
54
|
+
attempts: Configure agent to make multiple attempts.
|
55
|
+
model: Model name to use for Opus and Sonnet calls (defaults to main model for task).
|
56
|
+
small_model: Model to use for Haiku calls (defaults to main model for task).
|
57
|
+
env: Environment variables to set for claude code.
|
66
58
|
version: Version of claude code to use. One of:
|
67
59
|
- "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
|
68
60
|
- "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
|
@@ -72,16 +64,12 @@ def claude_code(
|
|
72
64
|
user: User to execute claude code with.
|
73
65
|
sandbox: Optional sandbox environment name.
|
74
66
|
"""
|
75
|
-
# provide default options if none specified
|
76
|
-
options = options or ClaudeCodeOptions()
|
77
|
-
|
78
67
|
# resolve models
|
79
|
-
model = f"inspect/{
|
80
|
-
small_model =
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
)
|
68
|
+
model = f"inspect/{model}" if model is not None else "inspect"
|
69
|
+
small_model = f"inspect/{small_model}" if small_model is not None else "inspect"
|
70
|
+
|
71
|
+
# resolve attempts
|
72
|
+
attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
|
85
73
|
|
86
74
|
async def execute(state: AgentState) -> AgentState:
|
87
75
|
async with sandbox_agent_bridge(state) as bridge:
|
@@ -95,9 +83,6 @@ def claude_code(
|
|
95
83
|
|
96
84
|
# base options
|
97
85
|
cmd = [
|
98
|
-
claude_binary,
|
99
|
-
"--session-id",
|
100
|
-
session_id,
|
101
86
|
"--print", # run without interactions
|
102
87
|
"--dangerously-skip-permissions",
|
103
88
|
"--model",
|
@@ -108,50 +93,82 @@ def claude_code(
|
|
108
93
|
system_messages = [
|
109
94
|
m.text for m in state.messages if isinstance(m, ChatMessageSystem)
|
110
95
|
]
|
111
|
-
if
|
112
|
-
system_messages.append(
|
96
|
+
if system_prompt is not None:
|
97
|
+
system_messages.append(system_prompt)
|
113
98
|
if system_messages:
|
114
99
|
cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
|
115
100
|
|
116
101
|
# mcp servers
|
117
|
-
if
|
118
|
-
cmd.extend(mcp_server_args(
|
102
|
+
if mcp_servers:
|
103
|
+
cmd.extend(mcp_server_args(mcp_servers))
|
119
104
|
|
120
105
|
# user prompt
|
121
106
|
prompt = "\n\n".join(
|
122
107
|
[m.text for m in state.messages if isinstance(m, ChatMessageUser)]
|
123
108
|
)
|
124
|
-
cmd.append("--")
|
125
|
-
cmd.append(prompt)
|
126
109
|
|
127
110
|
# resolve sandbox
|
128
111
|
sbox = sandbox_env(sandbox)
|
129
112
|
|
130
113
|
# execute the agent
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
"
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
114
|
+
agent_prompt = prompt
|
115
|
+
attempt_count = 0
|
116
|
+
while True:
|
117
|
+
# either starting a new session or resuming one
|
118
|
+
id_param = "--session-id" if attempt_count == 0 else "--resume"
|
119
|
+
agent_cmd = (
|
120
|
+
[claude_binary, id_param, session_id] + cmd + ["--", agent_prompt]
|
121
|
+
)
|
122
|
+
|
123
|
+
# run agent
|
124
|
+
result = await sbox.exec(
|
125
|
+
cmd=agent_cmd,
|
126
|
+
env={
|
127
|
+
"ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
|
128
|
+
"ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
|
129
|
+
"ANTHROPIC_MODEL": model,
|
130
|
+
"ANTHROPIC_DEFAULT_OPUS_MODEL": model,
|
131
|
+
"ANTHROPIC_DEFAULT_SONNET_MODEL": model,
|
132
|
+
"CLAUDE_CODE_SUBAGENT_MODEL": model,
|
133
|
+
"ANTHROPIC_DEFAULT_HAIKU_MODEL": small_model,
|
134
|
+
"ANTHROPIC_SMALL_FAST_MODEL": small_model,
|
135
|
+
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
|
136
|
+
"IS_SANDBOX": "1",
|
137
|
+
}
|
138
|
+
| (env or {}),
|
139
|
+
user=user,
|
140
|
+
)
|
141
|
+
|
142
|
+
# raise for error
|
143
|
+
if not result.success:
|
144
|
+
f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
|
145
|
+
|
146
|
+
# exit if we are at max_attempts
|
147
|
+
attempt_count += 1
|
148
|
+
if attempt_count >= attempts.attempts:
|
149
|
+
break
|
150
|
+
|
151
|
+
# score this attempt
|
152
|
+
answer_scores = await score(state)
|
153
|
+
|
154
|
+
# break if we score 'correct'
|
155
|
+
if attempts.score_value(answer_scores[0].value) == 1.0:
|
156
|
+
break
|
157
|
+
|
158
|
+
# otherwise update prompt with incorrect message and continue
|
159
|
+
else:
|
160
|
+
if callable(attempts.incorrect_message):
|
161
|
+
if not is_callable_coroutine(attempts.incorrect_message):
|
162
|
+
raise ValueError(
|
163
|
+
"The incorrect_message function must be async."
|
164
|
+
)
|
165
|
+
agent_prompt = await attempts.incorrect_message(
|
166
|
+
state, answer_scores
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
agent_prompt = attempts.incorrect_message
|
170
|
+
|
171
|
+
return bridge.state
|
155
172
|
|
156
173
|
# return agent with specified name and descritpion
|
157
174
|
return agent_with(execute, name=name, description=description)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
|
-
|
2
|
+
import inspect
|
3
|
+
from typing import Any, Coroutine, Literal, TypeVar, cast
|
3
4
|
|
4
5
|
import nest_asyncio # type: ignore
|
5
6
|
import sniffio
|
@@ -9,6 +10,14 @@ from .platform import running_in_notebook
|
|
9
10
|
T = TypeVar("T")
|
10
11
|
|
11
12
|
|
13
|
+
def is_callable_coroutine(func_or_cls: Any) -> bool:
|
14
|
+
if inspect.iscoroutinefunction(func_or_cls):
|
15
|
+
return True
|
16
|
+
elif callable(func_or_cls):
|
17
|
+
return inspect.iscoroutinefunction(func_or_cls.__call__)
|
18
|
+
return False
|
19
|
+
|
20
|
+
|
12
21
|
def run_coroutine(coroutine: Coroutine[None, None, T]) -> T:
|
13
22
|
if current_async_backend() == "trio":
|
14
23
|
raise RuntimeError("run_coroutine cannot be used with trio")
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
28
28
|
commit_id: COMMIT_ID
|
29
29
|
__commit_id__: COMMIT_ID
|
30
30
|
|
31
|
-
__version__ = version = '0.2.
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
31
|
+
__version__ = version = '0.2.6'
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 6)
|
33
33
|
|
34
34
|
__commit_id__ = commit_id = None
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|