PyPI - inspect-swe - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

inspect-swe 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_swe
-Version: 0.2.3
+Version: 0.2.4
 Summary: Software engineering agents for Inspect AI.
 Project-URL: Documentation, https://meridianlabs-ai.github.io/inspect_swe/
 Project-URL: Source Code, https://github.com/meridianlabs-ai/inspect_swe
@@ -14,6 +14,7 @@ Requires-Dist: inspect-ai>=0.3.125
 Requires-Dist: nest-asyncio
 Requires-Dist: platformdirs
 Requires-Dist: pydantic>=2.11.4
+Requires-Dist: pyyaml
 Requires-Dist: sniffio
 Requires-Dist: typing-extensions>=4.9.0
 Provides-Extra: dev
@@ -23,6 +24,7 @@ Requires-Dist: openai; extra == 'dev'
 Requires-Dist: pytest; extra == 'dev'
 Requires-Dist: pytest-dotenv; extra == 'dev'
 Requires-Dist: ruff; extra == 'dev'
+Requires-Dist: types-pyyaml; extra == 'dev'
 Provides-Extra: doc
 Requires-Dist: quarto-cli==1.7.31; extra == 'doc'
 Description-Content-Type: text/markdown

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/pyproject.toml RENAMED Viewed

@@ -16,6 +16,7 @@ dependencies = [
     "nest_asyncio",
     "platformdirs",
     "pydantic>=2.11.4",
+    "pyyaml",
     "sniffio",
     "typing_extensions>=4.9.0",
 ]
@@ -30,7 +31,15 @@ inspect_swe = "inspect_swe._registry"
 [project.optional-dependencies]
-dev = ["ruff", "mypy", "pytest", "anthropic", "openai", "pytest-dotenv"]
+dev = [
+    "ruff",
+    "mypy",
+    "pytest",
+    "anthropic",
+    "openai",
+    "pytest-dotenv",
+    "types-PyYAML",
+]
 doc = ["quarto-cli==1.7.31"]
 [tool.hatch.build]

inspect_swe-0.2.4/src/inspect_swe/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from ._claude_code.claude_code import ClaudeCodeOptions, claude_code
+from ._tools.download import download_agent_binary
+from ._util.sandbox import SandboxPlatform
+try:
+    from ._version import __version__
+except ImportError:
+    __version__ = "unknown"
+__all__ = [
+    "claude_code",
+    "ClaudeCodeOptions",
+    "download_agent_binary",
+    "SandboxPlatform",
+    "__version__",
+]

inspect_swe-0.2.4/src/inspect_swe/_claude_code/claude_code.py ADDED Viewed

@@ -0,0 +1,190 @@
+import uuid
+from textwrap import dedent
+from typing import Any, Literal, Sequence
+from inspect_ai.agent import (
+    Agent,
+    AgentState,
+    agent,
+    agent_with,
+    sandbox_agent_bridge,
+)
+from inspect_ai.model import ChatMessageSystem, ChatMessageUser
+from inspect_ai.tool import MCPServerConfig
+from inspect_ai.util import sandbox as sandbox_env
+from pydantic import BaseModel, Field
+from pydantic_core import to_json
+from inspect_swe._claude_code.install.install import ensure_claude_code_installed
+# TODO: AgentAttempts
+# TODO: AgentContinue
+# TODO: generate config merging (they are passing max_tokens=32000, temperature=1)
+class ClaudeCodeOptions(BaseModel):
+    """Claude Code options."""
+    system_prompt: str | None = Field(default=None)
+    """Additional system prompt to append to default system prompt."""
+    mcp_servers: Sequence[MCPServerConfig] | None = Field(default=None)
+    """MCP servers to make available to the agent."""
+    model: str | None = Field(default=None)
+    """ Model name to use for Opus and Sonnet calls (defaults to main model for task)."""
+    small_model: str | None = Field(default=None)
+    """Model to use for Haiku calls (defaults to main model for task)."""
+    env: dict[str, str] | None = Field(default=None)
+    """Environment variables to set for claude code."""
+@agent
+def claude_code(
+    name: str = "Claude Code",
+    description: str = dedent("""
+       Autonomous coding agent capable of writing, testing, debugging,
+       and iterating on code across multiple languages.
+    """),
+    options: ClaudeCodeOptions | None = None,
+    version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
+    user: str | None = None,
+    sandbox: str | None = None,
+) -> Agent:
+    """Claude Code agent.
+    Agent that uses [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) running in a sandbox.
+    The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
+    Args:
+        name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
+        description: Agent description (used in multi-agent systems with `as_tool()` and `handoff()`)
+        options: Claude code options.
+        version: Version of claude code to use. One of:
+            - "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
+            - "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
+            - "stable": Download and use the current stable version of claude code.
+            - "latest": Download and use the very latest version of claude code.
+            - "x.x.x": Download and use a specific version of claude code.
+        user: User to execute claude code with.
+        sandbox: Optional sandbox environment name.
+    """
+    # provide default options if none specified
+    options = options or ClaudeCodeOptions()
+    # resolve models
+    model = f"inspect/{options.model}" if options.model is not None else "inspect"
+    small_model = (
+        f"inspect/{options.small_model}"
+        if options.small_model is not None
+        else "inspect"
+    )
+    async def execute(state: AgentState) -> AgentState:
+        async with sandbox_agent_bridge(state) as bridge:
+            # ensure claude is installed and get binary location
+            claude_binary = await ensure_claude_code_installed(
+                version, user, sandbox_env(sandbox)
+            )
+            # allocate session_id
+            session_id = str(uuid.uuid4())
+            # base options
+            cmd = [
+                claude_binary,
+                "--session-id",
+                session_id,
+                "--print",  # run without interactions
+                "--dangerously-skip-permissions",
+                "--model",
+                model,
+            ]
+            # system prompt
+            system_messages = [
+                m.text for m in state.messages if isinstance(m, ChatMessageSystem)
+            ]
+            if options.system_prompt is not None:
+                system_messages.append(options.system_prompt)
+            if system_messages:
+                cmd.extend(["--append-system-prompt", "\n\n".join(system_messages)])
+            # mcp servers
+            if options.mcp_servers:
+                cmd.extend(mcp_server_args(options.mcp_servers))
+            # user prompt
+            prompt = "\n\n".join(
+                [m.text for m in state.messages if isinstance(m, ChatMessageUser)]
+            )
+            cmd.append("--")
+            cmd.append(prompt)
+            # resolve sandbox
+            sbox = sandbox_env(sandbox)
+            # execute the agent
+            result = await sbox.exec(
+                cmd=cmd,
+                env={
+                    "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
+                    "ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
+                    "ANTHROPIC_MODEL": model,
+                    "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
+                    "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
+                    "CLAUDE_CODE_SUBAGENT_MODEL": model,
+                    "ANTHROPIC_DEFAULT_HAIKU_MODEL": small_model,
+                    "ANTHROPIC_SMALL_FAST_MODEL": small_model,
+                    "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+                    "IS_SANDBOX": "1",
+                }
+                | (options.env or {}),
+                user=user,
+            )
+        if result.success:
+            return bridge.state
+        else:
+            raise RuntimeError(
+                f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
+            )
+    # return agent with specified name and descritpion
+    return agent_with(execute, name=name, description=description)
+def mcp_server_args(mcp_servers: Sequence[MCPServerConfig]) -> list[str]:
+    # build servers and allowed tools
+    mcp_servers_json: dict[str, dict[str, Any]] = {}
+    allowed_tools: list[str] = []
+    for mcp_server in mcp_servers:
+        mcp_servers_json[mcp_server.name] = mcp_server.model_dump(
+            exclude={"name", "tools"}, exclude_none=True
+        )
+        if mcp_server.tools == "all":
+            allowed_tools.append(f"mcp__{mcp_server.name}_*")
+        elif isinstance(mcp_server.tools, list):
+            allowed_tools.extend(
+                [f"mcp__{mcp_server.name}__{tool}" for tool in mcp_server.tools]
+            )
+        else:
+            raise ValueError(
+                f"Unexpected value for mcp server tools: {mcp_server.tools}"
+            )
+    # map to cli args
+    cmds: list[str] = []
+    if len(mcp_servers_json) > 0:
+        cmds.append("--mcp-config")
+        cmds.append(
+            to_json({"mcpServers": mcp_servers_json}, exclude_none=True).decode()
+        )
+    if len(allowed_tools):
+        cmds.append("--allowed-tools")
+        cmds.append(",".join(allowed_tools))
+    return cmds

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/src/inspect_swe/_claude_code/install/download.py RENAMED Viewed

@@ -3,7 +3,6 @@ from typing import Literal
 from pydantic import BaseModel
-from ..._util._async import run_coroutine
 from ..._util.checksum import verify_checksum
 from ..._util.download import download_file, download_text_file
 from ..._util.sandbox import SandboxPlatform
@@ -14,22 +13,6 @@ from .cache import (
 )
-def download_claude_code(
-    version: Literal["stable", "latest"] | str, platform: SandboxPlatform
-) -> None:
-    """Download Claude Code.
-    Download a version of Claude Code. This version will be added to the cache of downloaded versions (which retains the 5 most recently downloaded versions).
-    Use this if you need to ensure that a specific version of Claude Code is downloaded in advance (e.g. if you are going to run your evaluations offline). After downloading, explicit requests for the downloaded version (e.g. `claude_code(version="1.0.98")`) will not require network access.
-    Args:
-        version: Version to download ("stable", "latest", or an explicit version number).
-        platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
-    """
-    run_coroutine(download_claude_code_async(version, platform))
 async def download_claude_code_async(
     version: Literal["stable", "latest"] | str, platform: SandboxPlatform
 ) -> bytes:

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/src/inspect_swe/_claude_code/install/install.py RENAMED Viewed

@@ -45,6 +45,8 @@ async def ensure_claude_code_installed(
             )
             if claude_binary_bytes is not None:
                 trace(f"Used claude code binary from cache: {version} ({platform})")
+        else:
+            claude_binary_bytes = None
         # download the binary
         if claude_binary_bytes is None:

inspect_swe-0.2.4/src/inspect_swe/_tools/download.py ADDED Viewed

@@ -0,0 +1,27 @@
+from typing import Literal
+from .._claude_code.install.download import download_claude_code_async
+from .._util._async import run_coroutine
+from .._util.sandbox import SandboxPlatform
+def download_agent_binary(
+    binary: Literal["claude_code"],
+    version: Literal["stable", "latest"] | str,
+    platform: SandboxPlatform,
+) -> None:
+    """Download agent binary.
+    Download an agent binary. This version will be added to the cache of downloaded versions (which retains the 5 most recently downloaded versions).
+    Use this if you need to ensure that a specific version of an agent binary is downloaded in advance (e.g. if you are going to run your evaluations offline). After downloading, explicit requests for the downloaded version (e.g. `claude_code(version="1.0.98")`) will not require network access.
+    Args:
+        binary: Type of binary to download (currently only "claude_code")
+        version: Version to download ("stable", "latest", or an explicit version number).
+        platform: Target platform ("linux-x64", "linux-arm64", "linux-x64-musl", or "linux-arm64-musl")
+    """
+    if binary == "claude_code":
+        run_coroutine(download_claude_code_async(version, platform))
+    else:
+        raise ValueError(f"Unsuported agent binary type: {binary}")

inspect_swe-0.2.4/src/inspect_swe/_util/_yaml.py ADDED Viewed

@@ -0,0 +1,21 @@
+import re
+import yaml
+def read_front_matter_name(content: str) -> str | None:
+    # front-matter
+    frontmatter_match = re.match(r"^\s*---\s*\n(.*?)\n---", content, re.DOTALL)
+    if not frontmatter_match:
+        return None
+    frontmatter = frontmatter_match.group(1)
+    try:
+        # Parse as YAML
+        data = yaml.safe_load(frontmatter)
+        if "name" in data:
+            return str(data.get("name"))
+        else:
+            return None
+    except yaml.YAMLError:
+        return None

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/src/inspect_swe/_util/sandbox.py RENAMED Viewed

@@ -45,7 +45,7 @@ async def detect_sandbox_platform(sandbox: SandboxEnvironment) -> SandboxPlatfor
 def bash_command(cmd: str) -> list[str]:
-    return ["bash", "--login", "-c", cmd]
+    return ["bash", "-c", cmd]
 async def sandbox_exec(
@@ -53,7 +53,5 @@ async def sandbox_exec(
 ) -> str:
     result = await sandbox.exec(bash_command(cmd), user=user)
     if not result.success:
-        raise RuntimeError(
-            f"Error executing sandbox command {','.join(cmd)}: {result.stderr}"
-        )
+        raise RuntimeError(f"Error executing sandbox command {cmd}: {result.stderr}")
     return result.stdout.strip()

{inspect_swe-0.2.3 → inspect_swe-0.2.4}/src/inspect_swe/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.3'
-__version_tuple__ = version_tuple = (0, 2, 3)
+__version__ = version = '0.2.4'
+__version_tuple__ = version_tuple = (0, 2, 4)
 __commit_id__ = commit_id = None

inspect_swe-0.2.4/src/inspect_swe/py.typed ADDED Viewed

File without changes

inspect_swe-0.2.3/src/inspect_swe/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-from ._claude_code.claude_code import claude_code
-from ._claude_code.install.download import download_claude_code
-from ._util.sandbox import SandboxPlatform
-try:
-    from ._version import __version__
-except ImportError:
-    __version__ = "unknown"
-__all__ = ["claude_code", "download_claude_code", "SandboxPlatform", "__version__"]

inspect_swe-0.2.3/src/inspect_swe/_claude_code/claude_code.py DELETED Viewed

@@ -1,87 +0,0 @@
-from typing import Literal
-from inspect_ai.agent import (
-    Agent,
-    AgentState,
-    agent,
-    sandbox_agent_bridge,
-)
-from inspect_ai.model import ChatMessageSystem, ChatMessageUser
-from inspect_ai.util import sandbox as sandbox_env
-from inspect_swe._claude_code.install.install import ensure_claude_code_installed
-@agent
-def claude_code(
-    version: Literal["auto", "sandbox", "stable", "latest"] | str = "auto",
-    user: str | None = None,
-    sandbox: str | None = None,
-) -> Agent:
-    """Claude Code agent.
-    Agent that uses [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) running in a sandbox.
-    The agent can either use a version of Claude Code installed in the sandbox, or can download a version and install it in the sandbox (see docs on `version` option below for details).
-    Args:
-        version: Version of claude code to use. One of:
-            - "auto": Use any available version of claude code in the sandbox, otherwise download the current stable version.
-            - "sandbox": Use the version of claude code in the sandbox (raises `RuntimeError` if claude is not available in the sandbox)
-            - "stable": Download and use the current stable version of claude code.
-            - "latest": Download and use the very latest version of claude code.
-            - "x.x.x": Download and use a specific version of claude code.
-        user: User to execute claude code with.
-        sandbox: Optional sandbox environment name.
-    """
-    async def execute(state: AgentState) -> AgentState:
-        async with sandbox_agent_bridge(state) as bridge:
-            # ensure claude is installed and get binary location
-            claude_binary = await ensure_claude_code_installed(
-                version, user, sandbox_env(sandbox)
-            )
-            # base options
-            cmd = [
-                claude_binary,
-                "--print",  # run without interactions
-                "--dangerously-skip-permissions",
-                "--model",  # use current inspect model
-                "inspect",
-            ]
-            # system message
-            system_message = "\n\n".join(
-                [m.text for m in state.messages if isinstance(m, ChatMessageSystem)]
-            )
-            if system_message:
-                cmd.extend(["--append-system-prompt", system_message])
-            # user prompt
-            prompt = "\n\n".join(
-                [m.text for m in state.messages if isinstance(m, ChatMessageUser)]
-            )
-            cmd.append(prompt)
-            # execute the agent
-            result = await sandbox_env(sandbox).exec(
-                cmd=cmd,
-                env={
-                    "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
-                    "ANTHROPIC_API_KEY": "sk-ant-api03-DOq5tyLPrk9M4hPE",
-                    "ANTHROPIC_SMALL_FAST_MODEL": "inspect",
-                    "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
-                    "IS_SANDBOX": "1",
-                },
-                user=user,
-            )
-        if result.success:
-            return bridge.state
-        else:
-            raise RuntimeError(
-                f"Error executing claude code agent: {result.stdout}\n{result.stderr}"
-            )
-    return execute