PyPI - inspect-ai - Versions diffs - 0.3.75__py3-none-any.whl → 0.3.76__py3-none-any.whl - Mend

inspect-ai 0.3.75py3-none-any.whl → 0.3.76py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -1,23 +1,12 @@
 import functools
 import os
 import re
-import sys
 from copy import copy
 from logging import getLogger
-from typing import Any, Literal, Optional, Tuple, TypedDict, cast
+from typing import Any, Literal, NamedTuple, Optional, Tuple, cast
 import httpcore
 import httpx
-from inspect_ai._util.http import is_retryable_http_status
-from .util.hooks import HttpxHooks
-if sys.version_info >= (3, 11):
-    from typing import NotRequired
-else:
-    from typing_extensions import NotRequired
 from anthropic import (
     APIConnectionError,
     APIStatusError,
@@ -39,19 +28,19 @@ from anthropic.types import (
     TextBlockParam,
     ThinkingBlock,
     ThinkingBlockParam,
+    ToolBash20250124Param,
     ToolParam,
     ToolResultBlockParam,
+    ToolTextEditor20250124Param,
     ToolUseBlock,
     ToolUseBlockParam,
     message_create_params,
 )
+from anthropic.types.beta import BetaToolComputerUse20250124Param
 from pydantic import JsonValue
 from typing_extensions import override
-from inspect_ai._util.constants import (
-    BASE_64_DATA_REMOVED,
-    NO_CONTENT,
-)
+from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
 from inspect_ai._util.content import (
     Content,
     ContentImage,
@@ -59,6 +48,7 @@ from inspect_ai._util.content import (
     ContentText,
 )
 from inspect_ai._util.error import exception_message
+from inspect_ai._util.http import is_retryable_http_status
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.logger import warn_once
 from inspect_ai._util.url import data_uri_mime_type, data_uri_to_base64
@@ -70,11 +60,14 @@ from .._model import ModelAPI
 from .._model_call import ModelCall
 from .._model_output import ChatCompletionChoice, ModelOutput, ModelUsage, StopReason
 from .util import environment_prerequisite_error, model_base_url
+from .util.hooks import HttpxHooks
 logger = getLogger(__name__)
 ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
+INTERNAL_COMPUTER_TOOL_NAME = "computer"
 class AnthropicAPI(ModelAPI):
     def __init__(
@@ -93,7 +86,7 @@ class AnthropicAPI(ModelAPI):
         else:
             self.service = None
-        # collect gemerate model_args (then delete them so we can pass the rest on)
+        # collect generate model_args (then delete them so we can pass the rest on)
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
@@ -193,14 +186,11 @@ class AnthropicAPI(ModelAPI):
         # generate
         try:
-            (
-                system_param,
-                tools_param,
-                messages,
-                computer_use,
-            ) = await self.resolve_chat_input(input, tools, config)
+            system_param, tools_param, messages = await self.resolve_chat_input(
+                input, tools, config
+            )
-            # prepare request params (assembed this way so we can log the raw model call)
+            # prepare request params (assembled this way so we can log the raw model call)
             request = dict(messages=messages)
             # system messages and tools
@@ -218,7 +208,13 @@ class AnthropicAPI(ModelAPI):
             # extra headers (for time tracker and computer use)
             extra_headers = headers | {HttpxHooks.REQUEST_ID_HEADER: request_id}
-            if computer_use:
+            if any(
+                tool.get("type", None) == "computer_20250124" for tool in tools_param
+            ):
+                # From: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#claude-3-7-sonnet-beta-flag
+                # Note: The Bash (bash_20250124) and Text Editor (text_editor_20250124)
+                # tools are generally available for Claude 3.5 Sonnet (new) as well and
+                # can be used without the computer use beta header.
                 betas.append("computer-use-2025-01-24")
             if len(betas) > 0:
                 extra_headers["anthropic-beta"] = ",".join(betas)
@@ -405,9 +401,7 @@ class AnthropicAPI(ModelAPI):
         input: list[ChatMessage],
         tools: list[ToolInfo],
         config: GenerateConfig,
-    ) -> Tuple[
-        list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
-    ]:
+    ) -> Tuple[list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam]]:
         # extract system message
         system_messages, messages = split_system_messages(input, config)
@@ -420,7 +414,7 @@ class AnthropicAPI(ModelAPI):
         )
         # tools
-        tools_params, computer_use = self.tool_params_for_tools(tools, config)
+        tools_params = [self.tool_param_for_tool_info(tool, config) for tool in tools]
         # system messages
         if len(system_messages) > 0:
@@ -470,40 +464,35 @@ class AnthropicAPI(ModelAPI):
                     add_cache_control(cast(dict[str, Any], content[-1]))
         # return chat input
-        return system_param, tools_params, message_params, computer_use
-    def tool_params_for_tools(
-        self, tools: list[ToolInfo], config: GenerateConfig
-    ) -> tuple[list["ToolParamDef"], bool]:
-        # tool params and computer_use bit to return
-        tool_params: list["ToolParamDef"] = []
-        computer_use = False
-        # for each tool, check if it has a native computer use implementation and use that
-        # when available (noting that we need to set the computer use request header)
-        for tool in tools:
-            computer_use_tool = (
+        return system_param, tools_params, message_params
+    def tool_param_for_tool_info(
+        self, tool: ToolInfo, config: GenerateConfig
+    ) -> "ToolParamDef":
+        # Use a native tool implementation when available. Otherwise, use the
+        # standard tool implementation
+        return self.maybe_native_tool_param(tool, config) or ToolParam(
+            name=tool.name,
+            description=tool.description,
+            input_schema=tool.parameters.model_dump(exclude_none=True),
+        )
+    def maybe_native_tool_param(
+        self, tool: ToolInfo, config: GenerateConfig
+    ) -> Optional["ToolParamDef"]:
+        return (
+            (
                 self.computer_use_tool_param(tool)
-                if config.internal_tools is not False
-                else None
+                or self.text_editor_tool_param(tool)
+                or self.bash_tool_param(tool)
             )
-            if computer_use_tool:
-                tool_params.append(computer_use_tool)
-                computer_use = True
-            else:
-                tool_params.append(
-                    ToolParam(
-                        name=tool.name,
-                        description=tool.description,
-                        input_schema=tool.parameters.model_dump(exclude_none=True),
-                    )
-                )
-        return tool_params, computer_use
+            if config.internal_tools is not False
+            else None
+        )
     def computer_use_tool_param(
         self, tool: ToolInfo
-    ) -> Optional["ComputerUseToolParam"]:
+    ) -> Optional[BetaToolComputerUse20250124Param]:
         # check for compatible 'computer' tool
         if tool.name == "computer" and (
             sorted(tool.parameters.properties.keys())
@@ -525,7 +514,7 @@ class AnthropicAPI(ModelAPI):
                     "Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
                 )
                 return None
-            return ComputerUseToolParam(
+            return BetaToolComputerUse20250124Param(
                 type="computer_20250124",
                 name="computer",
                 # Note: The dimensions passed here for display_width_px and display_height_px should
@@ -542,23 +531,58 @@ class AnthropicAPI(ModelAPI):
         else:
             return None
+    def text_editor_tool_param(
+        self, tool: ToolInfo
+    ) -> Optional[ToolTextEditor20250124Param]:
+        # check for compatible 'text editor' tool
+        if tool.name == "text_editor" and (
+            sorted(tool.parameters.properties.keys())
+            == sorted(
+                [
+                    "command",
+                    "file_text",
+                    "insert_line",
+                    "new_str",
+                    "old_str",
+                    "path",
+                    "view_range",
+                ]
+            )
+        ):
+            return ToolTextEditor20250124Param(
+                type="text_editor_20250124", name="str_replace_editor"
+            )
+        # not a text_editor tool
+        else:
+            return None
-# native anthropic tool definitions for computer use beta
-# https://docs.anthropic.com/en/docs/build-with-claude/computer-use
-class ComputerUseToolParam(TypedDict):
-    type: str
-    name: str
-    display_width_px: NotRequired[int]
-    display_height_px: NotRequired[int]
-    display_number: NotRequired[int]
+    def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
+        # check for compatible 'bash' tool
+        if tool.name == "bash_session" and (
+            sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
+        ):
+            return ToolBash20250124Param(type="bash_20250124", name="bash")
+        # not a bash tool
+        else:
+            return None
-# tools can be either a stock tool param or a special computer use tool param
-ToolParamDef = ToolParam | ComputerUseToolParam
+# tools can be either a stock tool param or a special Anthropic native use tool param
+ToolParamDef = (
+    ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | ToolBash20250124Param
+)
 def add_cache_control(
-    param: TextBlockParam | ToolParam | ComputerUseToolParam | dict[str, Any],
+    param: TextBlockParam
+    | ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | ToolBash20250124Param
+    | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -567,10 +591,10 @@ def consecutive_user_message_reducer(
     messages: list[MessageParam],
     message: MessageParam,
 ) -> list[MessageParam]:
-    return consective_message_reducer(messages, message, "user")
+    return consecutive_message_reducer(messages, message, "user")
-def consective_message_reducer(
+def consecutive_message_reducer(
     messages: list[MessageParam],
     message: MessageParam,
     role: Literal["user", "assistant"],
@@ -583,6 +607,7 @@ def consective_message_reducer(
 def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
+    # TODO: Fix this code as it currently drops interesting properties when combining
     role = a["role"]
     a_content = a["content"]
     b_content = b["content"]
@@ -702,7 +727,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
                 ToolUseBlockParam(
                     type="tool_use",
                     id=tool_call.id,
-                    name=tool_call.function,
+                    name=tool_call.internal_name or tool_call.function,
                     input=tool_call.arguments,
                 )
             )
@@ -749,11 +774,13 @@ async def model_output_from_message(
             content.append(ContentText(type="text", text=content_text))
         elif isinstance(content_block, ToolUseBlock):
             tool_calls = tool_calls or []
+            info = maybe_mapped_call_info(content_block.name, tools)
             tool_calls.append(
                 ToolCall(
-                    type="function",
+                    type=info.internal_type,
                     id=content_block.id,
-                    function=content_block.name,
+                    function=info.inspect_name,
+                    internal_name=info.internal_name,
                     arguments=content_block.model_dump().get("input", {}),
                 )
             )
@@ -803,6 +830,37 @@ async def model_output_from_message(
     )
+class CallInfo(NamedTuple):
+    internal_name: str | None
+    internal_type: str
+    inspect_name: str
+def maybe_mapped_call_info(tool_called: str, tools: list[ToolInfo]) -> CallInfo:
+    """
+    Return call info - potentially transformed by native tool mappings.
+    Anthropic prescribes names for their native tools - `computer`, `bash`, and
+    `str_replace_editor`. For a variety of reasons, Inspect's tool names to not
+    necessarily conform to internal names. Anthropic also provides specific tool
+    types for these built-in tools.
+    """
+    mappings = (
+        (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
+        ("str_replace_editor", "text_editor_20250124", "text_editor"),
+        ("bash", "bash_20250124", "bash_session"),
+    )
+    return next(
+        (
+            CallInfo(entry[0], entry[1], entry[2])
+            for entry in mappings
+            if entry[0] == tool_called and any(tool.name == entry[2] for tool in tools)
+        ),
+        CallInfo(None, "function", tool_called),
+    )
 def message_stop_reason(message: Message) -> StopReason:
     match message.stop_reason:
         case "end_turn" | "stop_sequence":

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -67,6 +67,16 @@ class OpenAIAPI(ModelAPI):
         config: GenerateConfig = GenerateConfig(),
         **model_args: Any,
     ) -> None:
+        # extract azure service prefix from model name (other providers
+        # that subclass from us like together expect to have the qualifier
+        # in the model name e.g. google/gemma-2b-it)
+        parts = model_name.split("/")
+        if parts[0] == "azure" and len(parts) > 1:
+            self.service: str | None = parts[0]
+            model_name = "/".join(parts[1:])
+        else:
+            self.service = None
         # call super
         super().__init__(
             model_name=model_name,
@@ -76,14 +86,6 @@ class OpenAIAPI(ModelAPI):
             config=config,
         )
-        # extract any service prefix from model name
-        parts = model_name.split("/")
-        if len(parts) > 1:
-            self.service: str | None = parts[0]
-            model_name = "/".join(parts[1:])
-        else:
-            self.service = None
         # resolve api_key
         if not self.api_key:
             self.api_key = os.environ.get(
@@ -322,6 +324,7 @@ class OpenAIAPI(ModelAPI):
             config.reasoning_effort is not None
             and not self.is_gpt()
             and not self.is_o1_mini()
+            and not self.is_o1_preview()
         ):
             params["reasoning_effort"] = config.reasoning_effort
         if config.response_schema is not None:

inspect_ai/model/_providers/vertex.py CHANGED Viewed

@@ -34,8 +34,8 @@ from inspect_ai._util.content import (
     Content,
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
-    ContentVideo,
 )
 from inspect_ai._util.http import is_retryable_http_status
 from inspect_ai._util.images import file_as_data
@@ -336,10 +336,13 @@ async def content_part(content: Content | str) -> Part:
     elif isinstance(content, ContentImage):
         image_bytes, mime_type = await file_as_data(content.image)
         return Part.from_image(image=Image.from_bytes(data=image_bytes))
+    elif isinstance(content, ContentReasoning):
+        return Part.from_text(content.reasoning or NO_CONTENT)
     else:
         if isinstance(content, ContentAudio):
             file = content.audio
-        elif isinstance(content, ContentVideo):
+        else:
+            # it's ContentVideo
             file = content.video
         file_bytes, mime_type = await file_as_data(file)
         return Part.from_data(file_bytes, mime_type)

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -22,17 +22,21 @@ from ._tool_def import ToolDef
 from ._tool_info import ToolInfo
 from ._tool_params import ToolParam, ToolParams
 from ._tool_with import tool_with
+from ._tools._bash_session import bash_session
 from ._tools._computer import computer
 from ._tools._execute import bash, python
+from ._tools._text_editor import text_editor
 from ._tools._web_browser import web_browser
 from ._tools._web_search import web_search
 __all__ = [
     "bash",
+    "bash_session",
     "computer",
     "python",
     "web_browser",
     "web_search",
+    "text_editor",
     "tool",
     "tool_with",
     "Tool",

inspect_ai/tool/_tool_call.py CHANGED Viewed

@@ -44,8 +44,11 @@ class ToolCall:
     arguments: dict[str, Any]
     """Arguments to function."""
-    type: Literal["function"]
-    """Type of tool call (currently only 'function')"""
+    type: str
+    """Type of tool call ('function' or a model specific internal tool type)"""
+    internal_name: str | None = field(default=None)
+    """Model's internal name for the tool - if any."""
     parse_error: str | None = field(default=None)
     """Error which occurred parsing tool call."""

inspect_ai/tool/_tool_support_helpers.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""
+This module provides helper code for handling JSON-RPC communication between the inspect process and the `inspect-tool-support` package code running in the sandbox environment.
+It includes definitions for JSON-RPC request and response models, as well as functions to create and parse JSON-RPC requests and responses.
+"""
+import json
+from itertools import count
+from textwrap import dedent
+from typing import Literal, Type, TypeVar, cast
+from pydantic import BaseModel, RootModel
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai.tool._tool import ToolError, ToolParsingError
+from inspect_ai.util import sandbox_with
+from inspect_ai.util._sandbox.environment import SandboxEnvironment
+class JSONRPCResponseBase(BaseModel):
+    jsonrpc: Literal["2.0"]
+    id: int | float | str
+class JSONRPCSuccessResponse(JSONRPCResponseBase):
+    result: object
+class JSONRPCError(BaseModel):
+    """See: https://www.jsonrpc.org/specification#error_object"""
+    code: int
+    message: str
+    data: object | None = None
+class JSONRPCErrorResponse(JSONRPCResponseBase):
+    error: JSONRPCError
+class JSONRPCResponse(RootModel[JSONRPCSuccessResponse | JSONRPCErrorResponse]):
+    pass
+BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
+StrOrModelT = TypeVar("StrOrModelT", bound=str | BaseModel)
+id_generator = count(666)
+async def exec_sandbox_rpc(
+    sandbox: SandboxEnvironment,
+    method: str,
+    params: dict[str, object] | tuple[object, ...],
+    result_cls: Type[StrOrModelT],
+    timeout: int | None = None,
+    user: str | None = None,
+) -> StrOrModelT:
+    """
+    Execute a JSON-RPC command to a sandbox environment.
+    Note that the JSON RPC request is sent to the exec'ed program via stdin.
+    Args:
+      sandbox (SandboxEnvironment): The sandbox environment to execute the command in.
+      method (str): The JSON-RPC method to call.
+      params (dict[str, object] | tuple[object, ...]): The parameters for the JSON-RPC method.
+      result_cls (Type[BaseModelT]): The class to use for parsing the result.
+      timeout (int | None, optional): The timeout for the execution. Defaults to None.
+      user: Optional username or UID to run the command as.
+    Returns:
+      BaseModelT: The parsed result of the JSON-RPC call.
+    Raises:
+      RuntimeError: If the sandbox execution fails or if there is an error in the JSON-RPC response.
+      ToolParsingError: If the JSON-RPC response contains a specific error code indicating a parsing error.
+    """
+    exec_result = await sandbox.exec(
+        [SANDBOX_CLI, "exec"],
+        input=_create_json_rpc_request(method, params),
+        timeout=timeout,
+        user=user,
+    )
+    if not exec_result.success:
+        raise RuntimeError(
+            f"Sandbox.exec failure executing {_rpc_call_description(method, params)}: {exec_result.stderr}"
+        )
+    match _parse_json_rpc_response(exec_result.stdout, result_cls):
+        case JSONRPCError(code=-32601 | -32602, message=message):
+            raise ToolParsingError(message)
+        case JSONRPCError(code=-32000, message=message):
+            raise ToolError(message)
+        case JSONRPCError(code=code, message=message):
+            raise RuntimeError(
+                f"Error executing tool command {_rpc_call_description(method, params)}: {code=} {message}"
+            )
+        # case result_cls() as model: yields a mypy error since it has narrowed model down
+        # to BaseModel and not BaseModelT. ???
+        case model if isinstance(model, result_cls):
+            return model
+        case not_possible:
+            raise RuntimeError(
+                f"Error executing tool command {_rpc_call_description(method, params)}: {not_possible}"
+            )
+SANDBOX_CLI = "inspect-tool-support"
+INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB = "aisiuk/inspect-tool-support"
+async def tool_container_sandbox(tool_name: str) -> SandboxEnvironment:
+    sb = await sandbox_with(SANDBOX_CLI, True)
+    if sb:
+        return sb
+    else:
+        msg = dedent(f"""
+                The {tool_name} service was not found in any of the sandboxes for this sample. Please add the {tool_name} to your configuration.
+                For example, the following Docker compose file uses the {INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB} reference image as its default sandbox:
+                services:
+                  default:
+                    image: "{INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB}"
+                    init: true
+                Alternatively, you can include the service into your own Dockerfile:
+                RUN python -m venv /opt/inspect_tool_support
+                ENV PATH="/opt/inspect_tool_support/bin:$PATH"
+                RUN pip install inspect-tool-support
+                RUN inspect-tool-support post-install
+                """).strip()
+        raise PrerequisiteError(msg)
+def _create_json_rpc_request(
+    method: str, params: dict[str, object] | tuple[object, ...]
+) -> str:
+    return json.dumps(
+        {
+            "jsonrpc": "2.0",
+            "method": method,
+            "id": next(id_generator),
+            "params": list(params) if isinstance(params, tuple) else params,
+        }
+    )
+def _rpc_call_description(
+    method: str, params: dict[str, object] | tuple[object, ...]
+) -> str:
+    """
+    Generate a string description of an RPC call.
+    Args:
+        method (str): The name of the RPC method.
+        params (dict[str, object] | tuple[object, ...]): The parameters for the RPC method.
+    Returns:
+        str: A string description of the RPC call.
+    Examples:
+        >>> _rpc_call_description("subtract", {"minuend": 42, "subtrahend": 23})
+        'subtract(minuend: 42, subtrahend: 23)'
+        >>> _rpc_call_description("subtract", (42, 23))
+        'subtract(42, 23)'
+    """
+    normalized_params = (
+        list(map(str, params))
+        if isinstance(params, tuple)
+        else [f"{k}: {v}" for k, v in params.items()]
+    )
+    return f"{method}({', '.join(normalized_params)})"
+def _parse_json_rpc_response(
+    response_str: str,
+    result_cls: Type[StrOrModelT],
+) -> StrOrModelT | JSONRPCError:
+    match JSONRPCResponse.model_validate_json(response_str).root:
+        case JSONRPCErrorResponse(error=error):
+            return error
+        case JSONRPCSuccessResponse(result=rpc_result):
+            # TODO: Wow. Is there really no way to convince Python to narrow these types
+            # and avoid the cast's
+            if result_cls is str:
+                if not isinstance(rpc_result, str):
+                    raise ValueError(f"Expected string result, got {type(rpc_result)}")
+                return cast(StrOrModelT, rpc_result)
+            else:
+                return cast(
+                    StrOrModelT,
+                    cast(BaseModel, result_cls).model_validate(rpc_result, strict=True),
+                )
+        case _:
+            raise ValueError(f"Unexpected JSON RPC response: {response_str}")

inspect-ai 0.3.75__py3-none-any.whl → 0.3.76__py3-none-any.whl

inspect-ai 0.3.75py3-none-any.whl → 0.3.76py3-none-any.whl