PyPI - inspect-ai - Versions diffs - 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl - Mend

inspect-ai 0.3.74py3-none-any.whl → 0.3.76py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

inspect_ai/__init__.py +3 -2
inspect_ai/_cli/cache.py +1 -1
inspect_ai/_cli/common.py +15 -0
inspect_ai/_cli/eval.py +4 -5
inspect_ai/_cli/log.py +1 -1
inspect_ai/_cli/sandbox.py +1 -1
inspect_ai/_cli/trace.py +1 -1
inspect_ai/_cli/view.py +1 -1
inspect_ai/_display/core/config.py +3 -1
inspect_ai/_eval/eval.py +55 -61
inspect_ai/_eval/evalset.py +64 -154
inspect_ai/_eval/loader.py +27 -54
inspect_ai/_eval/registry.py +4 -15
inspect_ai/_eval/run.py +7 -4
inspect_ai/_eval/task/__init__.py +8 -2
inspect_ai/_eval/task/log.py +9 -1
inspect_ai/_eval/task/resolved.py +35 -0
inspect_ai/_eval/task/run.py +4 -0
inspect_ai/_eval/task/task.py +50 -69
inspect_ai/_eval/task/tasks.py +30 -0
inspect_ai/_util/constants.py +3 -0
inspect_ai/_util/dotenv.py +17 -0
inspect_ai/_util/logger.py +3 -0
inspect_ai/_util/registry.py +43 -2
inspect_ai/_view/server.py +28 -10
inspect_ai/_view/www/dist/assets/index.css +32 -19
inspect_ai/_view/www/dist/assets/index.js +17682 -29989
inspect_ai/_view/www/log-schema.json +79 -9
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/appearance/styles.ts +6 -5
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
inspect_ai/_view/www/src/constants.ts +3 -0
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
inspect_ai/_view/www/src/types/log.d.ts +11 -5
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
inspect_ai/_view/www/yarn.lock +12 -5
inspect_ai/log/_log.py +10 -1
inspect_ai/log/_recorders/eval.py +27 -8
inspect_ai/log/_recorders/json.py +10 -2
inspect_ai/log/_transcript.py +13 -4
inspect_ai/model/_call_tools.py +13 -4
inspect_ai/model/_chat_message.py +15 -1
inspect_ai/model/_model.py +30 -12
inspect_ai/model/_model_output.py +6 -1
inspect_ai/model/_openai.py +11 -6
inspect_ai/model/_providers/anthropic.py +167 -77
inspect_ai/model/_providers/google.py +6 -2
inspect_ai/model/_providers/none.py +31 -0
inspect_ai/model/_providers/openai.py +11 -8
inspect_ai/model/_providers/providers.py +7 -0
inspect_ai/model/_providers/vertex.py +5 -2
inspect_ai/solver/_bridge/bridge.py +1 -1
inspect_ai/solver/_chain.py +7 -6
inspect_ai/tool/__init__.py +4 -0
inspect_ai/tool/_tool_call.py +5 -2
inspect_ai/tool/_tool_support_helpers.py +200 -0
inspect_ai/tool/_tools/_bash_session.py +119 -0
inspect_ai/tool/_tools/_computer/_computer.py +1 -1
inspect_ai/tool/_tools/_text_editor.py +121 -0
inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
inspect_ai/tool/_tools/_web_search.py +2 -2
inspect_ai/util/_json.py +28 -0
inspect_ai/util/_sandbox/context.py +18 -8
inspect_ai/util/_sandbox/docker/config.py +1 -1
inspect_ai/util/_sandbox/docker/internal.py +3 -3
inspect_ai/util/_sandbox/environment.py +17 -2
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0

inspect_ai/model/_openai.py CHANGED Viewed

@@ -52,19 +52,22 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
 def is_o_series(name: str) -> bool:
-    return bool(re.match(r"(^|.*\/)o\d+", name))
+    if bool(re.match(r"^o\d+", name)):
+        return True
+    else:
+        return not is_gpt(name) and bool(re.search(r"o\d+", name))
 def is_o1_mini(name: str) -> bool:
-    return name.startswith("o1-mini")
+    return "o1-mini" in name
 def is_o1_preview(name: str) -> bool:
-    return name.startswith("o1-preview")
+    return "o1-preview" in name
 def is_gpt(name: str) -> bool:
-    return name.startswith("gpt")
+    return "gpt" in name
 def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
@@ -80,12 +83,13 @@ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
 def openai_chat_tool_call_param(
     tool_call: ToolCall,
 ) -> ChatCompletionMessageToolCallParam:
+    assert tool_call.type == "function", f"Unexpected tool call type {tool_call.type}"
     return ChatCompletionMessageToolCallParam(
         id=tool_call.id,
         function=dict(
             name=tool_call.function, arguments=json.dumps(tool_call.arguments)
         ),
-        type=tool_call.type,
+        type="function",  # Type narrowing couldn't figure it out
     )
@@ -108,7 +112,8 @@ async def openai_chat_completion_part(
             image_url=dict(url=image_url, detail=detail),
         )
     elif content.type == "audio":
-        audio_data = await file_as_data_uri(content.audio)
+        audio_data_uri = await file_as_data_uri(content.audio)
+        audio_data = audio_data_uri.split("base64,")[1]
         return ChatCompletionContentPartInputAudioParam(
             type="input_audio", input_audio=dict(data=audio_data, format=content.format)

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -1,23 +1,12 @@
 import functools
 import os
 import re
-import sys
 from copy import copy
 from logging import getLogger
-from typing import Any, Literal, Optional, Tuple, TypedDict, cast
+from typing import Any, Literal, NamedTuple, Optional, Tuple, cast
 import httpcore
 import httpx
-from inspect_ai._util.http import is_retryable_http_status
-from .util.hooks import HttpxHooks
-if sys.version_info >= (3, 11):
-    from typing import NotRequired
-else:
-    from typing_extensions import NotRequired
 from anthropic import (
     APIConnectionError,
     APIStatusError,
@@ -39,19 +28,19 @@ from anthropic.types import (
     TextBlockParam,
     ThinkingBlock,
     ThinkingBlockParam,
+    ToolBash20250124Param,
     ToolParam,
     ToolResultBlockParam,
+    ToolTextEditor20250124Param,
     ToolUseBlock,
     ToolUseBlockParam,
     message_create_params,
 )
+from anthropic.types.beta import BetaToolComputerUse20250124Param
 from pydantic import JsonValue
 from typing_extensions import override
-from inspect_ai._util.constants import (
-    BASE_64_DATA_REMOVED,
-    NO_CONTENT,
-)
+from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
 from inspect_ai._util.content import (
     Content,
     ContentImage,
@@ -59,6 +48,7 @@ from inspect_ai._util.content import (
     ContentText,
 )
 from inspect_ai._util.error import exception_message
+from inspect_ai._util.http import is_retryable_http_status
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.logger import warn_once
 from inspect_ai._util.url import data_uri_mime_type, data_uri_to_base64
@@ -70,11 +60,14 @@ from .._model import ModelAPI
 from .._model_call import ModelCall
 from .._model_output import ChatCompletionChoice, ModelOutput, ModelUsage, StopReason
 from .util import environment_prerequisite_error, model_base_url
+from .util.hooks import HttpxHooks
 logger = getLogger(__name__)
 ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
+INTERNAL_COMPUTER_TOOL_NAME = "computer"
 class AnthropicAPI(ModelAPI):
     def __init__(
@@ -93,7 +86,7 @@ class AnthropicAPI(ModelAPI):
         else:
             self.service = None
-        # collect gemerate model_args (then delete them so we can pass the rest on)
+        # collect generate model_args (then delete them so we can pass the rest on)
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
@@ -193,14 +186,11 @@ class AnthropicAPI(ModelAPI):
         # generate
         try:
-            (
-                system_param,
-                tools_param,
-                messages,
-                computer_use,
-            ) = await self.resolve_chat_input(input, tools, config)
+            system_param, tools_param, messages = await self.resolve_chat_input(
+                input, tools, config
+            )
-            # prepare request params (assembed this way so we can log the raw model call)
+            # prepare request params (assembled this way so we can log the raw model call)
             request = dict(messages=messages)
             # system messages and tools
@@ -218,7 +208,13 @@ class AnthropicAPI(ModelAPI):
             # extra headers (for time tracker and computer use)
             extra_headers = headers | {HttpxHooks.REQUEST_ID_HEADER: request_id}
-            if computer_use:
+            if any(
+                tool.get("type", None) == "computer_20250124" for tool in tools_param
+            ):
+                # From: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#claude-3-7-sonnet-beta-flag
+                # Note: The Bash (bash_20250124) and Text Editor (text_editor_20250124)
+                # tools are generally available for Claude 3.5 Sonnet (new) as well and
+                # can be used without the computer use beta header.
                 betas.append("computer-use-2025-01-24")
             if len(betas) > 0:
                 extra_headers["anthropic-beta"] = ",".join(betas)
@@ -240,7 +236,9 @@ class AnthropicAPI(ModelAPI):
             response = message.model_dump()
             # extract output
-            output = model_output_from_message(message, tools)
+            output = await model_output_from_message(
+                self.client, self.model_name, message, tools
+            )
             # return output and call
             return output, model_call()
@@ -403,9 +401,7 @@ class AnthropicAPI(ModelAPI):
         input: list[ChatMessage],
         tools: list[ToolInfo],
         config: GenerateConfig,
-    ) -> Tuple[
-        list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
-    ]:
+    ) -> Tuple[list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam]]:
         # extract system message
         system_messages, messages = split_system_messages(input, config)
@@ -418,7 +414,7 @@ class AnthropicAPI(ModelAPI):
         )
         # tools
-        tools_params, computer_use = self.tool_params_for_tools(tools, config)
+        tools_params = [self.tool_param_for_tool_info(tool, config) for tool in tools]
         # system messages
         if len(system_messages) > 0:
@@ -468,40 +464,35 @@ class AnthropicAPI(ModelAPI):
                     add_cache_control(cast(dict[str, Any], content[-1]))
         # return chat input
-        return system_param, tools_params, message_params, computer_use
-    def tool_params_for_tools(
-        self, tools: list[ToolInfo], config: GenerateConfig
-    ) -> tuple[list["ToolParamDef"], bool]:
-        # tool params and computer_use bit to return
-        tool_params: list["ToolParamDef"] = []
-        computer_use = False
-        # for each tool, check if it has a native computer use implementation and use that
-        # when available (noting that we need to set the computer use request header)
-        for tool in tools:
-            computer_use_tool = (
+        return system_param, tools_params, message_params
+    def tool_param_for_tool_info(
+        self, tool: ToolInfo, config: GenerateConfig
+    ) -> "ToolParamDef":
+        # Use a native tool implementation when available. Otherwise, use the
+        # standard tool implementation
+        return self.maybe_native_tool_param(tool, config) or ToolParam(
+            name=tool.name,
+            description=tool.description,
+            input_schema=tool.parameters.model_dump(exclude_none=True),
+        )
+    def maybe_native_tool_param(
+        self, tool: ToolInfo, config: GenerateConfig
+    ) -> Optional["ToolParamDef"]:
+        return (
+            (
                 self.computer_use_tool_param(tool)
-                if config.internal_tools is not False
-                else None
+                or self.text_editor_tool_param(tool)
+                or self.bash_tool_param(tool)
             )
-            if computer_use_tool:
-                tool_params.append(computer_use_tool)
-                computer_use = True
-            else:
-                tool_params.append(
-                    ToolParam(
-                        name=tool.name,
-                        description=tool.description,
-                        input_schema=tool.parameters.model_dump(exclude_none=True),
-                    )
-                )
-        return tool_params, computer_use
+            if config.internal_tools is not False
+            else None
+        )
     def computer_use_tool_param(
         self, tool: ToolInfo
-    ) -> Optional["ComputerUseToolParam"]:
+    ) -> Optional[BetaToolComputerUse20250124Param]:
         # check for compatible 'computer' tool
         if tool.name == "computer" and (
             sorted(tool.parameters.properties.keys())
@@ -523,7 +514,7 @@ class AnthropicAPI(ModelAPI):
                     "Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
                 )
                 return None
-            return ComputerUseToolParam(
+            return BetaToolComputerUse20250124Param(
                 type="computer_20250124",
                 name="computer",
                 # Note: The dimensions passed here for display_width_px and display_height_px should
@@ -540,23 +531,58 @@ class AnthropicAPI(ModelAPI):
         else:
             return None
+    def text_editor_tool_param(
+        self, tool: ToolInfo
+    ) -> Optional[ToolTextEditor20250124Param]:
+        # check for compatible 'text editor' tool
+        if tool.name == "text_editor" and (
+            sorted(tool.parameters.properties.keys())
+            == sorted(
+                [
+                    "command",
+                    "file_text",
+                    "insert_line",
+                    "new_str",
+                    "old_str",
+                    "path",
+                    "view_range",
+                ]
+            )
+        ):
+            return ToolTextEditor20250124Param(
+                type="text_editor_20250124", name="str_replace_editor"
+            )
+        # not a text_editor tool
+        else:
+            return None
-# native anthropic tool definitions for computer use beta
-# https://docs.anthropic.com/en/docs/build-with-claude/computer-use
-class ComputerUseToolParam(TypedDict):
-    type: str
-    name: str
-    display_width_px: NotRequired[int]
-    display_height_px: NotRequired[int]
-    display_number: NotRequired[int]
+    def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
+        # check for compatible 'bash' tool
+        if tool.name == "bash_session" and (
+            sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
+        ):
+            return ToolBash20250124Param(type="bash_20250124", name="bash")
+        # not a bash tool
+        else:
+            return None
-# tools can be either a stock tool param or a special computer use tool param
-ToolParamDef = ToolParam | ComputerUseToolParam
+# tools can be either a stock tool param or a special Anthropic native use tool param
+ToolParamDef = (
+    ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | ToolBash20250124Param
+)
 def add_cache_control(
-    param: TextBlockParam | ToolParam | ComputerUseToolParam | dict[str, Any],
+    param: TextBlockParam
+    | ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | ToolBash20250124Param
+    | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -565,10 +591,10 @@ def consecutive_user_message_reducer(
     messages: list[MessageParam],
     message: MessageParam,
 ) -> list[MessageParam]:
-    return consective_message_reducer(messages, message, "user")
+    return consecutive_message_reducer(messages, message, "user")
-def consective_message_reducer(
+def consecutive_message_reducer(
     messages: list[MessageParam],
     message: MessageParam,
     role: Literal["user", "assistant"],
@@ -581,6 +607,7 @@ def consective_message_reducer(
 def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
+    # TODO: Fix this code as it currently drops interesting properties when combining
     role = a["role"]
     a_content = a["content"]
     b_content = b["content"]
@@ -700,7 +727,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
                 ToolUseBlockParam(
                     type="tool_use",
                     id=tool_call.id,
-                    name=tool_call.function,
+                    name=tool_call.internal_name or tool_call.function,
                     input=tool_call.arguments,
                 )
             )
@@ -724,9 +751,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
         )
-def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelOutput:
+async def model_output_from_message(
+    client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
+    model: str,
+    message: Message,
+    tools: list[ToolInfo],
+) -> ModelOutput:
     # extract content and tool calls
     content: list[Content] = []
+    reasoning_tokens = 0
     tool_calls: list[ToolCall] | None = None
     for content_block in message.content:
@@ -741,11 +774,13 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
             content.append(ContentText(type="text", text=content_text))
         elif isinstance(content_block, ToolUseBlock):
             tool_calls = tool_calls or []
+            info = maybe_mapped_call_info(content_block.name, tools)
             tool_calls.append(
                 ToolCall(
-                    type="function",
+                    type=info.internal_type,
                     id=content_block.id,
-                    function=content_block.name,
+                    function=info.inspect_name,
+                    internal_name=info.internal_name,
                     arguments=content_block.model_dump().get("input", {}),
                 )
             )
@@ -754,6 +789,9 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
                 ContentReasoning(reasoning=content_block.data, redacted=True)
             )
         elif isinstance(content_block, ThinkingBlock):
+            reasoning_tokens += await count_tokens(
+                client, model, content_block.thinking
+            )
             content.append(
                 ContentReasoning(
                     reasoning=content_block.thinking, signature=content_block.signature
@@ -787,7 +825,39 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
             total_tokens=total_tokens,
             input_tokens_cache_write=input_tokens_cache_write,
             input_tokens_cache_read=input_tokens_cache_read,
+            reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
+        ),
+    )
+class CallInfo(NamedTuple):
+    internal_name: str | None
+    internal_type: str
+    inspect_name: str
+def maybe_mapped_call_info(tool_called: str, tools: list[ToolInfo]) -> CallInfo:
+    """
+    Return call info - potentially transformed by native tool mappings.
+    Anthropic prescribes names for their native tools - `computer`, `bash`, and
+    `str_replace_editor`. For a variety of reasons, Inspect's tool names to not
+    necessarily conform to internal names. Anthropic also provides specific tool
+    types for these built-in tools.
+    """
+    mappings = (
+        (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
+        ("str_replace_editor", "text_editor_20250124", "text_editor"),
+        ("bash", "bash_20250124", "bash_session"),
+    )
+    return next(
+        (
+            CallInfo(entry[0], entry[1], entry[2])
+            for entry in mappings
+            if entry[0] == tool_called and any(tool.name == entry[2] for tool in tools)
         ),
+        CallInfo(None, "function", tool_called),
     )
@@ -852,6 +922,26 @@ async def message_param_content(
         )
+async def count_tokens(
+    client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
+    model: str,
+    text: str,
+) -> int:
+    try:
+        response = await client.messages.count_tokens(
+            model=model,
+            messages=[{"role": "user", "content": text}],
+        )
+        return response.input_tokens
+    except Exception as e:
+        logger.warning(
+            f"Error counting tokens (falling back to estimated tokens): {str(e)}"
+        )
+        words = text.split()
+        estimated_tokens = int(len(words) * 1.3)
+        return estimated_tokens
 def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
     # remove base64 encoded images
     if (

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -267,8 +267,12 @@ class GoogleGenAIAPI(ModelAPI):
         import requests  # type: ignore
         # standard http errors
-        if isinstance(ex, APIError):
-            return is_retryable_http_status(ex.status)
+        if (
+            isinstance(ex, APIError)
+            and isinstance(ex.status, str)
+            and ex.status.isdigit()
+        ):
+            return is_retryable_http_status(int(ex.status))
         # low-level requests exceptions
         elif isinstance(ex, requests.exceptions.RequestException):

inspect_ai/model/_providers/none.py ADDED Viewed

@@ -0,0 +1,31 @@
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai.tool import ToolChoice, ToolInfo
+from .._chat_message import ChatMessage
+from .._generate_config import GenerateConfig
+from .._model import ModelAPI
+from .._model_output import ModelOutput
+class NoModel(ModelAPI):
+    """A sentinel model type indicating there is no model specified."""
+    def __init__(
+        self,
+        model_name: str = "none",
+        base_url: str | None = None,
+        api_key: str | None = None,
+        config: GenerateConfig = GenerateConfig(),
+    ) -> None:
+        super().__init__(model_name, base_url, api_key, [], config)
+    async def generate(
+        self,
+        input: list[ChatMessage],
+        tools: list[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        raise PrerequisiteError(
+            "No model specified (and no INSPECT_EVAL_MODEL defined)"
+        )

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -67,6 +67,16 @@ class OpenAIAPI(ModelAPI):
         config: GenerateConfig = GenerateConfig(),
         **model_args: Any,
     ) -> None:
+        # extract azure service prefix from model name (other providers
+        # that subclass from us like together expect to have the qualifier
+        # in the model name e.g. google/gemma-2b-it)
+        parts = model_name.split("/")
+        if parts[0] == "azure" and len(parts) > 1:
+            self.service: str | None = parts[0]
+            model_name = "/".join(parts[1:])
+        else:
+            self.service = None
         # call super
         super().__init__(
             model_name=model_name,
@@ -76,14 +86,6 @@ class OpenAIAPI(ModelAPI):
             config=config,
         )
-        # extract any service prefix from model name
-        parts = model_name.split("/")
-        if len(parts) > 1:
-            self.service: str | None = parts[0]
-            model_name = "/".join(parts[1:])
-        else:
-            self.service = None
         # resolve api_key
         if not self.api_key:
             self.api_key = os.environ.get(
@@ -322,6 +324,7 @@ class OpenAIAPI(ModelAPI):
             config.reasoning_effort is not None
             and not self.is_gpt()
             and not self.is_o1_mini()
+            and not self.is_o1_preview()
         ):
             params["reasoning_effort"] = config.reasoning_effort
         if config.response_schema is not None:

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -250,6 +250,13 @@ def mockllm() -> type[ModelAPI]:
     return MockLLM
+@modelapi(name="none")
+def none() -> type[ModelAPI]:
+    from .none import NoModel
+    return NoModel
 @modelapi("goodfire")
 def goodfire() -> type[ModelAPI]:
     """Get the Goodfire API provider."""

inspect_ai/model/_providers/vertex.py CHANGED Viewed

@@ -34,8 +34,8 @@ from inspect_ai._util.content import (
     Content,
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
-    ContentVideo,
 )
 from inspect_ai._util.http import is_retryable_http_status
 from inspect_ai._util.images import file_as_data
@@ -336,10 +336,13 @@ async def content_part(content: Content | str) -> Part:
     elif isinstance(content, ContentImage):
         image_bytes, mime_type = await file_as_data(content.image)
         return Part.from_image(image=Image.from_bytes(data=image_bytes))
+    elif isinstance(content, ContentReasoning):
+        return Part.from_text(content.reasoning or NO_CONTENT)
     else:
         if isinstance(content, ContentAudio):
             file = content.audio
-        elif isinstance(content, ContentVideo):
+        else:
+            # it's ContentVideo
             file = content.video
         file_bytes, mime_type = await file_as_data(file)
         return Part.from_data(file_bytes, mime_type)

inspect_ai/solver/_bridge/bridge.py CHANGED Viewed

@@ -17,7 +17,7 @@ from .._task_state import TaskState
 def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
     """Bridge an external agent into an Inspect Solver.
-    See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
+    See documentation at <https://inspect.aisi.org.uk/agent-bridge.html>
     Args:
       agent: Callable which takes a sample `dict` and returns a result `dict`.

inspect_ai/solver/_chain.py CHANGED Viewed

@@ -2,10 +2,11 @@ from typing import Sequence, overload
 from typing_extensions import override
-from ._solver import Generate, Solver
+from ._solver import Generate, Solver, solver
 from ._task_state import TaskState
+@solver
 def chain(*solvers: Solver | list[Solver]) -> Solver:
     """Compose a solver from multiple other solvers.
@@ -22,8 +23,8 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
     """
     # flatten lists and chains
     all_solvers: list[Solver] = []
-    for solver in solvers:
-        all_solvers.extend(unroll(solver))
+    for s in solvers:
+        all_solvers.extend(unroll(s))
     return Chain(all_solvers)
@@ -72,9 +73,9 @@ class Chain(Sequence[Solver], Solver):
     ) -> TaskState:
         from ._transcript import solver_transcript
-        for solver in self._solvers:
-            with solver_transcript(solver, state) as st:
-                state = await solver(state, generate)
+        for slv in self._solvers:
+            with solver_transcript(slv, state) as st:
+                state = await slv(state, generate)
                 st.complete(state)
             if state.completed:
                 break

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -22,17 +22,21 @@ from ._tool_def import ToolDef
 from ._tool_info import ToolInfo
 from ._tool_params import ToolParam, ToolParams
 from ._tool_with import tool_with
+from ._tools._bash_session import bash_session
 from ._tools._computer import computer
 from ._tools._execute import bash, python
+from ._tools._text_editor import text_editor
 from ._tools._web_browser import web_browser
 from ._tools._web_search import web_search
 __all__ = [
     "bash",
+    "bash_session",
     "computer",
     "python",
     "web_browser",
     "web_search",
+    "text_editor",
     "tool",
     "tool_with",
     "Tool",

inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl

inspect-ai 0.3.74py3-none-any.whl → 0.3.76py3-none-any.whl