PyPI - inspect-ai - Versions diffs - 0.3.71__py3-none-any.whl → 0.3.73__py3-none-any.whl - Mend

inspect-ai 0.3.71py3-none-any.whl → 0.3.73py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

inspect_ai/_cli/eval.py +14 -3
inspect_ai/_cli/sandbox.py +3 -3
inspect_ai/_cli/score.py +6 -4
inspect_ai/_cli/trace.py +53 -6
inspect_ai/_display/core/config.py +1 -1
inspect_ai/_display/core/display.py +2 -1
inspect_ai/_display/core/footer.py +6 -6
inspect_ai/_display/plain/display.py +11 -6
inspect_ai/_display/rich/display.py +23 -13
inspect_ai/_display/textual/app.py +10 -9
inspect_ai/_display/textual/display.py +2 -2
inspect_ai/_display/textual/widgets/footer.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +14 -5
inspect_ai/_eval/context.py +1 -2
inspect_ai/_eval/eval.py +54 -41
inspect_ai/_eval/loader.py +9 -2
inspect_ai/_eval/run.py +148 -81
inspect_ai/_eval/score.py +13 -8
inspect_ai/_eval/task/images.py +31 -21
inspect_ai/_eval/task/run.py +62 -59
inspect_ai/_eval/task/rundir.py +16 -9
inspect_ai/_eval/task/sandbox.py +7 -8
inspect_ai/_eval/task/util.py +7 -0
inspect_ai/_util/_async.py +118 -10
inspect_ai/_util/constants.py +0 -2
inspect_ai/_util/file.py +15 -29
inspect_ai/_util/future.py +37 -0
inspect_ai/_util/http.py +3 -99
inspect_ai/_util/httpx.py +60 -0
inspect_ai/_util/interrupt.py +2 -2
inspect_ai/_util/json.py +5 -52
inspect_ai/_util/logger.py +30 -86
inspect_ai/_util/retry.py +10 -61
inspect_ai/_util/trace.py +2 -2
inspect_ai/_view/server.py +86 -3
inspect_ai/_view/www/dist/assets/index.js +25837 -13269
inspect_ai/_view/www/log-schema.json +253 -186
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +8 -3
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +2 -3
inspect_ai/_view/www/src/types/log.d.ts +122 -94
inspect_ai/approval/_human/manager.py +6 -10
inspect_ai/approval/_human/panel.py +2 -2
inspect_ai/dataset/_sources/util.py +7 -6
inspect_ai/log/__init__.py +4 -0
inspect_ai/log/_file.py +35 -61
inspect_ai/log/_log.py +18 -1
inspect_ai/log/_recorders/eval.py +14 -23
inspect_ai/log/_recorders/json.py +3 -18
inspect_ai/log/_samples.py +27 -2
inspect_ai/log/_transcript.py +8 -8
inspect_ai/model/__init__.py +2 -1
inspect_ai/model/_call_tools.py +60 -40
inspect_ai/model/_chat_message.py +3 -2
inspect_ai/model/_generate_config.py +25 -0
inspect_ai/model/_model.py +74 -36
inspect_ai/model/_openai.py +9 -1
inspect_ai/model/_providers/anthropic.py +172 -154
inspect_ai/model/_providers/azureai.py +11 -9
inspect_ai/model/_providers/bedrock.py +33 -24
inspect_ai/model/_providers/cloudflare.py +8 -9
inspect_ai/model/_providers/goodfire.py +7 -3
inspect_ai/model/_providers/google.py +47 -13
inspect_ai/model/_providers/groq.py +15 -15
inspect_ai/model/_providers/hf.py +24 -17
inspect_ai/model/_providers/mistral.py +36 -20
inspect_ai/model/_providers/openai.py +30 -25
inspect_ai/model/_providers/openai_o1.py +1 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/together.py +3 -4
inspect_ai/model/_providers/util/__init__.py +2 -2
inspect_ai/model/_providers/util/chatapi.py +6 -19
inspect_ai/model/_providers/util/hooks.py +165 -0
inspect_ai/model/_providers/vertex.py +20 -3
inspect_ai/model/_providers/vllm.py +16 -19
inspect_ai/scorer/_multi.py +5 -2
inspect_ai/solver/_bridge/patch.py +31 -1
inspect_ai/solver/_fork.py +5 -3
inspect_ai/solver/_human_agent/agent.py +3 -2
inspect_ai/tool/__init__.py +8 -2
inspect_ai/tool/_tool_info.py +4 -90
inspect_ai/tool/_tool_params.py +4 -34
inspect_ai/tool/_tools/_computer/_common.py +117 -58
inspect_ai/tool/_tools/_computer/_computer.py +80 -57
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
inspect_ai/tool/_tools/_computer/test_args.py +151 -0
inspect_ai/tool/_tools/_web_search.py +30 -24
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_concurrency.py +5 -6
inspect_ai/util/_display.py +6 -0
inspect_ai/util/_json.py +170 -0
inspect_ai/util/_sandbox/docker/cleanup.py +13 -9
inspect_ai/util/_sandbox/docker/docker.py +5 -0
inspect_ai/util/_sandbox/environment.py +56 -9
inspect_ai/util/_sandbox/service.py +12 -5
inspect_ai/util/_subprocess.py +94 -113
inspect_ai/util/_subtask.py +2 -4
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/METADATA +6 -2
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/RECORD +111 -103
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/WHEEL +1 -1
inspect_ai/_util/timeouts.py +0 -160
inspect_ai/model/_providers/util/tracker.py +0 -92
inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.71.dist-info → inspect_ai-0.3.73.dist-info}/top_level.txt +0 -0

inspect_ai/model/_model.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Any, AsyncIterator, Callable, Literal, Type, cast
 from pydantic_core import to_jsonable_python
 from tenacity import (
+    RetryCallState,
     retry,
     retry_if_exception,
     stop_after_attempt,
@@ -20,8 +21,9 @@ from tenacity import (
     stop_never,
     wait_exponential_jitter,
 )
+from tenacity.stop import StopBaseT
-from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
+from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS, HTTP
 from inspect_ai._util.content import (
     Content,
     ContentImage,
@@ -30,6 +32,7 @@ from inspect_ai._util.content import (
 )
 from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
 from inspect_ai._util.interrupt import check_sample_interrupt
+from inspect_ai._util.logger import warn_once
 from inspect_ai._util.platform import platform_init
 from inspect_ai._util.registry import (
     RegistryInfo,
@@ -37,7 +40,7 @@ from inspect_ai._util.registry import (
     registry_info,
     registry_unqualified_name,
 )
-from inspect_ai._util.retry import log_rate_limit_retry
+from inspect_ai._util.retry import report_http_retry
 from inspect_ai._util.trace import trace_action
 from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
@@ -173,11 +176,11 @@ class ModelAPI(abc.ABC):
         """Scope for enforcement of max_connections."""
         return "default"
-    def is_rate_limit(self, ex: BaseException) -> bool:
-        """Is this exception a rate limit error.
+    def should_retry(self, ex: Exception) -> bool:
+        """Should this exception be retried?
         Args:
-           ex: Exception to check for rate limit.
+           ex: Exception to check for retry
         """
         return False
@@ -331,14 +334,17 @@ class Model:
         start_time = datetime.now()
         working_start = sample_working_time()
         async with self._connection_concurrency(config):
+            from inspect_ai.log._samples import track_active_sample_retries
             # generate
-            output = await self._generate(
-                input=input,
-                tools=tools,
-                tool_choice=tool_choice,
-                config=config,
-                cache=cache,
-            )
+            with track_active_sample_retries():
+                output = await self._generate(
+                    input=input,
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    config=config,
+                    cache=cache,
+                )
             # update the most recent ModelEvent with the actual start/completed
             # times as well as a computation of working time (events are
@@ -418,27 +424,27 @@ class Model:
         if self.api.collapse_assistant_messages():
             input = collapse_consecutive_assistant_messages(input)
-        # retry for rate limit errors (max of 30 minutes)
+        # retry for transient http errors:
+        # - no default timeout or max_retries (try forever)
+        # - exponential backoff starting at 3 seconds (will wait 25 minutes
+        #   on the 10th retry,then will wait no longer than 30 minutes on
+        #   subsequent retries)
+        if config.max_retries is not None and config.timeout is not None:
+            stop: StopBaseT = stop_after_attempt(config.max_retries) | stop_after_delay(
+                config.timeout
+            )
+        elif config.max_retries is not None:
+            stop = stop_after_attempt(config.max_retries)
+        elif config.timeout is not None:
+            stop = stop_after_delay(config.timeout)
+        else:
+            stop = stop_never
         @retry(
-            wait=wait_exponential_jitter(max=(30 * 60), jitter=5),
-            retry=retry_if_exception(self.api.is_rate_limit),
-            stop=(
-                (
-                    stop_after_delay(config.timeout)
-                    | stop_after_attempt(config.max_retries)
-                )
-                if config.timeout and config.max_retries
-                else (
-                    stop_after_delay(config.timeout)
-                    if config.timeout
-                    else (
-                        stop_after_attempt(config.max_retries)
-                        if config.max_retries
-                        else stop_never
-                    )
-                )
-            ),
-            before_sleep=functools.partial(log_rate_limit_retry, self.api.model_name),
+            wait=wait_exponential_jitter(initial=3, max=(30 * 60), jitter=3),
+            retry=retry_if_exception(self.should_retry),
+            stop=stop,
+            before_sleep=functools.partial(log_model_retry, self.api.model_name),
         )
         async def generate() -> ModelOutput:
             check_sample_interrupt()
@@ -555,6 +561,30 @@ class Model:
         # return results
         return model_output
+    def should_retry(self, ex: BaseException) -> bool:
+        if isinstance(ex, Exception):
+            # check standard should_retry() method
+            retry = self.api.should_retry(ex)
+            if retry:
+                report_http_retry()
+                return True
+            # see if the API implements legacy is_rate_limit() method
+            is_rate_limit = getattr(self.api, "is_rate_limit", None)
+            if is_rate_limit:
+                warn_once(
+                    logger,
+                    f"provider '{self.name}' implements deprecated is_rate_limit() method, "
+                    + "please change to should_retry()",
+                )
+                retry = cast(bool, is_rate_limit(ex))
+                if retry:
+                    report_http_retry()
+                    return True
+        # no retry
+        return False
     # function to verify that its okay to call model apis
     def verify_model_apis(self) -> None:
         if (
@@ -1064,6 +1094,7 @@ def tool_result_images_reducer(
             messages
             + [
                 ChatMessageTool(
+                    id=message.id,
                     content=edited_tool_message_content,
                     tool_call_id=message.tool_call_id,
                     function=message.function,
@@ -1170,19 +1201,26 @@ def combine_messages(
     a: ChatMessage, b: ChatMessage, message_type: Type[ChatMessage]
 ) -> ChatMessage:
     if isinstance(a.content, str) and isinstance(b.content, str):
-        return message_type(content=f"{a.content}\n{b.content}")
+        return message_type(id=a.id, content=f"{a.content}\n{b.content}")
     elif isinstance(a.content, list) and isinstance(b.content, list):
-        return message_type(content=a.content + b.content)
+        return message_type(id=a.id, content=a.content + b.content)
     elif isinstance(a.content, str) and isinstance(b.content, list):
-        return message_type(content=[ContentText(text=a.content), *b.content])
+        return message_type(id=a.id, content=[ContentText(text=a.content), *b.content])
     elif isinstance(a.content, list) and isinstance(b.content, str):
-        return message_type(content=a.content + [ContentText(text=b.content)])
+        return message_type(id=a.id, content=a.content + [ContentText(text=b.content)])
     else:
         raise TypeError(
             f"Cannot combine messages with invalid content types: {a.content!r}, {b.content!r}"
         )
+def log_model_retry(model_name: str, retry_state: RetryCallState) -> None:
+    logger.log(
+        HTTP,
+        f"-> {model_name} retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}",
+    )
 def init_active_model(model: Model, config: GenerateConfig) -> None:
     active_model_context_var.set(model)
     set_active_generate_config(config)

inspect_ai/model/_openai.py CHANGED Viewed

@@ -52,7 +52,7 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
 def is_o_series(name: str) -> bool:
-    return bool(re.match(r"^o\d+", name))
+    return bool(re.match(r"(^|.*\/)o\d+", name))
 def is_o1_mini(name: str) -> bool:
@@ -396,6 +396,9 @@ def content_from_openai(
     content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
     parse_reasoning: bool = False,
 ) -> list[Content]:
+    # Some providers omit the type tag and use "object-with-a-single-field" encoding
+    if "type" not in content and len(content) == 1:
+        content["type"] = list(content.keys())[0]  # type: ignore[arg-type]
     if content["type"] == "text":
         text = content["text"]
         if parse_reasoning:
@@ -413,6 +416,8 @@ def content_from_openai(
                 return [ContentText(text=text)]
         else:
             return [ContentText(text=text)]
+    elif content["type"] == "reasoning":  # type: ignore[comparison-overlap]
+        return [ContentReasoning(reasoning=content["reasoning"])]
     elif content["type"] == "image_url":
         return [
             ContentImage(
@@ -428,6 +433,9 @@ def content_from_openai(
         ]
     elif content["type"] == "refusal":
         return [ContentText(text=content["refusal"])]
+    else:
+        content_type = content["type"]
+        raise ValueError(f"Unexpected content type '{content_type}' in message.")
 def chat_message_assistant_from_openai(

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -4,9 +4,14 @@ import re
 import sys
 from copy import copy
 from logging import getLogger
-from typing import Any, Literal, Tuple, TypedDict, cast
+from typing import Any, Literal, Optional, Tuple, TypedDict, cast
-from .util.tracker import HttpxTimeTracker
+import httpcore
+import httpx
+from inspect_ai._util.http import is_retryable_http_status
+from .util.hooks import HttpxHooks
 if sys.version_info >= (3, 11):
     from typing import NotRequired
@@ -16,13 +21,12 @@ else:
 from anthropic import (
     APIConnectionError,
     APIStatusError,
+    APITimeoutError,
     AsyncAnthropic,
     AsyncAnthropicBedrock,
     AsyncAnthropicVertex,
     BadRequestError,
-    InternalServerError,
     NotGiven,
-    RateLimitError,
 )
 from anthropic._types import Body
 from anthropic.types import (
@@ -46,7 +50,6 @@ from typing_extensions import override
 from inspect_ai._util.constants import (
     BASE_64_DATA_REMOVED,
-    DEFAULT_MAX_RETRIES,
     NO_CONTENT,
 )
 from inspect_ai._util.content import (
@@ -125,9 +128,6 @@ class AnthropicAPI(ModelAPI):
                 AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex
             ) = AsyncAnthropicBedrock(
                 base_url=base_url,
-                max_retries=(
-                    config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
-                ),
                 aws_region=aws_region,
                 **model_args,
             )
@@ -141,9 +141,6 @@ class AnthropicAPI(ModelAPI):
                 region=region,
                 project_id=project_id,
                 base_url=base_url,
-                max_retries=(
-                    config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
-                ),
                 **model_args,
             )
         else:
@@ -156,14 +153,11 @@ class AnthropicAPI(ModelAPI):
             self.client = AsyncAnthropic(
                 base_url=base_url,
                 api_key=self.api_key,
-                max_retries=(
-                    config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
-                ),
                 **model_args,
             )
         # create time tracker
-        self._time_tracker = HttpxTimeTracker(self.client._client)
+        self._http_hooks = HttpxHooks(self.client._client)
     @override
     async def close(self) -> None:
@@ -183,7 +177,7 @@ class AnthropicAPI(ModelAPI):
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         # allocate request_id (so we can see it from ModelCall)
-        request_id = self._time_tracker.start_request()
+        request_id = self._http_hooks.start_request()
         # setup request and response for ModelCall
         request: dict[str, Any] = {}
@@ -194,7 +188,7 @@ class AnthropicAPI(ModelAPI):
                 request=request,
                 response=response,
                 filter=model_call_filter,
-                time=self._time_tracker.end_request(request_id),
+                time=self._http_hooks.end_request(request_id),
             )
         # generate
@@ -204,7 +198,7 @@ class AnthropicAPI(ModelAPI):
                 tools_param,
                 messages,
                 computer_use,
-            ) = await resolve_chat_input(self.model_name, input, tools, config)
+            ) = await self.resolve_chat_input(input, tools, config)
             # prepare request params (assembed this way so we can log the raw model call)
             request = dict(messages=messages)
@@ -223,9 +217,9 @@ class AnthropicAPI(ModelAPI):
             request = request | req
             # extra headers (for time tracker and computer use)
-            extra_headers = headers | {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
+            extra_headers = headers | {HttpxHooks.REQUEST_ID_HEADER: request_id}
             if computer_use:
-                betas.append("computer-use-2024-10-22")
+                betas.append("computer-use-2025-01-24")
             if len(betas) > 0:
                 extra_headers["anthropic-beta"] = ",".join(betas)
@@ -291,8 +285,6 @@ class AnthropicAPI(ModelAPI):
                 betas.append("output-128k-2025-02-19")
         # config that applies to all models
-        if config.timeout is not None:
-            params["timeout"] = float(config.timeout)
         if config.stop_seqs is not None:
             params["stop_sequences"] = config.stop_seqs
@@ -326,18 +318,27 @@ class AnthropicAPI(ModelAPI):
     def is_claude_3_5(self) -> bool:
         return "claude-3-5-" in self.model_name
+    def is_claude_3_7(self) -> bool:
+        return "claude-3-7-" in self.model_name
     @override
     def connection_key(self) -> str:
         return str(self.api_key)
     @override
-    def is_rate_limit(self, ex: BaseException) -> bool:
-        # We have observed that anthropic will frequently return InternalServerError
-        # seemingly in place of RateLimitError (at the very least the errors seem to
-        # always be transient). Equating this to rate limit errors may occasionally
-        # result in retrying too many times, but much more often will avert a failed
-        # eval that just needed to survive a transient error
-        return isinstance(ex, RateLimitError | InternalServerError | APIConnectionError)
+    def should_retry(self, ex: Exception) -> bool:
+        if isinstance(ex, APIStatusError):
+            return is_retryable_http_status(ex.status_code)
+        elif isinstance(
+            ex,
+            APIConnectionError
+            | APITimeoutError
+            | httpx.RemoteProtocolError
+            | httpcore.RemoteProtocolError,
+        ):
+            return True
+        else:
+            return False
     @override
     def collapse_user_messages(self) -> bool:
@@ -397,6 +398,148 @@ class AnthropicAPI(ModelAPI):
         else:
             return ex
+    async def resolve_chat_input(
+        self,
+        input: list[ChatMessage],
+        tools: list[ToolInfo],
+        config: GenerateConfig,
+    ) -> Tuple[
+        list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
+    ]:
+        # extract system message
+        system_messages, messages = split_system_messages(input, config)
+        # messages
+        message_params = [(await message_param(message)) for message in messages]
+        # collapse user messages (as Inspect 'tool' messages become Claude 'user' messages)
+        message_params = functools.reduce(
+            consecutive_user_message_reducer, message_params, []
+        )
+        # tools
+        tools_params, computer_use = self.tool_params_for_tools(tools, config)
+        # system messages
+        if len(system_messages) > 0:
+            system_param: list[TextBlockParam] | None = [
+                TextBlockParam(type="text", text=message.text)
+                for message in system_messages
+            ]
+        else:
+            system_param = None
+        # add caching directives if necessary
+        cache_prompt = (
+            config.cache_prompt
+            if isinstance(config.cache_prompt, bool)
+            else True
+            if len(tools_params)
+            else False
+        )
+        # only certain claude models qualify
+        if cache_prompt:
+            if (
+                "claude-3-sonnet" in self.model_name
+                or "claude-2" in self.model_name
+                or "claude-instant" in self.model_name
+            ):
+                cache_prompt = False
+        if cache_prompt:
+            # system
+            if system_param:
+                add_cache_control(system_param[-1])
+            # tools
+            if tools_params:
+                add_cache_control(tools_params[-1])
+            # last 2 user messages
+            user_message_params = list(
+                filter(lambda m: m["role"] == "user", reversed(message_params))
+            )
+            for message in user_message_params[:2]:
+                if isinstance(message["content"], str):
+                    text_param = TextBlockParam(type="text", text=message["content"])
+                    add_cache_control(text_param)
+                    message["content"] = [text_param]
+                else:
+                    content = list(message["content"])
+                    add_cache_control(cast(dict[str, Any], content[-1]))
+        # return chat input
+        return system_param, tools_params, message_params, computer_use
+    def tool_params_for_tools(
+        self, tools: list[ToolInfo], config: GenerateConfig
+    ) -> tuple[list["ToolParamDef"], bool]:
+        # tool params and computer_use bit to return
+        tool_params: list["ToolParamDef"] = []
+        computer_use = False
+        # for each tool, check if it has a native computer use implementation and use that
+        # when available (noting that we need to set the computer use request header)
+        for tool in tools:
+            computer_use_tool = (
+                self.computer_use_tool_param(tool)
+                if config.internal_tools is not False
+                else None
+            )
+            if computer_use_tool:
+                tool_params.append(computer_use_tool)
+                computer_use = True
+            else:
+                tool_params.append(
+                    ToolParam(
+                        name=tool.name,
+                        description=tool.description,
+                        input_schema=tool.parameters.model_dump(exclude_none=True),
+                    )
+                )
+        return tool_params, computer_use
+    def computer_use_tool_param(
+        self, tool: ToolInfo
+    ) -> Optional["ComputerUseToolParam"]:
+        # check for compatible 'computer' tool
+        if tool.name == "computer" and (
+            sorted(tool.parameters.properties.keys())
+            == sorted(
+                [
+                    "action",
+                    "coordinate",
+                    "duration",
+                    "scroll_amount",
+                    "scroll_direction",
+                    "start_coordinate",
+                    "text",
+                ]
+            )
+        ):
+            if self.is_claude_3_5():
+                warn_once(
+                    logger,
+                    "Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
+                )
+                return None
+            return ComputerUseToolParam(
+                type="computer_20250124",
+                name="computer",
+                # Note: The dimensions passed here for display_width_px and display_height_px should
+                # match the dimensions of screenshots returned by the tool.
+                # Those dimensions will always be one of the values in MAX_SCALING_TARGETS
+                # in _x11_client.py.
+                # TODO: enhance this code to calculate the dimensions based on the scaled screen
+                # size used by the container.
+                display_width_px=1366,
+                display_height_px=768,
+                display_number=1,
+            )
+        # not a computer_use tool
+        else:
+            return None
 # native anthropic tool definitions for computer use beta
 # https://docs.anthropic.com/en/docs/build-with-claude/computer-use
@@ -412,131 +555,6 @@ class ComputerUseToolParam(TypedDict):
 ToolParamDef = ToolParam | ComputerUseToolParam
-async def resolve_chat_input(
-    model: str,
-    input: list[ChatMessage],
-    tools: list[ToolInfo],
-    config: GenerateConfig,
-) -> Tuple[list[TextBlockParam] | None, list[ToolParamDef], list[MessageParam], bool]:
-    # extract system message
-    system_messages, messages = split_system_messages(input, config)
-    # messages
-    message_params = [(await message_param(message)) for message in messages]
-    # collapse user messages (as Inspect 'tool' messages become Claude 'user' messages)
-    message_params = functools.reduce(
-        consecutive_user_message_reducer, message_params, []
-    )
-    # tools
-    tools_params, computer_use = tool_params_for_tools(tools, config)
-    # system messages
-    if len(system_messages) > 0:
-        system_param: list[TextBlockParam] | None = [
-            TextBlockParam(type="text", text=message.text)
-            for message in system_messages
-        ]
-    else:
-        system_param = None
-    # add caching directives if necessary
-    cache_prompt = (
-        config.cache_prompt
-        if isinstance(config.cache_prompt, bool)
-        else True
-        if len(tools_params)
-        else False
-    )
-    # only certain claude models qualify
-    if cache_prompt:
-        if (
-            "claude-3-sonnet" in model
-            or "claude-2" in model
-            or "claude-instant" in model
-        ):
-            cache_prompt = False
-    if cache_prompt:
-        # system
-        if system_param:
-            add_cache_control(system_param[-1])
-        # tools
-        if tools_params:
-            add_cache_control(tools_params[-1])
-        # last 2 user messages
-        user_message_params = list(
-            filter(lambda m: m["role"] == "user", reversed(message_params))
-        )
-        for message in user_message_params[:2]:
-            if isinstance(message["content"], str):
-                text_param = TextBlockParam(type="text", text=message["content"])
-                add_cache_control(text_param)
-                message["content"] = [text_param]
-            else:
-                content = list(message["content"])
-                add_cache_control(cast(dict[str, Any], content[-1]))
-    # return chat input
-    return system_param, tools_params, message_params, computer_use
-def tool_params_for_tools(
-    tools: list[ToolInfo], config: GenerateConfig
-) -> tuple[list[ToolParamDef], bool]:
-    # tool params and computer_use bit to return
-    tool_params: list[ToolParamDef] = []
-    computer_use = False
-    # for each tool, check if it has a native computer use implementation and use that
-    # when available (noting that we need to set the computer use request header)
-    for tool in tools:
-        computer_use_tool = (
-            computer_use_tool_param(tool)
-            if config.internal_tools is not False
-            else None
-        )
-        if computer_use_tool:
-            tool_params.append(computer_use_tool)
-            computer_use = True
-        else:
-            tool_params.append(
-                ToolParam(
-                    name=tool.name,
-                    description=tool.description,
-                    input_schema=tool.parameters.model_dump(exclude_none=True),
-                )
-            )
-    return tool_params, computer_use
-def computer_use_tool_param(tool: ToolInfo) -> ComputerUseToolParam | None:
-    # check for compatible 'computer' tool
-    if tool.name == "computer" and (
-        sorted(tool.parameters.properties.keys())
-        == sorted(["action", "coordinate", "text"])
-    ):
-        return ComputerUseToolParam(
-            type="computer_20241022",
-            name="computer",
-            # Note: The dimensions passed here for display_width_px and display_height_px should
-            # match the dimensions of screenshots returned by the tool.
-            # Those dimensions will always be one of the values in MAX_SCALING_TARGETS
-            # in _x11_client.py.
-            # TODO: enhance this code to calculate the dimensions based on the scaled screen
-            # size used by the container.
-            display_width_px=1366,
-            display_height_px=768,
-            display_number=1,
-        )
-    # not a computer_use tool
-    else:
-        return None
 def add_cache_control(
     param: TextBlockParam | ToolParam | ComputerUseToolParam | dict[str, Any],
 ) -> None:

inspect-ai 0.3.71__py3-none-any.whl → 0.3.73__py3-none-any.whl

inspect-ai 0.3.71py3-none-any.whl → 0.3.73py3-none-any.whl