PyPI - inspect-ai - Versions diffs - 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl - Mend

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_display/textual/app.py +14 -3
inspect_ai/_display/textual/display.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +9 -3
inspect_ai/_display/textual/widgets/task_detail.py +3 -4
inspect_ai/_display/textual/widgets/tasks.py +17 -1
inspect_ai/_display/textual/widgets/vscode.py +48 -0
inspect_ai/_eval/eval.py +36 -24
inspect_ai/_eval/evalset.py +17 -18
inspect_ai/_eval/loader.py +34 -11
inspect_ai/_eval/run.py +8 -13
inspect_ai/_eval/score.py +13 -3
inspect_ai/_eval/task/generate.py +8 -9
inspect_ai/_eval/task/log.py +2 -0
inspect_ai/_eval/task/task.py +23 -9
inspect_ai/_util/file.py +13 -0
inspect_ai/_util/json.py +2 -1
inspect_ai/_util/registry.py +1 -0
inspect_ai/_util/vscode.py +37 -0
inspect_ai/_view/www/App.css +6 -0
inspect_ai/_view/www/dist/assets/index.css +304 -128
inspect_ai/_view/www/dist/assets/index.js +47495 -27519
inspect_ai/_view/www/log-schema.json +124 -31
inspect_ai/_view/www/package.json +3 -0
inspect_ai/_view/www/src/App.tsx +12 -0
inspect_ai/_view/www/src/appearance/icons.ts +1 -0
inspect_ai/_view/www/src/components/Card.tsx +6 -4
inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
inspect_ai/_view/www/src/components/Modal.module.css +38 -0
inspect_ai/_view/www/src/components/Modal.tsx +77 -0
inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
inspect_ai/_view/www/src/state/hooks.ts +5 -3
inspect_ai/_view/www/src/state/logPolling.ts +5 -1
inspect_ai/_view/www/src/state/logSlice.ts +10 -0
inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
inspect_ai/_view/www/src/types/log.d.ts +34 -26
inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
inspect_ai/_view/www/yarn.lock +94 -1
inspect_ai/agent/__init__.py +36 -0
inspect_ai/agent/_agent.py +268 -0
inspect_ai/agent/_as_solver.py +72 -0
inspect_ai/agent/_as_tool.py +122 -0
inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
inspect_ai/agent/_filter.py +46 -0
inspect_ai/agent/_handoff.py +93 -0
inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
inspect_ai/agent/_react.py +241 -0
inspect_ai/agent/_run.py +36 -0
inspect_ai/agent/_types.py +81 -0
inspect_ai/log/_log.py +11 -2
inspect_ai/log/_transcript.py +13 -9
inspect_ai/model/__init__.py +7 -1
inspect_ai/model/_call_tools.py +256 -52
inspect_ai/model/_chat_message.py +7 -4
inspect_ai/model/_conversation.py +13 -62
inspect_ai/model/_display.py +85 -0
inspect_ai/model/_model.py +113 -14
inspect_ai/model/_model_output.py +14 -9
inspect_ai/model/_openai.py +16 -4
inspect_ai/model/_openai_computer_use.py +162 -0
inspect_ai/model/_openai_responses.py +319 -165
inspect_ai/model/_providers/anthropic.py +20 -21
inspect_ai/model/_providers/azureai.py +24 -13
inspect_ai/model/_providers/bedrock.py +1 -7
inspect_ai/model/_providers/cloudflare.py +3 -3
inspect_ai/model/_providers/goodfire.py +2 -6
inspect_ai/model/_providers/google.py +11 -10
inspect_ai/model/_providers/groq.py +6 -3
inspect_ai/model/_providers/hf.py +7 -3
inspect_ai/model/_providers/mistral.py +7 -10
inspect_ai/model/_providers/openai.py +47 -17
inspect_ai/model/_providers/openai_o1.py +11 -4
inspect_ai/model/_providers/openai_responses.py +12 -14
inspect_ai/model/_providers/providers.py +2 -2
inspect_ai/model/_providers/together.py +12 -2
inspect_ai/model/_providers/util/chatapi.py +7 -2
inspect_ai/model/_providers/util/hf_handler.py +4 -2
inspect_ai/model/_providers/util/llama31.py +4 -2
inspect_ai/model/_providers/vertex.py +11 -9
inspect_ai/model/_providers/vllm.py +4 -4
inspect_ai/scorer/__init__.py +2 -0
inspect_ai/scorer/_metrics/__init__.py +2 -0
inspect_ai/scorer/_metrics/grouped.py +84 -0
inspect_ai/scorer/_score.py +26 -6
inspect_ai/solver/__init__.py +2 -2
inspect_ai/solver/_basic_agent.py +22 -9
inspect_ai/solver/_bridge.py +31 -0
inspect_ai/solver/_chain.py +20 -12
inspect_ai/solver/_fork.py +5 -1
inspect_ai/solver/_human_agent.py +52 -0
inspect_ai/solver/_prompt.py +3 -1
inspect_ai/solver/_run.py +59 -0
inspect_ai/solver/_solver.py +14 -4
inspect_ai/solver/_task_state.py +5 -3
inspect_ai/tool/_tool_call.py +15 -8
inspect_ai/tool/_tool_def.py +17 -12
inspect_ai/tool/_tool_support_helpers.py +2 -2
inspect_ai/tool/_tool_with.py +14 -11
inspect_ai/tool/_tools/_bash_session.py +11 -2
inspect_ai/tool/_tools/_computer/_common.py +18 -2
inspect_ai/tool/_tools/_computer/_computer.py +18 -2
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_anyio.py +27 -0
inspect_ai/util/_sandbox/__init__.py +2 -1
inspect_ai/util/_sandbox/context.py +32 -7
inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/docker/docker.py +12 -1
inspect_ai/util/_store_model.py +30 -7
inspect_ai/util/_subprocess.py +13 -3
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/METADATA +1 -1
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/RECORD +179 -153
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
/inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/top_level.txt +0 -0

inspect_ai/model/_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import os
 import time
 from contextvars import ContextVar
-from copy import deepcopy
+from copy import copy, deepcopy
 from datetime import datetime
 from types import TracebackType
 from typing import Any, AsyncIterator, Callable, Literal, Type, cast
@@ -45,11 +45,17 @@ from inspect_ai._util.retry import report_http_retry
 from inspect_ai._util.trace import trace_action
 from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
+from inspect_ai.tool._tool_call import ToolCallModelInputHints
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
 from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
-from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
+from ._call_tools import (
+    disable_parallel_tools,
+    execute_tools,
+    tool_call_view,
+    tools_info,
+)
 from ._chat_message import (
     ChatMessage,
     ChatMessageAssistant,
@@ -57,7 +63,10 @@ from ._chat_message import (
     ChatMessageTool,
     ChatMessageUser,
 )
-from ._conversation import conversation_assistant_error, conversation_assistant_message
+from ._display import (
+    display_conversation_assistant,
+    display_conversation_assistant_error,
+)
 from ._generate_config import (
     GenerateConfig,
     active_generate_config,
@@ -123,9 +132,20 @@ class ModelAPI(abc.ABC):
         # set any explicitly specified api key
         self.api_key = api_key
-    async def close(self) -> None:
-        """Close method for closing any client allocated for the model."""
-        pass
+    async def aclose(self) -> None:
+        """Async close method for closing any client allocated for the model."""
+        self.close()
+    def close(self) -> None:
+        """Sync close method for closing any client allocated for the model."""
+        # if this is is called and aclose is implemented by a subclass then
+        # raise a runtime error (as this model reuqires async close)
+        aclose_method = getattr(self.__class__, "aclose")
+        base_aclose_method = getattr(ModelAPI, "aclose")
+        if aclose_method != base_aclose_method:
+            raise RuntimeError(
+                f"{self.__class__.__name__} models require an async close / context manager."
+            )
     @abc.abstractmethod
     async def generate(
@@ -201,6 +221,10 @@ class ModelAPI(abc.ABC):
         """Tool results can contain images"""
         return False
+    def disable_computer_screenshot_truncation(self) -> bool:
+        """Some models do not support truncation of computer screenshots."""
+        return False
     def emulate_reasoning_history(self) -> bool:
         """Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
         return True
@@ -255,10 +279,23 @@ class Model:
         # get hit before score() or eval() so we activate nest_asyncio
         platform_init()
-    async def __aenter__(self: "Model") -> "Model":
+    def __enter__(self: "Model") -> "Model":
         self._context_bound = True
         return self
+    async def __aenter__(self: "Model") -> "Model":
+        return self.__enter__()
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        if not self._closed:
+            self.api.close()
+            self._closed = True
     async def __aexit__(
         self,
         exc_type: type[BaseException] | None,
@@ -266,7 +303,7 @@ class Model:
         exc_tb: TracebackType | None,
     ) -> None:
         if not self._closed:
-            await self.api.close()
+            await self.api.aclose()
             self._closed = True
     @property
@@ -373,6 +410,55 @@ class Model:
             # return output
             return output
+    async def generate_loop(
+        self,
+        input: str | list[ChatMessage],
+        tools: list[Tool] | list[ToolDef] | list[Tool | ToolDef] = [],
+        config: GenerateConfig = GenerateConfig(),
+        cache: bool | CachePolicy = False,
+    ) -> tuple[list[ChatMessage], ModelOutput]:
+        """Generate output from the model, looping as long as the model calls tools.
+        Similar to `generate()`, but runs in a loop resolving model tool calls.
+        The loop terminates when the model stops calling tools. The final `ModelOutput`
+        as well the message list for the conversation are returned as a tuple.
+        Args:
+          input: Chat message input (if a `str` is passed it is converted
+            to a `ChatMessageUser`).
+          tools: Tools available for the model to call.
+          config: Model configuration.
+          cache: Caching behavior for generate responses (defaults to no caching).
+        Returns:
+           Tuple of list[ChatMessage], ModelOutput
+        """
+        # initialise messages
+        input = [ChatMessageUser(content=input)] if isinstance(input, str) else input
+        messages = copy(input)
+        while True:
+            # call model
+            output = await self.generate(
+                input=messages,
+                tools=tools,  # type:ignore[arg-type]
+                config=config,
+                cache=cache,
+            )
+            # append to new messages
+            messages.append(output.message)
+            # make tool calls or terminate if there are none
+            if output.message.tool_calls:
+                tools_messages, tools_output = await execute_tools(
+                    messages, tools, config.max_tool_output
+                )
+                messages.extend(tools_messages)
+                if tools_output is not None:
+                    output = tools_output
+            else:
+                return messages[len(input) :], output
     async def _generate(
         self,
         input: list[ChatMessage],
@@ -414,7 +500,13 @@ class Model:
         input = resolve_reasoning_history(input, config, self.api)
         # apply any tool model_input handlers
-        input = resolve_tool_model_input(tdefs, input)
+        input = resolve_tool_model_input(
+            tdefs,
+            input,
+            ToolCallModelInputHints(
+                disable_computer_screenshot_truncation=self.api.disable_computer_screenshot_truncation()
+            ),
+        )
         # break tool image content out into user messages if the model doesn't
         # support tools returning images
@@ -664,10 +756,10 @@ class Model:
             # trace
             if isinstance(result, ModelOutput):
                 if result.choices:
-                    conversation_assistant_message(input, result.choices[0].message)
+                    display_conversation_assistant(input, result.choices[0].message)
                 event.output = result
             else:
-                conversation_assistant_error(result)
+                display_conversation_assistant_error(result)
                 event.error = repr(result)
             event.call = updated_call
@@ -1034,7 +1126,7 @@ def resolve_reasoning_history(
 def resolve_tool_model_input(
-    tdefs: list[ToolDef], messages: list[ChatMessage]
+    tdefs: list[ToolDef], messages: list[ChatMessage], hints: ToolCallModelInputHints
 ) -> list[ChatMessage]:
     # filter on tooldefs that have a model input handler
     tdefs = [tdef for tdef in tdefs if tdef.model_input is not None]
@@ -1060,7 +1152,7 @@ def resolve_tool_model_input(
         # call the function for each tool, passing the index, total, and content
         for index, message in enumerate(tdef_tool_messages):
             message.content = tdef.model_input(
-                index, len(tool_messages), message.content
+                index, len(tool_messages), message.content, hints
             )
     # return modified messages
@@ -1116,7 +1208,7 @@ def tool_result_images_reducer(
                     content=edited_tool_message_content,
                     tool_call_id=message.tool_call_id,
                     function=message.function,
-                    internal_name=message.internal_name,
+                    internal=message.internal,
                 )
             ],
             pending_content + new_user_message_content,
@@ -1219,6 +1311,13 @@ def consecutive_message_reducer(
 def combine_messages(
     a: ChatMessage, b: ChatMessage, message_type: Type[ChatMessage]
 ) -> ChatMessage:
+    # TODO: Although unlikely to happen based on the current call sites, these
+    # fabricated messages drop interesting fields from the source messages -
+    # such as `internal_name`, `tool_calls`, etc.
+    # To be more specific, since all `ChatMessageXxx` fields other than `id` and
+    # `content` have default values, it's more the case that they're reset to
+    # default values rather than dropped.
     if isinstance(a.content, str) and isinstance(b.content, str):
         return message_type(id=a.id, content=f"{a.content}\n{b.content}")
     elif isinstance(a.content, list) and isinstance(b.content, list):

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import uuid
 from typing import Any, Literal, Type
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, JsonValue, model_validator
 from inspect_ai.tool._tool_call import ToolCall
@@ -123,6 +123,10 @@ class ModelOutput(BaseModel):
     error: str | None = Field(default=None)
     """Error message in the case of content moderation refusals."""
+    @property
+    def empty(self) -> bool:
+        return len(self.choices) == 0
     @property
     def stop_reason(self) -> StopReason:
         """First message stop reason."""
@@ -153,7 +157,8 @@ class ModelOutput(BaseModel):
         else:
             self.choices.append(
                 ChatCompletionChoice(
-                    message=ChatMessageAssistant(content=completion), stop_reason="stop"
+                    message=ChatMessageAssistant(content=completion, model=self.model),
+                    stop_reason="stop",
                 )
             )
@@ -176,7 +181,9 @@ class ModelOutput(BaseModel):
             model=model,
             choices=[
                 ChatCompletionChoice(
-                    message=ChatMessageAssistant(content=content, source="generate"),
+                    message=ChatMessageAssistant(
+                        content=content, model=model, source="generate"
+                    ),
                     stop_reason=stop_reason,
                 )
             ],
@@ -188,10 +195,9 @@ class ModelOutput(BaseModel):
         model: str,
         tool_name: str,
         tool_arguments: dict[str, Any],
-        internal_tool_name: str | None = None,
+        internal: JsonValue | None = None,
         tool_call_id: str | None = None,
         content: str | None = None,
-        type: str = "function",
     ) -> "ModelOutput":
         """
         Returns a ModelOutput for requesting a tool call.
@@ -199,8 +205,7 @@ class ModelOutput(BaseModel):
         Args:
             model: model name
             tool_name: The name of the tool.
-            internal_tool_name: The model's internal name for the tool (if any).
-            type: The model's type for the tool. e.g. "function", "computer_use_preview"
+            internal: The model's internal info for the tool (if any).
             tool_arguments: The arguments passed to the tool.
             tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
             content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
@@ -220,14 +225,14 @@ class ModelOutput(BaseModel):
                 ChatCompletionChoice(
                     message=ChatMessageAssistant(
                         content=content,
+                        model=model,
                         source="generate",
                         tool_calls=[
                             ToolCall(
                                 id=tool_call_id,
                                 function=tool_name,
-                                internal_name=internal_tool_name,
+                                internal=internal,
                                 arguments=tool_arguments,
-                                type=type,
                             )
                         ],
                     ),

inspect_ai/model/_openai.py CHANGED Viewed

@@ -83,6 +83,10 @@ def is_o1_preview(name: str) -> bool:
     return "o1-preview" in name
+def is_computer_use_preview(name: str) -> bool:
+    return "computer-use-preview" in name
 def is_gpt(name: str) -> bool:
     return "gpt" in name
@@ -100,13 +104,12 @@ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
 def openai_chat_tool_call_param(
     tool_call: ToolCall,
 ) -> ChatCompletionMessageToolCallParam:
-    assert tool_call.type == "function", f"Unexpected tool call type {tool_call.type}"
     return ChatCompletionMessageToolCallParam(
         id=tool_call.id,
         function=dict(
             name=tool_call.function, arguments=json.dumps(tool_call.arguments)
         ),
-        type="function",  # Type narrowing couldn't figure it out
+        type="function",
     )
@@ -308,6 +311,7 @@ def chat_tool_calls_from_openai(
 def chat_messages_from_openai(
+    model: str,
     messages: list[ChatCompletionMessageParam],
 ) -> list[ChatMessage]:
     # track tool names by id
@@ -386,6 +390,8 @@ def chat_messages_from_openai(
                 ChatMessageAssistant(
                     content=content,
                     tool_calls=tool_calls or None,
+                    model=model,
+                    source="generate",
                 )
             )
         elif message["role"] == "tool":
@@ -464,7 +470,7 @@ def content_from_openai(
 def chat_message_assistant_from_openai(
-    message: ChatCompletionMessage, tools: list[ToolInfo]
+    model: str, message: ChatCompletionMessage, tools: list[ToolInfo]
 ) -> ChatMessageAssistant:
     refusal = getattr(message, "refusal", None)
     reasoning = getattr(message, "reasoning_content", None) or getattr(
@@ -484,6 +490,7 @@ def chat_message_assistant_from_openai(
     return ChatMessageAssistant(
         content=content,
+        model=model,
         source="generate",
         tool_calls=chat_tool_calls_from_openai(message, tools),
     )
@@ -496,7 +503,9 @@ def chat_choices_from_openai(
     choices.sort(key=lambda c: c.index)
     return [
         ChatCompletionChoice(
-            message=chat_message_assistant_from_openai(choice.message, tools),
+            message=chat_message_assistant_from_openai(
+                response.model, choice.message, tools
+            ),
             stop_reason=as_stop_reason(choice.finish_reason),
             logprobs=(
                 Logprobs(**choice.logprobs.model_dump())
@@ -538,6 +547,9 @@ def openai_handle_bad_request(
 def openai_media_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
     # remove images from raw api call
+    if key == "output" and isinstance(value, dict) and "image_url" in value:
+        value = copy(value)
+        value.update(image_url=BASE_64_DATA_REMOVED)
     if key == "image_url" and isinstance(value, dict) and "url" in value:
         url = str(value.get("url"))
         if url.startswith("data:"):

inspect_ai/model/_openai_computer_use.py ADDED Viewed

@@ -0,0 +1,162 @@
+from openai.types.responses import (
+    ComputerToolParam,
+    ResponseComputerToolCall,
+    ResponseComputerToolCallOutputScreenshotParam,
+)
+from openai.types.responses.response_input_item_param import ComputerCallOutput
+from inspect_ai._util.content import Content, ContentImage
+from inspect_ai.model._chat_message import ChatMessageTool
+from inspect_ai.tool._tool_call import ToolCall
+from inspect_ai.tool._tool_info import ToolInfo
+def tool_call_from_openai_computer_tool_call(
+    output: ResponseComputerToolCall,
+) -> ToolCall:
+    return ToolCall(
+        id=output.call_id,
+        function="computer",
+        arguments=_parse_computer_tool_call_arguments(output),
+        internal=output.model_dump(),
+    )
+def maybe_computer_use_preview_tool(tool: ToolInfo) -> ComputerToolParam | None:
+    # check for compatible 'computer' tool
+    return (
+        ComputerToolParam(
+            type="computer_use_preview",
+            # The OpenAI model is ahead of the sdk — "ubuntu" -> "linux"
+            environment="linux",  # type: ignore
+            # Note: The dimensions passed here for display_width and display_height should
+            # match the dimensions of screenshots returned by the tool.
+            # Those dimensions will always be one of the values in MAX_SCALING_TARGETS
+            # in _x11_client.py.
+            # TODO: enhance this code to calculate the dimensions based on the scaled screen
+            # size used by the container.
+            display_width=1366,
+            display_height=768,
+        )
+        if tool.name == "computer"
+        and (
+            sorted(tool.parameters.properties.keys())
+            == sorted(
+                [
+                    "action",
+                    "coordinate",
+                    "duration",
+                    "scroll_amount",
+                    "scroll_direction",
+                    "start_coordinate",
+                    "text",
+                ]
+            )
+        )
+        else None
+    )
+def computer_call_output(
+    message: ChatMessageTool,
+    # internal is passed in despite being within message to avoid an extra
+    # validation step
+    internal: ResponseComputerToolCall,
+) -> ComputerCallOutput:
+    return ComputerCallOutput(
+        call_id=internal.call_id,
+        type="computer_call_output",
+        output=ResponseComputerToolCallOutputScreenshotParam(
+            type="computer_screenshot",
+            image_url=_content_image(message.content),
+        ),
+    )
+def _parse_computer_tool_call_arguments(
+    output: ResponseComputerToolCall,
+) -> dict[str, object]:
+    action = output.action
+    if action.type == "click":
+        coordinate = [action.x, action.y]
+        match action.button:
+            case "left":
+                return {"action": "left_click", "coordinate": coordinate}
+            case "right":
+                return {"action": "right_click", "coordinate": coordinate}
+            case "wheel":
+                return {"action": "middle_click", "coordinate": coordinate}
+            case "back":
+                return {"action": "back_click", "coordinate": coordinate}
+            case "forward":
+                return {"action": "forward_click", "coordinate": coordinate}
+    elif action.type == "double_click":
+        return {"action": "double_click", "coordinate": [action.x, action.y]}
+    elif action.type == "drag":
+        # TODO: For now, we go directly from the first to the last coordinate in
+        # the path. Ultimately, we'll need to extend the tool to support all of
+        # the intermediate coordinates in the path.
+        path = action.path
+        assert len(path) >= 2
+        start = path[0]
+        end = path[-1]
+        return {
+            "action": "left_click_drag",
+            "start_coordinate": [start.x, start.y],
+            "coordinate": [end.x, end.y],
+        }
+    elif action.type == "keypress":
+        # TODO: This mapping logic is copied from their example, but seems incomplete
+        mapping = {
+            "ENTER": "Return",
+            "LEFT": "Left",
+            "RIGHT": "Right",
+            "UP": "Up",
+            "DOWN": "Down",
+            "ESC": "Escape",
+            "SPACE": "space",
+            "BACKSPACE": "BackSpace",
+            "TAB": "Tab",
+        }
+        return {
+            "action": "key",
+            "text": "+".join([mapping.get(key, key) for key in action.keys]),
+        }
+    elif action.type == "move":
+        return {"action": "mouse_move", "coordinate": [action.x, action.y]}
+    elif action.type == "screenshot":
+        return {"action": "screenshot"}
+    elif action.type == "scroll":
+        # TODO: OpenAI spec's with x/y distances. Their example code treats the
+        # unit of measurement as a "click" of the scroll wheel. Since it's not
+        # really a thing to scroll both horizontally and vertically at the same
+        # time, we'll just pick one of the potentially two directions and
+        # scroll along that dimension.
+        (scroll_direction, scroll_amount) = (
+            ("right" if action.scroll_x > 0 else "left", abs(action.scroll_x))
+            if action.scroll_x
+            else ("down" if action.scroll_y > 0 else "up", abs(action.scroll_y))
+        )
+        return {
+            "action": "scroll",
+            "coordinate": [action.x, action.y],
+            "scroll_direction": scroll_direction,
+            "scroll_amount": scroll_amount,
+        }
+    elif action.type == "type":
+        return {"action": "type", "text": action.text}
+    elif action.type == "wait":
+        return {"action": "wait", "duration": 1}
+    assert False, f"Unexpected action type: {action.type}"
+def _content_image(input: str | list[Content]) -> str:
+    result = (
+        next((item.image for item in input if isinstance(item, ContentImage)), None)
+        if isinstance(input, list)
+        else None
+    )
+    assert result, "Must find image in content"
+    return result

inspect-ai 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl