PyPI - inspect-ai - Versions diffs - 0.3.70__py3-none-any.whl → 0.3.71__py3-none-any.whl - Mend

inspect-ai 0.3.70py3-none-any.whl → 0.3.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

inspect_ai/model/_openai.py CHANGED Viewed

@@ -27,11 +27,18 @@ from openai.types.chat.chat_completion_message_tool_call import Function
 from openai.types.completion_usage import CompletionUsage
 from openai.types.shared_params.function_definition import FunctionDefinition
-from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentAudio,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+)
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_http_url
 from inspect_ai.model._call_tools import parse_tool_call
 from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
+from inspect_ai.model._reasoning import parse_content_with_reasoning
 from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
 from ._chat_message import (
@@ -148,14 +155,14 @@ async def openai_chat_message(
         if message.tool_calls:
             return ChatCompletionAssistantMessageParam(
                 role=message.role,
-                content=message.text,
+                content=openai_assistant_content(message),
                 tool_calls=[
                     openai_chat_tool_call_param(call) for call in message.tool_calls
                 ],
             )
         else:
             return ChatCompletionAssistantMessageParam(
-                role=message.role, content=message.text
+                role=message.role, content=openai_assistant_content(message)
             )
     elif message.role == "tool":
         return ChatCompletionToolMessageParam(
@@ -175,16 +182,29 @@ async def openai_chat_messages(
     return [await openai_chat_message(message, model) for message in messages]
+def openai_assistant_content(message: ChatMessageAssistant) -> str:
+    if isinstance(message.content, str):
+        content = message.content
+    else:
+        content = ""
+        for c in message.content:
+            if c.type == "reasoning":
+                attribs = ""
+                if c.signature is not None:
+                    attribs = f'{attribs} signature="{c.signature}"'
+                if c.redacted:
+                    attribs = f'{attribs} redacted="true"'
+                content = f"{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n"
+            elif c.type == "text":
+                content = f"{content}\n{c.text}"
+    return content
 def openai_chat_choices(choices: list[ChatCompletionChoice]) -> list[Choice]:
     oai_choices: list[Choice] = []
     for index, choice in enumerate(choices):
-        if isinstance(choice.message.content, str):
-            content = choice.message.content
-        else:
-            content = "\n".join(
-                [c.text for c in choice.message.content if c.type == "text"]
-            )
+        content = openai_assistant_content(choice.message)
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:
@@ -274,35 +294,47 @@ def chat_messages_from_openai(
     chat_messages: list[ChatMessage] = []
     for message in messages:
+        content: str | list[Content] = []
         if message["role"] == "system" or message["role"] == "developer":
             sys_content = message["content"]
             if isinstance(sys_content, str):
                 chat_messages.append(ChatMessageSystem(content=sys_content))
             else:
-                chat_messages.append(
-                    ChatMessageSystem(
-                        content=[content_from_openai(c) for c in sys_content]
-                    )
-                )
+                content = []
+                for sc in sys_content:
+                    content.extend(content_from_openai(sc))
+                chat_messages.append(ChatMessageSystem(content=content))
         elif message["role"] == "user":
             user_content = message["content"]
             if isinstance(user_content, str):
                 chat_messages.append(ChatMessageUser(content=user_content))
             else:
-                chat_messages.append(
-                    ChatMessageUser(
-                        content=[content_from_openai(c) for c in user_content]
-                    )
-                )
+                content = []
+                for uc in user_content:
+                    content.extend(content_from_openai(uc))
+                chat_messages.append(ChatMessageUser(content=content))
         elif message["role"] == "assistant":
             # resolve content
-            asst_content = message["content"]
+            asst_content = message.get("content", None)
             if isinstance(asst_content, str):
-                content: str | list[Content] = asst_content
+                result = parse_content_with_reasoning(asst_content)
+                if result is not None:
+                    content = [
+                        ContentReasoning(
+                            reasoning=result.reasoning,
+                            signature=result.signature,
+                            redacted=result.redacted,
+                        ),
+                        ContentText(text=result.content),
+                    ]
+                else:
+                    content = asst_content
             elif asst_content is None:
                 content = message.get("refusal", None) or ""
             else:
-                content = [content_from_openai(c) for c in asst_content]
+                content = []
+                for ac in asst_content:
+                    content.extend(content_from_openai(ac, parse_reasoning=True))
             # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
             # interfaces e.g. DeepSeek do include this field so we pluck it out)
@@ -310,22 +342,25 @@ def chat_messages_from_openai(
                 "reasoning", None
             )
             if reasoning is not None:
-                reasoning = str(reasoning)
+                if isinstance(content, str):
+                    content = [ContentText(text=content)]
+                else:
+                    content.insert(0, ContentReasoning(reasoning=str(reasoning)))
             # return message
             if "tool_calls" in message:
                 tool_calls: list[ToolCall] = []
-                for tc in message["tool_calls"]:
-                    tool_calls.append(tool_call_from_openai(tc))
-                    tool_names[tc["id"]] = tc["function"]["name"]
+                for call in message["tool_calls"]:
+                    tool_calls.append(tool_call_from_openai(call))
+                    tool_names[call["id"]] = call["function"]["name"]
             else:
                 tool_calls = []
             chat_messages.append(
                 ChatMessageAssistant(
                     content=content,
                     tool_calls=tool_calls or None,
-                    reasoning=reasoning,
                 )
             )
         elif message["role"] == "tool":
@@ -333,7 +368,9 @@ def chat_messages_from_openai(
             if isinstance(tool_content, str):
                 content = tool_content
             else:
-                content = [content_from_openai(c) for c in tool_content]
+                content = []
+                for tc in tool_content:
+                    content.extend(content_from_openai(tc))
             chat_messages.append(
                 ChatMessageTool(
                     content=content,
@@ -357,20 +394,40 @@ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> Tool
 def content_from_openai(
     content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
-) -> Content:
+    parse_reasoning: bool = False,
+) -> list[Content]:
     if content["type"] == "text":
-        return ContentText(text=content["text"])
+        text = content["text"]
+        if parse_reasoning:
+            result = parse_content_with_reasoning(text)
+            if result:
+                return [
+                    ContentReasoning(
+                        reasoning=result.reasoning,
+                        signature=result.signature,
+                        redacted=result.redacted,
+                    ),
+                    ContentText(text=result.content),
+                ]
+            else:
+                return [ContentText(text=text)]
+        else:
+            return [ContentText(text=text)]
     elif content["type"] == "image_url":
-        return ContentImage(
-            image=content["image_url"]["url"], detail=content["image_url"]["detail"]
-        )
+        return [
+            ContentImage(
+                image=content["image_url"]["url"], detail=content["image_url"]["detail"]
+            )
+        ]
     elif content["type"] == "input_audio":
-        return ContentAudio(
-            audio=content["input_audio"]["data"],
-            format=content["input_audio"]["format"],
-        )
+        return [
+            ContentAudio(
+                audio=content["input_audio"]["data"],
+                format=content["input_audio"]["format"],
+            )
+        ]
     elif content["type"] == "refusal":
-        return ContentText(text=content["refusal"])
+        return [ContentText(text=content["refusal"])]
 def chat_message_assistant_from_openai(
@@ -380,11 +437,20 @@ def chat_message_assistant_from_openai(
     reasoning = getattr(message, "reasoning_content", None) or getattr(
         message, "reasoning", None
     )
+    msg_content = refusal or message.content or ""
+    if reasoning is not None:
+        content: str | list[Content] = [
+            ContentReasoning(reasoning=str(reasoning)),
+            ContentText(text=msg_content),
+        ]
+    else:
+        content = msg_content
     return ChatMessageAssistant(
-        content=refusal or message.content or "",
+        content=content,
         source="generate",
         tool_calls=chat_tool_calls_from_openai(message, tools),
-        reasoning=reasoning,
     )

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import functools
 import os
+import re
 import sys
 from copy import copy
 from logging import getLogger
@@ -28,8 +29,12 @@ from anthropic.types import (
     ImageBlockParam,
     Message,
     MessageParam,
+    RedactedThinkingBlock,
+    RedactedThinkingBlockParam,
     TextBlock,
     TextBlockParam,
+    ThinkingBlock,
+    ThinkingBlockParam,
     ToolParam,
     ToolResultBlockParam,
     ToolUseBlock,
@@ -44,7 +49,12 @@ from inspect_ai._util.constants import (
     DEFAULT_MAX_RETRIES,
     NO_CONTENT,
 )
-from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+)
 from inspect_ai._util.error import exception_message
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.logger import warn_once
@@ -204,23 +214,33 @@ class AnthropicAPI(ModelAPI):
                 request["system"] = system_param
             request["tools"] = tools_param
             if len(tools) > 0:
-                request["tool_choice"] = message_tool_choice(tool_choice)
+                request["tool_choice"] = message_tool_choice(
+                    tool_choice, self.is_using_thinking(config)
+                )
             # additional options
-            request = request | self.completion_params(config)
+            req, headers, betas = self.completion_config(config)
+            request = request | req
             # extra headers (for time tracker and computer use)
-            extra_headers = {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
+            extra_headers = headers | {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
             if computer_use:
-                extra_headers["anthropic-beta"] = "computer-use-2024-10-22"
+                betas.append("computer-use-2024-10-22")
+            if len(betas) > 0:
+                extra_headers["anthropic-beta"] = ",".join(betas)
             request["extra_headers"] = extra_headers
             # extra_body
             if self.extra_body is not None:
                 request["extra_body"] = self.extra_body
-            # make request
-            message = await self.client.messages.create(**request, stream=False)
+            # make request (stream if we are using reasoning)
+            if self.is_using_thinking(config):
+                async with self.client.messages.stream(**request) as stream:
+                    message = await stream.get_final_message()
+            else:
+                message = await self.client.messages.create(**request, stream=False)
             # set response for ModelCall
             response = message.model_dump()
@@ -245,27 +265,67 @@ class AnthropicAPI(ModelAPI):
             else:
                 raise ex
-    def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
-        params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
-        if config.temperature is not None:
-            params["temperature"] = config.temperature
-        if config.top_p is not None:
-            params["top_p"] = config.top_p
-        if config.top_k is not None:
-            params["top_k"] = config.top_k
+    def completion_config(
+        self, config: GenerateConfig
+    ) -> tuple[dict[str, Any], dict[str, str], list[str]]:
+        max_tokens = cast(int, config.max_tokens)
+        params = dict(model=self.model_name, max_tokens=max_tokens)
+        headers: dict[str, str] = {}
+        betas: list[str] = []
+        # some params not compatible with thinking models
+        if not self.is_using_thinking(config):
+            if config.temperature is not None:
+                params["temperature"] = config.temperature
+            if config.top_p is not None:
+                params["top_p"] = config.top_p
+            if config.top_k is not None:
+                params["top_k"] = config.top_k
+        # some thinking-only stuff
+        if self.is_using_thinking(config):
+            params["thinking"] = dict(
+                type="enabled", budget_tokens=config.reasoning_tokens
+            )
+            headers["anthropic-version"] = "2023-06-01"
+            if max_tokens > 8192:
+                betas.append("output-128k-2025-02-19")
+        # config that applies to all models
         if config.timeout is not None:
             params["timeout"] = float(config.timeout)
         if config.stop_seqs is not None:
             params["stop_sequences"] = config.stop_seqs
-        return params
+        # return config
+        return params, headers, betas
     @override
     def max_tokens(self) -> int | None:
         # anthropic requires you to explicitly specify max_tokens (most others
         # set it to the maximum allowable output tokens for the model).
-        # set to 4096 which is the lowest documented max_tokens for claude models
+        # set to 4096 which is the highest possible for claude 3 (claude 3.5
+        # allows up to 8192)
         return 4096
+    @override
+    def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
+        max_tokens = cast(int, self.max_tokens())
+        if self.is_thinking_model() and config.reasoning_tokens is not None:
+            max_tokens = max_tokens + config.reasoning_tokens
+        return max_tokens
+    def is_using_thinking(self, config: GenerateConfig) -> bool:
+        return self.is_thinking_model() and config.reasoning_tokens is not None
+    def is_thinking_model(self) -> bool:
+        return not self.is_claude_3() and not self.is_claude_3_5()
+    def is_claude_3(self) -> bool:
+        return re.search(r"claude-3-[a-zA-Z]", self.model_name) is not None
+    def is_claude_3_5(self) -> bool:
+        return "claude-3-5-" in self.model_name
     @override
     def connection_key(self) -> str:
         return str(self.api_key)
@@ -295,6 +355,14 @@ class AnthropicAPI(ModelAPI):
     def tool_result_images(self) -> bool:
         return True
+    @override
+    def emulate_reasoning_history(self) -> bool:
+        return False
+    @override
+    def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
+        return "all"
     # convert some common BadRequestError states into 'refusal' model output
     def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | Exception:
         error = exception_message(ex).lower()
@@ -498,7 +566,7 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
     role = a["role"]
     a_content = a["content"]
     b_content = b["content"]
-    if isinstance(a_content, str) and isinstance(a_content, str):
+    if isinstance(a_content, str) and isinstance(b_content, str):
         return MessageParam(role=role, content=f"{a_content}\n{b_content}")
     elif isinstance(a_content, list) and isinstance(b_content, list):
         return MessageParam(role=role, content=a_content + b_content)
@@ -514,9 +582,15 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
         raise ValueError(f"Unexpected content types for messages: {a}, {b}")
-def message_tool_choice(tool_choice: ToolChoice) -> message_create_params.ToolChoice:
+def message_tool_choice(
+    tool_choice: ToolChoice, thinking_model: bool
+) -> message_create_params.ToolChoice:
     if isinstance(tool_choice, ToolFunction):
-        return {"type": "tool", "name": tool_choice.name}
+        # forced tool use not compatible with thinking models
+        if thinking_model:
+            return {"type": "any"}
+        else:
+            return {"type": "tool", "name": tool_choice.name}
     elif tool_choice == "any":
         return {"type": "any"}
     elif tool_choice == "none":
@@ -544,9 +618,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
     # "tool" means serving a tool call result back to claude
     elif message.role == "tool":
         if message.error is not None:
-            content: str | list[TextBlockParam | ImageBlockParam] = (
-                message.error.message
-            )
+            content: (
+                str
+                | list[
+                    TextBlockParam
+                    | ImageBlockParam
+                    | ThinkingBlockParam
+                    | RedactedThinkingBlockParam
+                ]
+            ) = message.error.message
             # anthropic requires that content be populated when
             # is_error is true (throws bad_request_error when not)
             # so make sure this precondition is met
@@ -567,7 +647,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
                 ToolResultBlockParam(
                     tool_use_id=str(message.tool_call_id),
                     type="tool_result",
-                    content=content,
+                    content=cast(list[TextBlockParam | ImageBlockParam], content),
                     is_error=message.error is not None,
                 )
             ],
@@ -576,7 +656,13 @@ async def message_param(message: ChatMessage) -> MessageParam:
     # tool_calls means claude is attempting to call our tools
     elif message.role == "assistant" and message.tool_calls:
         # first include content (claude <thinking>)
-        tools_content: list[TextBlockParam | ImageBlockParam | ToolUseBlockParam] = (
+        tools_content: list[
+            TextBlockParam
+            | ThinkingBlockParam
+            | RedactedThinkingBlockParam
+            | ImageBlockParam
+            | ToolUseBlockParam
+        ] = (
             [TextBlockParam(type="text", text=message.content or NO_CONTENT)]
             if isinstance(message.content, str)
             else (
@@ -645,6 +731,16 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
                     arguments=content_block.model_dump().get("input", {}),
                 )
             )
+        elif isinstance(content_block, RedactedThinkingBlock):
+            content.append(
+                ContentReasoning(reasoning=content_block.data, redacted=True)
+            )
+        elif isinstance(content_block, ThinkingBlock):
+            content.append(
+                ContentReasoning(
+                    reasoning=content_block.thinking, signature=content_block.signature
+                )
+            )
     # resolve choice
     choice = ChatCompletionChoice(
@@ -702,7 +798,7 @@ def split_system_messages(
 async def message_param_content(
     content: Content,
-) -> TextBlockParam | ImageBlockParam:
+) -> TextBlockParam | ImageBlockParam | ThinkingBlockParam | RedactedThinkingBlockParam:
     if isinstance(content, ContentText):
         return TextBlockParam(type="text", text=content.text or NO_CONTENT)
     elif isinstance(content, ContentImage):
@@ -720,6 +816,18 @@ async def message_param_content(
             type="image",
             source=dict(type="base64", media_type=cast(Any, media_type), data=image),
         )
+    elif isinstance(content, ContentReasoning):
+        if content.redacted:
+            return RedactedThinkingBlockParam(
+                type="redacted_thinking",
+                data=content.reasoning,
+            )
+        else:
+            if content.signature is None:
+                raise ValueError("Thinking content without signature.")
+            return ThinkingBlockParam(
+                type="thinking", thinking=content.reasoning, signature=content.signature
+            )
     else:
         raise RuntimeError(
             "Anthropic models do not currently support audio or video inputs."

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -38,10 +38,13 @@ from pydantic import JsonValue
 from typing_extensions import override
 from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
-from inspect_ai._util.content import Content as InspectContent
+from inspect_ai._util.content import (
+    Content as InspectContent,
+)
 from inspect_ai._util.content import (
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -250,7 +253,10 @@ class GoogleGenAIAPI(ModelAPI):
     @override
     def is_rate_limit(self, ex: BaseException) -> bool:
-        return isinstance(ex, APIError) and ex.code in (429, 500, 503, 504)
+        # see https://cloud.google.com/storage/docs/retry-strategy
+        return isinstance(ex, APIError) and (
+            ex.code in (408, 429, 429) or ex.code >= 500
+        )
     @override
     def connection_key(self) -> str:
@@ -405,6 +411,8 @@ async def content_part(client: Client, content: InspectContent | str) -> Part:
         return Part.from_text(text=content or NO_CONTENT)
     elif isinstance(content, ContentText):
         return Part.from_text(text=content.text or NO_CONTENT)
+    elif isinstance(content, ContentReasoning):
+        return Part.from_text(text=content.reasoning or NO_CONTENT)
     else:
         return await chat_content_to_part(client, content)
@@ -417,7 +425,8 @@ async def chat_content_to_part(
         content_bytes, mime_type = await file_as_data(content.image)
         return Part.from_bytes(mime_type=mime_type, data=content_bytes)
     else:
-        return await file_for_content(client, content)
+        file = await file_for_content(client, content)
+        return Part.from_uri(file_uri=file.uri, mime_type=file.mime_type)
 async def extract_system_message_as_parts(
@@ -552,11 +561,19 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
     # stop reason
     stop_reason = finish_reason_to_stop_reason(candidate.finish_reason)
+    # choice content may include reasoning
+    if reasoning:
+        choice_content: str | list[Content] = [
+            ContentReasoning(reasoning=reasoning),
+            ContentText(text=content),
+        ]
+    else:
+        choice_content = content
     # build choice
     choice = ChatCompletionChoice(
         message=ChatMessageAssistant(
-            content=content,
-            reasoning=reasoning,
+            content=choice_content,
             tool_calls=tool_calls if len(tool_calls) > 0 else None,
             source="generate",
         ),
@@ -742,7 +759,7 @@ async def file_for_content(
         uploaded_file = files_db.get(content_sha256)
         if uploaded_file:
             try:
-                upload: File = client.files.get(uploaded_file)
+                upload: File = client.files.get(name=uploaded_file)
                 if upload.state.name == "ACTIVE":
                     trace(f"Using uploaded file: {uploaded_file}")
                     return upload
@@ -754,10 +771,12 @@ async def file_for_content(
                 trace(f"Error attempting to access uploaded file: {ex}")
                 files_db.delete(content_sha256)
         # do the upload (and record it)
-        upload = client.files.upload(BytesIO(content_bytes), mime_type=mime_type)
+        upload = client.files.upload(
+            file=BytesIO(content_bytes), config=dict(mime_type=mime_type)
+        )
         while upload.state.name == "PROCESSING":
             await asyncio.sleep(3)
-            upload = client.files.get(upload.name)
+            upload = client.files.get(name=upload.name)
         if upload.state.name == "FAILED":
             trace(f"Failed to upload file '{upload.name}: {upload.error}")
             raise ValueError(f"Google file upload failed: {upload.error}")

inspect_ai/model/_providers/groq.py CHANGED Viewed

@@ -28,7 +28,7 @@ from inspect_ai._util.constants import (
     DEFAULT_MAX_RETRIES,
     DEFAULT_MAX_TOKENS,
 )
-from inspect_ai._util.content import Content
+from inspect_ai._util.content import Content, ContentReasoning, ContentText
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_http_url
 from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
@@ -326,12 +326,17 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
 def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
     reasoning = getattr(message, "reasoning", None)
     if reasoning is not None:
-        reasoning = str(reasoning)
+        content: str | list[Content] = [
+            ContentReasoning(reasoning=str(reasoning)),
+            ContentText(text=message.content or ""),
+        ]
+    else:
+        content = message.content or ""
     return ChatMessageAssistant(
-        content=message.content or "",
+        content=content,
         source="generate",
         tool_calls=chat_tool_calls(message, tools),
-        reasoning=reasoning,
     )

inspect-ai 0.3.70__py3-none-any.whl → 0.3.71__py3-none-any.whl

inspect-ai 0.3.70py3-none-any.whl → 0.3.71py3-none-any.whl