PyPI - inspect-ai - Versions diffs - 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl - Mend

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

inspect_ai/model/_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import abc
-import asyncio
+import contextlib
 import functools
 import json
 import logging
@@ -7,8 +7,9 @@ import os
 import time
 from contextvars import ContextVar
 from copy import deepcopy
+from datetime import datetime
 from types import TracebackType
-from typing import Any, Callable, Literal, Type, cast
+from typing import Any, AsyncIterator, Callable, Literal, Type, cast
 from pydantic_core import to_jsonable_python
 from tenacity import (
@@ -21,7 +22,12 @@ from tenacity import (
 )
 from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
-from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+)
 from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
 from inspect_ai._util.interrupt import check_sample_interrupt
 from inspect_ai._util.platform import platform_init
@@ -33,6 +39,7 @@ from inspect_ai._util.registry import (
 )
 from inspect_ai._util.retry import log_rate_limit_retry
 from inspect_ai._util.trace import trace_action
+from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
@@ -147,6 +154,17 @@ class ModelAPI(abc.ABC):
         """Default max_tokens."""
         return None
+    def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
+        """Default max_tokens for a given config.
+        Args:
+           config: Generation config.
+        Returns:
+           Default maximum tokens for specified configuration.
+        """
+        return None
     def max_connections(self) -> int:
         """Default max_connections."""
         return DEFAULT_MAX_CONNECTIONS
@@ -179,9 +197,17 @@ class ModelAPI(abc.ABC):
         """Tool results can contain images"""
         return False
-    def has_reasoning_history(self) -> bool:
-        """Chat message assistant messages can include reasoning."""
-        return False
+    def emulate_reasoning_history(self) -> bool:
+        """Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
+        return True
+    def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
+        """Force a specific reasoning history behavior for this provider."""
+        return None
+    def auto_reasoning_history(self) -> Literal["none", "all", "last"]:
+        """Behavior to use for reasoning_history='auto'"""
+        return "all"
 class Model:
@@ -284,9 +310,10 @@ class Model:
         config = base_config.merge(config)
         # provide max_tokens from the model api if required
-        config.max_tokens = (
-            config.max_tokens if config.max_tokens else self.api.max_tokens()
-        )
+        if config.max_tokens is None:
+            config.max_tokens = self.api.max_tokens_for_config(config)
+            if config.max_tokens is None:
+                config.max_tokens = self.api.max_tokens()
         # disable parallel tool calls if requested by any of our tools
         if disable_parallel_tools(tools):
@@ -301,8 +328,11 @@ class Model:
             input = [ChatMessageSystem(content=config.system_message)] + input
         # enforce concurrency limits
+        start_time = datetime.now()
+        working_start = sample_working_time()
         async with self._connection_concurrency(config):
-            return await self._generate(
+            # generate
+            output = await self._generate(
                 input=input,
                 tools=tools,
                 tool_choice=tool_choice,
@@ -310,6 +340,28 @@ class Model:
                 cache=cache,
             )
+            # update the most recent ModelEvent with the actual start/completed
+            # times as well as a computation of working time (events are
+            # created _after_ the call to _generate, potentially in response
+            # to retries, so they need their timestamp updated so it accurately
+            # reflects the full start/end time which we know here)
+            from inspect_ai.log._transcript import ModelEvent, transcript
+            last_model_event = transcript().find_last_event(ModelEvent)
+            if last_model_event:
+                last_model_event.timestamp = start_time
+                last_model_event.working_start = working_start
+                completed = datetime.now()
+                last_model_event.completed = completed
+                last_model_event.working_time = (
+                    output.time
+                    if output.time is not None
+                    else (completed - start_time).total_seconds()
+                )
+            # return output
+            return output
     async def _generate(
         self,
         input: list[ChatMessage],
@@ -348,9 +400,7 @@ class Model:
             tool_choice = "none"
         # handle reasoning history
-        input = resolve_reasoning_history(
-            input, config, self.api.has_reasoning_history()
-        )
+        input = resolve_reasoning_history(input, config, self.api)
         # apply any tool model_input handlers
         input = resolve_tool_model_input(tdefs, input)
@@ -435,14 +485,16 @@ class Model:
             )
             with trace_action(logger, "Model", f"generate ({str(self)})"):
-                time_start = time.perf_counter()
-                result = await self.api.generate(
-                    input=input,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    config=config,
-                )
-                time_elapsed = time.perf_counter() - time_start
+                time_start = time.monotonic()
+                try:
+                    result = await self.api.generate(
+                        input=input,
+                        tools=tools,
+                        tool_choice=tool_choice,
+                        config=config,
+                    )
+                finally:
+                    time_elapsed = time.monotonic() - time_start
             if isinstance(result, tuple):
                 output, call = result
@@ -461,8 +513,12 @@ class Model:
                 error_message = f"{error}\n\nRequest:\n{request}"
                 raise RuntimeError(error_message)
-            # update output with time elapsed
-            output.time = time_elapsed
+            # update output with time (call.time captures time spent
+            # on the actual request that succeeds w/ status 200)
+            if call and call.time is not None:
+                output.time = call.time
+            else:
+                output.time = time_elapsed
             # add views to tool calls
             for choice in output.choices:
@@ -488,8 +544,13 @@ class Model:
             return output
-        # call the model
+        # call the model (this will so retries, etc., so report waiting time
+        # as elapsed time - actual time for successful model call)
+        time_start = time.monotonic()
         model_output = await generate()
+        total_time = time.monotonic() - time_start
+        if model_output.time:
+            report_sample_waiting_time(total_time - model_output.time)
         # return results
         return model_output
@@ -513,7 +574,10 @@ class Model:
     # override the _connection_key() argument to provide a scope within which
     # to enforce max_connections (e.g. by account/api_key, by endpoint, etc.)
-    def _connection_concurrency(self, config: GenerateConfig) -> asyncio.Semaphore:
+    @contextlib.asynccontextmanager
+    async def _connection_concurrency(
+        self, config: GenerateConfig
+    ) -> AsyncIterator[None]:
         """Get the appropriate connection semaphore for this model instance."""
         max_connections = (
             config.max_connections
@@ -521,11 +585,12 @@ class Model:
             else self.api.max_connections()
         )
         model_name = ModelName(self)
-        return concurrency(
+        async with concurrency(
             name=f"{model_name.api}",
             concurrency=max_connections,
             key=f"Model{self.api.connection_key()}",
-        )
+        ):
+            yield
     def _record_model_interaction(
         self,
@@ -833,68 +898,91 @@ def simple_input_messages(
 def resolve_reasoning_history(
-    messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
+    messages: list[ChatMessage],
+    config: GenerateConfig,
+    model_api: ModelAPI,
 ) -> list[ChatMessage]:
-    # determine if we are including reasoning history
-    reasoning_history = config.reasoning_history is not False
     # determine up front if we have any reasoning content
     have_reasoning = any(
         [
-            isinstance(m, ChatMessageAssistant) and m.reasoning is not None
+            isinstance(m, ChatMessageAssistant)
+            and isinstance(m.content, list)
+            and any([c for c in m.content if isinstance(c, ContentReasoning)])
             for m in messages
         ]
     )
     if not have_reasoning:
         return messages
-    # API asssistant message format directly supports reasoning history so we will:
-    #   (a) Remove reasoning content entirely if config says not to include it; or
-    #   (b) Leave the messages alone if config says to include it
-    if api_has_reasoning_history:
-        # remove reasoning history as per config
-        if not reasoning_history:
-            resolved_messages: list[ChatMessage] = []
-            for message in messages:
-                if isinstance(message, ChatMessageAssistant):
-                    resolved_messages.append(
-                        message.model_copy(update={"reasoning": None})
-                    )
-                else:
-                    resolved_messages.append(message)
-            return resolved_messages
-        # include reasoning history as per config
-        else:
-            return messages
+    # determine reasoning history configuration
+    reasoning_history = (
+        config.reasoning_history if config.reasoning_history is not None else "auto"
+    )
-    # API can't represent reasoning natively so include <think> tags
-    elif reasoning_history:
+    # see if the provider is forcing a reasoning history
+    force = model_api.force_reasoning_history()
+    if force is not None:
+        reasoning_history = force
+    # if it's 'auto' then defer to the provider
+    elif reasoning_history == "auto":
+        reasoning_history = model_api.auto_reasoning_history()
+    # generate a version of message history with the correct history
+    if reasoning_history == "all":
+        resolved_messages: list[ChatMessage] = messages
+    else:
+        found_last = False
         resolved_messages = []
-        for message in messages:
-            if (
-                isinstance(message, ChatMessageAssistant)
-                and message.reasoning is not None
+        for message in reversed(messages):
+            if isinstance(message, ChatMessageAssistant) and isinstance(
+                message.content, list
             ):
-                message = deepcopy(message)
-                if isinstance(message.content, str):
-                    message.content = (
-                        f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
-                    )
-                else:
-                    message.content.insert(
-                        0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
-                    )
-                message.reasoning = None
+                # is there reasoning in this message?
+                has_reasoning = any(
+                    isinstance(c, ContentReasoning) for c in message.content
+                )
+                # remove it unless we are in "last" mode and haven't yet found last
+                if has_reasoning:
+                    if reasoning_history == "none" or found_last:
+                        message = message.model_copy(
+                            update={
+                                "content": [
+                                    content
+                                    for content in message.content
+                                    if not isinstance(content, ContentReasoning)
+                                ]
+                            }
+                        )
+                    found_last = True
             resolved_messages.append(message)
-        return resolved_messages
+        # reverse them back
+        resolved_messages.reverse()
-    # api doesn't handle reasoning and config says no reasoning_history, nothing to do
-    else:
-        return messages
+    # api can't represent reasoning natively so emulate it
+    if model_api.emulate_reasoning_history():
+        emulated_messages: list[ChatMessage] = []
+        for message in resolved_messages:
+            if isinstance(message, ChatMessageAssistant) and isinstance(
+                message.content, list
+            ):
+                content: list[Content] = []
+                for c in message.content:
+                    if isinstance(c, ContentReasoning):
+                        content.append(
+                            ContentText(text=f"<think>\n{c.reasoning}\n</think>")
+                        )
+                    else:
+                        content.append(c)
+                message = message.model_copy(update={"content": content})
+            emulated_messages.append(message)
+        resolved_messages = emulated_messages
+    # return messages
+    return resolved_messages
 def resolve_tool_model_input(
@@ -1184,6 +1272,10 @@ def set_model_usage(
             if total_usage.input_tokens_cache_read is None:
                 total_usage.input_tokens_cache_read = 0
             total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
+        if usage.reasoning_tokens is not None:
+            if total_usage.reasoning_tokens is None:
+                total_usage.reasoning_tokens = 0
+            total_usage.reasoning_tokens += usage.reasoning_tokens
         model_usage[model] = total_usage

inspect_ai/model/_model_call.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any, Callable
-from pydantic import BaseModel, JsonValue
+from pydantic import BaseModel, Field, JsonValue
 from inspect_ai._util.json import jsonable_python
@@ -22,9 +22,15 @@ class ModelCall(BaseModel):
     response: dict[str, JsonValue]
     """Raw response data from model."""
+    time: float | None = Field(default=None)
+    """Time taken for underlying model call."""
     @staticmethod
     def create(
-        request: Any, response: Any, filter: ModelCallFilter | None = None
+        request: Any,
+        response: Any,
+        filter: ModelCallFilter | None = None,
+        time: float | None = None,
     ) -> "ModelCall":
         """Create a ModelCall object.
@@ -36,6 +42,7 @@ class ModelCall(BaseModel):
            request (Any): Request object (dict, dataclass, BaseModel, etc.)
            response (Any): Response object (dict, dataclass, BaseModel, etc.)
            filter (ModelCallFilter): Function for filtering model call data.
+           time: Time taken for underlying ModelCall
         """
         request_dict = jsonable_python(request)
         if filter:
@@ -43,7 +50,7 @@ class ModelCall(BaseModel):
         response_dict = jsonable_python(response)
         if filter:
             response_dict = _walk_json_value(None, response_dict, filter)
-        return ModelCall(request=request_dict, response=response_dict)
+        return ModelCall(request=request_dict, response=response_dict, time=time)
 def _walk_json_value(

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -26,6 +26,9 @@ class ModelUsage(BaseModel):
     input_tokens_cache_read: int | None = Field(default=None)
     """Number of tokens retrieved from the cache."""
+    reasoning_tokens: int | None = Field(default=None)
+    """Number of tokens used for reasoning."""
 StopReason = Literal[
     "stop",

inspect_ai/model/_openai.py CHANGED Viewed

@@ -27,11 +27,18 @@ from openai.types.chat.chat_completion_message_tool_call import Function
 from openai.types.completion_usage import CompletionUsage
 from openai.types.shared_params.function_definition import FunctionDefinition
-from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentAudio,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+)
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_http_url
 from inspect_ai.model._call_tools import parse_tool_call
 from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
+from inspect_ai.model._reasoning import parse_content_with_reasoning
 from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
 from ._chat_message import (
@@ -148,14 +155,14 @@ async def openai_chat_message(
         if message.tool_calls:
             return ChatCompletionAssistantMessageParam(
                 role=message.role,
-                content=message.text,
+                content=openai_assistant_content(message),
                 tool_calls=[
                     openai_chat_tool_call_param(call) for call in message.tool_calls
                 ],
             )
         else:
             return ChatCompletionAssistantMessageParam(
-                role=message.role, content=message.text
+                role=message.role, content=openai_assistant_content(message)
             )
     elif message.role == "tool":
         return ChatCompletionToolMessageParam(
@@ -175,16 +182,29 @@ async def openai_chat_messages(
     return [await openai_chat_message(message, model) for message in messages]
+def openai_assistant_content(message: ChatMessageAssistant) -> str:
+    if isinstance(message.content, str):
+        content = message.content
+    else:
+        content = ""
+        for c in message.content:
+            if c.type == "reasoning":
+                attribs = ""
+                if c.signature is not None:
+                    attribs = f'{attribs} signature="{c.signature}"'
+                if c.redacted:
+                    attribs = f'{attribs} redacted="true"'
+                content = f"{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n"
+            elif c.type == "text":
+                content = f"{content}\n{c.text}"
+    return content
 def openai_chat_choices(choices: list[ChatCompletionChoice]) -> list[Choice]:
     oai_choices: list[Choice] = []
     for index, choice in enumerate(choices):
-        if isinstance(choice.message.content, str):
-            content = choice.message.content
-        else:
-            content = "\n".join(
-                [c.text for c in choice.message.content if c.type == "text"]
-            )
+        content = openai_assistant_content(choice.message)
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:
@@ -274,35 +294,47 @@ def chat_messages_from_openai(
     chat_messages: list[ChatMessage] = []
     for message in messages:
+        content: str | list[Content] = []
         if message["role"] == "system" or message["role"] == "developer":
             sys_content = message["content"]
             if isinstance(sys_content, str):
                 chat_messages.append(ChatMessageSystem(content=sys_content))
             else:
-                chat_messages.append(
-                    ChatMessageSystem(
-                        content=[content_from_openai(c) for c in sys_content]
-                    )
-                )
+                content = []
+                for sc in sys_content:
+                    content.extend(content_from_openai(sc))
+                chat_messages.append(ChatMessageSystem(content=content))
         elif message["role"] == "user":
             user_content = message["content"]
             if isinstance(user_content, str):
                 chat_messages.append(ChatMessageUser(content=user_content))
             else:
-                chat_messages.append(
-                    ChatMessageUser(
-                        content=[content_from_openai(c) for c in user_content]
-                    )
-                )
+                content = []
+                for uc in user_content:
+                    content.extend(content_from_openai(uc))
+                chat_messages.append(ChatMessageUser(content=content))
         elif message["role"] == "assistant":
             # resolve content
-            asst_content = message["content"]
+            asst_content = message.get("content", None)
             if isinstance(asst_content, str):
-                content: str | list[Content] = asst_content
+                result = parse_content_with_reasoning(asst_content)
+                if result is not None:
+                    content = [
+                        ContentReasoning(
+                            reasoning=result.reasoning,
+                            signature=result.signature,
+                            redacted=result.redacted,
+                        ),
+                        ContentText(text=result.content),
+                    ]
+                else:
+                    content = asst_content
             elif asst_content is None:
                 content = message.get("refusal", None) or ""
             else:
-                content = [content_from_openai(c) for c in asst_content]
+                content = []
+                for ac in asst_content:
+                    content.extend(content_from_openai(ac, parse_reasoning=True))
             # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
             # interfaces e.g. DeepSeek do include this field so we pluck it out)
@@ -310,22 +342,25 @@ def chat_messages_from_openai(
                 "reasoning", None
             )
             if reasoning is not None:
-                reasoning = str(reasoning)
+                if isinstance(content, str):
+                    content = [ContentText(text=content)]
+                else:
+                    content.insert(0, ContentReasoning(reasoning=str(reasoning)))
             # return message
             if "tool_calls" in message:
                 tool_calls: list[ToolCall] = []
-                for tc in message["tool_calls"]:
-                    tool_calls.append(tool_call_from_openai(tc))
-                    tool_names[tc["id"]] = tc["function"]["name"]
+                for call in message["tool_calls"]:
+                    tool_calls.append(tool_call_from_openai(call))
+                    tool_names[call["id"]] = call["function"]["name"]
             else:
                 tool_calls = []
             chat_messages.append(
                 ChatMessageAssistant(
                     content=content,
                     tool_calls=tool_calls or None,
-                    reasoning=reasoning,
                 )
             )
         elif message["role"] == "tool":
@@ -333,7 +368,9 @@ def chat_messages_from_openai(
             if isinstance(tool_content, str):
                 content = tool_content
             else:
-                content = [content_from_openai(c) for c in tool_content]
+                content = []
+                for tc in tool_content:
+                    content.extend(content_from_openai(tc))
             chat_messages.append(
                 ChatMessageTool(
                     content=content,
@@ -357,20 +394,40 @@ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> Tool
 def content_from_openai(
     content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
-) -> Content:
+    parse_reasoning: bool = False,
+) -> list[Content]:
     if content["type"] == "text":
-        return ContentText(text=content["text"])
+        text = content["text"]
+        if parse_reasoning:
+            result = parse_content_with_reasoning(text)
+            if result:
+                return [
+                    ContentReasoning(
+                        reasoning=result.reasoning,
+                        signature=result.signature,
+                        redacted=result.redacted,
+                    ),
+                    ContentText(text=result.content),
+                ]
+            else:
+                return [ContentText(text=text)]
+        else:
+            return [ContentText(text=text)]
     elif content["type"] == "image_url":
-        return ContentImage(
-            image=content["image_url"]["url"], detail=content["image_url"]["detail"]
-        )
+        return [
+            ContentImage(
+                image=content["image_url"]["url"], detail=content["image_url"]["detail"]
+            )
+        ]
     elif content["type"] == "input_audio":
-        return ContentAudio(
-            audio=content["input_audio"]["data"],
-            format=content["input_audio"]["format"],
-        )
+        return [
+            ContentAudio(
+                audio=content["input_audio"]["data"],
+                format=content["input_audio"]["format"],
+            )
+        ]
     elif content["type"] == "refusal":
-        return ContentText(text=content["refusal"])
+        return [ContentText(text=content["refusal"])]
 def chat_message_assistant_from_openai(
@@ -380,11 +437,20 @@ def chat_message_assistant_from_openai(
     reasoning = getattr(message, "reasoning_content", None) or getattr(
         message, "reasoning", None
     )
+    msg_content = refusal or message.content or ""
+    if reasoning is not None:
+        content: str | list[Content] = [
+            ContentReasoning(reasoning=str(reasoning)),
+            ContentText(text=msg_content),
+        ]
+    else:
+        content = msg_content
     return ChatMessageAssistant(
-        content=refusal or message.content or "",
+        content=content,
         source="generate",
         tool_calls=chat_tool_calls_from_openai(message, tools),
-        reasoning=reasoning,
     )

inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl