PyPI - inspect-ai - Versions diffs - 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl - Mend

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
 from pydantic import BaseModel, Field, model_validator
-from inspect_ai._util.content import Content, ContentText
+from inspect_ai._util.content import Content, ContentReasoning, ContentText
 from inspect_ai.tool import ToolCall
 from inspect_ai.tool._tool_call import ToolCallError
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
             self.content = text
         else:
             all_other = [content for content in self.content if content.type != "text"]
-            self.content = [ContentText(text=text)] + all_other
+            self.content = all_other + [ContentText(text=text)]
 class ChatMessageSystem(ChatMessageBase):
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
     tool_calls: list[ToolCall] | None = Field(default=None)
     """Tool calls made by the model."""
-    reasoning: str | None = Field(default=None)
-    """Reasoning content."""
     # Some OpenAI compatible REST endpoints include reasoning as a field alongside
     # content, however since this field doesn't exist in the OpenAI interface,
     # hosting providers (so far we've seen this with Together and Groq) may
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
     @classmethod
     def extract_reasoning(cls, data: Any) -> Any:
         if isinstance(data, dict):
+            # cleave apart <think> blocks
             content = data.get("content", None)
             if isinstance(content, str):
                 parsed = parse_content_with_reasoning(content)
                 if parsed:
-                    data["reasoning"] = parsed.reasoning
-                    data["content"] = parsed.content
+                    data["content"] = [
+                        ContentReasoning(reasoning=parsed.reasoning),
+                        ContentText(text=parsed.content),
+                    ]
+            # migrate messages that has explicit 'reasoning' field
+            # (which was our original representation of reasoning)
+            reasoning = data.get("reasoning", None)
+            if isinstance(reasoning, str):
+                # ensure that content is a list
+                content = data.get("content", None)
+                if content is None:
+                    data["content"] = []
+                elif isinstance(content, str):
+                    data["content"] = [ContentText(text=content)]
+                elif not isinstance(content, list):
+                    data["content"] = []
+                data["content"].insert(0, ContentReasoning(reasoning=reasoning))
+                del data["reasoning"]
         return data

inspect_ai/model/_conversation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from rich.console import RenderableType
 from rich.text import Text
+from inspect_ai._util.content import ContentReasoning, ContentText
 from inspect_ai._util.rich import lines_display
 from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
 from inspect_ai.util._conversation import conversation_panel
@@ -41,14 +42,15 @@ def conversation_assistant_message(
         # build content
         content: list[RenderableType] = []
-        # reasoning
-        if message.reasoning:
-            content.extend(transcript_reasoning(message.reasoning))
-        # message text
-        content.extend(
-            [transcript_markdown(message.text, escape=True)] if message.text else []
-        )
+        # deal with plain text or with content blocks
+        if isinstance(message.content, str):
+            content.extend([transcript_markdown(message.text.strip(), escape=True)])
+        else:
+            for c in message.content:
+                if isinstance(c, ContentReasoning):
+                    content.extend(transcript_reasoning(c))
+                elif isinstance(c, ContentText) and c.text:
+                    content.extend([transcript_markdown(c.text.strip(), escape=True)])
         # print tool calls
         if message.tool_calls:

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from contextvars import ContextVar
 from copy import deepcopy
-from typing import Literal, Union
+from typing import Any, Literal, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from typing_extensions import TypedDict
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
     reasoning_effort: Literal["low", "medium", "high"] | None
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
-    reasoning_history: bool | None
+    reasoning_tokens: int | None
+    """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_history: Literal["none", "all", "last", "auto"] | None
     """Include reasoning in chat message history sent to generate."""
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
     reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
-    reasoning_history: bool | None = Field(default=None)
+    reasoning_tokens: int | None = Field(default=None)
+    """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
+        default=None
+    )
     """Include reasoning in chat message history sent to generate."""
+    # migrate reasoning_history as a bool
+    @model_validator(mode="before")
+    @classmethod
+    def migrate_reasoning(cls, data: Any) -> Any:
+        if isinstance(data, dict):
+            reasoning_history = data.get("reasoning_history", None)
+            if reasoning_history is True:
+                data["reasoning_history"] = "all"
+            elif reasoning_history is False:
+                data["reasoning_history"] = "none"
+        return data
     def merge(
         self, other: Union["GenerateConfig", GenerateConfigArgs]
     ) -> "GenerateConfig":

inspect_ai/model/_model.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import time
 from contextvars import ContextVar
 from copy import deepcopy
+from datetime import datetime
 from types import TracebackType
 from typing import Any, AsyncIterator, Callable, Literal, Type, cast
@@ -21,7 +22,12 @@ from tenacity import (
 )
 from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
-from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+)
 from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
 from inspect_ai._util.interrupt import check_sample_interrupt
 from inspect_ai._util.platform import platform_init
@@ -33,7 +39,7 @@ from inspect_ai._util.registry import (
 )
 from inspect_ai._util.retry import log_rate_limit_retry
 from inspect_ai._util.trace import trace_action
-from inspect_ai._util.working import report_sample_waiting_time
+from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
@@ -148,6 +154,17 @@ class ModelAPI(abc.ABC):
         """Default max_tokens."""
         return None
+    def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
+        """Default max_tokens for a given config.
+        Args:
+           config: Generation config.
+        Returns:
+           Default maximum tokens for specified configuration.
+        """
+        return None
     def max_connections(self) -> int:
         """Default max_connections."""
         return DEFAULT_MAX_CONNECTIONS
@@ -180,9 +197,17 @@ class ModelAPI(abc.ABC):
         """Tool results can contain images"""
         return False
-    def has_reasoning_history(self) -> bool:
-        """Chat message assistant messages can include reasoning."""
-        return False
+    def emulate_reasoning_history(self) -> bool:
+        """Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
+        return True
+    def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
+        """Force a specific reasoning history behavior for this provider."""
+        return None
+    def auto_reasoning_history(self) -> Literal["none", "all", "last"]:
+        """Behavior to use for reasoning_history='auto'"""
+        return "all"
 class Model:
@@ -285,9 +310,10 @@ class Model:
         config = base_config.merge(config)
         # provide max_tokens from the model api if required
-        config.max_tokens = (
-            config.max_tokens if config.max_tokens else self.api.max_tokens()
-        )
+        if config.max_tokens is None:
+            config.max_tokens = self.api.max_tokens_for_config(config)
+            if config.max_tokens is None:
+                config.max_tokens = self.api.max_tokens()
         # disable parallel tool calls if requested by any of our tools
         if disable_parallel_tools(tools):
@@ -302,8 +328,11 @@ class Model:
             input = [ChatMessageSystem(content=config.system_message)] + input
         # enforce concurrency limits
+        start_time = datetime.now()
+        working_start = sample_working_time()
         async with self._connection_concurrency(config):
-            return await self._generate(
+            # generate
+            output = await self._generate(
                 input=input,
                 tools=tools,
                 tool_choice=tool_choice,
@@ -311,6 +340,28 @@ class Model:
                 cache=cache,
             )
+            # update the most recent ModelEvent with the actual start/completed
+            # times as well as a computation of working time (events are
+            # created _after_ the call to _generate, potentially in response
+            # to retries, so they need their timestamp updated so it accurately
+            # reflects the full start/end time which we know here)
+            from inspect_ai.log._transcript import ModelEvent, transcript
+            last_model_event = transcript().find_last_event(ModelEvent)
+            if last_model_event:
+                last_model_event.timestamp = start_time
+                last_model_event.working_start = working_start
+                completed = datetime.now()
+                last_model_event.completed = completed
+                last_model_event.working_time = (
+                    output.time
+                    if output.time is not None
+                    else (completed - start_time).total_seconds()
+                )
+            # return output
+            return output
     async def _generate(
         self,
         input: list[ChatMessage],
@@ -349,9 +400,7 @@ class Model:
             tool_choice = "none"
         # handle reasoning history
-        input = resolve_reasoning_history(
-            input, config, self.api.has_reasoning_history()
-        )
+        input = resolve_reasoning_history(input, config, self.api)
         # apply any tool model_input handlers
         input = resolve_tool_model_input(tdefs, input)
@@ -849,68 +898,91 @@ def simple_input_messages(
 def resolve_reasoning_history(
-    messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
+    messages: list[ChatMessage],
+    config: GenerateConfig,
+    model_api: ModelAPI,
 ) -> list[ChatMessage]:
-    # determine if we are including reasoning history
-    reasoning_history = config.reasoning_history is not False
     # determine up front if we have any reasoning content
     have_reasoning = any(
         [
-            isinstance(m, ChatMessageAssistant) and m.reasoning is not None
+            isinstance(m, ChatMessageAssistant)
+            and isinstance(m.content, list)
+            and any([c for c in m.content if isinstance(c, ContentReasoning)])
             for m in messages
         ]
     )
     if not have_reasoning:
         return messages
-    # API asssistant message format directly supports reasoning history so we will:
-    #   (a) Remove reasoning content entirely if config says not to include it; or
-    #   (b) Leave the messages alone if config says to include it
-    if api_has_reasoning_history:
-        # remove reasoning history as per config
-        if not reasoning_history:
-            resolved_messages: list[ChatMessage] = []
-            for message in messages:
-                if isinstance(message, ChatMessageAssistant):
-                    resolved_messages.append(
-                        message.model_copy(update={"reasoning": None})
-                    )
-                else:
-                    resolved_messages.append(message)
-            return resolved_messages
-        # include reasoning history as per config
-        else:
-            return messages
+    # determine reasoning history configuration
+    reasoning_history = (
+        config.reasoning_history if config.reasoning_history is not None else "auto"
+    )
-    # API can't represent reasoning natively so include <think> tags
-    elif reasoning_history:
+    # see if the provider is forcing a reasoning history
+    force = model_api.force_reasoning_history()
+    if force is not None:
+        reasoning_history = force
+    # if it's 'auto' then defer to the provider
+    elif reasoning_history == "auto":
+        reasoning_history = model_api.auto_reasoning_history()
+    # generate a version of message history with the correct history
+    if reasoning_history == "all":
+        resolved_messages: list[ChatMessage] = messages
+    else:
+        found_last = False
         resolved_messages = []
-        for message in messages:
-            if (
-                isinstance(message, ChatMessageAssistant)
-                and message.reasoning is not None
+        for message in reversed(messages):
+            if isinstance(message, ChatMessageAssistant) and isinstance(
+                message.content, list
             ):
-                message = deepcopy(message)
-                if isinstance(message.content, str):
-                    message.content = (
-                        f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
-                    )
-                else:
-                    message.content.insert(
-                        0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
-                    )
-                message.reasoning = None
+                # is there reasoning in this message?
+                has_reasoning = any(
+                    isinstance(c, ContentReasoning) for c in message.content
+                )
+                # remove it unless we are in "last" mode and haven't yet found last
+                if has_reasoning:
+                    if reasoning_history == "none" or found_last:
+                        message = message.model_copy(
+                            update={
+                                "content": [
+                                    content
+                                    for content in message.content
+                                    if not isinstance(content, ContentReasoning)
+                                ]
+                            }
+                        )
+                    found_last = True
             resolved_messages.append(message)
-        return resolved_messages
+        # reverse them back
+        resolved_messages.reverse()
-    # api doesn't handle reasoning and config says no reasoning_history, nothing to do
-    else:
-        return messages
+    # api can't represent reasoning natively so emulate it
+    if model_api.emulate_reasoning_history():
+        emulated_messages: list[ChatMessage] = []
+        for message in resolved_messages:
+            if isinstance(message, ChatMessageAssistant) and isinstance(
+                message.content, list
+            ):
+                content: list[Content] = []
+                for c in message.content:
+                    if isinstance(c, ContentReasoning):
+                        content.append(
+                            ContentText(text=f"<think>\n{c.reasoning}\n</think>")
+                        )
+                    else:
+                        content.append(c)
+                message = message.model_copy(update={"content": content})
+            emulated_messages.append(message)
+        resolved_messages = emulated_messages
+    # return messages
+    return resolved_messages
 def resolve_tool_model_input(
@@ -1200,6 +1272,10 @@ def set_model_usage(
             if total_usage.input_tokens_cache_read is None:
                 total_usage.input_tokens_cache_read = 0
             total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
+        if usage.reasoning_tokens is not None:
+            if total_usage.reasoning_tokens is None:
+                total_usage.reasoning_tokens = 0
+            total_usage.reasoning_tokens += usage.reasoning_tokens
         model_usage[model] = total_usage

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -26,6 +26,9 @@ class ModelUsage(BaseModel):
     input_tokens_cache_read: int | None = Field(default=None)
     """Number of tokens retrieved from the cache."""
+    reasoning_tokens: int | None = Field(default=None)
+    """Number of tokens used for reasoning."""
 StopReason = Literal[
     "stop",

inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl