PyPI - inspect-ai - Versions diffs - 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl - Mend

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -16,6 +16,7 @@ from inspect_ai._util.constants import LOG_SCHEMA_VERSION
 from inspect_ai._util.content import (
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -252,7 +253,11 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
         for message in inputs:
             if not isinstance(message.content, str):
                 filtered_content: list[
-                    ContentText | ContentImage | ContentAudio | ContentVideo
+                    ContentText
+                    | ContentReasoning
+                    | ContentImage
+                    | ContentAudio
+                    | ContentVideo
                 ] = []
                 for content in message.content:
                     if content.type == "text":

inspect_ai/log/_samples.py CHANGED Viewed

@@ -23,6 +23,7 @@ class ActiveSample:
         message_limit: int | None,
         token_limit: int | None,
         time_limit: int | None,
+        working_limit: int | None,
         fails_on_error: bool,
         transcript: Transcript,
         sandboxes: dict[str, SandboxConnection],
@@ -37,6 +38,7 @@ class ActiveSample:
         self.message_limit = message_limit
         self.token_limit = token_limit
         self.time_limit = time_limit
+        self.working_limit = working_limit
         self.fails_on_error = fails_on_error
         self.total_messages = 0
         self.total_tokens = 0
@@ -45,7 +47,7 @@ class ActiveSample:
         self._interrupt_action: Literal["score", "error"] | None = None
     @property
-    def execution_time(self) -> float:
+    def running_time(self) -> float:
         if self.started is not None:
             completed = (
                 self.completed
@@ -78,6 +80,7 @@ async def active_sample(
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     fails_on_error: bool,
     transcript: Transcript,
 ) -> AsyncGenerator[ActiveSample, None]:
@@ -90,6 +93,7 @@ async def active_sample(
         message_limit=message_limit,
         token_limit=token_limit,
         time_limit=time_limit,
+        working_limit=working_limit,
         sandboxes=await sandbox_connections(),
         fails_on_error=fails_on_error,
         transcript=transcript,

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -8,7 +8,9 @@ from typing import (
     Iterator,
     Literal,
     Sequence,
+    Type,
     TypeAlias,
+    TypeVar,
     Union,
 )
@@ -17,6 +19,7 @@ from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
 from inspect_ai._util.constants import SAMPLE_SUBTASK
 from inspect_ai._util.error import EvalError
 from inspect_ai._util.json import JsonChange, json_changes
+from inspect_ai._util.working import sample_working_time
 from inspect_ai.dataset._dataset import Sample
 from inspect_ai.log._message import LoggingMessage
 from inspect_ai.model._chat_message import ChatMessage
@@ -41,7 +44,10 @@ logger = getLogger(__name__)
 class BaseEvent(BaseModel):
     timestamp: datetime = Field(default_factory=datetime.now)
-    """Time at which event occurred."""
+    """Clock time at which event occurred."""
+    working_start: float = Field(default_factory=sample_working_time)
+    """Working time (within sample) at which the event occurred."""
     pending: bool | None = Field(default=None)
     """Is this event pending?"""
@@ -70,7 +76,7 @@ class SampleLimitEvent(BaseEvent):
     event: Literal["sample_limit"] = Field(default="sample_limit")
     """Event type."""
-    type: Literal["message", "time", "token", "operator", "custom"]
+    type: Literal["message", "time", "working", "token", "operator", "custom"]
     """Type of limit that halted processing"""
     message: str
@@ -133,6 +139,18 @@ class ModelEvent(BaseEvent):
     call: ModelCall | None = Field(default=None)
     """Raw call made to model API."""
+    completed: datetime | None = Field(default=None)
+    """Time that model call completed (see `timestamp` for started)"""
+    working_time: float | None = Field(default=None)
+    """working time for model call that succeeded (i.e. was not retried)."""
+    @field_serializer("completed")
+    def serialize_completed(self, dt: datetime) -> str:
+        if dt is None:
+            return None
+        return dt.astimezone().isoformat()
 class ToolEvent(BaseEvent):
     """Call to a tool."""
@@ -167,18 +185,28 @@ class ToolEvent(BaseEvent):
     events: list["Event"] = Field(default_factory=list)
     """Transcript of events for tool."""
+    completed: datetime | None = Field(default=None)
+    """Time that tool call completed (see `timestamp` for started)"""
+    working_time: float | None = Field(default=None)
+    """Working time for tool call (i.e. time not spent waiting on semaphores)."""
     def _set_result(
         self,
         result: ToolResult,
         truncated: tuple[int, int] | None,
         error: ToolCallError | None,
         events: list["Event"],
+        waiting_time: float,
     ) -> None:
         self.result = result
         self.truncated = truncated
         self.error = error
         self.events = events
         self.pending = None
+        completed = datetime.now()
+        self.completed = completed
+        self.working_time = (completed - self.timestamp).total_seconds() - waiting_time
     # mechanism for operator to cancel the tool call
@@ -206,6 +234,45 @@ class ToolEvent(BaseEvent):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     """Required so that we can include '_task' as a member."""
+    @field_serializer("completed")
+    def serialize_completed(self, dt: datetime) -> str:
+        return dt.astimezone().isoformat()
+class SandboxEvent(BaseEvent):
+    """Sandbox execution or I/O"""
+    event: Literal["sandbox"] = Field(default="sandbox")
+    """Event type"""
+    action: Literal["exec", "read_file", "write_file"]
+    """Sandbox action"""
+    cmd: str | None = Field(default=None)
+    """Command (for exec)"""
+    options: dict[str, JsonValue] | None = Field(default=None)
+    """Options (for exec)"""
+    file: str | None = Field(default=None)
+    """File (for read_file and write_file)"""
+    input: str | None = Field(default=None)
+    """Input (for cmd and write_file). Truncated to 100 lines."""
+    result: int | None = Field(default=None)
+    """Result (for exec)"""
+    output: str | None = Field(default=None)
+    """Output (for exec and read_file). Truncated to 100 lines."""
+    completed: datetime | None = Field(default=None)
+    """Time that sandbox action completed (see `timestamp` for started)"""
+    @field_serializer("completed")
+    def serialize_completed(self, dt: datetime) -> str:
+        return dt.astimezone().isoformat()
 class ApprovalEvent(BaseEvent):
     """Tool approval."""
@@ -338,14 +405,26 @@ class SubtaskEvent(BaseEvent):
     events: list["Event"] = Field(default_factory=list)
     """Transcript of events for subtask."""
+    completed: datetime | None = Field(default=None)
+    """Time that subtask completed (see `timestamp` for started)"""
+    working_time: float | None = Field(default=None)
+    """Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
+    @field_serializer("completed")
+    def serialize_completed(self, dt: datetime) -> str:
+        return dt.astimezone().isoformat()
 Event: TypeAlias = Union[
     SampleInitEvent
     | SampleLimitEvent
+    | SandboxEvent
     | StateEvent
     | StoreEvent
     | ModelEvent
     | ToolEvent
+    | SandboxEvent
     | ApprovalEvent
     | InputEvent
     | ScoreEvent
@@ -357,6 +436,8 @@ Event: TypeAlias = Union[
 ]
 """Event in a transcript."""
+ET = TypeVar("ET", bound=BaseEvent)
 class Transcript:
     """Transcript of events."""
@@ -396,6 +477,12 @@ class Transcript:
     def events(self) -> Sequence[Event]:
         return self._events
+    def find_last_event(self, event_cls: Type[ET]) -> ET | None:
+        for event in reversed(self.events):
+            if isinstance(event, event_cls):
+                return event
+        return None
     def _event(self, event: Event) -> None:
         self._events.append(event)

inspect_ai/model/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from inspect_ai._util.content import (
     Content,
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -51,6 +52,7 @@ __all__ = [
     "CachePolicy",
     "ContentAudio",
     "ContentImage",
+    "ContentReasoning",
     "ContentText",
     "ContentVideo",
     "Content",

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -36,6 +36,7 @@ from inspect_ai._util.content import (
 from inspect_ai._util.format import format_function_call
 from inspect_ai._util.text import truncate_string_to_bytes
 from inspect_ai._util.trace import trace_action
+from inspect_ai._util.working import sample_waiting_time
 from inspect_ai.model._conversation import conversation_tool_mesage
 from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
 from inspect_ai.tool._tool import ToolApprovalError, ToolParsingError
@@ -180,6 +181,10 @@ async def call_tools(
             task = asyncio.create_task(call_tool_task(call))
             # create pending tool event and add it to the transcript
+            # (record the waiting time for the sample so we can compare
+            # it at the end to deduce total waiting time inside the tool
+            # call (in turn used to calculate working time)
+            waiting_time_start = sample_waiting_time()
             event = ToolEvent(
                 id=call.id,
                 function=call.function,
@@ -227,11 +232,13 @@ async def call_tools(
             conversation_tool_mesage(tool_message)
             # update the event with the results
+            waiting_time_end = sample_waiting_time()
             event._set_result(
                 result=result_event.result,
                 truncated=result_event.truncated,
                 error=result_event.error,
                 events=result_event.events,
+                waiting_time=waiting_time_end - waiting_time_start,
             )
         # return tool messages
@@ -407,7 +414,7 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
             return tuple(input)
     elif origin is dict or origin is Dict:
         if args and len(args) > 1:
-            return {k: tool_param(args[1], v) for k, v in input}
+            return {k: tool_param(args[1], v) for k, v in input.items()}
         else:
             return input
     elif origin is Union or origin is types.UnionType:

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
 from pydantic import BaseModel, Field, model_validator
-from inspect_ai._util.content import Content, ContentText
+from inspect_ai._util.content import Content, ContentReasoning, ContentText
 from inspect_ai.tool import ToolCall
 from inspect_ai.tool._tool_call import ToolCallError
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
             self.content = text
         else:
             all_other = [content for content in self.content if content.type != "text"]
-            self.content = [ContentText(text=text)] + all_other
+            self.content = all_other + [ContentText(text=text)]
 class ChatMessageSystem(ChatMessageBase):
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
     tool_calls: list[ToolCall] | None = Field(default=None)
     """Tool calls made by the model."""
-    reasoning: str | None = Field(default=None)
-    """Reasoning content."""
     # Some OpenAI compatible REST endpoints include reasoning as a field alongside
     # content, however since this field doesn't exist in the OpenAI interface,
     # hosting providers (so far we've seen this with Together and Groq) may
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
     @classmethod
     def extract_reasoning(cls, data: Any) -> Any:
         if isinstance(data, dict):
+            # cleave apart <think> blocks
             content = data.get("content", None)
             if isinstance(content, str):
                 parsed = parse_content_with_reasoning(content)
                 if parsed:
-                    data["reasoning"] = parsed.reasoning
-                    data["content"] = parsed.content
+                    data["content"] = [
+                        ContentReasoning(reasoning=parsed.reasoning),
+                        ContentText(text=parsed.content),
+                    ]
+            # migrate messages that has explicit 'reasoning' field
+            # (which was our original representation of reasoning)
+            reasoning = data.get("reasoning", None)
+            if isinstance(reasoning, str):
+                # ensure that content is a list
+                content = data.get("content", None)
+                if content is None:
+                    data["content"] = []
+                elif isinstance(content, str):
+                    data["content"] = [ContentText(text=content)]
+                elif not isinstance(content, list):
+                    data["content"] = []
+                data["content"].insert(0, ContentReasoning(reasoning=reasoning))
+                del data["reasoning"]
         return data

inspect_ai/model/_conversation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from rich.console import RenderableType
 from rich.text import Text
+from inspect_ai._util.content import ContentReasoning, ContentText
 from inspect_ai._util.rich import lines_display
 from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
 from inspect_ai.util._conversation import conversation_panel
@@ -19,7 +20,7 @@ def conversation_tool_mesage(message: ChatMessageTool) -> None:
             message.error.message.strip() if message.error else message.text.strip()
         )
         if output:
-            content = lines_display(output, 100)
+            content = lines_display(output, 50)
             conversation_panel(
                 title=f"Tool Output: {message.function}",
@@ -41,14 +42,15 @@ def conversation_assistant_message(
         # build content
         content: list[RenderableType] = []
-        # reasoning
-        if message.reasoning:
-            content.extend(transcript_reasoning(message.reasoning))
-        # message text
-        content.extend(
-            [transcript_markdown(message.text, escape=True)] if message.text else []
-        )
+        # deal with plain text or with content blocks
+        if isinstance(message.content, str):
+            content.extend([transcript_markdown(message.text.strip(), escape=True)])
+        else:
+            for c in message.content:
+                if isinstance(c, ContentReasoning):
+                    content.extend(transcript_reasoning(c))
+                elif isinstance(c, ContentText) and c.text:
+                    content.extend([transcript_markdown(c.text.strip(), escape=True)])
         # print tool calls
         if message.tool_calls:

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from contextvars import ContextVar
 from copy import deepcopy
-from typing import Literal, Union
+from typing import Any, Literal, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from typing_extensions import TypedDict
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
     reasoning_effort: Literal["low", "medium", "high"] | None
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
-    reasoning_history: bool | None
+    reasoning_tokens: int | None
+    """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_history: Literal["none", "all", "last", "auto"] | None
     """Include reasoning in chat message history sent to generate."""
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
     reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
-    reasoning_history: bool | None = Field(default=None)
+    reasoning_tokens: int | None = Field(default=None)
+    """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
+        default=None
+    )
     """Include reasoning in chat message history sent to generate."""
+    # migrate reasoning_history as a bool
+    @model_validator(mode="before")
+    @classmethod
+    def migrate_reasoning(cls, data: Any) -> Any:
+        if isinstance(data, dict):
+            reasoning_history = data.get("reasoning_history", None)
+            if reasoning_history is True:
+                data["reasoning_history"] = "all"
+            elif reasoning_history is False:
+                data["reasoning_history"] = "none"
+        return data
     def merge(
         self, other: Union["GenerateConfig", GenerateConfigArgs]
     ) -> "GenerateConfig":

inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl