PyPI - inspect-ai - Versions diffs - 0.3.89__py3-none-any.whl → 0.3.91__py3-none-any.whl - Mend

inspect-ai 0.3.89py3-none-any.whl → 0.3.91py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (370) hide show

inspect_ai/log/_file.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import re
 from logging import getLogger
+from pathlib import Path
 from typing import Any, Callable, Generator, Literal
 from pydantic import BaseModel
@@ -97,7 +98,7 @@ def list_eval_logs(
 def write_eval_log(
     log: EvalLog,
-    location: str | FileInfo | None = None,
+    location: str | Path | FileInfo | None = None,
     format: Literal["eval", "json", "auto"] = "auto",
 ) -> None:
     """Write an evaluation log.
@@ -121,7 +122,7 @@ def write_eval_log(
 async def write_eval_log_async(
     log: EvalLog,
-    location: str | FileInfo | None = None,
+    location: str | Path | FileInfo | None = None,
     format: Literal["eval", "json", "auto"] = "auto",
 ) -> None:
     """Write an evaluation log.
@@ -140,7 +141,13 @@ async def write_eval_log_async(
             raise ValueError(
                 "EvalLog passe to write_eval_log does not have a location, so you must pass an explicit location"
             )
-    location = location if isinstance(location, str) else location.name
+    location = (
+        location
+        if isinstance(location, str)
+        else location.as_posix()
+        if isinstance(location, Path)
+        else location.name
+    )
     logger.debug(f"Writing eval log to {location}")
@@ -197,7 +204,7 @@ def write_log_dir_manifest(
 def read_eval_log(
-    log_file: str | EvalLogInfo,
+    log_file: str | Path | EvalLogInfo,
     header_only: bool = False,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",
@@ -235,7 +242,7 @@ def read_eval_log(
 async def read_eval_log_async(
-    log_file: str | EvalLogInfo,
+    log_file: str | Path | EvalLogInfo,
     header_only: bool = False,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",
@@ -255,7 +262,13 @@ async def read_eval_log_async(
        EvalLog object read from file.
     """
     # resolve to file path
-    log_file = log_file if isinstance(log_file, str) else log_file.name
+    log_file = (
+        log_file
+        if isinstance(log_file, str)
+        else log_file.as_posix()
+        if isinstance(log_file, Path)
+        else log_file.name
+    )
     logger.debug(f"Reading eval log from {log_file}")
     # get recorder type
@@ -291,7 +304,7 @@ def read_eval_log_headers(
 async def read_eval_log_headers_async(
-    log_files: list[str] | list[EvalLogInfo],
+    log_files: list[str] | list[Path] | list[EvalLogInfo],
 ) -> list[EvalLog]:
     return [
         await read_eval_log_async(log_file, header_only=True) for log_file in log_files
@@ -299,7 +312,7 @@ async def read_eval_log_headers_async(
 def read_eval_log_sample(
-    log_file: str | EvalLogInfo,
+    log_file: str | Path | EvalLogInfo,
     id: int | str,
     epoch: int = 1,
     resolve_attachments: bool = False,
@@ -336,7 +349,7 @@ def read_eval_log_sample(
 async def read_eval_log_sample_async(
-    log_file: str | EvalLogInfo,
+    log_file: str | Path | EvalLogInfo,
     id: int | str,
     epoch: int = 1,
     resolve_attachments: bool = False,
@@ -360,7 +373,13 @@ async def read_eval_log_sample_async(
        IndexError: If the passed id and epoch are not found.
     """
     # resolve to file path
-    log_file = log_file if isinstance(log_file, str) else log_file.name
+    log_file = (
+        log_file
+        if isinstance(log_file, str)
+        else log_file.as_posix()
+        if isinstance(log_file, Path)
+        else log_file.name
+    )
     if format == "auto":
         recorder_type = recorder_type_for_location(log_file)
@@ -375,7 +394,7 @@ async def read_eval_log_sample_async(
 def read_eval_log_samples(
-    log_file: str | EvalLogInfo,
+    log_file: str | Path | EvalLogInfo,
     all_samples_required: bool = True,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",

inspect_ai/log/_log.py CHANGED Viewed

@@ -87,6 +87,9 @@ class EvalConfig(BaseModel):
     of samples fails.
     """
+    retry_on_error: int | None = Field(default=None)
+    """Number of times to retry samples if they encounter errors."""
     message_limit: int | None = Field(default=None)
     """Maximum messages to allow per sample."""
@@ -255,6 +258,9 @@ class EvalSample(BaseModel):
     error: EvalError | None = Field(default=None)
     """Error that halted sample."""
+    error_retries: list[EvalError] | None = Field(default=None)
+    """Errors that were retried for this sample."""
     attachments: dict[str, str] = Field(default_factory=dict)
     """Attachments referenced from messages and events.
@@ -703,7 +709,7 @@ def rich_traceback(
         exc_value=exc_value,
         traceback=exc_traceback,
         suppress=[click, asyncio, tenacity, sys.modules[PKG_NAME]],
-        show_locals=False,
+        show_locals=os.environ.get("INSPECT_TRACEBACK_LOCALS", None) == "1",
         width=CONSOLE_DISPLAY_WIDTH,
     )
     return rich_tb

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -329,6 +329,9 @@ class ZipLogFile:
                         limit=f"{sample.limit.type}"
                         if sample.limit is not None
                         else None,
+                        retries=len(sample.error_retries)
+                        if sample.error_retries is not None
+                        else None,
                     )
                 )
             self._samples.clear()

inspect_ai/log/_recorders/types.py CHANGED Viewed

@@ -20,6 +20,7 @@ class SampleSummary(BaseModel):
     scores: dict[str, Score] | None = Field(default=None)
     error: str | None = Field(default=None)
     limit: str | None = Field(default=None)
+    retries: int | None = Field(default=None)
     @model_validator(mode="after")
     def thin_scores(self) -> "SampleSummary":

inspect_ai/log/_samples.py CHANGED Viewed

@@ -18,6 +18,7 @@ class ActiveSample:
         self,
         *,
         task: str,
+        log_location: str,
         model: str,
         sample: Sample,
         epoch: int,
@@ -33,6 +34,7 @@ class ActiveSample:
         self.started: float | None = None
         self.completed: float | None = None
         self.task = task
+        self.log_location = log_location
         self.model = model
         self.sample = sample
         self.epoch = epoch
@@ -76,6 +78,7 @@ def init_active_samples() -> None:
 async def active_sample(
     *,
     task: str,
+    log_location: str,
     model: str,
     sample: Sample,
     epoch: int,
@@ -89,6 +92,7 @@ async def active_sample(
     # create the sample
     active = ActiveSample(
         task=task,
+        log_location=log_location,
         model=model,
         sample=sample,
         epoch=epoch,

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import types
 from copy import copy
 from dataclasses import is_dataclass
+from datetime import date, datetime, time
 from logging import getLogger
 from textwrap import dedent
 from types import UnionType
@@ -13,6 +14,8 @@ from typing import (
     List,
     NamedTuple,
     Optional,
+    Sequence,
+    Set,
     Tuple,
     Type,
     Union,
@@ -45,7 +48,12 @@ from inspect_ai._util.working import sample_waiting_time
 from inspect_ai.model._display import display_conversation_message
 from inspect_ai.model._model_output import ModelOutput
 from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
-from inspect_ai.tool._tool import ToolApprovalError, ToolParsingError, ToolResult
+from inspect_ai.tool._tool import (
+    ToolApprovalError,
+    ToolParsingError,
+    ToolResult,
+    ToolSource,
+)
 from inspect_ai.tool._tool_call import ToolCallContent, ToolCallError
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.tool._tool_info import parse_docstring
@@ -83,7 +91,7 @@ class ExecuteToolsResult(NamedTuple):
 async def execute_tools(
     messages: list[ChatMessage],
-    tools: list[Tool] | list[ToolDef] | list[Tool | ToolDef],
+    tools: Sequence[Tool | ToolDef | ToolSource] | ToolSource,
     max_output: int | None = None,
 ) -> ExecuteToolsResult:
     """Perform tool calls in the last assistant message.
@@ -108,7 +116,7 @@ async def execute_tools(
             transcript,
         )
-        tdefs = tool_defs(tools)
+        tdefs = await tool_defs(tools)
         async def call_tool_task(
             call: ToolCall,
@@ -385,7 +393,6 @@ async def call_tool(
         # normal tool call
         else:
-            arguments = tool_params(call.arguments, tool_def.tool)
             result: ToolResult = await tool_def.tool(**arguments)
             return result, [], None, None
@@ -498,10 +505,7 @@ def prepend_agent_name(
 def tools_info(
-    tools: list[Tool]
-    | list[ToolDef]
-    | list[ToolInfo]
-    | list[Tool | ToolDef | ToolInfo],
+    tools: Sequence[Tool | ToolDef | ToolInfo],
 ) -> list[ToolInfo]:
     tools_info: list[ToolInfo] = []
     for tool in tools:
@@ -521,16 +525,14 @@ def tools_info(
 def disable_parallel_tools(
-    tools: list[Tool]
-    | list[ToolDef]
-    | list[ToolInfo]
-    | list[Tool | ToolDef | ToolInfo],
+    tools: Sequence[Tool | ToolDef | ToolInfo | ToolSource] | ToolSource,
 ) -> bool:
-    for tool in tools:
-        if isinstance(tool, Tool):
-            tool = ToolDef(tool)
-        if isinstance(tool, ToolDef) and not tool.parallel:
-            return True
+    if not isinstance(tools, ToolSource):
+        for tool in tools:
+            if isinstance(tool, Tool):
+                tool = ToolDef(tool)
+            if isinstance(tool, ToolDef) and not tool.parallel:
+                return True
     return False
@@ -598,6 +600,15 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
                 raise ToolParsingError(
                     f"Unable to convert '{input}' to {type_hint.__name__}"
                 )
+        elif type_hint == datetime:
+            if input.endswith("Z"):
+                # convert trailing Z to +00:00
+                input = input[:-1] + "+00:00"
+            return datetime.fromisoformat(input)
+        elif type_hint == date:
+            return date.fromisoformat(input)
+        elif type_hint == time:
+            return time.fromisoformat(input)
         elif is_typeddict(type_hint):
             typeddict_data: dict[str, Any] = {}
             annotations = get_type_hints(type_hint)
@@ -619,6 +630,11 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
             return [tool_param(args[0], x) for x in input]
         else:
             return input
+    elif origin is set or origin is Set:
+        if args:
+            return {tool_param(args[0], x) for x in input}
+        else:
+            return set(input)
     elif origin is tuple or origin is Tuple:
         if args:
             return tuple([tool_param(args[0], x) for x in input])

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -29,7 +29,7 @@ class GenerateConfigArgs(TypedDict, total=False):
     """Type for kwargs that selectively override GenerateConfig."""
     max_retries: int | None
-    """Maximum number of times to retry request (defaults to 5)."""
+    """Maximum number of times to retry request (defaults to unlimited)."""
     timeout: int | None
     """Request timeout (in seconds)."""
@@ -97,6 +97,9 @@ class GenerateConfigArgs(TypedDict, total=False):
     reasoning_tokens: int | None
     """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_summary: Literal["concise", "detailed", "auto"] | None
+    """Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
     reasoning_history: Literal["none", "all", "last", "auto"] | None
     """Include reasoning in chat message history sent to generate."""
@@ -108,7 +111,7 @@ class GenerateConfig(BaseModel):
     """Model generation options."""
     max_retries: int | None = Field(default=None)
-    """Maximum number of times to retry request (defaults to 5)."""
+    """Maximum number of times to retry request (defaults to unlimited)."""
     timeout: int | None = Field(default=None)
     """Request timeout (in seconds)."""
@@ -176,6 +179,11 @@ class GenerateConfig(BaseModel):
     reasoning_tokens: int | None = Field(default=None)
     """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
+    reasoning_summary: Literal["concise", "detailed", "auto"] | None = Field(
+        default=None
+    )
+    """Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
     reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
         default=None
     )

inspect_ai/model/_model.py CHANGED Viewed

@@ -9,7 +9,15 @@ from contextvars import ContextVar
 from copy import copy, deepcopy
 from datetime import datetime
 from types import TracebackType
-from typing import Any, AsyncIterator, Callable, Literal, Type, cast
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Literal,
+    Sequence,
+    Type,
+    cast,
+)
 from pydantic_core import to_jsonable_python
 from tenacity import (
@@ -45,6 +53,7 @@ from inspect_ai._util.retry import report_http_retry
 from inspect_ai._util.trace import trace_action
 from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
+from inspect_ai.tool._tool import ToolSource
 from inspect_ai.tool._tool_call import ToolCallModelInputHints
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
@@ -54,7 +63,9 @@ from ._call_tools import (
     disable_parallel_tools,
     execute_tools,
     tool_call_view,
-    tools_info,
+)
+from ._call_tools import (
+    tools_info as get_tools_info,
 )
 from ._chat_message import (
     ChatMessage,
@@ -326,10 +337,7 @@ class Model:
     async def generate(
         self,
         input: str | list[ChatMessage],
-        tools: list[Tool]
-        | list[ToolDef]
-        | list[ToolInfo]
-        | list[Tool | ToolDef | ToolInfo] = [],
+        tools: Sequence[Tool | ToolDef | ToolInfo | ToolSource] | ToolSource = [],
         tool_choice: ToolChoice | None = None,
         config: GenerateConfig = GenerateConfig(),
         cache: bool | CachePolicy = False,
@@ -422,7 +430,7 @@ class Model:
     async def generate_loop(
         self,
         input: str | list[ChatMessage],
-        tools: list[Tool] | list[ToolDef] | list[Tool | ToolDef] = [],
+        tools: Sequence[Tool | ToolDef | ToolSource] | ToolSource = [],
         config: GenerateConfig = GenerateConfig(),
         cache: bool | CachePolicy = False,
     ) -> tuple[list[ChatMessage], ModelOutput]:
@@ -471,10 +479,7 @@ class Model:
     async def _generate(
         self,
         input: list[ChatMessage],
-        tools: list[Tool]
-        | list[ToolDef]
-        | list[ToolInfo]
-        | list[Tool | ToolDef | ToolInfo],
+        tools: Sequence[Tool | ToolDef | ToolInfo | ToolSource] | ToolSource,
         tool_choice: ToolChoice | None,
         config: GenerateConfig,
         cache: bool | CachePolicy = False,
@@ -482,15 +487,30 @@ class Model:
         # default to 'auto' for tool_choice (same as underlying model apis)
         tool_choice = tool_choice if tool_choice else "auto"
+        # resolve top level tool source
+        if isinstance(tools, ToolSource):
+            tools = await tools.tools()
+        # resolve tool sources
+        resolved_tools: list[Tool | ToolDef | ToolInfo] = []
+        for tool in tools:
+            if isinstance(tool, ToolSource):
+                source_tools = await tool.tools()
+                resolved_tools.extend(source_tools)
+            else:
+                resolved_tools.append(tool)
         # extract tool defs if we can
-        tdefs = tool_defs([tool for tool in tools if not isinstance(tool, ToolInfo)])
+        tdefs = await tool_defs(
+            [tool for tool in resolved_tools if not isinstance(tool, ToolInfo)]
+        )
         # resolve all tools into tool_info
-        tools = tools_info(tools)
+        tools_info = get_tools_info(resolved_tools)
         # if we have a specific tool selected then filter out the others
         if isinstance(tool_choice, ToolFunction):
-            tools = [tool for tool in tools if tool.name == tool_choice.name]
+            tools_info = [tool for tool in tools_info if tool.name == tool_choice.name]
         # if tool_choice is "none" or if there are no tools then fully purge
         # the tools (as some models (e.g. openai and mistral) get confused
@@ -498,11 +518,11 @@ class Model:
         # (they both 'semi' use the tool by placing the arguments in JSON
         # in their output!). on the other hand, anthropic actually errors if
         # there are tools anywhere in the message stream and no tools defined.
-        if tool_choice == "none" or len(tools) == 0:
+        if tool_choice == "none" or len(tools_info) == 0:
             # allow model providers to implement a tools_required() method to
             # force tools to be passed (we need this for anthropic)
             if not self.api.tools_required():
-                tools = []
+                tools_info = []
             tool_choice = "none"
         # handle reasoning history
@@ -569,13 +589,13 @@ class Model:
                     model=str(self),
                     policy=policy,
                     tool_choice=tool_choice,
-                    tools=tools,
+                    tools=tools_info,
                 )
                 existing = cache_fetch(cache_entry)
                 if isinstance(existing, ModelOutput):
                     self._record_model_interaction(
                         input=input,
-                        tools=tools,
+                        tools=tools_info,
                         tool_choice=tool_choice,
                         config=config,
                         cache="read",
@@ -593,7 +613,7 @@ class Model:
             # (we'll update it with the results once we have them)
             complete = self._record_model_interaction(
                 input=input,
-                tools=tools,
+                tools=tools_info,
                 tool_choice=tool_choice,
                 config=config,
                 cache="write" if cache else None,
@@ -604,7 +624,7 @@ class Model:
                 try:
                     result = await self.api.generate(
                         input=input,
-                        tools=tools,
+                        tools=tools_info,
                         tool_choice=tool_choice,
                         config=config,
                     )
@@ -1371,7 +1391,7 @@ def combine_messages(
 def log_model_retry(model_name: str, retry_state: RetryCallState) -> None:
     logger.log(
         HTTP,
-        f"-> {model_name} retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}",
+        f"-> {model_name} retry {retry_state.attempt_number} (retrying in {retry_state.upcoming_sleep:,.0f} seconds)",
     )

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Any, Literal, Type
 from pydantic import BaseModel, Field, JsonValue, model_validator
+from inspect_ai._util.content import Content
 from inspect_ai.tool._tool_call import ToolCall
 from ._chat_message import ChatMessageAssistant
@@ -165,7 +166,7 @@ class ModelOutput(BaseModel):
     @staticmethod
     def from_content(
         model: str,
-        content: str,
+        content: str | list[Content],
         stop_reason: StopReason = "stop",
         error: str | None = None,
     ) -> "ModelOutput":

inspect_ai/model/_openai.py CHANGED Viewed

@@ -82,16 +82,16 @@ def is_o_series(name: str) -> bool:
         return not is_gpt(name) and bool(re.search(r"o\d+", name))
-def is_o1_pro(name: str) -> bool:
-    return "o1-pro" in name
+def is_o1(name: str) -> bool:
+    return "o1" in name and not is_o1_early(name)
-def is_o1_mini(name: str) -> bool:
-    return "o1-mini" in name
+def is_o1_early(name: str) -> bool:
+    return "o1-mini" in name or "o1-preview" in name
-def is_o1_preview(name: str) -> bool:
-    return "o1-preview" in name
+def is_o3_mini(name: str) -> bool:
+    return "o3-mini" in name
 def is_computer_use_preview(name: str) -> bool:
@@ -423,10 +423,12 @@ def chat_messages_from_openai(
                 "reasoning", None
             )
             if reasoning is not None:
+                # normalize content to an array
                 if isinstance(content, str):
                     content = [ContentText(text=content, refusal=refusal)]
-                else:
-                    content.insert(0, ContentReasoning(reasoning=str(reasoning)))
+                # insert reasoning
+                content.insert(0, ContentReasoning(reasoning=str(reasoning)))
             # return message
             if "tool_calls" in message:

inspect-ai 0.3.89__py3-none-any.whl → 0.3.91__py3-none-any.whl

inspect-ai 0.3.89py3-none-any.whl → 0.3.91py3-none-any.whl