PyPI - inspect-ai - Versions diffs - 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl - Mend

inspect-ai 0.3.61py3-none-any.whl → 0.3.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (430) hide show

inspect_ai/dataset/_dataset.py CHANGED Viewed

@@ -144,6 +144,14 @@ class Dataset(Sequence[Sample], abc.ABC):
     @abc.abstractmethod
     def shuffled(self) -> bool: ...
+    @abc.abstractmethod
+    def shuffle_choices(self, seed: int | None = None) -> None:
+        """Shuffle the order of the choices with each sample.
+        Args:
+           seed: (int | None): Random seed for shuffling (optional).
+        """
     @overload
     def __getitem__(self, index: int) -> Sample: ...
@@ -315,6 +323,34 @@ class MemoryDataset(Dataset):
             random.shuffle(self.samples)
         self._shuffled = True
+    @override
+    def shuffle_choices(self, seed: int | None = None) -> None:
+        rand = random.Random(seed)
+        for sample in self.samples:
+            if not sample.choices:
+                continue
+            # The original positions
+            positions = list(range(len(sample.choices)))
+            # Shuffle the choices
+            rand.shuffle(positions)
+            shuffled_choices = [sample.choices[i] for i in positions]
+            # Map of original position / target letter
+            position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
+            # Update to the shuffled choices and target
+            sample.choices = shuffled_choices
+            sample.target = self._remap_target(sample.target, position_map=position_map)
+    def _remap_target(
+        self, target: str | list[str], position_map: dict[int, str]
+    ) -> str | list[str]:
+        if isinstance(target, list):
+            return [position_map[ord(t) - 65] for t in target]
+        else:
+            return position_map[ord(target) - 65]
     @override
     def sort(
         self,

inspect_ai/dataset/_sources/csv.py CHANGED Viewed

@@ -23,6 +23,7 @@ def csv_dataset(
     auto_id: bool = False,
     shuffle: bool = False,
     seed: int | None = None,
+    shuffle_choices: bool | int | None = None,
     limit: int | None = None,
     dialect: str = "unix",
     encoding: str = "utf-8",
@@ -45,6 +46,7 @@ def csv_dataset(
         auto_id (bool): Assign an auto-incrementing ID for each sample.
         shuffle (bool): Randomly shuffle the dataset order.
         seed: (int | None): Seed used for random shuffle.
+        shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
         limit (int | None): Limit the number of records to read.
         dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
         encoding (str): Text encoding for file (defaults to "utf-8").
@@ -86,6 +88,12 @@ def csv_dataset(
         if shuffle:
             dataset.shuffle(seed=seed)
+        # shuffle choices, if requested
+        if isinstance(shuffle_choices, int):
+            dataset.shuffle_choices(seed=shuffle_choices)
+        elif shuffle_choices is True:
+            dataset.shuffle_choices()
         # limit if requested
         if limit:
             return dataset[0:limit]

inspect_ai/dataset/_sources/file.py CHANGED Viewed

@@ -16,6 +16,7 @@ def file_dataset(
     auto_id: bool = False,
     shuffle: bool = False,
     seed: int | None = None,
+    shuffle_choices: bool | int | None = None,
     limit: int | None = None,
     dialect: str = "unix",
     encoding: str = "utf-8",
@@ -40,6 +41,7 @@ def file_dataset(
         auto_id (bool): Assign an auto-incrementing ID for each sample.
         shuffle (bool): Randomly shuffle the dataset order.
         seed: (int | None): Seed used for random shuffle.
+        shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
         limit (int | None): Limit the number of records to read.
         dialect (str): CSV dialect ("unix" or "excel", defaults to "unix"). Only
             applies to reading CSV files.
@@ -66,6 +68,7 @@ def file_dataset(
                 auto_id=auto_id,
                 shuffle=shuffle,
                 seed=seed,
+                shuffle_choices=shuffle_choices,
                 limit=limit,
                 encoding=encoding,
                 name=name,
@@ -78,6 +81,7 @@ def file_dataset(
                 auto_id=auto_id,
                 shuffle=shuffle,
                 seed=seed,
+                shuffle_choices=shuffle_choices,
                 limit=limit,
                 dialect=dialect,
                 encoding=encoding,

inspect_ai/dataset/_sources/hf.py CHANGED Viewed

@@ -29,6 +29,7 @@ def hf_dataset(
     auto_id: bool = False,
     shuffle: bool = False,
     seed: int | None = None,
+    shuffle_choices: bool | int | None = None,
     limit: int | None = None,
     trust: bool = False,
     cached: bool = True,
@@ -59,6 +60,7 @@ def hf_dataset(
         auto_id (bool): Assign an auto-incrementing ID for each sample.
         shuffle (bool): Randomly shuffle the dataset order.
         seed: (int | None): Seed used for random shuffle.
+        shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
         limit (int | None): Limit the number of records to read.
         trust (bool): Whether or not to allow for datasets defined on the Hub
           using a dataset script. This option should only be set to True for
@@ -117,8 +119,16 @@ def hf_dataset(
         dataset = dataset.select(range(limit))
     # return the dataset
-    return MemoryDataset(
+    memory_dataset = MemoryDataset(
         samples=data_to_samples(dataset.to_list(), data_to_sample, auto_id),
         name=Path(path).stem if Path(path).exists() else path,
         location=path,
     )
+    # maybe shuffle the choices
+    if isinstance(shuffle_choices, int):
+        memory_dataset.shuffle_choices(seed=shuffle_choices)
+    elif shuffle_choices is True:
+        memory_dataset.shuffle_choices()
+    return memory_dataset

inspect_ai/dataset/_sources/json.py CHANGED Viewed

@@ -25,6 +25,7 @@ def json_dataset(
     auto_id: bool = False,
     shuffle: bool = False,
     seed: int | None = None,
+    shuffle_choices: bool | int | None = None,
     limit: int | None = None,
     encoding: str = "utf-8",
     name: str | None = None,
@@ -49,6 +50,7 @@ def json_dataset(
       auto_id (bool): Assign an auto-incrementing ID for each sample.
       shuffle (bool): Randomly shuffle the dataset order.
       seed: (int | None): Seed used for random shuffle.
+      shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
       limit (int | None): Limit the number of records to read.
       encoding (str): Text encoding for file (defaults to "utf-8").
       name (str): Optional name for dataset (for logging). If not specified,
@@ -86,6 +88,12 @@ def json_dataset(
         if shuffle:
             dataset.shuffle(seed=seed)
+        # shuffle choices, if requested
+        if isinstance(shuffle_choices, int):
+            dataset.shuffle_choices(seed=shuffle_choices)
+        elif shuffle_choices is True:
+            dataset.shuffle_choices()
         # limit if requested
         if limit:
             return dataset[0:limit]

inspect_ai/log/_log.py CHANGED Viewed

@@ -17,12 +17,7 @@ from inspect_ai._util.error import EvalError, exception_message
 from inspect_ai._util.logger import warn_once
 from inspect_ai.approval._policy import ApprovalPolicyConfig
 from inspect_ai.dataset._dataset import MT, metadata_as
-from inspect_ai.model import (
-    ChatMessage,
-    GenerateConfig,
-    ModelOutput,
-    ModelUsage,
-)
+from inspect_ai.model import ChatMessage, GenerateConfig, ModelOutput, ModelUsage
 from inspect_ai.scorer import Score
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._store import Store
@@ -404,6 +399,8 @@ class EvalResults(BaseModel):
             if "metrics" in values:
                 metrics = values["metrics"]
                 del values["metrics"]
+            else:
+                metrics = None
             # Convert the scorer to the new schema
             score = values["scorer"]
             if metrics:

inspect_ai/log/_message.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Literal, Type, cast
 from pydantic import BaseModel, Field, model_validator
 LoggingLevel = Literal[
-    "debug", "http", "sandbox", "info", "warning", "error", "critical"
+    "debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
 ]
 """Logging level."""

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -203,7 +203,7 @@ class EvalRecorder(FileRecorder):
         # of small fetches from the zip file streams)
         temp_log: str | None = None
         fs = filesystem(location)
-        if not fs.is_local():
+        if not fs.is_local() and header_only is False:
             with tempfile.NamedTemporaryFile(delete=False) as temp:
                 temp_log = temp.name
                 fs.get_file(location, temp_log)

inspect_ai/log/_recorders/json.py CHANGED Viewed

@@ -9,12 +9,7 @@ from typing_extensions import override
 from inspect_ai._util.constants import LOG_SCHEMA_VERSION
 from inspect_ai._util.error import EvalError
-from inspect_ai._util.file import (
-    absolute_file_path,
-    async_fileystem,
-    file,
-    filesystem,
-)
+from inspect_ai._util.file import absolute_file_path, async_fileystem, file, filesystem
 from inspect_ai._util.trace import trace_action
 from .._log import (
@@ -236,12 +231,13 @@ def _read_header_streaming(log_file: str) -> EvalLog:
         f.seek(0)
         # Parse the log file, stopping before parsing samples
+        status: Literal["started", "success", "cancelled", "error"] | None = None
         for k, v in ijson.kvitems(f, ""):
             if k == "status":
                 assert v in get_args(
                     Literal["started", "success", "cancelled", "error"]
                 )
-                status: Literal["started", "success", "cancelled", "error"] = v
+                status = v
             if k == "eval":
                 eval = EvalSpec(**v)
             elif k == "plan":
@@ -257,6 +253,8 @@ def _read_header_streaming(log_file: str) -> EvalLog:
                 error = EvalError(**v)
                 break
+    assert status, "Must encounter a 'status'"
     return EvalLog(
         eval=eval,
         plan=plan,

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -133,7 +133,8 @@ async def call_tools(
             ):
                 content: str | list[Content] = [result]
             elif isinstance(result, list) and (
-                isinstance(
+                len(result) == 0
+                or isinstance(
                     result[0], ContentText | ContentImage | ContentAudio | ContentVideo
                 )
             ):

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -7,6 +7,8 @@ from inspect_ai._util.content import Content, ContentText
 from inspect_ai.tool import ToolCall
 from inspect_ai.tool._tool_call import ToolCallError
+from ._reasoning import parse_content_with_reasoning
 logger = getLogger(__name__)
@@ -83,6 +85,31 @@ class ChatMessageAssistant(ChatMessageBase):
     tool_calls: list[ToolCall] | None = Field(default=None)
     """Tool calls made by the model."""
+    reasoning: str | None = Field(default=None)
+    """Reasoning content."""
+    # Some OpenAI compatible REST endpoints include reasoning as a field alongside
+    # content, however since this field doesn't exist in the OpenAI interface,
+    # hosting providers (so far we've seen this with Together and Groq) may
+    # include the reasoning in a <think></think> tag before the main response.
+    # We expect this pattern to be repeated elsewhere, so include this hook to
+    # automatically extract the reasoning content when the response is prefaced
+    # with a <think> block. If this ends up being an overeach we can fall back
+    # to each provider manually parsing out <think> using a helper function.
+    # The implementation isn't important here, the critical thing to establish
+    # is that Inspect makes reasoning content available separately.
+    @model_validator(mode="before")
+    @classmethod
+    def extract_reasoning(cls, data: Any) -> Any:
+        if isinstance(data, dict):
+            content = data.get("content", None)
+            if isinstance(content, str):
+                parsed = parse_content_with_reasoning(content)
+                if parsed:
+                    data["reasoning"] = parsed.reasoning
+                    data["content"] = parsed.content
+        return data
 class ChatMessageTool(ChatMessageBase):
     role: Literal["tool"] = Field(default="tool")

inspect_ai/model/_conversation.py CHANGED Viewed

@@ -2,7 +2,7 @@ from rich.console import RenderableType
 from rich.text import Text
 from inspect_ai._util.rich import lines_display
-from inspect_ai._util.transcript import transcript_markdown
+from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
 from inspect_ai.util._conversation import conversation_panel
 from inspect_ai.util._display import display_type
@@ -38,8 +38,15 @@ def conversation_assistant_message(
                 content=transcript_markdown(m.text, escape=True),
             )
-        # start with assistant content
-        content: list[RenderableType] = (
+        # build content
+        content: list[RenderableType] = []
+        # reasoning
+        if message.reasoning:
+            content.extend(transcript_reasoning(message.reasoning))
+        # message text
+        content.extend(
             [transcript_markdown(message.text, escape=True)] if message.text else []
         )

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -75,6 +75,9 @@ class GenerateConfigArgs(TypedDict, total=False):
     reasoning_effort: Literal["low", "medium", "high"] | None
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
+    reasoning_history: bool | None
+    """Include reasoning in chat message history sent to generate."""
 class GenerateConfig(BaseModel):
     """Base class for model generation configs."""
@@ -145,6 +148,9 @@ class GenerateConfig(BaseModel):
     reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
     """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
+    reasoning_history: bool | None = Field(default=None)
+    """Include reasoning in chat message history sent to generate."""
     def merge(
         self, other: Union["GenerateConfig", GenerateConfigArgs]
     ) -> "GenerateConfig":

inspect_ai/model/_model.py CHANGED Viewed

@@ -168,6 +168,10 @@ class ModelAPI(abc.ABC):
         """Tool results can contain images"""
         return False
+    def has_reasoning_history(self) -> bool:
+        """Chat message assistant messages can include reasoning."""
+        return False
 class Model:
     """Model interface."""
@@ -302,6 +306,11 @@ class Model:
                 tools = []
             tool_choice = "none"
+        # handle reasoning history
+        input = resolve_reasoning_history(
+            input, config, self.api.has_reasoning_history()
+        )
         # apply any tool model_input handlers
         input = resolve_tool_model_input(tdefs, input)
@@ -726,6 +735,71 @@ def simple_input_messages(
     return messages
+def resolve_reasoning_history(
+    messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
+) -> list[ChatMessage]:
+    # determine if we are including reasoning history
+    reasoning_history = config.reasoning_history is not False
+    # determine up front if we have any reasoning content
+    have_reasoning = any(
+        [
+            isinstance(m, ChatMessageAssistant) and m.reasoning is not None
+            for m in messages
+        ]
+    )
+    if not have_reasoning:
+        return messages
+    # API asssistant message format directly supports reasoning history so we will:
+    #   (a) Remove reasoning content entirely if config says not to include it; or
+    #   (b) Leave the messages alone if config says to include it
+    if api_has_reasoning_history:
+        # remove reasoning history as per config
+        if not reasoning_history:
+            resolved_messages: list[ChatMessage] = []
+            for message in messages:
+                if isinstance(message, ChatMessageAssistant):
+                    resolved_messages.append(
+                        message.model_copy(update={"reasoning": None})
+                    )
+                else:
+                    resolved_messages.append(message)
+            return resolved_messages
+        # include reasoning history as per config
+        else:
+            return messages
+    # API can't represent reasoning natively so include <think> tags
+    elif reasoning_history:
+        resolved_messages = []
+        for message in messages:
+            if (
+                isinstance(message, ChatMessageAssistant)
+                and message.reasoning is not None
+            ):
+                message = deepcopy(message)
+                if isinstance(message.content, str):
+                    message.content = (
+                        f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
+                    )
+                else:
+                    message.content.insert(
+                        0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
+                    )
+                message.reasoning = None
+            resolved_messages.append(message)
+        return resolved_messages
+    # api doesn't handle reasoning and config says no reasoning_history, nothing to do
+    else:
+        return messages
 def resolve_tool_model_input(
     tdefs: list[ToolDef], messages: list[ChatMessage]
 ) -> list[ChatMessage]:

inspect_ai/model/_openai.py CHANGED Viewed

@@ -43,10 +43,18 @@ from ._chat_message import (
 from ._model_output import ModelUsage, StopReason, as_stop_reason
+def is_o_series(name: str) -> bool:
+    return is_o1(name) or is_o3(name)
 def is_o1(name: str) -> bool:
     return name.startswith("o1")
+def is_o3(name: str) -> bool:
+    return name.startswith("o3")
 def is_o1_full(name: str) -> bool:
     return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
@@ -55,10 +63,18 @@ def is_o1_mini(name: str) -> bool:
     return name.startswith("o1-mini")
+def is_o3_mini(name: str) -> bool:
+    return name.startswith("o3-mini")
 def is_o1_preview(name: str) -> bool:
     return name.startswith("o1-preview")
+def is_gpt(name: str) -> bool:
+    return name.startswith("gpt")
 def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
     return ChatCompletionMessageToolCall(
         type="function",
@@ -296,6 +312,14 @@ def chat_messages_from_openai(
             else:
                 content = [content_from_openai(c) for c in asst_content]
+            # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
+            # interfaces e.g. DeepSeek do include this field so we pluck it out)
+            reasoning = message.get("reasoning_content", None) or message.get(
+                "reasoning", None
+            )
+            if reasoning is not None:
+                reasoning = str(reasoning)
             # return message
             if "tool_calls" in message:
                 tool_calls: list[ToolCall] = []
@@ -306,7 +330,11 @@ def chat_messages_from_openai(
             else:
                 tool_calls = []
             chat_messages.append(
-                ChatMessageAssistant(content=content, tool_calls=tool_calls or None)
+                ChatMessageAssistant(
+                    content=content,
+                    tool_calls=tool_calls or None,
+                    reasoning=reasoning,
+                )
             )
         elif message["role"] == "tool":
             tool_content = message.get("content", None) or ""
@@ -357,10 +385,14 @@ def chat_message_assistant_from_openai(
     message: ChatCompletionMessage, tools: list[ToolInfo]
 ) -> ChatMessageAssistant:
     refusal = getattr(message, "refusal", None)
+    reasoning = getattr(message, "reasoning_content", None) or getattr(
+        message, "reasoning", None
+    )
     return ChatMessageAssistant(
         content=refusal or message.content or "",
         source="generate",
         tool_calls=chat_tool_calls_from_openai(message, tools),
+        reasoning=reasoning,
     )

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -12,6 +12,7 @@ else:
 from anthropic import (
     APIConnectionError,
+    APIStatusError,
     AsyncAnthropic,
     AsyncAnthropicBedrock,
     AsyncAnthropicVertex,
@@ -218,6 +219,17 @@ class AnthropicAPI(ModelAPI):
         except BadRequestError as ex:
             return self.handle_bad_request(ex), model_call()
+        except APIStatusError as ex:
+            if ex.status_code == 413:
+                return ModelOutput.from_content(
+                    model=self.model_name,
+                    content=ex.message,
+                    stop_reason="model_length",
+                    error=ex.message,
+                ), model_call()
+            else:
+                raise ex
     def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
         params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
         if config.temperature is not None:

inspect_ai/model/_providers/groq.py CHANGED Viewed

@@ -294,8 +294,12 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
 def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
+    reasoning = getattr(message, "reasoning", None)
+    if reasoning is not None:
+        reasoning = str(reasoning)
     return ChatMessageAssistant(
         content=message.content or "",
         source="generate",
         tool_calls=chat_tool_calls(message, tools),
+        reasoning=reasoning,
     )

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -35,10 +35,12 @@ from .._model_output import (
     StopReason,
 )
 from .._openai import (
-    is_o1,
+    is_gpt,
     is_o1_full,
     is_o1_mini,
     is_o1_preview,
+    is_o3,
+    is_o_series,
     openai_chat_messages,
     openai_chat_tool_choice,
     openai_chat_tools,
@@ -140,8 +142,8 @@ class OpenAIAPI(ModelAPI):
     def is_azure(self) -> bool:
         return self.service == "azure"
-    def is_o1(self) -> bool:
-        return is_o1(self.model_name)
+    def is_o_series(self) -> bool:
+        return is_o_series(self.model_name)
     def is_o1_full(self) -> bool:
         return is_o1_full(self.model_name)
@@ -149,9 +151,15 @@ class OpenAIAPI(ModelAPI):
     def is_o1_mini(self) -> bool:
         return is_o1_mini(self.model_name)
+    def is_o3(self) -> bool:
+        return is_o3(self.model_name)
     def is_o1_preview(self) -> bool:
         return is_o1_preview(self.model_name)
+    def is_gpt(self) -> bool:
+        return is_gpt(self.model_name)
     async def generate(
         self,
         input: list[ChatMessage],
@@ -258,7 +266,7 @@ class OpenAIAPI(ModelAPI):
             model=self.model_name,
         )
         if config.max_tokens is not None:
-            if self.is_o1():
+            if self.is_o_series():
                 params["max_completion_tokens"] = config.max_tokens
             else:
                 params["max_tokens"] = config.max_tokens
@@ -273,10 +281,10 @@ class OpenAIAPI(ModelAPI):
         if config.seed is not None:
             params["seed"] = config.seed
         if config.temperature is not None:
-            if self.is_o1():
+            if self.is_o_series():
                 warn_once(
                     logger,
-                    "o1 models do not support the 'temperature' parameter (temperature is always 1).",
+                    "o series models do not support the 'temperature' parameter (temperature is always 1).",
                 )
             else:
                 params["temperature"] = config.temperature
@@ -293,9 +301,9 @@ class OpenAIAPI(ModelAPI):
             params["logprobs"] = config.logprobs
         if config.top_logprobs is not None:
             params["top_logprobs"] = config.top_logprobs
-        if tools and config.parallel_tool_calls is not None and not self.is_o1():
+        if tools and config.parallel_tool_calls is not None and not self.is_o_series():
             params["parallel_tool_calls"] = config.parallel_tool_calls
-        if config.reasoning_effort is not None and self.is_o1_full():
+        if config.reasoning_effort is not None and not self.is_gpt():
             params["reasoning_effort"] = config.reasoning_effort
         return params
@@ -312,7 +320,11 @@ class OpenAIAPI(ModelAPI):
         stop_reason: StopReason | None = None
         if e.code == "context_length_exceeded":
             stop_reason = "model_length"
-        elif e.code == "invalid_prompt":
+        elif (
+            e.code == "invalid_prompt"  # seems to happen for o1/o3
+            or e.code == "content_policy_violation"  # seems to happen for vision
+            or e.code == "content_filter"  # seems to happen on azure
+        ):
             stop_reason = "content_filter"
         if stop_reason:

inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl

inspect-ai 0.3.61py3-none-any.whl → 0.3.63py3-none-any.whl