PyPI - inspect-ai - Versions diffs - 0.3.99__py3-none-any.whl → 0.3.101__py3-none-any.whl - Mend

inspect-ai 0.3.99py3-none-any.whl → 0.3.101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

inspect_ai/_cli/eval.py +2 -1
inspect_ai/_display/core/config.py +11 -5
inspect_ai/_display/core/panel.py +66 -2
inspect_ai/_display/core/textual.py +5 -2
inspect_ai/_display/plain/display.py +1 -0
inspect_ai/_display/rich/display.py +2 -2
inspect_ai/_display/textual/widgets/transcript.py +37 -9
inspect_ai/_eval/eval.py +13 -1
inspect_ai/_eval/evalset.py +3 -2
inspect_ai/_eval/run.py +2 -0
inspect_ai/_eval/score.py +2 -4
inspect_ai/_eval/task/log.py +3 -1
inspect_ai/_eval/task/run.py +59 -81
inspect_ai/_util/content.py +11 -6
inspect_ai/_util/interrupt.py +2 -2
inspect_ai/_util/text.py +7 -0
inspect_ai/_util/working.py +8 -37
inspect_ai/_view/__init__.py +0 -0
inspect_ai/_view/schema.py +2 -1
inspect_ai/_view/www/CLAUDE.md +15 -0
inspect_ai/_view/www/dist/assets/index.css +307 -171
inspect_ai/_view/www/dist/assets/index.js +24733 -21641
inspect_ai/_view/www/log-schema.json +77 -3
inspect_ai/_view/www/package.json +9 -5
inspect_ai/_view/www/src/@types/log.d.ts +9 -0
inspect_ai/_view/www/src/app/App.tsx +1 -15
inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +220 -205
inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
inspect_ai/_view/www/src/app/routing/url.ts +84 -4
inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +24 -17
inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
inspect_ai/_view/www/src/app/types.ts +5 -1
inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
inspect_ai/_view/www/src/state/hooks.ts +52 -2
inspect_ai/_view/www/src/state/logSlice.ts +4 -3
inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
inspect_ai/_view/www/src/state/scrolling.ts +152 -0
inspect_ai/_view/www/src/utils/attachments.ts +7 -0
inspect_ai/_view/www/src/utils/python.ts +18 -0
inspect_ai/_view/www/yarn.lock +290 -33
inspect_ai/agent/_react.py +12 -7
inspect_ai/agent/_run.py +2 -3
inspect_ai/analysis/beta/__init__.py +2 -0
inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
inspect_ai/dataset/_sources/csv.py +2 -6
inspect_ai/dataset/_sources/hf.py +2 -6
inspect_ai/dataset/_sources/json.py +2 -6
inspect_ai/dataset/_util.py +23 -0
inspect_ai/log/_log.py +1 -1
inspect_ai/log/_recorders/eval.py +4 -3
inspect_ai/log/_recorders/file.py +2 -9
inspect_ai/log/_recorders/json.py +1 -0
inspect_ai/log/_recorders/recorder.py +1 -0
inspect_ai/log/_transcript.py +1 -1
inspect_ai/model/_call_tools.py +6 -2
inspect_ai/model/_openai.py +1 -1
inspect_ai/model/_openai_responses.py +85 -41
inspect_ai/model/_openai_web_search.py +38 -0
inspect_ai/model/_providers/azureai.py +72 -3
inspect_ai/model/_providers/openai.py +4 -1
inspect_ai/model/_providers/openai_responses.py +5 -1
inspect_ai/scorer/_metric.py +1 -2
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/solver/_task_state.py +2 -2
inspect_ai/tool/_tool.py +6 -2
inspect_ai/tool/_tool_def.py +27 -4
inspect_ai/tool/_tool_info.py +2 -0
inspect_ai/tool/_tools/_web_search/_google.py +43 -15
inspect_ai/tool/_tools/_web_search/_tavily.py +46 -13
inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_json.py +3 -0
inspect_ai/util/_limit.py +230 -20
inspect_ai/util/_sandbox/docker/compose.py +20 -11
inspect_ai/util/_span.py +1 -1
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +3 -3
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +138 -124
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0

inspect_ai/dataset/_util.py CHANGED Viewed

@@ -13,6 +13,7 @@ from inspect_ai.model import (
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from ._dataset import (
+    Dataset,
     DatasetRecord,
     FieldSpec,
     RecordToSample,
@@ -225,3 +226,25 @@ def read_files(files: Any | None) -> dict[str, str] | None:
         raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
     else:
         return None
+def shuffle_choices_if_requested(
+    dataset: Dataset, shuffle_choices: bool | int | None
+) -> None:
+    """
+    Shuffle the choices in the dataset if requested.
+    The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
+    and `hf_dataset` can be a boolean, an integer, or `None` (default).
+    If it is a boolean, it will shuffle the choices if the value is `True`,
+    and do nothing if it is `False`.
+    If it is an integer, it will shuffle the choices using the integer as the seed.
+    """
+    # Note that `isinstance(x, int)` returns True if x is True or False,
+    # so we need to check for both explicitly
+    if shuffle_choices is True:
+        dataset.shuffle_choices()
+    elif shuffle_choices is False:
+        pass
+    elif isinstance(shuffle_choices, int):
+        dataset.shuffle_choices(seed=shuffle_choices)

inspect_ai/log/_log.py CHANGED Viewed

@@ -165,7 +165,7 @@ class EvalSampleLimit(BaseModel):
     ]
     """The type of limit"""
-    limit: int
+    limit: float
     """The limit value"""

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -133,6 +133,7 @@ class EvalRecorder(FileRecorder):
         results: EvalResults | None,
         reductions: list[EvalSampleReductions] | None,
         error: EvalError | None = None,
+        header_only: bool = False,
     ) -> EvalLog:
         # get the key and log
         key = self._log_file_key(eval)
@@ -174,7 +175,7 @@ class EvalRecorder(FileRecorder):
         # flush and write the results
         await log.flush()
-        return await log.close()
+        return await log.close(header_only)
     @classmethod
     @override
@@ -321,12 +322,12 @@ class ZipLogFile:
                     # re-open zip file w/ self.temp_file pointer at end
                     self._open()
-    async def close(self) -> EvalLog:
+    async def close(self, header_only: bool) -> EvalLog:
         async with self._lock:
             # read the log from the temp file then close it
             try:
                 self._temp_file.seek(0)
-                return _read_log(self._temp_file, self._file)
+                return _read_log(self._temp_file, self._file, header_only=header_only)
             finally:
                 self._temp_file.close()
                 if self._zip:

inspect_ai/log/_recorders/file.py CHANGED Viewed

@@ -67,16 +67,9 @@ class FileRecorder(Recorder):
     async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
         # establish the log to read from (might be cached)
         eval_log = await cls._log_file_maybe_cached(location)
-        # throw if no samples
         if not eval_log.samples:
-            raise IndexError(f"No samples found in log {location}")
-        summaries: list[EvalSampleSummary] = []
-        for sample in eval_log.samples:
-            summaries.append(sample.summary())
-        return summaries
+            return []
+        return [sample.summary() for sample in eval_log.samples]
     @classmethod
     async def _log_file_maybe_cached(cls, location: str) -> EvalLog:

inspect_ai/log/_recorders/json.py CHANGED Viewed

@@ -96,6 +96,7 @@ class JSONRecorder(FileRecorder):
         results: EvalResults | None,
         reductions: list[EvalSampleReductions] | None,
         error: EvalError | None = None,
+        header_only: bool = False,
     ) -> EvalLog:
         log = self.data[self._log_file_key(spec)]
         log.data.status = status

inspect_ai/log/_recorders/recorder.py CHANGED Viewed

@@ -46,6 +46,7 @@ class Recorder(abc.ABC):
         results: EvalResults | None,
         reductions: list[EvalSampleReductions] | None,
         error: EvalError | None = None,
+        header_only: bool = False,
     ) -> EvalLog: ...
     @classmethod

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -111,7 +111,7 @@ class SampleLimitEvent(BaseEvent):
     message: str
     """A message associated with this limit"""
-    limit: int | None = Field(default=None)
+    limit: float | None = Field(default=None)
     """The limit value (if any)"""

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -4,6 +4,7 @@ import types
 from copy import copy
 from dataclasses import is_dataclass
 from datetime import date, datetime, time
+from enum import EnumMeta
 from logging import getLogger
 from textwrap import dedent
 from types import UnionType
@@ -172,7 +173,7 @@ async def execute_tools(
             except LimitExceededError as ex:
                 tool_error = ToolCallError(
                     "limit",
-                    f"The tool exceeded its {ex.type} limit of {ex.limit}.",
+                    f"The tool exceeded its {ex.type} limit of {ex.limit_str}.",
                 )
             except ToolParsingError as ex:
                 tool_error = ToolCallError("parsing", ex.message)
@@ -497,7 +498,7 @@ async def agent_handoff(
             ChatMessageUser(
                 content=(
                     f"The {agent_name} exceeded its {limit_error.type} limit of "
-                    f"{limit_error.limit}."
+                    f"{limit_error.limit_str}."
                 )
             )
         )
@@ -548,6 +549,7 @@ def tools_info(
                     name=tool.name,
                     description=tool.description,
                     parameters=tool.parameters,
+                    options=tool.options,
                 )
             )
     return tools_info
@@ -652,6 +654,8 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
             return type_hint(**dataclass_data)
         elif issubclass(type_hint, BaseModel):
             return type_hint(**input)
+        elif isinstance(type_hint, EnumMeta):
+            return type_hint(input)
         else:
             return input
     elif origin is list or origin is List:

inspect_ai/model/_openai.py CHANGED Viewed

@@ -594,7 +594,7 @@ def chat_choices_from_openai(
             stop_reason=as_stop_reason(choice.finish_reason),
             logprobs=(
                 Logprobs(**choice.logprobs.model_dump())
-                if choice.logprobs is not None
+                if choice.logprobs and choice.logprobs.content is not None
                 else None
             ),
         )

inspect_ai/model/_openai_responses.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
-from itertools import chain
-from typing import TypedDict, cast
+from typing import Sequence, TypedDict, cast
 from openai.types.responses import (
     FunctionToolParam,
@@ -8,6 +7,8 @@ from openai.types.responses import (
     ResponseComputerToolCallParam,
     ResponseFunctionToolCall,
     ResponseFunctionToolCallParam,
+    ResponseFunctionWebSearch,
+    ResponseFunctionWebSearchParam,
     ResponseInputContentParam,
     ResponseInputImageParam,
     ResponseInputItemParam,
@@ -51,6 +52,7 @@ from inspect_ai.model._openai_computer_use import (
     maybe_computer_use_preview_tool,
     tool_call_from_openai_computer_tool_call,
 )
+from inspect_ai.model._openai_web_search import maybe_web_search_tool
 from inspect_ai.tool._tool_call import ToolCall
 from inspect_ai.tool._tool_choice import ToolChoice
 from inspect_ai.tool._tool_info import ToolInfo
@@ -160,9 +162,9 @@ def openai_responses_tool_choice(
 def openai_responses_tools(
-    tools: list[ToolInfo], config: GenerateConfig
+    tools: list[ToolInfo], model_name: str, config: GenerateConfig
 ) -> list[ToolParam]:
-    return [_tool_param_for_tool_info(tool, config) for tool in tools]
+    return [_tool_param_for_tool_info(tool, model_name, config) for tool in tools]
 def openai_responses_chat_choices(
@@ -174,6 +176,14 @@ def openai_responses_chat_choices(
     return [ChatCompletionChoice(message=message, stop_reason=stop_reason)]
+def is_native_tool_configured(
+    tools: Sequence[ToolInfo], model_name: str, config: GenerateConfig
+) -> bool:
+    return any(
+        _maybe_native_tool_param(tool, model_name, config) is not None for tool in tools
+    )
 # The next two function perform transformations between OpenAI types an Inspect
 # ChatMessageAssistant. Here is a diagram that helps visualize the transforms.
 # ┌───────────────────────────┐    ┌───────────────────────────┐    ┌───────────────────────────┐
@@ -207,7 +217,6 @@ def openai_responses_chat_choices(
 class _AssistantInternal(TypedDict):
-    output_message_id: str | None
     tool_message_ids: dict[str, str]
@@ -237,17 +246,17 @@ def _chat_message_assistant_from_openai_response(
     # collect output and tool calls
     message_content: list[Content] = []
     tool_calls: list[ToolCall] = []
-    internal = _AssistantInternal(output_message_id=None, tool_message_ids={})
+    internal = _AssistantInternal(tool_message_ids={})
     for output in response.output:
         match output:
             case ResponseOutputMessage(content=content, id=id):
-                assert internal["output_message_id"] is None, "Multiple message outputs"
-                internal["output_message_id"] = id
                 message_content.extend(
                     [
-                        ContentText(text=c.text)
+                        ContentText(text=c.text, internal={"id": id})
                         if isinstance(c, ResponseOutputText)
-                        else ContentText(text=c.refusal, refusal=True)
+                        else ContentText(
+                            text=c.refusal, refusal=True, internal={"id": id}
+                        )
                         for c in content
                     ]
                 )
@@ -277,6 +286,13 @@ def _chat_message_assistant_from_openai_response(
                         tool_calls.append(
                             tool_call_from_openai_computer_tool_call(output)
                         )
+                    case ResponseFunctionWebSearch():
+                        # We don't currently capture this since the model did the
+                        # "tool call" internally. It's conceivable that could be
+                        # forced to include it in `.internal` in the future, but
+                        # for now we just ignore it.
+                        # {"id":"ws_682cdcec3fa88198bc10b38fafefbd5e077e89e31fd4a3d5","status":"completed","type":"web_search_call"}
+                        pass
                     case _:
                         raise ValueError(f"Unexpected output type: {output.__class__}")
@@ -304,25 +320,39 @@ def _openai_input_items_from_chat_message_assistant(
     field of the `ChatMessageAssistant` to help it provide the proper id's the
     items in the returned list.
     """
-    (output_message_id, tool_message_ids) = _ids_from_assistant_internal(message)
+    tool_message_ids = _ids_from_assistant_internal(message)
     # we want to prevent yielding output messages in the case where we have an
     # 'internal' field (so the message came from the model API as opposed to
-    # being user synthesized) AND there is no output_message_id (indicating that
-    # when reading the message from the server we didn't find output). this could
-    # happen e.g. when a react() agent sets the output.completion in response
+    # being user synthesized) AND there are no ContentText items with message IDs
+    # (indicating that when reading the message from the server we didn't find output).
+    # this could happen e.g. when a react() agent sets the output.completion in response
     # to a submit() tool call
-    suppress_output_message = message.internal is not None and output_message_id is None
+    content_items: list[ContentText | ContentReasoning] = (
+        [ContentText(text=message.content)]
+        if isinstance(message.content, str)
+        else [
+            c for c in message.content if isinstance(c, ContentText | ContentReasoning)
+        ]
+    )
+    has_content_with_ids = any(
+        isinstance(c, ContentText)
+        and isinstance(c.internal, dict)
+        and "id" in c.internal
+        for c in content_items
+    )
+    suppress_output_message = message.internal is not None and not has_content_with_ids
     # if we are not storing messages on the server then blank these out
     if not store:
-        output_message_id = None
         tool_message_ids = {}
-    # items to return -- ensure we use a single output message (and just chain
-    # additional content on to it)
+    # items to return
     items: list[ResponseInputItemParam] = []
-    output_message: ResponseOutputMessageParam | None = None
+    # group content by message ID
+    messages_by_id: dict[
+        str | None, list[ResponseOutputTextParam | ResponseOutputRefusalParam]
+    ] = {}
     for content in (
         list[ContentText | ContentReasoning]([ContentText(text=message.content)])
@@ -352,6 +382,14 @@ def _openai_input_items_from_chat_message_assistant(
                 if suppress_output_message:
                     continue
+                # get the message ID from ContentText.modelJson
+                content_message_id: str | None = None
+                if isinstance(content.internal, dict) and "id" in content.internal:
+                    id_value = content.internal["id"]
+                    content_message_id = id_value if isinstance(id_value, str) else None
+                else:
+                    content_message_id = None
                 new_content = (
                     ResponseOutputRefusalParam(type="refusal", refusal=text)
                     if refusal
@@ -359,22 +397,24 @@ def _openai_input_items_from_chat_message_assistant(
                         type="output_text", text=text, annotations=[]
                     )
                 )
-                if output_message is None:
-                    output_message = ResponseOutputMessageParam(
-                        type="message",
-                        role="assistant",
-                        # this actually can be `None`, and it will in fact be `None` when the
-                        # assistant message is synthesized by the scaffold as opposed to being
-                        # replayed from the model (or when store=False)
-                        id=output_message_id,  # type: ignore[typeddict-item]
-                        content=[new_content],
-                        status="completed",
-                    )
-                    items.append(output_message)
-                else:
-                    output_message["content"] = chain(
-                        output_message["content"], [new_content]
-                    )
+                if content_message_id not in messages_by_id:
+                    messages_by_id[content_message_id] = []
+                messages_by_id[content_message_id].append(new_content)
+    # create ResponseOutputMessage for each unique ID
+    for msg_id, content_list in messages_by_id.items():
+        output_message = ResponseOutputMessageParam(
+            type="message",
+            role="assistant",
+            # this actually can be `None`, and it will in fact be `None` when the
+            # assistant message is synthesized by the scaffold as opposed to being
+            # replayed from the model (or when store=False)
+            id=msg_id,  # type: ignore[typeddict-item]
+            content=content_list,
+            status="completed",
+        )
+        items.append(output_message)
     return items + _tool_call_items_from_assistant_message(message, tool_message_ids)
@@ -395,11 +435,13 @@ def _model_tool_call_for_internal(
 def _maybe_native_tool_param(
     tool: ToolInfo,
+    model_name: str,
     config: GenerateConfig,
 ) -> ToolParam | None:
     return (
         (
             maybe_computer_use_preview_tool(tool)
+            or maybe_web_search_tool(model_name, tool)
             # or self.text_editor_tool_param(tool)
             # or self.bash_tool_param(tool)
         )
@@ -442,32 +484,34 @@ def _tool_call_items_from_assistant_message(
 def _ids_from_assistant_internal(
     message: ChatMessageAssistant,
-) -> tuple[str | None, dict[str, str]]:
+) -> dict[str, str]:
     if message.internal is not None:
         assert isinstance(message.internal, dict), (
             "OpenAI ChatMessageAssistant internal must be an _AssistantInternal"
         )
         internal = cast(_AssistantInternal, message.internal)
-        return (internal["output_message_id"], internal["tool_message_ids"])
+        return internal["tool_message_ids"]
     else:
-        return None, {}
+        return {}
 _ResponseToolCallParam = (
-    ResponseFunctionToolCallParam | ResponseComputerToolCallParam
+    ResponseFunctionToolCallParam
+    | ResponseComputerToolCallParam
+    | ResponseFunctionWebSearchParam
     # | ResponseFileSearchToolCallParam
     # | ResponseFunctionToolCallParam
-    # | ResponseFunctionWebSearchParam
 )
 def _tool_param_for_tool_info(
     tool: ToolInfo,
+    model_name: str,
     config: GenerateConfig,
 ) -> ToolParam:
     # Use a native tool implementation when available. Otherwise, use the
     # standard tool implementation
-    return _maybe_native_tool_param(tool, config) or FunctionToolParam(
+    return _maybe_native_tool_param(tool, model_name, config) or FunctionToolParam(
         type="function",
         name=_responses_tool_alias(tool.name),
         description=tool.description,

inspect_ai/model/_openai_web_search.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import cast
+from openai.types.responses import WebSearchTool, WebSearchToolParam
+from inspect_ai.tool._tool_info import ToolInfo
+COMPATIBLE_MODELS = ["gpt-4o", "gpt-4o-mini", "gpt-4.1"]
+def maybe_web_search_tool(model_name: str, tool: ToolInfo) -> WebSearchToolParam | None:
+    return (
+        _web_search_tool(tool.options["openai"])
+        if (
+            tool.name == "web_search"
+            and tool.options
+            and "openai" in tool.options
+            and model_name in COMPATIBLE_MODELS
+        )
+        else None
+    )
+def _web_search_tool(maybe_openai_options: object) -> WebSearchToolParam:
+    if maybe_openai_options is None:
+        maybe_openai_options = {}
+    elif not isinstance(maybe_openai_options, dict):
+        raise TypeError(
+            f"Expected a dictionary for openai_options, got {type(maybe_openai_options)}"
+        )
+    openai_options = (
+        WebSearchTool.model_validate(
+            {"type": "web_search_preview", **maybe_openai_options}
+        )
+        if maybe_openai_options
+        else WebSearchTool(type="web_search_preview")
+    )
+    return cast(WebSearchToolParam, openai_options.model_dump(exclude_none=True))

inspect_ai/model/_providers/azureai.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import functools
 import json
 import os
 from copy import copy
@@ -151,7 +152,7 @@ class AzureAIAPI(ModelAPI):
         # prepare request
         request = dict(
-            messages=await chat_request_messages(input, handler),
+            messages=await chat_request_messages(input, handler, self.is_mistral()),
             **self.completion_params(config),
         )
         # newer versions of vllm reject requests with tools or tool_choice if the
@@ -280,9 +281,77 @@ class AzureAIAPI(ModelAPI):
 async def chat_request_messages(
-    messages: list[ChatMessage], handler: ChatAPIHandler | None
+    messages: list[ChatMessage],
+    handler: ChatAPIHandler | None,
+    is_mistral: bool = False,
+) -> list[ChatRequestMessage]:
+    chat_messages = [
+        await chat_request_message(message, handler) for message in messages
+    ]
+    if is_mistral:
+        chat_messages = functools.reduce(mistral_message_reducer, chat_messages, [])
+    return chat_messages
+def mistral_message_reducer(
+    messages: list[ChatRequestMessage],
+    message: ChatRequestMessage,
 ) -> list[ChatRequestMessage]:
-    return [await chat_request_message(message, handler) for message in messages]
+    """Fold any user messages found immediately after tool messages into the last tool message."""
+    if (
+        len(messages) > 0
+        and isinstance(messages[-1], ToolMessage)
+        and isinstance(message, UserMessage)
+    ):
+        messages[-1] = fold_user_message_into_tool_message(messages[-1], message)
+    else:
+        messages.append(message)
+    return messages
+def fold_user_message_into_tool_message(
+    tool_message: ToolMessage,
+    user_message: UserMessage,
+) -> ToolMessage:
+    def convert_content_items_to_string(list_content: list[ContentItem]) -> str:
+        if not all(
+            isinstance(item, (TextContentItem | ImageContentItem))
+            for item in list_content
+        ):
+            raise TypeError(
+                "Expected all items to be TextContentItem or ImageContentItem"
+            )
+        parts = []
+        for item in list_content:
+            if isinstance(item, TextContentItem):
+                parts.append(item.text)
+            elif isinstance(item, ImageContentItem):
+                parts.append(f"[Image: {item.image_url.url}]")
+            else:
+                raise ValueError("Unexpected content item type")
+        return "".join(parts)
+    def normalise_content(
+        content: str | list[ContentItem] | None,
+    ) -> str | None:
+        return (
+            None
+            if content is None
+            else convert_content_items_to_string(content)
+            if isinstance(content, list)
+            else content
+        )
+    tool_content = normalise_content(tool_message.content)
+    user_content = normalise_content(user_message.content)
+    return ToolMessage(
+        content=(tool_content or "") + (user_content or ""),
+        tool_call_id=tool_message.tool_call_id,
+    )
 async def chat_request_message(

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -42,6 +42,7 @@ from .._openai import (
     openai_media_filter,
     openai_should_retry,
 )
+from .._openai_responses import is_native_tool_configured
 from .openai_o1 import generate_o1
 from .util import environment_prerequisite_error, model_base_url
@@ -241,7 +242,9 @@ class OpenAIAPI(ModelAPI):
                 tools=tools,
                 **self.completion_params(config, False),
             )
-        elif self.responses_api:
+        elif self.responses_api or is_native_tool_configured(
+            tools, self.model_name, config
+        ):
             return await generate_responses(
                 client=self.client,
                 http_hooks=self._http_hooks,

inspect_ai/model/_providers/openai_responses.py CHANGED Viewed

@@ -59,7 +59,11 @@ async def generate_responses(
         )
     # prepare request (we do this so we can log the ModelCall)
-    tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
+    tool_params = (
+        openai_responses_tools(tools, model_name, config)
+        if len(tools) > 0
+        else NOT_GIVEN
+    )
     request = dict(
         input=await openai_responses_inputs(input, model_name, store),
         tools=tool_params,

inspect_ai/scorer/_metric.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import (
     Protocol,
     Type,
     Union,
-    cast,
     overload,
     runtime_checkable,
 )
@@ -356,7 +355,7 @@ def metric(
             )
             return metric
-        return metric_register(cast(Callable[P, Metric], metric_wrapper), metric_name)
+        return metric_register(metric_wrapper, metric_name)
     # for decorators with an explicit name, one more wrapper for the name
     if isinstance(name, str):

inspect_ai/scorer/_reducer/reducer.py CHANGED Viewed

@@ -121,7 +121,7 @@ def pass_at(
     def reduce(scores: list[Score]) -> Score:
         def pass_at_k(values: list[float]) -> float:
             total = len(scores)
-            correct = sum(1 for v in values if v == value)
+            correct = sum(1 for v in values if v >= value)
             if total - correct < k:
                 return 1.0
             else:

inspect_ai/solver/_task_state.py CHANGED Viewed

@@ -290,7 +290,7 @@ class TaskState:
         return self._tools
     @tools.setter
-    def tools(self, tools: list[Tool | ToolDef]) -> None:
+    def tools(self, tools: Sequence[Tool | ToolDef]) -> None:
         self._tools.clear()
         for tool in tools:
             self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
@@ -353,7 +353,7 @@ class TaskState:
     def completed(self) -> bool:
         """Is the task completed.
-        Additionally, checks message and token limits and raises if they are exceeded, and also checks for an operator interrupt of the sample.
+        Additionally, checks for an operator interrupt of the sample.
         """
         from inspect_ai.log._samples import set_active_sample_total_messages

inspect-ai 0.3.99__py3-none-any.whl → 0.3.101__py3-none-any.whl

inspect-ai 0.3.99py3-none-any.whl → 0.3.101py3-none-any.whl