PyPI - inspect-ai - Versions diffs - 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl - Mend

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/task/run.py +21 -12
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/exception.py +4 -0
inspect_ai/_util/hash.py +39 -0
inspect_ai/_util/local_server.py +51 -21
inspect_ai/_util/path.py +22 -0
inspect_ai/_util/trace.py +1 -1
inspect_ai/_util/working.py +4 -0
inspect_ai/_view/www/dist/assets/index.css +23 -22
inspect_ai/_view/www/dist/assets/index.js +517 -204
inspect_ai/_view/www/log-schema.json +375 -0
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +90 -12
inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/types.ts +12 -2
inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
inspect_ai/_view/www/src/state/hooks.ts +19 -3
inspect_ai/_view/www/src/state/logSlice.ts +23 -5
inspect_ai/_view/www/yarn.lock +9 -9
inspect_ai/agent/_as_solver.py +3 -1
inspect_ai/agent/_as_tool.py +6 -4
inspect_ai/agent/_bridge/patch.py +1 -3
inspect_ai/agent/_handoff.py +5 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +6 -1
inspect_ai/agent/_types.py +9 -0
inspect_ai/analysis/__init__.py +0 -0
inspect_ai/analysis/beta/__init__.py +57 -0
inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
inspect_ai/analysis/beta/_dataframe/record.py +377 -0
inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
inspect_ai/analysis/beta/_dataframe/util.py +157 -0
inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +10 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +1 -1
inspect_ai/log/_log.py +21 -1
inspect_ai/log/_samples.py +14 -17
inspect_ai/log/_transcript.py +77 -35
inspect_ai/log/_tree.py +118 -0
inspect_ai/model/_call_tools.py +44 -35
inspect_ai/model/_model.py +51 -44
inspect_ai/model/_openai_responses.py +17 -18
inspect_ai/model/_providers/anthropic.py +30 -5
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/sglang.py +8 -2
inspect_ai/model/_providers/vllm.py +6 -2
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +9 -23
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +7 -3
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_mcp/_context.py +3 -5
inspect_ai/tool/_mcp/_mcp.py +6 -5
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_sandbox/events.py +3 -2
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +114 -82
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0

inspect_ai/log/_tree.py ADDED Viewed

@@ -0,0 +1,118 @@
+from dataclasses import dataclass, field
+from logging import getLogger
+from typing import Iterable, Sequence, TypeAlias
+from ._transcript import Event, SpanBeginEvent, SpanEndEvent
+logger = getLogger(__name__)
+EventNode: TypeAlias = "SpanNode" | Event
+"""Node in an event tree."""
+EventTree: TypeAlias = list[EventNode]
+"""Tree of events (has invividual events and event spans)."""
+@dataclass
+class SpanNode:
+    """Event tree node representing a span of events."""
+    id: str
+    """Span id."""
+    parent_id: str | None
+    """Parent span id."""
+    type: str | None
+    """Optional 'type' field for span."""
+    name: str
+    """Span name."""
+    begin: SpanBeginEvent
+    """Span begin event."""
+    end: SpanEndEvent | None = None
+    """Span end event (if any)."""
+    children: list[EventNode] = field(default_factory=list)
+    """Children in the span."""
+def event_tree(events: Sequence[Event]) -> EventTree:
+    """Build a tree representation of a sequence of events.
+    Organize events heirarchially into event spans.
+    Args:
+        events: Sequence of `Event`.
+    Returns:
+        Event tree.
+    """
+    # Convert one flat list of (possibly interleaved) events into  *forest*
+    # (list of root-level items).
+    # Pre-create one node per span so we can attach events no matter when they
+    # arrive in the file. A single forward scan guarantees that the order of
+    # `children` inside every span reflects the order in which things appeared
+    # in the transcript.
+    nodes: dict[str, SpanNode] = {
+        ev.id: SpanNode(
+            id=ev.id, parent_id=ev.parent_id, type=ev.type, name=ev.name, begin=ev
+        )
+        for ev in events
+        if isinstance(ev, SpanBeginEvent)
+    }
+    roots: list[EventNode] = []
+    # Where should an event with `span_id` go?
+    def bucket(span_id: str | None) -> list[EventNode]:
+        if span_id and span_id in nodes:
+            return nodes[span_id].children
+        return roots  # root level
+    # Single pass in original order
+    for ev in events:
+        if isinstance(ev, SpanBeginEvent):  # span starts
+            bucket(ev.parent_id).append(nodes[ev.id])
+        elif isinstance(ev, SpanEndEvent):  # span ends
+            if n := nodes.get(ev.id):
+                n.end = ev
+            else:
+                logger.warning(f"Span end event (id: {ev.id} with no span begin)")
+        else:  # ordinary event
+            bucket(ev.span_id).append(ev)
+    return roots
+def event_sequence(tree: EventTree) -> Iterable[Event]:
+    """Flatten a span forest back into a properly ordered seqeunce.
+    Args:
+        tree: Event tree
+    Returns:
+        Sequence of events.
+    """
+    for item in tree:
+        if isinstance(item, SpanNode):
+            yield item.begin
+            yield from event_sequence(item.children)
+            if item.end:
+                yield item.end
+        else:
+            yield item
+def _print_event_tree(tree: EventTree, indent: str = "") -> None:
+    for item in tree:
+        if isinstance(item, SpanNode):
+            print(f"{indent}span ({item.type}): {item.name}")
+            _print_event_tree(item.children, f"{indent}  ")
+        else:
+            print(f"{indent}{item.event}")

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -39,6 +39,7 @@ from inspect_ai._util.content import (
     ContentText,
     ContentVideo,
 )
+from inspect_ai._util.exception import TerminateSampleError
 from inspect_ai._util.format import format_function_call
 from inspect_ai._util.logger import warn_once
 from inspect_ai._util.registry import registry_unqualified_name
@@ -61,6 +62,7 @@ from inspect_ai.tool._tool_params import ToolParams
 from inspect_ai.util import OutputLimitExceededError
 from inspect_ai.util._anyio import inner_exception
 from inspect_ai.util._limit import LimitExceededError, apply_limits
+from inspect_ai.util._span import span
 from ._chat_message import (
     ChatMessage,
@@ -109,26 +111,18 @@ async def execute_tools(
     """
     message = messages[-1]
     if isinstance(message, ChatMessageAssistant) and message.tool_calls:
-        from inspect_ai.log._transcript import (
-            ToolEvent,
-            Transcript,
-            init_transcript,
-            track_store_changes,
-            transcript,
-        )
+        from inspect_ai.log._transcript import ToolEvent, transcript
         tdefs = await tool_defs(tools)
         async def call_tool_task(
             call: ToolCall,
+            event: ToolEvent,
             conversation: list[ChatMessage],
             send_stream: MemoryObjectSendStream[
                 tuple[ExecuteToolsResult, ToolEvent, Exception | None]
             ],
         ) -> None:
-            # create a transript for this call
-            init_transcript(Transcript(name=call.function))
             result: ToolResult = ""
             messages: list[ChatMessage] = []
             output: ModelOutput | None = None
@@ -136,15 +130,14 @@ async def execute_tools(
             tool_error: ToolCallError | None = None
             tool_exception: Exception | None = None
             try:
-                with track_store_changes():
-                    try:
-                        result, messages, output, agent = await call_tool(
-                            tdefs, message.text, call, conversation
-                        )
-                    # unwrap exception group
-                    except Exception as ex:
-                        inner_ex = inner_exception(ex)
-                        raise inner_ex.with_traceback(inner_ex.__traceback__)
+                try:
+                    result, messages, output, agent = await call_tool(
+                        tdefs, message.text, call, event, conversation
+                    )
+                # unwrap exception group
+                except Exception as ex:
+                    inner_ex = inner_exception(ex)
+                    raise inner_ex.with_traceback(inner_ex.__traceback__)
             except TimeoutError:
                 tool_error = ToolCallError(
@@ -227,7 +220,6 @@ async def execute_tools(
                 truncated=truncated,
                 view=call.view,
                 error=tool_error,
-                events=list(transcript().events),
                 agent=agent,
             )
@@ -270,7 +262,6 @@ async def execute_tools(
                 internal=call.internal,
                 pending=True,
             )
-            transcript()._event(event)
             # execute the tool call. if the operator cancels the
             # tool call then synthesize the appropriate message/event
@@ -280,7 +271,7 @@ async def execute_tools(
             result_exception = None
             async with anyio.create_task_group() as tg:
-                tg.start_soon(call_tool_task, call, messages, send_stream)
+                tg.start_soon(call_tool_task, call, event, messages, send_stream)
                 event._set_cancel_fn(tg.cancel_scope.cancel)
                 async with receive_stream:
                     (
@@ -306,7 +297,6 @@ async def execute_tools(
                     truncated=None,
                     view=call.view,
                     error=tool_message.error,
-                    events=[],
                 )
                 transcript().info(
                     f"Tool call '{call.function}' was cancelled by operator."
@@ -326,7 +316,6 @@ async def execute_tools(
                 result=result_event.result,
                 truncated=result_event.truncated,
                 error=result_event.error,
-                events=result_event.events,
                 waiting_time=waiting_time_end - waiting_time_start,
                 agent=result_event.agent,
                 failed=True if result_exception else None,
@@ -347,19 +336,34 @@ async def execute_tools(
 async def call_tool(
-    tools: list[ToolDef], message: str, call: ToolCall, conversation: list[ChatMessage]
+    tools: list[ToolDef],
+    message: str,
+    call: ToolCall,
+    event: BaseModel,
+    conversation: list[ChatMessage],
 ) -> tuple[ToolResult, list[ChatMessage], ModelOutput | None, str | None]:
     from inspect_ai.agent._handoff import AgentTool
-    from inspect_ai.log._transcript import SampleLimitEvent, transcript
+    from inspect_ai.log._transcript import SampleLimitEvent, ToolEvent, transcript
+    # dodge circular import
+    assert isinstance(event, ToolEvent)
+    # this function is responsible for transcript events so that it can
+    # put them in the right enclosure (e.g. handoff/agent/tool). This
+    # means that if we throw early we need to do the enclosure when raising.
+    async def record_tool_parsing_error(error: str) -> Exception:
+        async with span(name=call.function, type="tool"):
+            transcript()._event(event)
+        return ToolParsingError(error)
     # if there was an error parsing the ToolCall, raise that
     if call.parse_error:
-        raise ToolParsingError(call.parse_error)
+        raise await record_tool_parsing_error(call.parse_error)
     # find the tool
     tool_def = next((tool for tool in tools if tool.name == call.function), None)
     if tool_def is None:
-        raise ToolParsingError(f"Tool {call.function} not found")
+        raise await record_tool_parsing_error(f"Tool {call.function} not found")
     # if we have a tool approver, apply it now
     from inspect_ai.approval._apply import apply_tool_approval
@@ -373,7 +377,7 @@ async def call_tool(
             transcript()._event(
                 SampleLimitEvent(type="operator", limit=1, message=message)
             )
-            raise LimitExceededError("operator", value=1, limit=1, message=message)
+            raise TerminateSampleError(message)
         else:
             raise ToolApprovalError(approval.explanation if approval else None)
     if approval and approval.modified:
@@ -382,7 +386,7 @@ async def call_tool(
     # validate the schema of the passed object
     validation_errors = validate_tool_input(call.arguments, tool_def.parameters)
     if validation_errors:
-        raise ToolParsingError(validation_errors)
+        raise await record_tool_parsing_error(validation_errors)
     # get arguments (with creation of dataclasses, pydantic objects, etc.)
     arguments = tool_params(call.arguments, tool_def.tool)
@@ -391,14 +395,18 @@ async def call_tool(
     with trace_action(
         logger, "Tool Call", format_function_call(tool_def.name, arguments, width=1000)
     ):
-        # agent tools get special handling
         if isinstance(tool_def.tool, AgentTool):
-            return await agent_handoff(tool_def, call, conversation)
+            async with span(tool_def.tool.name, type="handoff"):
+                async with span(name=call.function, type="tool"):
+                    transcript()._event(event)
+                    return await agent_handoff(tool_def, call, conversation)
         # normal tool call
         else:
-            result: ToolResult = await tool_def.tool(**arguments)
-            return result, [], None, None
+            async with span(name=call.function, type="tool"):
+                transcript()._event(event)
+                result: ToolResult = await tool_def.tool(**arguments)
+                return result, [], None, None
 async def agent_handoff(
@@ -463,7 +471,8 @@ async def agent_handoff(
     agent_state = AgentState(messages=copy(agent_conversation))
     try:
         with apply_limits(agent_tool.limits):
-            agent_state = await agent_tool.agent(agent_state, **arguments)
+            async with span(name=agent_name, type="agent"):
+                agent_state = await agent_tool.agent(agent_state, **arguments)
     except LimitExceededError as ex:
         limit_error = ex

inspect_ai/model/_model.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import (
     cast,
 )
+from pydantic import BaseModel
 from pydantic_core import to_jsonable_python
 from tenacity import (
     RetryCallState,
@@ -402,36 +403,32 @@ class Model:
         start_time = datetime.now()
         working_start = sample_working_time()
         async with self._connection_concurrency(config):
-            from inspect_ai.log._samples import track_active_sample_retries
             # generate
-            with track_active_sample_retries():
-                output = await self._generate(
-                    input=input,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    config=config,
-                    cache=cache,
-                )
+            output, event = await self._generate(
+                input=input,
+                tools=tools,
+                tool_choice=tool_choice,
+                config=config,
+                cache=cache,
+            )
             # update the most recent ModelEvent with the actual start/completed
             # times as well as a computation of working time (events are
             # created _after_ the call to _generate, potentially in response
             # to retries, so they need their timestamp updated so it accurately
             # reflects the full start/end time which we know here)
-            from inspect_ai.log._transcript import ModelEvent, transcript
-            last_model_event = transcript().find_last_event(ModelEvent)
-            if last_model_event:
-                last_model_event.timestamp = start_time
-                last_model_event.working_start = working_start
-                completed = datetime.now()
-                last_model_event.completed = completed
-                last_model_event.working_time = (
-                    output.time
-                    if output.time is not None
-                    else (completed - start_time).total_seconds()
-                )
+            from inspect_ai.log._transcript import ModelEvent
+            assert isinstance(event, ModelEvent)
+            event.timestamp = start_time
+            event.working_start = working_start
+            completed = datetime.now()
+            event.completed = completed
+            event.working_time = (
+                output.time
+                if output.time is not None
+                else (completed - start_time).total_seconds()
+            )
             # return output
             return output
@@ -492,9 +489,12 @@ class Model:
         tool_choice: ToolChoice | None,
         config: GenerateConfig,
         cache: bool | CachePolicy = False,
-    ) -> ModelOutput:
+    ) -> tuple[ModelOutput, BaseModel]:
+        from inspect_ai.log._samples import track_active_model_event
+        from inspect_ai.log._transcript import ModelEvent
         # default to 'auto' for tool_choice (same as underlying model apis)
-        tool_choice = tool_choice if tool_choice else "auto"
+        tool_choice = tool_choice if tool_choice is not None else "auto"
         # resolve top level tool source
         if isinstance(tools, ToolSource):
@@ -581,7 +581,10 @@ class Model:
             stop=stop,
             before_sleep=functools.partial(log_model_retry, self.api.model_name),
         )
-        async def generate() -> ModelOutput:
+        async def generate() -> tuple[ModelOutput, BaseModel]:
+            # type-checker can't see that we made sure tool_choice is not none in the outer frame
+            assert tool_choice is not None
             check_sample_interrupt()
             cache_entry: CacheEntry | None
@@ -602,7 +605,7 @@ class Model:
                 )
                 existing = cache_fetch(cache_entry)
                 if isinstance(existing, ModelOutput):
-                    self._record_model_interaction(
+                    _, event = self._record_model_interaction(
                         input=input,
                         tools=tools_info,
                         tool_choice=tool_choice,
@@ -611,7 +614,7 @@ class Model:
                         output=existing,
                         call=None,
                     )
-                    return existing
+                    return existing, event
             else:
                 cache_entry = None
@@ -620,7 +623,7 @@ class Model:
             # record the interaction before the call to generate
             # (we'll update it with the results once we have them)
-            complete = self._record_model_interaction(
+            complete, event = self._record_model_interaction(
                 input=input,
                 tools=tools_info,
                 tool_choice=tool_choice,
@@ -631,12 +634,14 @@ class Model:
             with trace_action(logger, "Model", f"generate ({str(self)})"):
                 time_start = time.monotonic()
                 try:
-                    result = await self.api.generate(
-                        input=input,
-                        tools=tools_info,
-                        tool_choice=tool_choice,
-                        config=config,
-                    )
+                    assert isinstance(event, ModelEvent)
+                    with track_active_model_event(event):
+                        result = await self.api.generate(
+                            input=input,
+                            tools=tools_info,
+                            tool_choice=tool_choice,
+                            config=config,
+                        )
                 finally:
                     time_elapsed = time.monotonic() - time_start
@@ -686,18 +691,18 @@ class Model:
             if cache and cache_entry:
                 cache_store(entry=cache_entry, output=output)
-            return output
+            return output, event
         # call the model (this will so retries, etc., so report waiting time
         # as elapsed time - actual time for successful model call)
         time_start = time.monotonic()
-        model_output = await generate()
+        model_output, event = await generate()
         total_time = time.monotonic() - time_start
         if model_output.time:
             report_sample_waiting_time(total_time - model_output.time)
         # return results
-        return model_output
+        return model_output, event
     def should_retry(self, ex: BaseException) -> bool:
         if isinstance(ex, Exception):
@@ -769,7 +774,7 @@ class Model:
         cache: Literal["read", "write"] | None,
         output: ModelOutput | None = None,
         call: ModelCall | None = None,
-    ) -> Callable[[ModelOutput | Exception, ModelCall | None], None]:
+    ) -> tuple[Callable[[ModelOutput | Exception, ModelCall | None], None], BaseModel]:
         from inspect_ai.log._transcript import ModelEvent, transcript
         # create event and add it to the transcript
@@ -809,7 +814,7 @@ class Model:
         if output:
             complete(output, call)
-        return complete
+        return complete, event
 class ModelName:
@@ -1232,9 +1237,10 @@ def tool_result_images_as_user_message(
     Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
     """
-    init_accum: ImagesAccumulator = ([], [], [])
     chat_messages, user_message_content, tool_call_ids = functools.reduce(
-        tool_result_images_reducer, messages, init_accum
+        tool_result_images_reducer,
+        messages,
+        (list[ChatMessage](), list[Content](), list[str]()),
     )
     # if the last message was a tool result, we may need to flush the pending stuff here
     return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
@@ -1260,9 +1266,10 @@ def tool_result_images_reducer(
         and isinstance(message.content, list)
         and any([isinstance(c, ContentImage) for c in message.content])
     ):
-        init_accum: ImageContentAccumulator = ([], [])
         new_user_message_content, edited_tool_message_content = functools.reduce(
-            tool_result_image_content_reducer, message.content, init_accum
+            tool_result_image_content_reducer,
+            message.content,
+            (list[Content](), list[Content]()),
         )
         return (

inspect_ai/model/_openai_responses.py CHANGED Viewed

@@ -184,24 +184,23 @@ def openai_responses_chat_choices(
 # │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │
 # │ │ │ type: "reasoning" │ │ │    │ │ │ ContentText       │ │ │    │ │ │ type: "reasoning" │ │ │
 # │ │ │ id: "rs_bbbbbb"   │ │ │    │ │ │ text: ""          │ │ │    │ │ │ id: "rs_bbbbbb"   │ │ │
-# │ │ │ summary: []       │ │ │    │ │ └───────────────────┘ │ │    │ │ │ summary: []       │ │ │
-# │ │ └───────────────────┘ │ │    │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │
-# │ │ ┌───────────────────┐ │ │    │ │ │ ContentText       │ │ │    │ │ │ type: "message"   │ │ │
-# │ │ │ type: "message"   │ │ │    │ │ │ text: "text1"     │ │ │    │ │ │ id: "msg_ccccccc" │ │ │
-# │ │ │ id: "msg_ccccccc" │ │ │    │ │ └───────────────────┘ │ │    │ │ │ role: "assistant" │ │ │
-# │ │ │ role: "assistant" │ │ │--->│ │ ┌───────────────────┐ │ │--->│ │ │ ┌───────────────┐ │ │ │
-# │ │ │ ┌───────────────┐ │ │ │    │ │ │ ContentText       │ │ │    │ │ │ │ Content       │ │ │ │
-# │ │ │ │ Content       │ │ │ │    │ │ │ text: "text2"     │ │ │    │ │ │ │ ┌───────────┐ │ │ │ │
-# │ │ │ │ ┌───────────┐ │ │ │ │    │ └───────────────────────┘ │    │ │ │ │ │"text1"    │ │ │ │ │
-# │ │ │ │ │"text1"    │ │ │ │ │    │ ┌───────────────────────┐ │    │ │ │ │ └───────────┘ │ │ │ │
-# │ │ │ │ └───────────┘ │ │ │ │    │ │ internal              │ │    │ │ │ │ ┌───────────┐ │ │ │ │
-# │ │ │ │ ┌───────────┐ │ │ │ │    │ │ ┌───────────────────┐ │ │    │ │ │ │ │ "text2"   │ │ │ │ │
-# │ │ │ │ │ "text2"   │ │ │ │ │    │ │ │ reasoning_id:     │ │ │    │ │ │ │ └───────────┘ │ │ │ │
-# │ │ │ │ └───────────┘ │ │ │ │    │ │ │ "rs_bbbbbb"       │ │ │    │ │ │ └───────────────┘ │ │ │
-# │ │ │ └───────────────┘ │ │ │    │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │
-# │ │ └───────────────────┘ │ │    │ │ ┌───────────────────┐ │ │    │ └───────────────────────┘ │
-# │ └───────────────────────┘ │    │ │ │ output_msg_id:    │ │ │    └───────────────────────────┘
-# └───────────────────────────┘    │ │ │ "msg_ccccccc"     │ │ │
+# │ │ │ summary: []       │ │ │    │ │ ├───────────────────┤ │ │    │ │ │ summary: []       │ │ │
+# │ │ ├───────────────────┤ │ │    │ │ │ ContentText       │ │ │    │ │ ├───────────────────┤ │ │
+# │ │ │ type: "message"   │ │ │    │ │ │ text: "text1"     │ │ │    │ │ │ type: "message"   │ │ │
+# │ │ │ id: "msg_ccccccc" │ │ │    │ │ ├───────────────────┤ │ │    │ │ │ id: "msg_ccccccc" │ │ │
+# │ │ │ role: "assistant" │ │ │    │ │ │ ContentText       │ │ │    │ │ │ role: "assistant" │ │ │
+# │ │ │ ┌───────────────┐ │ │ │ -> │ │ │ text: "text2"     │ │ │ -> │ │ │ ┌───────────────┐ │ │ │
+# │ │ │ │ Content       │ │ │ │    │ │ └───────────────────┘ │ │    │ │ │ │ Content       │ │ │ │
+# │ │ │ │ ┌───────────┐ │ │ │ │    │ └───────────────────────┘ │    │ │ │ │ ┌───────────┐ │ │ │ │
+# │ │ │ │ │"text1"    │ │ │ │ │    │ ┌───────────────────────┐ │    │ │ │ │ │"text1"    │ │ │ │ │
+# │ │ │ │ ├───────────┤ │ │ │ │    │ │ internal              │ │    │ │ │ │ ├───────────┤ │ │ │ │
+# │ │ │ │ │"text2"    │ │ │ │ │    │ │ ┌───────────────────┐ │ │    │ │ │ │ │"text2"    │ │ │ │ │
+# │ │ │ │ └───────────┘ │ │ │ │    │ │ │ reasoning_id:     │ │ │    │ │ │ │ └───────────┘ │ │ │ │
+# │ │ │ └───────────────┘ │ │ │    │ │ │ "rs_bbbbbb"       │ │ │    │ │ │ └───────────────┘ │ │ │
+# │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │
+# │ └───────────────────────┘ │    │ │ ┌───────────────────┐ │ │    │ └───────────────────────┘ │
+# └───────────────────────────┘    │ │ │ output_msg_id:    │ │ │    └───────────────────────────┘
+#                                  │ │ │ "msg_ccccccc"     │ │ │
 #                                  │ │ └───────────────────┘ │ │
 #                                  │ └───────────────────────┘ │
 #                                  └───────────────────────────┘

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -33,7 +33,10 @@ from anthropic.types import (
     ToolUseBlockParam,
     message_create_params,
 )
-from anthropic.types.beta import BetaToolComputerUse20250124Param
+from anthropic.types.beta import (
+    BetaToolComputerUse20250124Param,
+    BetaToolTextEditor20241022Param,
+)
 from pydantic import JsonValue
 from typing_extensions import override
@@ -218,6 +221,8 @@ class AnthropicAPI(ModelAPI):
                 # tools are generally available for Claude 3.5 Sonnet (new) as well and
                 # can be used without the computer use beta header.
                 betas.append("computer-use-2025-01-24")
+            if any("20241022" in str(tool.get("type", "")) for tool in tools_param):
+                betas.append("computer-use-2024-10-22")
             if len(betas) > 0:
                 extra_headers["anthropic-beta"] = ",".join(betas)
@@ -337,6 +342,15 @@ class AnthropicAPI(ModelAPI):
     @override
     def should_retry(self, ex: Exception) -> bool:
         if isinstance(ex, APIStatusError):
+            # for unknown reasons, anthropic does not always set status_code == 529
+            # for "overloaded_error" so we check for it explicitly
+            if (
+                isinstance(ex.body, dict)
+                and ex.body.get("error", {}).get("type", "") == "overloaded_error"
+            ):
+                return True
+            # standard http status code checking
             return is_retryable_http_status(ex.status_code)
         elif httpx_should_retry(ex):
             return True
@@ -545,7 +559,7 @@ class AnthropicAPI(ModelAPI):
     def text_editor_tool_param(
         self, tool: ToolInfo
-    ) -> Optional[ToolTextEditor20250124Param]:
+    ) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
         # check for compatible 'text editor' tool
         if tool.name == "text_editor" and (
             sorted(tool.parameters.properties.keys())
@@ -561,8 +575,14 @@ class AnthropicAPI(ModelAPI):
                 ]
             )
         ):
-            return ToolTextEditor20250124Param(
-                type="text_editor_20250124", name="str_replace_editor"
+            return (
+                BetaToolTextEditor20241022Param(
+                    type="text_editor_20241022", name="str_replace_editor"
+                )
+                if self.is_claude_3_5()
+                else ToolTextEditor20250124Param(
+                    type="text_editor_20250124", name="str_replace_editor"
+                )
             )
         # not a text_editor tool
         else:
@@ -571,7 +591,10 @@ class AnthropicAPI(ModelAPI):
 # tools can be either a stock tool param or a special Anthropic native use tool param
 ToolParamDef = (
-    ToolParam | BetaToolComputerUse20250124Param | ToolTextEditor20250124Param
+    ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | BetaToolTextEditor20241022Param
 )
@@ -580,6 +603,7 @@ def add_cache_control(
     | ToolParam
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
+    | BetaToolTextEditor20241022Param
     | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -844,6 +868,7 @@ def _names_for_tool_call(
     """
     mappings = (
         (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
+        ("str_replace_editor", "text_editor_20241022", "text_editor"),
         ("str_replace_editor", "text_editor_20250124", "text_editor"),
         ("bash", "bash_20250124", "bash_session"),
     )

inspect-ai 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl