PyPI - inspect-ai - Versions diffs - 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl - Mend

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

inspect_ai/_cli/eval.py +31 -0
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +13 -20
inspect_ai/_util/local_server.py +368 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +159 -146
inspect_ai/_view/www/dist/assets/index.js +1020 -1061
inspect_ai/_view/www/log-schema.json +4 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +3 -2
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +20 -12
inspect_ai/agent/_as_tool.py +15 -3
inspect_ai/agent/_handoff.py +8 -1
inspect_ai/agent/_run.py +11 -3
inspect_ai/log/__init__.py +4 -0
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +0 -8
inspect_ai/log/_transcript.py +7 -1
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +32 -12
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +21 -48
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_openai_responses.py +13 -1
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +241 -0
inspect_ai/model/_providers/vllm.py +207 -400
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +2 -0
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +12 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0

inspect_ai/log/_util.py ADDED Viewed

@@ -0,0 +1,52 @@
+import textwrap
+from datetime import date, datetime, time
+from typing import Any
+from inspect_ai._util.content import (
+    ContentAudio,
+    ContentImage,
+    ContentReasoning,
+    ContentText,
+    ContentVideo,
+)
+from inspect_ai.model._chat_message import ChatMessage
+def text_input_only(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
+    # Clean the input of any images
+    if isinstance(inputs, list):
+        input: list[ChatMessage] = []
+        for message in inputs:
+            if not isinstance(message.content, str):
+                filtered_content: list[
+                    ContentText
+                    | ContentReasoning
+                    | ContentImage
+                    | ContentAudio
+                    | ContentVideo
+                ] = []
+                for content in message.content:
+                    if content.type == "text":
+                        filtered_content.append(content)
+                    else:
+                        filtered_content.append(
+                            ContentText(text=f"({content.type.capitalize()})")
+                        )
+                message.content = filtered_content
+                input.append(message)
+            else:
+                input.append(message)
+        return input
+    else:
+        return inputs
+def thin_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
+    thinned: dict[str, Any] = {}
+    for key, value in metadata.items():
+        if isinstance(value, int | float | bool | date | time | datetime):
+            thinned[key] = value
+        elif isinstance(value, str):
+            thinned[key] = textwrap.shorten(value, width=1024, placeholder="...")
+    return thinned

inspect_ai/model/__init__.py CHANGED Viewed

@@ -28,7 +28,11 @@ from ._chat_message import (
     ChatMessageUser,
 )
 from ._conversation import ModelConversation
-from ._generate_config import GenerateConfig, GenerateConfigArgs, ResponseSchema
+from ._generate_config import (
+    GenerateConfig,
+    GenerateConfigArgs,
+    ResponseSchema,
+)
 from ._model import (
     Model,
     ModelAPI,

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -60,6 +60,7 @@ from inspect_ai.tool._tool_info import parse_docstring
 from inspect_ai.tool._tool_params import ToolParams
 from inspect_ai.util import OutputLimitExceededError
 from inspect_ai.util._anyio import inner_exception
+from inspect_ai.util._limit import LimitExceededError, apply_limits
 from ._chat_message import (
     ChatMessage,
@@ -171,10 +172,15 @@ async def execute_tools(
                 tool_error = ToolCallError("is_a_directory", err)
             except OutputLimitExceededError as ex:
                 tool_error = ToolCallError(
-                    "output_limit",
-                    f"The tool output limit of {ex.limit_str} was exceeded.",
+                    "limit",
+                    f"The tool exceeded its output limit of {ex.limit_str}.",
                 )
                 result = ex.truncated_output or ""
+            except LimitExceededError as ex:
+                tool_error = ToolCallError(
+                    "limit",
+                    f"The tool exceeded its {ex.type} limit of {ex.limit}.",
+                )
             except ToolParsingError as ex:
                 tool_error = ToolCallError("parsing", ex.message)
             except ToolApprovalError as ex:
@@ -344,6 +350,7 @@ async def call_tool(
     tools: list[ToolDef], message: str, call: ToolCall, conversation: list[ChatMessage]
 ) -> tuple[ToolResult, list[ChatMessage], ModelOutput | None, str | None]:
     from inspect_ai.agent._handoff import AgentTool
+    from inspect_ai.log._transcript import SampleLimitEvent, transcript
     # if there was an error parsing the ToolCall, raise that
     if call.parse_error:
@@ -362,14 +369,11 @@ async def call_tool(
     )
     if not approved:
         if approval and approval.decision == "terminate":
-            from inspect_ai.solver._limit import SampleLimitExceededError
-            raise SampleLimitExceededError(
-                "operator",
-                value=1,
-                limit=1,
-                message="Tool call approver requested termination.",
+            message = "Tool call approver requested termination."
+            transcript()._event(
+                SampleLimitEvent(type="operator", limit=1, message=message)
             )
+            raise LimitExceededError("operator", value=1, limit=1, message=message)
         else:
             raise ToolApprovalError(approval.explanation if approval else None)
     if approval and approval.modified:
@@ -454,9 +458,14 @@ async def agent_handoff(
     arguments = tool_params(arguments, agent_tool.agent)
     del arguments["state"]
-    # make the call
+    # run the agent with limits
+    limit_error: LimitExceededError | None = None
     agent_state = AgentState(messages=copy(agent_conversation))
-    agent_state = await agent_tool.agent(agent_state, **arguments)
+    try:
+        with apply_limits(agent_tool.limits):
+            agent_state = await agent_tool.agent(agent_state, **arguments)
+    except LimitExceededError as ex:
+        limit_error = ex
     # determine which messages are new and return only those (but exclude new
     # system messages as they an internal matter for the handed off to agent.
@@ -474,9 +483,20 @@ async def agent_handoff(
     if agent_tool.output_filter is not None:
         agent_messages = await agent_tool.output_filter(agent_messages)
+    if limit_error is not None:
+        agent_messages.append(
+            ChatMessageUser(
+                content=(
+                    f"The {agent_name} exceeded its {limit_error.type} limit of "
+                    f"{limit_error.limit}."
+                )
+            )
+        )
     # if we end with an assistant message then add a user message
     # so that the calling agent carries on
-    if len(agent_messages) == 0 or isinstance(agent_messages[-1], ChatMessageAssistant):
+    elif len(agent_messages) == 0 or isinstance(
+        agent_messages[-1], ChatMessageAssistant
+    ):
         agent_messages.append(
             ChatMessageUser(content=f"The {agent_name} agent has completed its work.")
         )

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -106,6 +106,9 @@ class GenerateConfigArgs(TypedDict, total=False):
     response_schema: ResponseSchema | None
     """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
+    extra_body: dict[str, Any] | None
+    """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
 class GenerateConfig(BaseModel):
     """Model generation options."""
@@ -138,28 +141,28 @@ class GenerateConfig(BaseModel):
     """Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
     frequency_penalty: float | None = Field(default=None)
-    """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, and vLLM only."""
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
     presence_penalty: float | None = Field(default=None)
-    """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, and vLLM only."""
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
     logit_bias: dict[int, float] | None = Field(default=None)
-    """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, and Grok only."""
+    """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
     seed: int | None = Field(default=None)
     """Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
     top_k: int | None = Field(default=None)
-    """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, and vLLM only."""
+    """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
     num_choices: int | None = Field(default=None)
-    """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and vLLM only."""
+    """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
     logprobs: bool | None = Field(default=None)
-    """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
+    """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
     top_logprobs: int | None = Field(default=None)
-    """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, and vLLM only."""
+    """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
     parallel_tool_calls: bool | None = Field(default=None)
     """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
@@ -190,7 +193,10 @@ class GenerateConfig(BaseModel):
     """Include reasoning in chat message history sent to generate."""
     response_schema: ResponseSchema | None = Field(default=None)
-    """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
+    """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, Mistral, vLLM, and SGLang only."""
+    extra_body: dict[str, Any] | None = Field(default=None)
+    """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
     # migrate reasoning_history as a bool
     @model_validator(mode="before")

inspect_ai/model/_model.py CHANGED Viewed

@@ -57,6 +57,11 @@ from inspect_ai.tool._tool import ToolSource
 from inspect_ai.tool._tool_call import ToolCallModelInputHints
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
+from inspect_ai.util._limit import (
+    check_message_limit,
+    check_token_limit,
+    record_model_usage,
+)
 from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
 from ._call_tools import (
@@ -355,11 +360,15 @@ class Model:
         Returns:
            ModelOutput
         """
-        # if we are the default model then enforce message limit if it
-        # exists (raise an exception if it is exceeded)
+        # if we are the default model then update the displayed message count
         is_active_model = self == active_model()
         if is_active_model:
-            handle_sample_message_limit(input)
+            set_total_messages(input)
+        # check message limit, raise exception if we're already at the limit to prevent
+        # a wasteful generate()
+        conversation_length = len(input) if isinstance(input, list) else 1
+        check_message_limit(conversation_length, raise_for_equal=True)
         # base config for this model
         base_config = self.config
@@ -666,7 +675,7 @@ class Model:
             # record usage
             if output.usage:
                 # record usage
-                record_model_usage(f"{self}", output.usage)
+                record_and_check_model_usage(f"{self}", output.usage)
                 # send telemetry if its hooked up
                 await send_telemetry(
@@ -1423,20 +1432,10 @@ _model_roles: ContextVar[dict[str, Model]] = ContextVar("model_roles", default={
 # shared contexts for asyncio tasks
-def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
-    from inspect_ai.log._samples import (
-        active_sample_message_limit,
-        set_active_sample_total_messages,
-    )
-    from inspect_ai.solver._limit import SampleLimitExceededError
+def set_total_messages(input: str | list[ChatMessage]) -> None:
+    from inspect_ai.log._samples import set_active_sample_total_messages
     total_messages = 1 if isinstance(input, str) else len(input)
-    message_limit = active_sample_message_limit()
-    if message_limit is not None:
-        if total_messages >= message_limit:
-            raise SampleLimitExceededError(
-                "message", value=total_messages, limit=message_limit
-            )
     # set total messages
     set_active_sample_total_messages(total_messages)
@@ -1450,16 +1449,13 @@ def init_sample_model_usage() -> None:
     sample_model_usage_context_var.set({})
-def record_model_usage(model: str, usage: ModelUsage) -> None:
-    from inspect_ai.log._samples import (
-        active_sample_token_limit,
-        set_active_sample_total_tokens,
-    )
-    from inspect_ai.solver._limit import SampleLimitExceededError
+def record_and_check_model_usage(model: str, usage: ModelUsage) -> None:
+    from inspect_ai.log._samples import set_active_sample_total_tokens
     # record usage
     set_model_usage(model, usage, sample_model_usage_context_var.get(None))
     set_model_usage(model, usage, model_usage_context_var.get(None))
+    record_model_usage(usage)
     # compute total tokens
     total_tokens = sample_total_tokens()
@@ -1467,38 +1463,15 @@ def record_model_usage(model: str, usage: ModelUsage) -> None:
     # update active sample
     set_active_sample_total_tokens(total_tokens)
-    # check for token limit overflow and raise
-    token_limit = active_sample_token_limit()
-    if token_limit is not None:
-        if total_tokens > token_limit:
-            raise SampleLimitExceededError(
-                "token", value=total_tokens, limit=token_limit
-            )
+    check_token_limit()
 def set_model_usage(
     model: str, usage: ModelUsage, model_usage: dict[str, ModelUsage] | None
 ) -> None:
     if model_usage is not None:
-        total_usage: ModelUsage | None = model_usage.get(model, None)
-        if not total_usage:
-            total_usage = ModelUsage()
-        total_usage.input_tokens += usage.input_tokens
-        total_usage.output_tokens += usage.output_tokens
-        total_usage.total_tokens += usage.total_tokens
-        if usage.input_tokens_cache_write is not None:
-            if total_usage.input_tokens_cache_write is None:
-                total_usage.input_tokens_cache_write = 0
-            total_usage.input_tokens_cache_write += usage.input_tokens_cache_write
-        if usage.input_tokens_cache_read is not None:
-            if total_usage.input_tokens_cache_read is None:
-                total_usage.input_tokens_cache_read = 0
-            total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
-        if usage.reasoning_tokens is not None:
-            if total_usage.reasoning_tokens is None:
-                total_usage.reasoning_tokens = 0
-            total_usage.reasoning_tokens += usage.reasoning_tokens
+        total_usage = model_usage.get(model, ModelUsage())
+        total_usage += usage
         model_usage[model] = total_usage

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -30,6 +30,31 @@ class ModelUsage(BaseModel):
     reasoning_tokens: int | None = Field(default=None)
     """Number of tokens used for reasoning."""
+    def __add__(self, other: "ModelUsage") -> "ModelUsage":
+        def optional_sum(a: int | None, b: int | None) -> int | None:
+            if a is not None and b is not None:
+                return a + b
+            if a is not None:
+                return a
+            if b is not None:
+                return b
+            return None
+        return ModelUsage(
+            input_tokens=self.input_tokens + other.input_tokens,
+            output_tokens=self.output_tokens + other.output_tokens,
+            total_tokens=self.total_tokens + other.total_tokens,
+            input_tokens_cache_write=optional_sum(
+                self.input_tokens_cache_write, other.input_tokens_cache_write
+            ),
+            input_tokens_cache_read=optional_sum(
+                self.input_tokens_cache_read, other.input_tokens_cache_read
+            ),
+            reasoning_tokens=optional_sum(
+                self.reasoning_tokens, other.reasoning_tokens
+            ),
+        )
 StopReason = Literal[
     "stop",

inspect_ai/model/_openai.py CHANGED Viewed

@@ -255,6 +255,8 @@ def openai_completion_params(
                 strict=config.response_schema.strict,
             ),
         )
+    if config.extra_body:
+        params["extra_body"] = config.extra_body
     return params

inspect_ai/model/_openai_responses.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from itertools import chain
 from typing import TypedDict, cast
@@ -306,6 +307,14 @@ def _openai_input_items_from_chat_message_assistant(
     """
     (output_message_id, tool_message_ids) = _ids_from_assistant_internal(message)
+    # we want to prevent yielding output messages in the case where we have an
+    # 'internal' field (so the message came from the model API as opposed to
+    # being user synthesized) AND there is no output_message_id (indicating that
+    # when reading the message from the server we didn't find output). this could
+    # happen e.g. when a react() agent sets the output.completion in response
+    # to a submit() tool call
+    suppress_output_message = message.internal is not None and output_message_id is None
     # if we are not storing messages on the server then blank these out
     if not store:
         output_message_id = None
@@ -341,6 +350,9 @@ def _openai_input_items_from_chat_message_assistant(
                         )
                     )
             case ContentText(text=text, refusal=refusal):
+                if suppress_output_message:
+                    continue
                 new_content = (
                     ResponseOutputRefusalParam(type="refusal", refusal=text)
                     if refusal
@@ -415,7 +427,7 @@ def _tool_call_items_from_assistant_message(
                 type="function_call",
                 call_id=call.id,
                 name=_responses_tool_alias(call.function),
-                arguments=call.function,
+                arguments=json.dumps(call.arguments),
             )
             # add id if available

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -26,7 +26,6 @@ from anthropic.types import (
     TextBlockParam,
     ThinkingBlock,
     ThinkingBlockParam,
-    ToolBash20250124Param,
     ToolParam,
     ToolResultBlockParam,
     ToolTextEditor20250124Param,
@@ -76,6 +75,7 @@ class AnthropicAPI(ModelAPI):
         base_url: str | None = None,
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
+        streaming: bool | Literal["auto"] = "auto",
         **model_args: Any,
     ):
         # extract any service prefix from model name
@@ -85,6 +85,9 @@ class AnthropicAPI(ModelAPI):
         else:
             self.service = None
+        # record steraming pref
+        self.streaming = streaming
         # collect generate model_args (then delete them so we can pass the rest on)
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
@@ -224,8 +227,13 @@ class AnthropicAPI(ModelAPI):
             if self.extra_body is not None:
                 request["extra_body"] = self.extra_body
-            # make request (stream if we are using reasoning)
-            if self.is_using_thinking(config):
+            # make request (unless overrideen, stream if we are using reasoning)
+            streaming = (
+                self.is_using_thinking(config)
+                if self.streaming == "auto"
+                else self.streaming
+            )
+            if streaming:
                 async with self.client.messages.stream(**request) as stream:
                     message = await stream.get_final_message()
             else:
@@ -489,11 +497,7 @@ class AnthropicAPI(ModelAPI):
         self, tool: ToolInfo, config: GenerateConfig
     ) -> Optional["ToolParamDef"]:
         return (
-            (
-                self.computer_use_tool_param(tool)
-                or self.text_editor_tool_param(tool)
-                or self.bash_tool_param(tool)
-            )
+            (self.computer_use_tool_param(tool) or self.text_editor_tool_param(tool))
             if config.internal_tools is not False
             else None
         )
@@ -564,23 +568,10 @@ class AnthropicAPI(ModelAPI):
         else:
             return None
-    def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
-        # check for compatible 'bash' tool
-        if tool.name == "bash_session" and (
-            sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
-        ):
-            return ToolBash20250124Param(type="bash_20250124", name="bash")
-        # not a bash tool
-        else:
-            return None
 # tools can be either a stock tool param or a special Anthropic native use tool param
 ToolParamDef = (
-    ToolParam
-    | BetaToolComputerUse20250124Param
-    | ToolTextEditor20250124Param
-    | ToolBash20250124Param
+    ToolParam | BetaToolComputerUse20250124Param | ToolTextEditor20250124Param
 )
@@ -589,7 +580,6 @@ def add_cache_control(
     | ToolParam
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
-    | ToolBash20250124Param
     | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -211,8 +211,15 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
         This method has an interdependency with `input_with_tools()` (as that is the
         prompt that asks the model to use the <tool_call>...</tool_call> syntax)
         """
-        # extract tool calls
+        # define regex patterns
+        # NOTE: If you change either of these regex patterns, please update the other
+        # tool_call_regex extracts the JSON content (in curly braces) between tool call tags
         tool_call_regex = rf"<{TOOL_CALL}>\s*(\{{[\s\S]*?\}})\s*</{TOOL_CALL}>"
+        # tool_call_content_regex matches the entire tool call block including tags for extracting
+        # the content outside of the tool call tags
+        tool_call_content_regex = rf"<{TOOL_CALL}>\s*\{{[\s\S]*?\}}\s*</{TOOL_CALL}>"
+        # extract tool calls
         tool_calls_content: list[str] = re.findall(tool_call_regex, response)
         # if there are tool calls proceed with parsing
@@ -226,7 +233,6 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
             ]
             # find other content that exists outside tool calls
-            tool_call_content_regex = rf"<{TOOL_CALL}>(?:.|\n)*?</{TOOL_CALL}>"
             other_content = re.split(tool_call_content_regex, response, flags=re.DOTALL)
             other_content = [
                 str(content).strip()

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -136,10 +136,12 @@ def hf() -> type[ModelAPI]:
 @modelapi(name="vllm")
 def vllm() -> type[ModelAPI]:
-    try:
-        from .vllm import VLLMAPI
-    except ImportError:
-        raise pip_dependency_error("vLLM Models", ["vllm"])
+    # Only validate OpenAI compatibility (needed for the API interface)
+    validate_openai_client("vLLM API")
+    # Import VLLMAPI without checking for vllm package yet
+    # The actual vllm dependency will only be checked if needed to start a server
+    from .vllm import VLLMAPI
     return VLLMAPI
@@ -257,6 +259,18 @@ def mockllm() -> type[ModelAPI]:
     return MockLLM
+@modelapi(name="sglang")
+def sglang() -> type[ModelAPI]:
+    # Only validate OpenAI compatibility (needed for the API interface)
+    validate_openai_client("SGLang API")
+    # Import SGLangAPI without checking for sglang package yet
+    # The actual sglang dependency will only be checked if needed to start a server
+    from .sglang import SGLangAPI
+    return SGLangAPI
 @modelapi(name="none")
 def none() -> type[ModelAPI]:
     from .none import NoModel

inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl