PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/model/_model.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import (
     cast,
 )
+from pydantic import BaseModel
 from pydantic_core import to_jsonable_python
 from tenacity import (
     RetryCallState,
@@ -57,6 +58,11 @@ from inspect_ai.tool._tool import ToolSource
 from inspect_ai.tool._tool_call import ToolCallModelInputHints
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
+from inspect_ai.util._limit import (
+    check_message_limit,
+    check_token_limit,
+    record_model_usage,
+)
 from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
 from ._call_tools import (
@@ -355,11 +361,15 @@ class Model:
         Returns:
            ModelOutput
         """
-        # if we are the default model then enforce message limit if it
-        # exists (raise an exception if it is exceeded)
+        # if we are the default model then update the displayed message count
         is_active_model = self == active_model()
         if is_active_model:
-            handle_sample_message_limit(input)
+            set_total_messages(input)
+        # check message limit, raise exception if we're already at the limit to prevent
+        # a wasteful generate()
+        conversation_length = len(input) if isinstance(input, list) else 1
+        check_message_limit(conversation_length, raise_for_equal=True)
         # base config for this model
         base_config = self.config
@@ -393,36 +403,32 @@ class Model:
         start_time = datetime.now()
         working_start = sample_working_time()
         async with self._connection_concurrency(config):
-            from inspect_ai.log._samples import track_active_sample_retries
             # generate
-            with track_active_sample_retries():
-                output = await self._generate(
-                    input=input,
-                    tools=tools,
-                    tool_choice=tool_choice,
-                    config=config,
-                    cache=cache,
-                )
+            output, event = await self._generate(
+                input=input,
+                tools=tools,
+                tool_choice=tool_choice,
+                config=config,
+                cache=cache,
+            )
             # update the most recent ModelEvent with the actual start/completed
             # times as well as a computation of working time (events are
             # created _after_ the call to _generate, potentially in response
             # to retries, so they need their timestamp updated so it accurately
             # reflects the full start/end time which we know here)
-            from inspect_ai.log._transcript import ModelEvent, transcript
-            last_model_event = transcript().find_last_event(ModelEvent)
-            if last_model_event:
-                last_model_event.timestamp = start_time
-                last_model_event.working_start = working_start
-                completed = datetime.now()
-                last_model_event.completed = completed
-                last_model_event.working_time = (
-                    output.time
-                    if output.time is not None
-                    else (completed - start_time).total_seconds()
-                )
+            from inspect_ai.log._transcript import ModelEvent
+            assert isinstance(event, ModelEvent)
+            event.timestamp = start_time
+            event.working_start = working_start
+            completed = datetime.now()
+            event.completed = completed
+            event.working_time = (
+                output.time
+                if output.time is not None
+                else (completed - start_time).total_seconds()
+            )
             # return output
             return output
@@ -483,9 +489,12 @@ class Model:
         tool_choice: ToolChoice | None,
         config: GenerateConfig,
         cache: bool | CachePolicy = False,
-    ) -> ModelOutput:
+    ) -> tuple[ModelOutput, BaseModel]:
+        from inspect_ai.log._samples import track_active_model_event
+        from inspect_ai.log._transcript import ModelEvent
         # default to 'auto' for tool_choice (same as underlying model apis)
-        tool_choice = tool_choice if tool_choice else "auto"
+        tool_choice = tool_choice if tool_choice is not None else "auto"
         # resolve top level tool source
         if isinstance(tools, ToolSource):
@@ -572,7 +581,10 @@ class Model:
             stop=stop,
             before_sleep=functools.partial(log_model_retry, self.api.model_name),
         )
-        async def generate() -> ModelOutput:
+        async def generate() -> tuple[ModelOutput, BaseModel]:
+            # type-checker can't see that we made sure tool_choice is not none in the outer frame
+            assert tool_choice is not None
             check_sample_interrupt()
             cache_entry: CacheEntry | None
@@ -593,7 +605,7 @@ class Model:
                 )
                 existing = cache_fetch(cache_entry)
                 if isinstance(existing, ModelOutput):
-                    self._record_model_interaction(
+                    _, event = self._record_model_interaction(
                         input=input,
                         tools=tools_info,
                         tool_choice=tool_choice,
@@ -602,7 +614,7 @@ class Model:
                         output=existing,
                         call=None,
                     )
-                    return existing
+                    return existing, event
             else:
                 cache_entry = None
@@ -611,7 +623,7 @@ class Model:
             # record the interaction before the call to generate
             # (we'll update it with the results once we have them)
-            complete = self._record_model_interaction(
+            complete, event = self._record_model_interaction(
                 input=input,
                 tools=tools_info,
                 tool_choice=tool_choice,
@@ -622,12 +634,14 @@ class Model:
             with trace_action(logger, "Model", f"generate ({str(self)})"):
                 time_start = time.monotonic()
                 try:
-                    result = await self.api.generate(
-                        input=input,
-                        tools=tools_info,
-                        tool_choice=tool_choice,
-                        config=config,
-                    )
+                    assert isinstance(event, ModelEvent)
+                    with track_active_model_event(event):
+                        result = await self.api.generate(
+                            input=input,
+                            tools=tools_info,
+                            tool_choice=tool_choice,
+                            config=config,
+                        )
                 finally:
                     time_elapsed = time.monotonic() - time_start
@@ -666,7 +680,7 @@ class Model:
             # record usage
             if output.usage:
                 # record usage
-                record_model_usage(f"{self}", output.usage)
+                record_and_check_model_usage(f"{self}", output.usage)
                 # send telemetry if its hooked up
                 await send_telemetry(
@@ -677,18 +691,18 @@ class Model:
             if cache and cache_entry:
                 cache_store(entry=cache_entry, output=output)
-            return output
+            return output, event
         # call the model (this will so retries, etc., so report waiting time
         # as elapsed time - actual time for successful model call)
         time_start = time.monotonic()
-        model_output = await generate()
+        model_output, event = await generate()
         total_time = time.monotonic() - time_start
         if model_output.time:
             report_sample_waiting_time(total_time - model_output.time)
         # return results
-        return model_output
+        return model_output, event
     def should_retry(self, ex: BaseException) -> bool:
         if isinstance(ex, Exception):
@@ -760,7 +774,7 @@ class Model:
         cache: Literal["read", "write"] | None,
         output: ModelOutput | None = None,
         call: ModelCall | None = None,
-    ) -> Callable[[ModelOutput | Exception, ModelCall | None], None]:
+    ) -> tuple[Callable[[ModelOutput | Exception, ModelCall | None], None], BaseModel]:
         from inspect_ai.log._transcript import ModelEvent, transcript
         # create event and add it to the transcript
@@ -800,7 +814,7 @@ class Model:
         if output:
             complete(output, call)
-        return complete
+        return complete, event
 class ModelName:
@@ -1423,20 +1437,10 @@ _model_roles: ContextVar[dict[str, Model]] = ContextVar("model_roles", default={
 # shared contexts for asyncio tasks
-def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
-    from inspect_ai.log._samples import (
-        active_sample_message_limit,
-        set_active_sample_total_messages,
-    )
-    from inspect_ai.solver._limit import SampleLimitExceededError
+def set_total_messages(input: str | list[ChatMessage]) -> None:
+    from inspect_ai.log._samples import set_active_sample_total_messages
     total_messages = 1 if isinstance(input, str) else len(input)
-    message_limit = active_sample_message_limit()
-    if message_limit is not None:
-        if total_messages >= message_limit:
-            raise SampleLimitExceededError(
-                "message", value=total_messages, limit=message_limit
-            )
     # set total messages
     set_active_sample_total_messages(total_messages)
@@ -1450,16 +1454,13 @@ def init_sample_model_usage() -> None:
     sample_model_usage_context_var.set({})
-def record_model_usage(model: str, usage: ModelUsage) -> None:
-    from inspect_ai.log._samples import (
-        active_sample_token_limit,
-        set_active_sample_total_tokens,
-    )
-    from inspect_ai.solver._limit import SampleLimitExceededError
+def record_and_check_model_usage(model: str, usage: ModelUsage) -> None:
+    from inspect_ai.log._samples import set_active_sample_total_tokens
     # record usage
     set_model_usage(model, usage, sample_model_usage_context_var.get(None))
     set_model_usage(model, usage, model_usage_context_var.get(None))
+    record_model_usage(usage)
     # compute total tokens
     total_tokens = sample_total_tokens()
@@ -1467,38 +1468,15 @@ def record_model_usage(model: str, usage: ModelUsage) -> None:
     # update active sample
     set_active_sample_total_tokens(total_tokens)
-    # check for token limit overflow and raise
-    token_limit = active_sample_token_limit()
-    if token_limit is not None:
-        if total_tokens > token_limit:
-            raise SampleLimitExceededError(
-                "token", value=total_tokens, limit=token_limit
-            )
+    check_token_limit()
 def set_model_usage(
     model: str, usage: ModelUsage, model_usage: dict[str, ModelUsage] | None
 ) -> None:
     if model_usage is not None:
-        total_usage: ModelUsage | None = model_usage.get(model, None)
-        if not total_usage:
-            total_usage = ModelUsage()
-        total_usage.input_tokens += usage.input_tokens
-        total_usage.output_tokens += usage.output_tokens
-        total_usage.total_tokens += usage.total_tokens
-        if usage.input_tokens_cache_write is not None:
-            if total_usage.input_tokens_cache_write is None:
-                total_usage.input_tokens_cache_write = 0
-            total_usage.input_tokens_cache_write += usage.input_tokens_cache_write
-        if usage.input_tokens_cache_read is not None:
-            if total_usage.input_tokens_cache_read is None:
-                total_usage.input_tokens_cache_read = 0
-            total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
-        if usage.reasoning_tokens is not None:
-            if total_usage.reasoning_tokens is None:
-                total_usage.reasoning_tokens = 0
-            total_usage.reasoning_tokens += usage.reasoning_tokens
+        total_usage = model_usage.get(model, ModelUsage())
+        total_usage += usage
         model_usage[model] = total_usage

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -30,6 +30,31 @@ class ModelUsage(BaseModel):
     reasoning_tokens: int | None = Field(default=None)
     """Number of tokens used for reasoning."""
+    def __add__(self, other: "ModelUsage") -> "ModelUsage":
+        def optional_sum(a: int | None, b: int | None) -> int | None:
+            if a is not None and b is not None:
+                return a + b
+            if a is not None:
+                return a
+            if b is not None:
+                return b
+            return None
+        return ModelUsage(
+            input_tokens=self.input_tokens + other.input_tokens,
+            output_tokens=self.output_tokens + other.output_tokens,
+            total_tokens=self.total_tokens + other.total_tokens,
+            input_tokens_cache_write=optional_sum(
+                self.input_tokens_cache_write, other.input_tokens_cache_write
+            ),
+            input_tokens_cache_read=optional_sum(
+                self.input_tokens_cache_read, other.input_tokens_cache_read
+            ),
+            reasoning_tokens=optional_sum(
+                self.reasoning_tokens, other.reasoning_tokens
+            ),
+        )
 StopReason = Literal[
     "stop",

inspect_ai/model/_openai.py CHANGED Viewed

@@ -255,6 +255,8 @@ def openai_completion_params(
                 strict=config.response_schema.strict,
             ),
         )
+    if config.extra_body:
+        params["extra_body"] = config.extra_body
     return params

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -26,7 +26,6 @@ from anthropic.types import (
     TextBlockParam,
     ThinkingBlock,
     ThinkingBlockParam,
-    ToolBash20250124Param,
     ToolParam,
     ToolResultBlockParam,
     ToolTextEditor20250124Param,
@@ -76,6 +75,7 @@ class AnthropicAPI(ModelAPI):
         base_url: str | None = None,
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
+        streaming: bool | Literal["auto"] = "auto",
         **model_args: Any,
     ):
         # extract any service prefix from model name
@@ -85,6 +85,9 @@ class AnthropicAPI(ModelAPI):
         else:
             self.service = None
+        # record steraming pref
+        self.streaming = streaming
         # collect generate model_args (then delete them so we can pass the rest on)
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
@@ -224,8 +227,13 @@ class AnthropicAPI(ModelAPI):
             if self.extra_body is not None:
                 request["extra_body"] = self.extra_body
-            # make request (stream if we are using reasoning)
-            if self.is_using_thinking(config):
+            # make request (unless overrideen, stream if we are using reasoning)
+            streaming = (
+                self.is_using_thinking(config)
+                if self.streaming == "auto"
+                else self.streaming
+            )
+            if streaming:
                 async with self.client.messages.stream(**request) as stream:
                     message = await stream.get_final_message()
             else:
@@ -489,11 +497,7 @@ class AnthropicAPI(ModelAPI):
         self, tool: ToolInfo, config: GenerateConfig
     ) -> Optional["ToolParamDef"]:
         return (
-            (
-                self.computer_use_tool_param(tool)
-                or self.text_editor_tool_param(tool)
-                or self.bash_tool_param(tool)
-            )
+            (self.computer_use_tool_param(tool) or self.text_editor_tool_param(tool))
             if config.internal_tools is not False
             else None
         )
@@ -564,23 +568,10 @@ class AnthropicAPI(ModelAPI):
         else:
             return None
-    def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
-        # check for compatible 'bash' tool
-        if tool.name == "bash_session" and (
-            sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
-        ):
-            return ToolBash20250124Param(type="bash_20250124", name="bash")
-        # not a bash tool
-        else:
-            return None
 # tools can be either a stock tool param or a special Anthropic native use tool param
 ToolParamDef = (
-    ToolParam
-    | BetaToolComputerUse20250124Param
-    | ToolTextEditor20250124Param
-    | ToolBash20250124Param
+    ToolParam | BetaToolComputerUse20250124Param | ToolTextEditor20250124Param
 )
@@ -589,7 +580,6 @@ def add_cache_control(
     | ToolParam
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
-    | ToolBash20250124Param
     | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}

inspect_ai/model/_providers/hf.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import concurrent
 import concurrent.futures
 import copy
@@ -26,7 +28,12 @@ from transformers import (  # type: ignore
 from typing_extensions import override
 from inspect_ai._util.constants import DEFAULT_MAX_TOKENS
-from inspect_ai._util.content import ContentText
+from inspect_ai._util.content import (
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
 from inspect_ai._util.trace import trace_action
 from inspect_ai.tool import ToolChoice, ToolInfo
@@ -85,6 +92,7 @@ class HuggingFaceAPI(ModelAPI):
         self.batch_size = collect_model_arg("batch_size")
         self.chat_template = collect_model_arg("chat_template")
         self.tokenizer_call_args = collect_model_arg("tokenizer_call_args")
+        self.enable_thinking = collect_model_arg("enable_thinking")
         if self.tokenizer_call_args is None:
             self.tokenizer_call_args = {}
@@ -263,6 +271,7 @@ class HuggingFaceAPI(ModelAPI):
             elif "qwen" in self.model_name.lower():
                 hf_messages = inspect_tools_to_string(hf_messages)
+        hf_messages = message_content_to_string(hf_messages)
         # apply chat template
         if self.tokenizer.chat_template is not None:
             chat = self.tokenizer.apply_chat_template(
@@ -270,6 +279,7 @@ class HuggingFaceAPI(ModelAPI):
                 add_generation_prompt=True,
                 tokenize=False,
                 tools=tools_list if len(tools_list) > 0 else None,
+                enable_thinking=self.enable_thinking,  # not all models use this, check if it is supported
             )
         else:
             chat = ""
@@ -279,6 +289,22 @@ class HuggingFaceAPI(ModelAPI):
         return cast(str, chat)
+def message_content_to_string(messages: list[ChatMessage]) -> list[ChatMessage]:
+    """Convert list of content in `ChatMessageAssistant`, `ChatMessageUser` or `ChatMessageSystem` to a string."""
+    for message in messages:
+        if isinstance(message.content, list):
+            is_multimodal = any(
+                isinstance(item, ContentAudio | ContentImage | ContentVideo)
+                for item in message.content
+            )
+            if is_multimodal:
+                raise NotImplementedError(
+                    "HuggingFace provider does not support multimodal content, please provide text inputs only."
+                )
+            message.content = message.text
+    return messages
 def shorten_tool_id(messages: list[ChatMessage]) -> list[ChatMessage]:
     """Shorten the tool_call_id in the messages to the last 9 characters for Mistral."""
     for i, message in enumerate(messages):

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -211,8 +211,15 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
         This method has an interdependency with `input_with_tools()` (as that is the
         prompt that asks the model to use the <tool_call>...</tool_call> syntax)
         """
-        # extract tool calls
+        # define regex patterns
+        # NOTE: If you change either of these regex patterns, please update the other
+        # tool_call_regex extracts the JSON content (in curly braces) between tool call tags
         tool_call_regex = rf"<{TOOL_CALL}>\s*(\{{[\s\S]*?\}})\s*</{TOOL_CALL}>"
+        # tool_call_content_regex matches the entire tool call block including tags for extracting
+        # the content outside of the tool call tags
+        tool_call_content_regex = rf"<{TOOL_CALL}>\s*\{{[\s\S]*?\}}\s*</{TOOL_CALL}>"
+        # extract tool calls
         tool_calls_content: list[str] = re.findall(tool_call_regex, response)
         # if there are tool calls proceed with parsing
@@ -226,7 +233,6 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
             ]
             # find other content that exists outside tool calls
-            tool_call_content_regex = rf"<{TOOL_CALL}>(?:.|\n)*?</{TOOL_CALL}>"
             other_content = re.split(tool_call_content_regex, response, flags=re.DOTALL)
             other_content = [
                 str(content).strip()

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -136,10 +136,12 @@ def hf() -> type[ModelAPI]:
 @modelapi(name="vllm")
 def vllm() -> type[ModelAPI]:
-    try:
-        from .vllm import VLLMAPI
-    except ImportError:
-        raise pip_dependency_error("vLLM Models", ["vllm"])
+    # Only validate OpenAI compatibility (needed for the API interface)
+    validate_openai_client("vLLM API")
+    # Import VLLMAPI without checking for vllm package yet
+    # The actual vllm dependency will only be checked if needed to start a server
+    from .vllm import VLLMAPI
     return VLLMAPI
@@ -257,6 +259,18 @@ def mockllm() -> type[ModelAPI]:
     return MockLLM
+@modelapi(name="sglang")
+def sglang() -> type[ModelAPI]:
+    # Only validate OpenAI compatibility (needed for the API interface)
+    validate_openai_client("SGLang API")
+    # Import SGLangAPI without checking for sglang package yet
+    # The actual sglang dependency will only be checked if needed to start a server
+    from .sglang import SGLangAPI
+    return SGLangAPI
 @modelapi(name="none")
 def none() -> type[ModelAPI]:
     from .none import NoModel

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl