PyPI - inspect-ai - Versions diffs - 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl - Mend

inspect-ai 0.3.94py3-none-any.whl → 0.3.96py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/task/run.py +12 -6
inspect_ai/_util/exception.py +4 -0
inspect_ai/_util/hash.py +39 -0
inspect_ai/_util/local_server.py +16 -0
inspect_ai/_util/path.py +22 -0
inspect_ai/_util/trace.py +1 -1
inspect_ai/_util/working.py +4 -0
inspect_ai/_view/www/dist/assets/index.css +9 -9
inspect_ai/_view/www/dist/assets/index.js +117 -120
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
inspect_ai/_view/www/src/app/types.ts +12 -2
inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
inspect_ai/_view/www/src/state/hooks.ts +19 -3
inspect_ai/_view/www/src/state/logSlice.ts +23 -5
inspect_ai/_view/www/yarn.lock +9 -9
inspect_ai/agent/_bridge/patch.py +1 -3
inspect_ai/agent/_types.py +1 -1
inspect_ai/analysis/__init__.py +0 -0
inspect_ai/analysis/beta/__init__.py +67 -0
inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
inspect_ai/analysis/beta/_dataframe/evals/table.py +177 -0
inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/events/columns.py +87 -0
inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
inspect_ai/analysis/beta/_dataframe/events/table.py +100 -0
inspect_ai/analysis/beta/_dataframe/extract.py +73 -0
inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
inspect_ai/analysis/beta/_dataframe/messages/table.py +79 -0
inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
inspect_ai/analysis/beta/_dataframe/record.py +377 -0
inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/samples/columns.py +77 -0
inspect_ai/analysis/beta/_dataframe/samples/extract.py +54 -0
inspect_ai/analysis/beta/_dataframe/samples/table.py +370 -0
inspect_ai/analysis/beta/_dataframe/util.py +160 -0
inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
inspect_ai/log/_file.py +10 -3
inspect_ai/log/_log.py +21 -1
inspect_ai/model/_call_tools.py +2 -1
inspect_ai/model/_model.py +6 -4
inspect_ai/model/_openai_responses.py +17 -18
inspect_ai/model/_providers/anthropic.py +30 -5
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/solver/_multiple_choice.py +4 -1
inspect_ai/solver/_task_state.py +8 -4
inspect_ai/tool/_mcp/_context.py +3 -5
inspect_ai/tool/_mcp/_sandbox.py +17 -14
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
inspect_ai/util/_sandbox/events.py +3 -2
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +9 -2
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +75 -46
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0

inspect_ai/log/_file.py CHANGED Viewed

@@ -524,14 +524,21 @@ def manifest_eval_log_name(info: EvalLogInfo, log_dir: str, sep: str) -> str:
 def log_files_from_ls(
     ls: list[FileInfo],
-    formats: list[Literal["eval", "json"]] | None,
+    formats: list[Literal["eval", "json"]] | None = None,
     descending: bool = True,
+    sort: bool = True,
 ) -> list[EvalLogInfo]:
     extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
     return [
         log_file_info(file)
-        for file in sorted(
-            ls, key=lambda file: (file.mtime if file.mtime else 0), reverse=descending
+        for file in (
+            sorted(
+                ls,
+                key=lambda file: (file.mtime if file.mtime else 0),
+                reverse=descending,
+            )
+            if sort
+            else ls
         )
         if file.type == "file" and is_log_file(file.name, extensions)
     ]

inspect_ai/log/_log.py CHANGED Viewed

@@ -17,9 +17,11 @@ from pydantic import (
 )
 from rich.console import Console, RenderableType
 from rich.traceback import Traceback
+from shortuuid import uuid
-from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
+from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, DESERIALIZING, PKG_NAME
 from inspect_ai._util.error import EvalError, exception_message
+from inspect_ai._util.hash import base57_id_hash
 from inspect_ai._util.logger import warn_once
 from inspect_ai.approval._policy import ApprovalPolicyConfig
 from inspect_ai.dataset._dataset import MT, metadata_as
@@ -677,6 +679,9 @@ class EvalModelConfig(BaseModel):
 class EvalSpec(BaseModel):
     """Eval target and configuration."""
+    eval_id: str = Field(default_factory=str)
+    """Globally unique id for eval."""
     run_id: str = Field(default_factory=str)
     """Unique run id"""
@@ -757,6 +762,21 @@ class EvalSpec(BaseModel):
     # allow field model_args
     model_config = ConfigDict(protected_namespaces=())
+    def model_post_init(self, __context: Any) -> None:
+        # check if deserializing
+        is_deserializing = isinstance(__context, dict) and __context.get(
+            DESERIALIZING, False
+        )
+        # Generate eval_id if needed
+        if self.eval_id == "":
+            if is_deserializing:
+                # we want the eval_id to be stable across reads of the eval log so we compose it
+                # as a hash that matches the size/apperance of shortuuid-based uuids
+                self.eval_id = base57_id_hash(self.run_id + self.task_id + self.created)
+            else:
+                self.eval_id = uuid()
     @model_validator(mode="before")
     @classmethod
     def read_sandbox_spec(

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -39,6 +39,7 @@ from inspect_ai._util.content import (
     ContentText,
     ContentVideo,
 )
+from inspect_ai._util.exception import TerminateSampleError
 from inspect_ai._util.format import format_function_call
 from inspect_ai._util.logger import warn_once
 from inspect_ai._util.registry import registry_unqualified_name
@@ -376,7 +377,7 @@ async def call_tool(
             transcript()._event(
                 SampleLimitEvent(type="operator", limit=1, message=message)
             )
-            raise LimitExceededError("operator", value=1, limit=1, message=message)
+            raise TerminateSampleError(message)
         else:
             raise ToolApprovalError(approval.explanation if approval else None)
     if approval and approval.modified:

inspect_ai/model/_model.py CHANGED Viewed

@@ -1237,9 +1237,10 @@ def tool_result_images_as_user_message(
     Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
     """
-    init_accum: ImagesAccumulator = ([], [], [])
     chat_messages, user_message_content, tool_call_ids = functools.reduce(
-        tool_result_images_reducer, messages, init_accum
+        tool_result_images_reducer,
+        messages,
+        (list[ChatMessage](), list[Content](), list[str]()),
     )
     # if the last message was a tool result, we may need to flush the pending stuff here
     return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
@@ -1265,9 +1266,10 @@ def tool_result_images_reducer(
         and isinstance(message.content, list)
         and any([isinstance(c, ContentImage) for c in message.content])
     ):
-        init_accum: ImageContentAccumulator = ([], [])
         new_user_message_content, edited_tool_message_content = functools.reduce(
-            tool_result_image_content_reducer, message.content, init_accum
+            tool_result_image_content_reducer,
+            message.content,
+            (list[Content](), list[Content]()),
         )
         return (

inspect_ai/model/_openai_responses.py CHANGED Viewed

@@ -184,24 +184,23 @@ def openai_responses_chat_choices(
 # │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │
 # │ │ │ type: "reasoning" │ │ │    │ │ │ ContentText       │ │ │    │ │ │ type: "reasoning" │ │ │
 # │ │ │ id: "rs_bbbbbb"   │ │ │    │ │ │ text: ""          │ │ │    │ │ │ id: "rs_bbbbbb"   │ │ │
-# │ │ │ summary: []       │ │ │    │ │ └───────────────────┘ │ │    │ │ │ summary: []       │ │ │
-# │ │ └───────────────────┘ │ │    │ │ ┌───────────────────┐ │ │    │ │ ┌───────────────────┐ │ │
-# │ │ ┌───────────────────┐ │ │    │ │ │ ContentText       │ │ │    │ │ │ type: "message"   │ │ │
-# │ │ │ type: "message"   │ │ │    │ │ │ text: "text1"     │ │ │    │ │ │ id: "msg_ccccccc" │ │ │
-# │ │ │ id: "msg_ccccccc" │ │ │    │ │ └───────────────────┘ │ │    │ │ │ role: "assistant" │ │ │
-# │ │ │ role: "assistant" │ │ │--->│ │ ┌───────────────────┐ │ │--->│ │ │ ┌───────────────┐ │ │ │
-# │ │ │ ┌───────────────┐ │ │ │    │ │ │ ContentText       │ │ │    │ │ │ │ Content       │ │ │ │
-# │ │ │ │ Content       │ │ │ │    │ │ │ text: "text2"     │ │ │    │ │ │ │ ┌───────────┐ │ │ │ │
-# │ │ │ │ ┌───────────┐ │ │ │ │    │ └───────────────────────┘ │    │ │ │ │ │"text1"    │ │ │ │ │
-# │ │ │ │ │"text1"    │ │ │ │ │    │ ┌───────────────────────┐ │    │ │ │ │ └───────────┘ │ │ │ │
-# │ │ │ │ └───────────┘ │ │ │ │    │ │ internal              │ │    │ │ │ │ ┌───────────┐ │ │ │ │
-# │ │ │ │ ┌───────────┐ │ │ │ │    │ │ ┌───────────────────┐ │ │    │ │ │ │ │ "text2"   │ │ │ │ │
-# │ │ │ │ │ "text2"   │ │ │ │ │    │ │ │ reasoning_id:     │ │ │    │ │ │ │ └───────────┘ │ │ │ │
-# │ │ │ │ └───────────┘ │ │ │ │    │ │ │ "rs_bbbbbb"       │ │ │    │ │ │ └───────────────┘ │ │ │
-# │ │ │ └───────────────┘ │ │ │    │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │
-# │ │ └───────────────────┘ │ │    │ │ ┌───────────────────┐ │ │    │ └───────────────────────┘ │
-# │ └───────────────────────┘ │    │ │ │ output_msg_id:    │ │ │    └───────────────────────────┘
-# └───────────────────────────┘    │ │ │ "msg_ccccccc"     │ │ │
+# │ │ │ summary: []       │ │ │    │ │ ├───────────────────┤ │ │    │ │ │ summary: []       │ │ │
+# │ │ ├───────────────────┤ │ │    │ │ │ ContentText       │ │ │    │ │ ├───────────────────┤ │ │
+# │ │ │ type: "message"   │ │ │    │ │ │ text: "text1"     │ │ │    │ │ │ type: "message"   │ │ │
+# │ │ │ id: "msg_ccccccc" │ │ │    │ │ ├───────────────────┤ │ │    │ │ │ id: "msg_ccccccc" │ │ │
+# │ │ │ role: "assistant" │ │ │    │ │ │ ContentText       │ │ │    │ │ │ role: "assistant" │ │ │
+# │ │ │ ┌───────────────┐ │ │ │ -> │ │ │ text: "text2"     │ │ │ -> │ │ │ ┌───────────────┐ │ │ │
+# │ │ │ │ Content       │ │ │ │    │ │ └───────────────────┘ │ │    │ │ │ │ Content       │ │ │ │
+# │ │ │ │ ┌───────────┐ │ │ │ │    │ └───────────────────────┘ │    │ │ │ │ ┌───────────┐ │ │ │ │
+# │ │ │ │ │"text1"    │ │ │ │ │    │ ┌───────────────────────┐ │    │ │ │ │ │"text1"    │ │ │ │ │
+# │ │ │ │ ├───────────┤ │ │ │ │    │ │ internal              │ │    │ │ │ │ ├───────────┤ │ │ │ │
+# │ │ │ │ │"text2"    │ │ │ │ │    │ │ ┌───────────────────┐ │ │    │ │ │ │ │"text2"    │ │ │ │ │
+# │ │ │ │ └───────────┘ │ │ │ │    │ │ │ reasoning_id:     │ │ │    │ │ │ │ └───────────┘ │ │ │ │
+# │ │ │ └───────────────┘ │ │ │    │ │ │ "rs_bbbbbb"       │ │ │    │ │ │ └───────────────┘ │ │ │
+# │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │    │ │ └───────────────────┘ │ │
+# │ └───────────────────────┘ │    │ │ ┌───────────────────┐ │ │    │ └───────────────────────┘ │
+# └───────────────────────────┘    │ │ │ output_msg_id:    │ │ │    └───────────────────────────┘
+#                                  │ │ │ "msg_ccccccc"     │ │ │
 #                                  │ │ └───────────────────┘ │ │
 #                                  │ └───────────────────────┘ │
 #                                  └───────────────────────────┘

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -33,7 +33,10 @@ from anthropic.types import (
     ToolUseBlockParam,
     message_create_params,
 )
-from anthropic.types.beta import BetaToolComputerUse20250124Param
+from anthropic.types.beta import (
+    BetaToolComputerUse20250124Param,
+    BetaToolTextEditor20241022Param,
+)
 from pydantic import JsonValue
 from typing_extensions import override
@@ -218,6 +221,8 @@ class AnthropicAPI(ModelAPI):
                 # tools are generally available for Claude 3.5 Sonnet (new) as well and
                 # can be used without the computer use beta header.
                 betas.append("computer-use-2025-01-24")
+            if any("20241022" in str(tool.get("type", "")) for tool in tools_param):
+                betas.append("computer-use-2024-10-22")
             if len(betas) > 0:
                 extra_headers["anthropic-beta"] = ",".join(betas)
@@ -337,6 +342,15 @@ class AnthropicAPI(ModelAPI):
     @override
     def should_retry(self, ex: Exception) -> bool:
         if isinstance(ex, APIStatusError):
+            # for unknown reasons, anthropic does not always set status_code == 529
+            # for "overloaded_error" so we check for it explicitly
+            if (
+                isinstance(ex.body, dict)
+                and ex.body.get("error", {}).get("type", "") == "overloaded_error"
+            ):
+                return True
+            # standard http status code checking
             return is_retryable_http_status(ex.status_code)
         elif httpx_should_retry(ex):
             return True
@@ -545,7 +559,7 @@ class AnthropicAPI(ModelAPI):
     def text_editor_tool_param(
         self, tool: ToolInfo
-    ) -> Optional[ToolTextEditor20250124Param]:
+    ) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
         # check for compatible 'text editor' tool
         if tool.name == "text_editor" and (
             sorted(tool.parameters.properties.keys())
@@ -561,8 +575,14 @@ class AnthropicAPI(ModelAPI):
                 ]
             )
         ):
-            return ToolTextEditor20250124Param(
-                type="text_editor_20250124", name="str_replace_editor"
+            return (
+                BetaToolTextEditor20241022Param(
+                    type="text_editor_20241022", name="str_replace_editor"
+                )
+                if self.is_claude_3_5()
+                else ToolTextEditor20250124Param(
+                    type="text_editor_20250124", name="str_replace_editor"
+                )
             )
         # not a text_editor tool
         else:
@@ -571,7 +591,10 @@ class AnthropicAPI(ModelAPI):
 # tools can be either a stock tool param or a special Anthropic native use tool param
 ToolParamDef = (
-    ToolParam | BetaToolComputerUse20250124Param | ToolTextEditor20250124Param
+    ToolParam
+    | BetaToolComputerUse20250124Param
+    | ToolTextEditor20250124Param
+    | BetaToolTextEditor20241022Param
 )
@@ -580,6 +603,7 @@ def add_cache_control(
     | ToolParam
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
+    | BetaToolTextEditor20241022Param
     | dict[str, Any],
 ) -> None:
     cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -844,6 +868,7 @@ def _names_for_tool_call(
     """
     mappings = (
         (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
+        ("str_replace_editor", "text_editor_20241022", "text_editor"),
         ("str_replace_editor", "text_editor_20250124", "text_editor"),
         ("bash", "bash_20250124", "bash_session"),
     )

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -281,7 +281,7 @@ def none() -> type[ModelAPI]:
 def validate_openai_client(feature: str) -> None:
     FEATURE = feature
     PACKAGE = "openai"
-    MIN_VERSION = "1.75.0"
+    MIN_VERSION = "1.78.0"
     # verify we have the package
     try:

inspect_ai/solver/_multiple_choice.py CHANGED Viewed

@@ -200,6 +200,7 @@ def multiple_choice(
     template: str | None = None,
     cot: bool = False,
     multiple_correct: bool = False,
+    max_tokens: int | None = None,
     **kwargs: Unpack[DeprecatedArgs],
 ) -> Solver:
     """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
@@ -226,6 +227,8 @@ def multiple_choice(
         squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
         as `False` if there's exactly one correct answer from the choices
         available. NOTE: this has no effect if you provide a custom template.
+      max_tokens: Default `None`. Controls the number of tokens generated through the call
+        to generate().
       **kwargs (Any): Deprecated arguments for backward compatibility.
     #### Shuffling
@@ -282,7 +285,7 @@ def multiple_choice(
             template=str(template),
         )
-        state = await generate(state)
+        state = await generate(state, max_tokens=max_tokens)
         answers = parse_answers(state)
         if answers and answers.group(1):

inspect_ai/solver/_task_state.py CHANGED Viewed

@@ -138,7 +138,7 @@ class TaskState:
     The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
     The `TaskState` is passed to and returned from each solver during a sample's
-    evaluation. It allows us to manipulated the message history, the tools
+    evaluation. It allows us to maintain the manipulated message history, the tools
     available to the model, the final output of the model, and whether the task
     is completed or has hit a limit.
     """
@@ -204,13 +204,17 @@ class TaskState:
         Convenience function for accessing the initial input from the `Sample` as a string.
         If the `input` is a `list[ChatMessage]`, this will return the text from
-        the first chat message
+        the last chat message
         """
         if isinstance(self._input, str):
             return self._input
         else:
             input = next(
-                (message.text for message in self._input if message.role == "user"),
+                (
+                    message.text
+                    for message in reversed(self._input)
+                    if message.role == "user"
+                ),
                 None,
             )
             if input:
@@ -231,7 +235,7 @@ class TaskState:
         write access to the user chat prompt. Raises an
         exception if there is no user prompt
         """
-        prompt = next((m for m in self.messages if m.role == "user"), None)
+        prompt = next((m for m in reversed(self.messages) if m.role == "user"), None)
         if prompt:
             return prompt
         else:

inspect_ai/tool/_mcp/_context.py CHANGED Viewed

@@ -2,13 +2,11 @@ from contextlib import _AsyncGeneratorContextManager
 from typing import TypeAlias
 from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
-from mcp.types import (
-    JSONRPCMessage,
-)
+from mcp.shared.message import SessionMessage
 MCPServerContext: TypeAlias = _AsyncGeneratorContextManager[
     tuple[
-        MemoryObjectReceiveStream[JSONRPCMessage | Exception],
-        MemoryObjectSendStream[JSONRPCMessage],
+        MemoryObjectReceiveStream[SessionMessage | Exception],
+        MemoryObjectSendStream[SessionMessage],
     ],
 ]

inspect_ai/tool/_mcp/_sandbox.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import TextIO
 import anyio
 from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
 from mcp import JSONRPCRequest, StdioServerParameters
+from mcp.shared.message import SessionMessage
 from mcp.types import JSONRPCMessage, JSONRPCNotification
 from inspect_ai.tool._tool_support_helpers import (
@@ -36,12 +37,12 @@ async def sandbox_client(  # type: ignore
     )
     # read_stream is remote process's stdout
-    read_stream: MemoryObjectReceiveStream[JSONRPCMessage | Exception]
-    read_stream_writer: MemoryObjectSendStream[JSONRPCMessage | Exception]
+    read_stream: MemoryObjectReceiveStream[SessionMessage | Exception]
+    read_stream_writer: MemoryObjectSendStream[SessionMessage | Exception]
     # write_stream is remote process's stdin
-    write_stream: MemoryObjectSendStream[JSONRPCMessage]
-    write_stream_reader: MemoryObjectReceiveStream[JSONRPCMessage]
+    write_stream: MemoryObjectSendStream[SessionMessage]
+    write_stream_reader: MemoryObjectReceiveStream[SessionMessage]
     read_stream_writer, read_stream = anyio.create_memory_object_stream(0)
     write_stream, write_stream_reader = anyio.create_memory_object_stream(0)
@@ -64,18 +65,20 @@ async def sandbox_client(  # type: ignore
             async with write_stream_reader:
                 # This reads messages until the stream is closed
                 async for message in write_stream_reader:
-                    root = message.root
+                    root = message.message.root
                     if isinstance(root, JSONRPCRequest):
                         await read_stream_writer.send(
-                            await exec_model_request(
-                                sandbox=sandbox_environment,
-                                method="mcp_send_request",
-                                params={
-                                    "session_id": session_id,
-                                    "request": root.model_dump(),
-                                },
-                                result_type=JSONRPCMessage,
-                                timeout=timeout,
+                            SessionMessage(
+                                message=await exec_model_request(
+                                    sandbox=sandbox_environment,
+                                    method="mcp_send_request",
+                                    params={
+                                        "session_id": session_id,
+                                        "request": root.model_dump(),
+                                    },
+                                    result_type=JSONRPCMessage,
+                                    timeout=timeout,
+                                )
                             )
                         )
                     elif isinstance(root, JSONRPCNotification):

inspect_ai/tool/_mcp/server.py CHANGED Viewed

@@ -102,7 +102,7 @@ def mcp_server_sandbox(
 def verfify_mcp_package() -> None:
     FEATURE = "MCP tools"
     PACKAGE = "mcp"
-    MIN_VERSION = "1.6.0"
+    MIN_VERSION = "1.8.0"
     # verify we have the package
     try:

inspect_ai/tool/_tools/_think.py CHANGED Viewed

@@ -41,7 +41,7 @@ def think(
 def think_tool_viewer() -> ToolCallViewer:
     def viewer(tool_call: ToolCall) -> ToolCallView:
         call = ToolCallContent(
-            format="markdown", content=tool_call.arguments["thought"]
+            format="markdown", content=tool_call.arguments.get("thought", "")
         )
         return ToolCallView(call=call)

inspect_ai/tool/_tools/_web_search/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ._web_search import web_search
+__all__ = ["web_search"]

inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} RENAMED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Literal, Protocol, runtime_checkable
+from typing import Awaitable, Callable
 import anyio
 import httpx
@@ -16,8 +16,6 @@ from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.httpx import httpx_should_retry, log_httpx_retry_attempt
 from inspect_ai.util._concurrency import concurrency
-from .._tool import Tool, ToolResult, tool
 DEFAULT_RELEVANCE_PROMPT = """I am trying to answer the following question and need to find the most relevant information on the web. Please let me know if the following content is relevant to the question or not. You should just respond with "yes" or "no".
 Question: {question}
@@ -31,59 +29,35 @@ class SearchLink:
         self.snippet = snippet
-@runtime_checkable
-class SearchProvider(Protocol):
-    async def __call__(self, query: str, start_idx: int) -> list[SearchLink]: ...
-@tool
-def web_search(
-    provider: Literal["google"] = "google",
-    num_results: int = 3,
-    max_provider_calls: int = 3,
-    max_connections: int = 10,
-    model: str | None = None,
-) -> Tool:
-    """Web search tool.
-    A tool that can be registered for use by models to search the web. Use
-    the `use_tools()` solver to make the tool available (e.g. `use_tools(web_search())`))
-    A web search is conducted using the specified provider, the results are parsed for relevance
-    using the specified model, and the top 'num_results' relevant pages are returned.
-    See further documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-web-search>.
-    Args:
-      provider: Search provider (defaults to "google", currently
-        the only provider). Possible future providers include "brave" and "bing".
-      num_results: Number of web search result pages to return to the model.
-      max_provider_calls: Maximum number of search calls to make to the search provider.
-      max_connections: Maximum number of concurrent connections to API
-        endpoint of search provider.
-      model: Model used to parse web pages for relevance.
+def maybe_get_google_api_keys() -> tuple[str, str] | None:
+    """
+    Get Google API keys from environment variables.
     Returns:
-       A tool that can be registered for use by models to search the web.
+        tuple: A tuple containing the Google API key and the Google CSE ID.
     """
-    # get search client
-    client = httpx.AsyncClient()
+    google_api_key = os.environ.get("GOOGLE_CSE_API_KEY", None)
+    google_cse_id = os.environ.get("GOOGLE_CSE_ID", None)
+    return (google_api_key, google_cse_id) if google_api_key and google_cse_id else None
-    if provider == "google":
-        search_provider = google_search_provider(client)
-    else:
-        raise ValueError(
-            f"Provider {provider} not supported. Only 'google' is supported."
+def google_search_provider(
+    num_results: int,
+    max_provider_calls: int,
+    max_connections: int,
+    model: str | None,
+) -> Callable[[str], Awaitable[str | None]]:
+    keys = maybe_get_google_api_keys()
+    if not keys:
+        raise PrerequisiteError(
+            "GOOGLE_CSE_ID and/or GOOGLE_CSE_API_KEY not set in the environment. Please ensure these variables are defined to use Google Custom Search with the web_search tool.\n\nLearn more about the Google web search provider at https://inspect.aisi.org.uk/tools.html#google-provider"
         )
+    google_api_key, google_cse_id = keys
-    # resolve provider (only google for now)
-    async def execute(query: str) -> ToolResult:
-        """
-        Use the web_search tool to perform keyword searches of the web.
+    # Create the client within the provider
+    client = httpx.AsyncClient()
-        Args:
-            query (str): Search query.
-        """
+    async def search(query: str) -> str | None:
         # limit number of concurrent searches
         page_contents: list[str] = []
         urls: list[str] = []
@@ -92,8 +66,8 @@ def web_search(
         # Paginate through search results until we have successfully extracted num_results pages or we have reached max_provider_calls
         while len(page_contents) < num_results and search_calls < max_provider_calls:
-            async with concurrency(f"{provider}_web_search", max_connections):
-                links = await search_provider(query, start_idx=search_calls * 10)
+            async with concurrency("google_web_search", max_connections):
+                links = await _search(query, start_idx=search_calls * 10)
             async with anyio.create_task_group() as tg:
@@ -114,19 +88,39 @@ def web_search(
             search_calls += 1
         all_page_contents = "\n\n".join(page_contents)
-        if all_page_contents == "":
-            response: ToolResult = (
-                "I'm sorry, I couldn't find any relevant information on the web."
-            )
-        else:
-            response = (
-                "Here are your web search results. Please read them carefully as they may be useful later! "
-                + all_page_contents
-            )
+        return None if all_page_contents == "" else all_page_contents
-        return response
+    async def _search(query: str, start_idx: int) -> list[SearchLink]:
+        # List of allowed parameters can be found https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list
+        search_params = {
+            "q": query,
+            "key": google_api_key,
+            "cx": google_cse_id,
+            "start": start_idx,
+        }
+        search_url = "https://www.googleapis.com/customsearch/v1?" + "&".join(
+            [f"{key}={value}" for key, value in search_params.items()]
+        )
-    return execute
+        # retry up to 5 times over a period of up to 1 minute
+        @retry(
+            wait=wait_exponential_jitter(),
+            stop=stop_after_attempt(5) | stop_after_delay(60),
+            retry=retry_if_exception(httpx_should_retry),
+            before_sleep=log_httpx_retry_attempt(search_url),
+        )
+        async def execute_search() -> httpx.Response:
+            return await client.get(search_url)
+        result = await execute_search()
+        data = result.json()
+        if "items" in data:
+            return [SearchLink(item["link"], item["snippet"]) for item in data["items"]]
+        else:
+            return []
+    return search
 async def page_if_relevant(
@@ -183,44 +177,3 @@ async def page_if_relevant(
         return full_text
     else:
         return None
-def google_search_provider(client: httpx.AsyncClient) -> SearchProvider:
-    google_api_key = os.environ.get("GOOGLE_CSE_API_KEY", None)
-    google_cse_id = os.environ.get("GOOGLE_CSE_ID", None)
-    if not google_api_key or not google_cse_id:
-        raise PrerequisiteError(
-            "GOOGLE_CSE_ID and/or GOOGLE_CSE_API_KEY not set in the environment. Please ensure these variables are defined to use Google Custom Search with the web_search tool.\n\nLearn more about the Google web search provider at https://inspect.aisi.org.uk/tools.html#google-provider"
-        )
-    async def search(query: str, start_idx: int) -> list[SearchLink]:
-        # List of allowed parameters can be found https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list
-        search_params = {
-            "q": query,
-            "key": google_api_key,
-            "cx": google_cse_id,
-            "start": start_idx,
-        }
-        search_url = "https://www.googleapis.com/customsearch/v1?" + "&".join(
-            [f"{key}={value}" for key, value in search_params.items()]
-        )
-        # retry up to 5 times over a period of up to 1 minute
-        @retry(
-            wait=wait_exponential_jitter(),
-            stop=stop_after_attempt(5) | stop_after_delay(60),
-            retry=retry_if_exception(httpx_should_retry),
-            before_sleep=log_httpx_retry_attempt(search_url),
-        )
-        async def execute_search() -> httpx.Response:
-            return await client.get(search_url)
-        result = await execute_search()
-        data = result.json()
-        if "items" in data:
-            return [SearchLink(item["link"], item["snippet"]) for item in data["items"]]
-        else:
-            return []
-    return search

inspect-ai 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl

inspect-ai 0.3.94py3-none-any.whl → 0.3.96py3-none-any.whl