PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -13,8 +13,8 @@ from inspect_ai.scorer._score import score
 from inspect_ai.solver._chain import chain
 from inspect_ai.tool._tool import Tool, ToolResult, tool
 from inspect_ai.tool._tool_with import tool_with
+from inspect_ai.util._limit import token_limit as create_token_limit
-from ._limit import SampleLimitExceededError
 from ._prompt import system_message
 from ._solver import Generate, Solver, solver
 from ._task_state import TaskState
@@ -172,14 +172,11 @@ def basic_agent(
             # (if there is no message_limit then default to 50)
             state.message_limit = message_limit or state.message_limit or 50
-            # resolve token limit
-            state.token_limit = token_limit or state.token_limit
             # track attempts
             attempts = 0
-            try:
-                # main loop (state.completed checks message_limit and token_limit)
+            with create_token_limit(token_limit):
+                # main loop
                 while not state.completed:
                     # generate output and append assistant message
                     state.output = await get_model().generate(
@@ -247,10 +244,6 @@ def basic_agent(
                     else:
                         state.messages.append(ChatMessageUser(content=continue_message))
-            # propagate current state along with sample limit exceeded
-            except SampleLimitExceededError as ex:
-                raise ex.with_state(state)
             return state
         return solve

inspect_ai/solver/_chain.py CHANGED Viewed

@@ -82,7 +82,7 @@ class Chain(Sequence[Solver], Solver):
         from ._transcript import solver_transcript
         for slv in self._solvers:
-            with solver_transcript(slv, state) as st:
+            async with solver_transcript(slv, state) as st:
                 state = await slv(state, generate)
                 st.complete(state)
             if state.completed:

inspect_ai/solver/_fork.py CHANGED Viewed

@@ -73,7 +73,7 @@ async def solver_subtask(state: TaskState, solver: Solver) -> TaskState:
     @subtask(name=name, store=state.store, type="fork", input=input)  # type: ignore
     async def solve() -> TaskState:
         if not isinstance(solver, Chain):
-            with solver_transcript(solver, state) as st:
+            async with solver_transcript(solver, state) as st:
                 new_state = await solver(state, generate)
                 st.complete(new_state)
             return new_state

inspect_ai/solver/_multiple_choice.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Match, TypedDict
 from typing_extensions import Unpack
+from inspect_ai._util.answer import answer_character, answer_index
 from inspect_ai._util.logger import warn_once
 from inspect_ai.util import resource
@@ -64,31 +65,13 @@ def answer_options(choices: Choices) -> str:
     indexes = list(range(len(choices)))
     return "\n".join(
-        [f"{chr(65 + i)}) {choices[j].value}" for i, j in enumerate(indexes)]
+        [f"{answer_character(i)}) {choices[j].value}" for i, j in enumerate(indexes)]
     )
-def answer_character(index: int) -> str:
-    r"""
-    Helper to go from array index to char, for example:
-        0 -> 'A', 1 -> 'B', etc
-    """
-    return chr(ord("A") + index)
-def answer_index(char: str) -> int:
-    r"""
-    Helper to go from char to array index, for example:
-        'A' -> 0, 'B' -> 1, etc
-    """
-    return ord(char.upper()) - ord("A")
 def prompt(question: str, choices: Choices, template: str) -> str:
     choices_text = answer_options(choices)
-    letters = ",".join(chr(65 + i) for i in range(len(choices)))
+    letters = ",".join(answer_character(i) for i in range(len(choices)))
     return template.format(
         choices=choices_text,
@@ -112,7 +95,7 @@ def parse_answers(state: TaskState) -> Match[str] | None:
     # In this case, we're looking for a single line which contains the expected
     # ANSWER: B,C string with only whitespace after it
     match = re.search(
-        r"(?i)^ANSWER\s*:\s*([A-Za-z ,]+)\s*(?:$|\n)",
+        r"(?i)^ANSWER\s*:\s*([A-Za-z\d ,]+)\s*(?:$|\n)",
         state.output.completion,
         flags=re.MULTILINE,
     )
@@ -121,7 +104,7 @@ def parse_answers(state: TaskState) -> Match[str] | None:
     # version for backward compatibility
     if match is None:
         return re.search(
-            r"(?i)ANSWER\s*:\s*([A-Za-z ,]+)(?:[^\w]|\n|$)", state.output.completion
+            r"(?i)ANSWER\s*:\s*([A-Za-z\d ,]+)(?:[^\w]|\n|$)", state.output.completion
         )
     else:
         return match

inspect_ai/solver/_plan.py CHANGED Viewed

@@ -102,7 +102,7 @@ class Plan(Solver):
             # execute steps
             for index, solver in enumerate(self.steps):
                 # run solver
-                with solver_transcript(solver, state) as st:
+                async with solver_transcript(solver, state) as st:
                     state = await solver(state, generate)
                     st.complete(state)
@@ -113,7 +113,7 @@ class Plan(Solver):
             # execute finish
             if self.finish:
-                with solver_transcript(self.finish, state) as st:
+                async with solver_transcript(self.finish, state) as st:
                     state = await self.finish(state, generate)
                     st.complete(state)
                 check_sample_interrupt()

inspect_ai/solver/_task_state.py CHANGED Viewed

@@ -2,9 +2,8 @@ from collections.abc import Sequence
 from contextvars import ContextVar
 from copy import deepcopy
 from dataclasses import dataclass
-from itertools import tee
 from random import Random
-from typing import Any, Iterable, SupportsIndex, Type, Union, cast, overload
+from typing import Any, Type, Union, cast, overload
 from pydantic_core import to_jsonable_python
 from shortuuid import uuid
@@ -18,12 +17,18 @@ from inspect_ai.model import (
     ModelOutput,
 )
 from inspect_ai.model._call_tools import tools_info
-from inspect_ai.model._chat_message import ChatMessageBase
 from inspect_ai.model._model import sample_total_tokens
 from inspect_ai.scorer._metric import Score
 from inspect_ai.scorer._target import Target
 from inspect_ai.tool import Tool, ToolChoice
 from inspect_ai.tool._tool_def import ToolDef
+from inspect_ai.util._limit import (
+    check_message_limit,
+    check_token_limit,
+)
+from inspect_ai.util._limit import message_limit as create_message_limit
+from inspect_ai.util._limit import token_limit as create_token_limit
+from inspect_ai.util._limited_conversation import ChatMessageList
 from inspect_ai.util._store import Store, store_jsonable
 from inspect_ai.util._store_model import SMT
@@ -159,11 +164,11 @@ class TaskState:
         self._input = input
         self._target = target
         self._metadata = metadata
-        self._messages: list[ChatMessage] = ChatMessageList(messages, self)
+        self._messages: list[ChatMessage] = ChatMessageList(messages)
         self._tools: list[Tool] = []
         self._output = output if output else ModelOutput(model=str(model))
-        self._message_limit = message_limit
-        self._token_limit = token_limit
+        self._message_limit = create_message_limit(message_limit)
+        self._token_limit = create_token_limit(token_limit)
         self._completed = completed
         self._store = Store()
         self._uuid = uuid()
@@ -254,7 +259,7 @@ class TaskState:
     @messages.setter
     def messages(self, messages: list[ChatMessage]) -> None:
-        self._messages = ChatMessageList(messages, self)
+        self._messages = ChatMessageList(messages)
     @property
     def output(self) -> ModelOutput:
@@ -302,12 +307,16 @@ class TaskState:
     @property
     def message_limit(self) -> int | None:
         """Limit on total messages allowed per conversation."""
-        return self._message_limit
+        return self._message_limit.limit
     @message_limit.setter
     def message_limit(self, messages: int | None) -> None:
-        """Set limit on total messages allowed per conversation."""
-        self._message_limit = messages
+        """Set limit on total messages allowed per conversation.
+        Also checks whether the current message count exceeds the new limit.
+        """
+        self._message_limit.limit = messages
+        check_message_limit(len(self.messages), raise_for_equal=False)
         from inspect_ai.log._samples import set_active_sample_message_limit
@@ -316,12 +325,16 @@ class TaskState:
     @property
     def token_limit(self) -> int | None:
         """Limit on total tokens allowed per conversation."""
-        return self._token_limit
+        return self._token_limit.limit
     @token_limit.setter
     def token_limit(self, tokens: int | None) -> None:
-        """Set limit on total tokens allowed per conversation."""
-        self._token_limit = tokens
+        """Set limit on total tokens allowed per conversation.
+        Also checks whether the current token usage exceeds the new limit.
+        """
+        self._token_limit.limit = tokens
+        check_token_limit()
         from inspect_ai.log._samples import set_active_sample_token_limit
@@ -340,24 +353,11 @@ class TaskState:
         """
         from inspect_ai.log._samples import set_active_sample_total_messages
-        from ._limit import SampleLimitExceededError
         # update messages
         set_active_sample_total_messages(len(self.messages))
         if self._completed:
             return True
-        elif self.message_limit and len(self.messages) >= self.message_limit:
-            raise SampleLimitExceededError(
-                "message",
-                value=len(self.messages),
-                limit=self.message_limit,
-                state=self,
-            )
-        elif self.token_limit and self.token_usage >= self.token_limit:
-            raise SampleLimitExceededError(
-                "token", value=self.token_usage, limit=self.token_limit, state=self
-            )
         else:
             check_sample_interrupt()
             return self._completed
@@ -445,65 +445,3 @@ def state_jsonable(state: TaskState | None = None) -> dict[str, Any]:
 def sample_jsonable(sample: Sample) -> dict[str, Any]:
     jsonable = to_jsonable_python(sample, exclude_none=True, fallback=lambda _x: None)
     return cast(dict[str, Any], deepcopy(jsonable))
-class ChatMessageList(list[ChatMessage]):
-    def __init__(self, iterable: Iterable[ChatMessage], parent_state: TaskState):
-        self.parent_state = parent_state
-        items, length = self._iterable_length(iterable)
-        self._check_size(length)
-        super().__init__(items)
-    def _check_size(self, additional_items: int = 1) -> None:
-        from inspect_ai.log._samples import active_sample_message_limit
-        from ._limit import SampleLimitExceededError
-        messages_limit = active_sample_message_limit()
-        if messages_limit is not None:
-            messages = len(self) + additional_items
-            if messages > messages_limit:
-                raise SampleLimitExceededError(
-                    "message",
-                    value=messages,
-                    limit=messages_limit,
-                    message=None,
-                    state=self.parent_state,
-                )
-    def append(self, item: ChatMessage) -> None:
-        self._check_size()
-        super().append(item)
-    def extend(self, items: Iterable[ChatMessage]) -> None:
-        items, length = self._iterable_length(items)
-        self._check_size(length)
-        super().extend(items)
-    def insert(self, index: SupportsIndex, item: ChatMessage) -> None:
-        self._check_size()
-        super().insert(index, item)
-    @overload
-    def __setitem__(self, index: SupportsIndex, item: ChatMessage) -> None: ...
-    @overload
-    def __setitem__(self, index: slice, item: Iterable[ChatMessage]) -> None: ...
-    def __setitem__(
-        self, index: SupportsIndex | slice, item: ChatMessage | Iterable[ChatMessage]
-    ) -> None:
-        if isinstance(index, slice) and not isinstance(item, ChatMessageBase):
-            item, length = self._iterable_length(item)
-            size_change = length - len(self[index])
-            if size_change > 0:
-                self._check_size(size_change)
-        super().__setitem__(index, item)  # type: ignore[assignment,index]
-    def _iterable_length(
-        self, items: Iterable[ChatMessage]
-    ) -> tuple[Iterable[ChatMessage], int]:
-        items, counter = tee(items)
-        length = sum(1 for _ in counter)
-        return items, length

inspect_ai/solver/_transcript.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import contextlib
-from typing import Iterator
+from typing import AsyncIterator
 from inspect_ai._util.json import json_changes
 from inspect_ai._util.registry import registry_log_name
+from inspect_ai.util._span import span
 from ._solver import Solver
 from ._task_state import TaskState, state_jsonable
@@ -22,12 +23,10 @@ class SolverTranscript:
             transcript()._event(StateEvent(changes=changes))
-@contextlib.contextmanager
-def solver_transcript(
+@contextlib.asynccontextmanager
+async def solver_transcript(
     solver: Solver, state: TaskState, name: str | None = None
-) -> Iterator[SolverTranscript]:
-    from inspect_ai.log._transcript import transcript
+) -> AsyncIterator[SolverTranscript]:
     name = registry_log_name(name or solver)
-    with transcript().step(name=name, type="solver"):
+    async with span(name=name, type="solver"):
         yield SolverTranscript(name, state)

inspect_ai/tool/_json_rpc_helpers.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Literal, Protocol, Type, TypeAlias, TypeVar
 from pydantic import BaseModel, RootModel
-from inspect_ai.tool._tool import ToolError
+from inspect_ai.tool._tool import ToolError, ToolParsingError
 class JSONRPCResponseBase(BaseModel):
@@ -70,6 +70,7 @@ async def exec_scalar_request(
     params: JSONRPCParamsType,
     result_type: Type[ScalarT],
     transport: JSONRPCTransport,
+    server_error_mapper: JSONRPCServerErrorMapper,
 ) -> ScalarT:
     """
     Execute a JSON-RPC command expecting a scalar result.
@@ -79,6 +80,7 @@ async def exec_scalar_request(
       params (JSONRPCParamsType): The parameters for the JSON-RPC method.
       result_type (Type[ScalarT]): The scalar type (str, int, float, bool, None) to validate the result against.
       transport (JSONRPCTransport): The transport callable to use for the RPC communication.
+      server_error_mapper (JSONRPCServerErrorMapper): A callable to map server specific JSON-RPC errors to exceptions.
     Returns:
       ScalarT: The scalar result of the JSON-RPC call.
@@ -88,7 +90,12 @@ async def exec_scalar_request(
       ToolParsingError: If the JSON-RPC response contains a specific error code indicating a parsing error.
       ValueError: If the result is not of the expected scalar type.
     """
-    rpc_result = await _exec_request(method=method, params=params, transport=transport)
+    rpc_result = await _exec_request(
+        method=method,
+        params=params,
+        transport=transport,
+        server_error_mapper=server_error_mapper,
+    )
     if (result_type is type(None) and rpc_result is not None) or not isinstance(
         rpc_result, result_type
     ):
@@ -101,6 +108,7 @@ async def exec_model_request(
     params: JSONRPCParamsType,
     result_type: Type[BaseModelT],
     transport: JSONRPCTransport,
+    server_error_mapper: JSONRPCServerErrorMapper | None = None,
 ) -> BaseModelT:
     """
     Execute a JSON-RPC command to a sandbox environment expecting a model result.
@@ -110,6 +118,7 @@ async def exec_model_request(
       params (JSONRPCParamsType): The parameters for the JSON-RPC method.
       result_type (Type[BaseModelT]): The Pydantic model class to validate and parse the result.
       transport (JSONRPCTransport): The transport callable to use for the RPC communication.
+      server_error_mapper (JSONRPCServerErrorMapper): A callable to map server specific JSON-RPC errors to exceptions.
     Returns:
       BaseModelT: The parsed and validated result of the JSON-RPC call.
@@ -119,7 +128,12 @@ async def exec_model_request(
       ToolParsingError: If the JSON-RPC response contains a specific error code indicating a parsing error.
       ValueError: If the result cannot be validated against the provided model class.
     """
-    rpc_result = await _exec_request(method=method, params=params, transport=transport)
+    rpc_result = await _exec_request(
+        method=method,
+        params=params,
+        transport=transport,
+        server_error_mapper=server_error_mapper,
+    )
     return result_type.model_validate(rpc_result, strict=True)
@@ -161,6 +175,7 @@ async def _exec_request(
     method: str,
     params: JSONRPCParamsType,
     transport: JSONRPCTransport,
+    server_error_mapper: JSONRPCServerErrorMapper | None = None,
 ) -> object:
     """Execute a request using the provided transport mechanism."""
     return parse_json_rpc_response(
@@ -171,6 +186,7 @@ async def _exec_request(
         ),
         method,
         params,
+        server_error_mapper,
     )
@@ -178,15 +194,16 @@ def parse_json_rpc_response(
     response_str: str,
     method: str,
     params: JSONRPCParamsType,
+    server_error_mapper: JSONRPCServerErrorMapper | None = None,
 ) -> object:
     """Validates the JSON RPC response and returns the result or raises a proper Inspect error."""
     match JSONRPCResponse.model_validate_json(response_str).root:
         case JSONRPCSuccessResponse(result=rpc_result):
             return rpc_result
-        case JSONRPCErrorResponse(
-            error=JSONRPCError(code=code, message=message, data=_)
-        ):
-            raise exception_for_rpc_response_error(code, message, method, params)
+        case JSONRPCErrorResponse(error=JSONRPCError(code=code, message=message)):
+            raise exception_for_rpc_response_error(
+                code, message, method, params, server_error_mapper
+            )
         case _:
             raise ValueError(
                 f"Unexpected JSON RPC response to request {_rpc_call_description(method, params)}: {response_str}"
@@ -220,16 +237,17 @@ def exception_for_rpc_response_error(
             if server_error_mapper
             else ToolError(message)
         )
+    elif code == -32602:  # (Invalid params)
+        # Even though the Inspect side does validation, it can't possibly be
+        # complete - especially for tools that have dynamic action dependant
+        # rules for optional/required params.
+        return ToolParsingError(message)
     elif code == -32603:
         return ToolError(message)
     else:
         # -32600 (Invalid Request)
         #   If we sent a bogus request, it's 100% a code bug.
         # -32601 (Method not found)
-        # -32602 (Invalid params)
-        #   These shouldn't be possible since Inspect did validation prior to
-        #   making the tool call. Because of that, these errors should not make
-        #   it back to the model, so choose RuntimeError.
         # -32700 (Parse error)
         #   shouldn't be seen in this flow since we're processing responses, and
         #   this is a request oriented error.
@@ -276,10 +294,20 @@ def create_json_rpc_request(
     is_notification: bool,
 ) -> str:
     return json.dumps(
-        {
-            "jsonrpc": "2.0",
-            "method": method,
-            **({"params": params} if params else {}),
-            **({"id": next(id_generator)} if not is_notification else {}),
-        }
+        remove_none_values(
+            {
+                "jsonrpc": "2.0",
+                "method": method,
+                **({"params": params} if params else {}),
+                **({"id": next(id_generator)} if not is_notification else {}),
+            }
+        )
     )
+def remove_none_values(obj: object) -> object:
+    if isinstance(obj, dict):
+        return {k: remove_none_values(v) for k, v in obj.items() if v is not None}
+    elif isinstance(obj, list):
+        return [remove_none_values(item) for item in obj if item is not None]
+    return obj

inspect_ai/tool/_mcp/_mcp.py CHANGED Viewed

@@ -61,16 +61,17 @@ class MCPServerImpl(MCPServer):
     ) -> list[Tool]:
         return await self._task_session()._list_tools(tools)
-    # create a separate MCPServer session per async task
-    _task_sessions: dict[int, "MCPServerSession"] = {}
+    # create a separate MCPServer session per async task / server name
+    _task_sessions: dict[str, "MCPServerSession"] = {}
     def _task_session(self) -> "MCPServerSession":
         task_id = anyio.get_current_task().id
-        if task_id not in self._task_sessions:
-            MCPServerImpl._task_sessions[task_id] = MCPServerSession(
+        session_key = f"{task_id}_{self._name}"
+        if session_key not in self._task_sessions:
+            MCPServerImpl._task_sessions[session_key] = MCPServerSession(
                 self._client, name=self._name, events=self._events
             )
-        return MCPServerImpl._task_sessions[task_id]
+        return MCPServerImpl._task_sessions[session_key]
 class MCPServerSession(MCPServer):
@@ -259,6 +260,7 @@ def create_server_sandbox(
     cwd: str | Path | None = None,
     env: dict[str, str] | None = None,
     sandbox: str | None = None,
+    timeout: int | None = None,
 ) -> MCPServer:
     # TODO: Confirm the lifetime concepts. By the time a request makes it to the
     # sandbox, it's going to need both a session id and a server "name".
@@ -272,6 +274,7 @@ def create_server_sandbox(
                 env=env,
             ),
             sandbox_name=sandbox,
+            timeout=timeout,
         ),
         name=name,
         events=False,

inspect_ai/tool/_mcp/_sandbox.py CHANGED Viewed

@@ -11,7 +11,7 @@ from inspect_ai.tool._tool_support_helpers import (
     exec_model_request,
     exec_notification,
     exec_scalar_request,
-    tool_container_sandbox,
+    tool_support_sandbox,
 )
 from ._context import MCPServerContext
@@ -28,8 +28,10 @@ async def sandbox_client(  # type: ignore
     *,
     sandbox_name: str | None = None,
     errlog: TextIO = sys.stderr,
+    timeout: int | None = None,  # default 180 seconds
 ) -> MCPServerContext:  # type: ignore
-    sandbox_environment = await tool_container_sandbox(
+    timeout = timeout or 180
+    (sandbox_environment, _) = await tool_support_sandbox(
         "mcp support", sandbox_name=sandbox_name
     )
@@ -49,6 +51,7 @@ async def sandbox_client(  # type: ignore
         method="mcp_launch_server",
         params={"server_params": server.model_dump()},
         result_type=int,
+        timeout=timeout,
     )
     async def stdout_reader() -> None:
@@ -72,6 +75,7 @@ async def sandbox_client(  # type: ignore
                                     "request": root.model_dump(),
                                 },
                                 result_type=JSONRPCMessage,
+                                timeout=timeout,
                             )
                         )
                     elif isinstance(root, JSONRPCNotification):
@@ -82,6 +86,7 @@ async def sandbox_client(  # type: ignore
                                 "session_id": session_id,
                                 "notification": root.model_dump(),
                             },
+                            timeout=timeout,
                         )
                     else:
                         assert False, f"Unexpected message type {message=}"
@@ -101,4 +106,5 @@ async def sandbox_client(  # type: ignore
                 method="mcp_kill_server",
                 params={"session_id": session_id},
                 result_type=type(None),
+                timeout=timeout,
             )

inspect_ai/tool/_mcp/server.py CHANGED Viewed

@@ -73,6 +73,7 @@ def mcp_server_sandbox(
     cwd: str | Path | None = None,
     env: dict[str, str] | None = None,
     sandbox: str | None = None,
+    timeout: int | None = None,
 ) -> MCPServer:
     """MCP Server (Sandbox).
@@ -87,6 +88,7 @@ def mcp_server_sandbox(
             "SHELL", "TERM", and "USER" for Posix-based systems).
         cwd: The working directory to use when spawning the process.
         sandbox: The sandbox to use when spawning the process.
+        timeout: Timeout (in seconds) for command.
     Returns:
         McpClient: Client for MCP Server
@@ -94,7 +96,7 @@ def mcp_server_sandbox(
     verfify_mcp_package()
     from ._mcp import create_server_sandbox
-    return create_server_sandbox(command, args, cwd, env, sandbox)
+    return create_server_sandbox(command, args, cwd, env, sandbox, timeout)
 def verfify_mcp_package() -> None:

inspect_ai/tool/_tool_call.py CHANGED Viewed

@@ -68,9 +68,12 @@ class ToolCallError:
         "permission",
         "file_not_found",
         "is_a_directory",
-        "output_limit",
+        "limit",
         "approval",
         "unknown",
+        # Retained for backward compatibility when loading logs created with an older
+        # version of inspect.
+        "output_limit",
     ]
     """Error type."""

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl