PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/tool/_tool_support_helpers.py CHANGED Viewed

@@ -7,13 +7,17 @@ It includes definitions for JSON-RPC request and response models, as well as fun
 from textwrap import dedent
 from typing import Type
+import semver
 from inspect_ai._util.error import PrerequisiteError
+from inspect_ai.tool._tool import ToolError
 from inspect_ai.util import sandbox_with
 from inspect_ai.util._sandbox.environment import SandboxEnvironment
 from ._json_rpc_helpers import (
     BaseModelT,
     JSONRPCParamsType,
+    JSONRPCServerErrorMapper,
     JSONRPCTransport,
     ScalarT,
     _rpc_call_description,
@@ -29,7 +33,7 @@ async def exec_scalar_request(
     method: str,
     params: JSONRPCParamsType,
     result_type: Type[ScalarT],
-    timeout: int | None = None,
+    timeout: int,
     user: str | None = None,
 ) -> ScalarT:
     return await scalar_request(
@@ -37,6 +41,7 @@ async def exec_scalar_request(
         params,
         result_type,
         transport=ToolSupportSandboxTransport(sandbox, timeout, user),
+        server_error_mapper=ToolSupportServerErrorMapper(),
     )
@@ -45,7 +50,7 @@ async def exec_model_request(
     method: str,
     params: JSONRPCParamsType,
     result_type: Type[BaseModelT],
-    timeout: int | None = None,
+    timeout: int,
     user: str | None = None,
 ) -> BaseModelT:
     return await model_request(
@@ -53,6 +58,7 @@ async def exec_model_request(
         params,
         result_type,
         transport=ToolSupportSandboxTransport(sandbox, timeout, user),
+        server_error_mapper=ToolSupportServerErrorMapper(),
     )
@@ -60,7 +66,7 @@ async def exec_notification(
     sandbox: SandboxEnvironment,
     method: str,
     params: JSONRPCParamsType,
-    timeout: int | None = None,
+    timeout: int,
     user: str | None = None,
 ) -> None:
     return await notification_helper(
@@ -68,19 +74,33 @@ async def exec_notification(
     )
+class ToolSupportServerErrorMapper(JSONRPCServerErrorMapper):
+    def __call__(
+        self, code: int, message: str, method: str, params: JSONRPCParamsType
+    ) -> Exception:
+        """Map `inspect-tool-support` defined custom codes to an exception."""
+        match code:
+            case -32099:  # This is a ToolException from the container
+                return ToolError(message)
+            case -32098:  # This is an unexpected exception inside the container
+                return RuntimeError(message)
+            case _:
+                return RuntimeError(message)
 class ToolSupportSandboxTransport(JSONRPCTransport):
     """
-    A transport callable that uses a sandbox for RPC communication.
+    A transport that uses a sandbox for RPC communication.
-    This class implements the TransportCallable protocol and encapsulates
-    the sandbox, timeout, and user parameters needed for sandbox-based
-    RPC communication.
+    This class implements the TransportCallable protocol and encapsulates the
+    sandbox, timeout, and user parameters needed for sandbox-based RPC
+    communication.
     """
     def __init__(
         self,
         sandbox: SandboxEnvironment,
-        timeout: int | None = None,
+        timeout: int,
         user: str | None = None,
     ):
         """
@@ -128,13 +148,32 @@ class ToolSupportSandboxTransport(JSONRPCTransport):
 SANDBOX_CLI = "inspect-tool-support"
 INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB = "aisiuk/inspect-tool-support"
+FIRST_PUBLISHED_VERSION = semver.Version.parse("0.1.6")
+MIN_SUPPORTED_VERSION = FIRST_PUBLISHED_VERSION
+MIN_NON_DEPRECATED_VERSION = semver.Version.parse("1.0.0")
+async def _get_sandbox_tool_support_version(
+    sandbox: SandboxEnvironment,
+) -> semver.Version:
+    try:
+        return semver.Version.parse(
+            await exec_scalar_request(sandbox, "version", {}, str, 5)
+        )
+    except RuntimeError as rte:
+        if "-32601" in str(rte):
+            # The container doesn't even have a version method. The first version
+            # published was 0.1.6, so we'll have to assume it was that old.
+            return FIRST_PUBLISHED_VERSION
+        raise rte
-async def tool_container_sandbox(
+async def tool_support_sandbox(
     tool_name: str, *, sandbox_name: str | None = None
-) -> SandboxEnvironment:
+) -> tuple[SandboxEnvironment, semver.Version]:
     if sb := await sandbox_with(SANDBOX_CLI, True, name=sandbox_name):
-        return sb
+        current_version = await _get_sandbox_tool_support_version(sb)
+        return (sb, current_version)
     # This sort of programmatic sentence building will not cut it if we ever
     # support other languages.
@@ -160,7 +199,7 @@ async def tool_container_sandbox(
 def create_sandbox_transport(
-    sandbox: SandboxEnvironment, timeout: int | None = None, user: str | None = None
+    sandbox: SandboxEnvironment, timeout: int, user: str | None = None
 ) -> JSONRPCTransport:
     """
     Create a transport callable that uses a sandbox for RPC communication.

inspect_ai/tool/_tools/_bash_session.py CHANGED Viewed

@@ -1,20 +1,27 @@
-from pydantic import BaseModel, Field, RootModel
+from textwrap import dedent
+from typing import Annotated, Literal
+from pydantic import BaseModel, Discriminator, Field, RootModel
+from semver import Version
 from shortuuid import uuid
+from inspect_ai._util.error import PrerequisiteError
 from inspect_ai.tool import ToolResult
-from inspect_ai.tool._tool_support_helpers import (
-    exec_model_request,
-    tool_container_sandbox,
-)
 from inspect_ai.util import StoreModel, store_as
+from inspect_ai.util._sandbox.environment import SandboxEnvironment
 from .._tool import Tool, ToolParsingError, tool
-from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer
+from .._tool_support_helpers import (
+    exec_model_request,
+    exec_scalar_request,
+    tool_support_sandbox,
+)
 # These models are cloned from the container code. If/when we decide to create
 # a package that is shared between the inspect and tool-container codebases, we'll
 # just have to live with it.
 class NewSessionResult(BaseModel):
     session_name: str
@@ -23,106 +30,221 @@ class BashRestartResult(BaseModel):
     pass
-class BashCommandResult(BaseModel):
-    status: int
-    stdout: str
-    stderr: str
+class BashSessionStore(StoreModel):
+    session_id: str = Field(default_factory=str)
+    sandbox: SandboxEnvironment | None = Field(default=None)
-class BashResult(RootModel[BashRestartResult | BashCommandResult]):
-    pass
+# Action-specific parameter models
-class BashSessionStore(StoreModel):
-    session_id: str = Field(default_factory=str)
+class TypeParams(BaseModel):
+    action: Literal["type"] = "type"
+    input: str
-# custom viewer for bash
-def code_viewer(language: str, code_param: str) -> ToolCallViewer:
-    def viewer(tool_call: ToolCall) -> ToolCallView:
-        code = tool_call.arguments.get(code_param, None)
-        code = (code or tool_call.function).strip()
-        call = ToolCallContent(
-            title=language,
-            format="markdown",
-            content=f"```{language}\n" + code + "\n```\n",
-        )
-        return ToolCallView(call=call)
+class TypeSubmitParams(BaseModel):
+    action: Literal["type_submit"] = "type_submit"
+    input: str
+class RestartParams(BaseModel):
+    action: Literal["restart"] = "restart"
-    return viewer
+class ReadParams(BaseModel):
+    action: Literal["read"] = "read"
-@tool(viewer=code_viewer("bash", "command"))
-def bash_session(*, timeout: int | None = None, instance: str | None = uuid()) -> Tool:
-    """Bash shell session command execution tool.
+class InterruptParams(BaseModel):
+    action: Literal["interrupt"] = "interrupt"
-    Execute bash shell commands in a long running session using a sandbox environment (e.g. "docker").
+class BashSessionParams(
+    RootModel[
+        TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams
+    ]
+):
+    root: Annotated[
+        TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams,
+        Discriminator("action"),
+    ]
+DEFAULT_WAIT_FOR_OUTPUT = 30
+DEFAULT_IDLE_TIME = 0.5
+# this is how long we're willing to wait for the basic RPC call overhead.
+TRANSPORT_TIMEOUT = 5
+@tool()
+def bash_session(
+    *,
+    timeout: int | None = None,  # default is max_wait + 5 seconds
+    wait_for_output: int | None = None,  # default is 30 seconds
+    instance: str | None = uuid(),
+) -> Tool:
+    """Interactive bash shell session tool.
+    Interact with a bash shell in a long running session using a sandbox
+    environment (e.g. "docker"). This tool allows sending text to the shell,
+    which could be a command followed by a newline character or any other input
+    text such as the response to a password prompt.
     By default, a separate bash process is created within the sandbox for each
-    call to `bash_session()`. You can modify this behavior by passing `instance=None`
-    (which will result in a single bash process for the entire sample) or use other
-    `instance` values that implement another scheme).
+    call to `bash_session()`. You can modify this behavior by passing
+    `instance=None` (which will result in a single bash process for the entire
+    sample) or use other `instance` values that implement another scheme).
     See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.
     Args:
       timeout: Timeout (in seconds) for command.
+      wait_for_output: Maximum time (in seconds) to wait for output. If no
+          output is received within this period, the function will return an
+          empty string. The model may need to make multiple tool calls to obtain
+          all output from a given command.
       instance: Instance id (each unique instance id has its own bash process)
     Returns:
-      String with command output (stdout) or command error (stderr).
+      String with output from the shell.
     """
+    wait_for_output = wait_for_output or DEFAULT_WAIT_FOR_OUTPUT
+    min_timeout = wait_for_output + TRANSPORT_TIMEOUT
+    if timeout is None:
+        timeout = min_timeout
+    elif timeout < min_timeout:
+        raise ValueError(
+            f"Timeout must be at least {min_timeout} seconds, but got {timeout}."
+        )
     async def execute(
-        command: str | None = None,
-        restart: bool | None = None,
+        action: Literal["type", "type_submit", "restart", "read", "interrupt"],
+        input: str | None = None,
     ) -> ToolResult:
-        """
-        Use this function to execute bash commands.
+        r"""
+        Interact with a bash shell.
+        Interact with a bash shell by sending it input text and retrieving output
+        from it. There is no guarantee that all output will be returned in a
+        single call. Call this function multiple times to retrieve additional
+        output from the shell.
+        USAGE NOTES:
+        - Ensure that the shell is at a command prompt (typically when the
+          output ends in "$ " or "# ") before submitting a new command.
+        - Control characters must be sent as Unicode escape sequences (e.g., use
+          "\u0003" for Ctrl+C/ETX, "\u0004" for Ctrl+D/EOT). The literal string
+          "Ctrl+C" will not be interpreted as a control character.
+        - Use the "read" action to retrieve output from the shell without
+          sending any input. This is useful for long-running commands that
+          produce output over time. The "read" action will return any new output
+          since the last call.
+        - If a long-running command is in progress, additional input to execute
+          a new command will not be processed until the previous completes. To
+          abort a long-running command, use the "interrupt" action:
+          `bash_session(action="interrupt")`
+        Example use case:
+        - For a short-running command with a nominal amount of output, a single
+          call may suffice.
+          ```
+          bash_session(action="type_submit", input="echo foo") -> "foo\nuser@host:/# "
+          ```
+        - For a long-running command with output over time, multiple calls to are needed.
+          ```
+          bash_session(action="type_submit", input="tail -f /tmp/foo.log") -> <some output>
+          bash_session(action="read") -> <more output>
+          # Send interrupt (Ctrl+C)
+          bash_session(action="interrupt") -> "<final output>^Cuser@host:/# "
+          ```
+        - Interactive command awaiting more input from the user.
+          ```
+          bash_session(action="type_submit", input="ssh fred@foo.com") -> "foo.com's password: "
+          bash_session(action="type_submit", input="secret") -> "fred@foo.com:~$ "
+          ```
         Args:
-          command: The bash command to run. Required unless the tool is being restarted.
-          restart: Specifying true will restart this tool. Otherwise, leave this unspecified.
+          action: The action to execute:
+                - "type": Send input without a return key
+                - "type_submit": Send input followed by a return key
+                - "read": Read any new output without sending input
+                - "interrupt": Send a Ctrl+C (ETX character) to interrupt the current process
+                - "restart": Restart the bash session
+          input: The input to send to the shell.
+                Required for "type". Optional for "type_submit" actions. Must
+                not be provided for "restart", "read", or "interrupt" actions.
         Returns:
-          The output of the command.
+          The accumulated output of the shell.
         """
-        if not ((command is None) ^ (restart is None)):
-            raise ToolParsingError(
-                "Either 'command' or 'restart' must be specified, but not both."
-            )
-        params: dict[str, object] = {"command": command, "restart": restart}
+        # Validate parameters based on action
+        match action:
+            case "type":
+                if input is None:
+                    raise ToolParsingError(
+                        f"'input' is required for '{action}' action."
+                    )
+            case "restart" | "read" | "interrupt":
+                if input is not None:
+                    raise ToolParsingError(
+                        f"Do not provide 'input' with '{action}' action."
+                    )
-        sandbox = await tool_container_sandbox("bash session")
         store = store_as(BashSessionStore, instance=instance)
+        sandbox = await _get_sandbox(store)
         if not store.session_id:
             store.session_id = (
                 await exec_model_request(
-                    sandbox=sandbox,
-                    method="bash_session_new_session",
-                    params={},
-                    result_type=NewSessionResult,
-                    timeout=timeout,
+                    sandbox,
+                    "bash_session_new_session",
+                    {},
+                    NewSessionResult,
+                    TRANSPORT_TIMEOUT,
                 )
             ).session_name
-        params["session_name"] = store.session_id
+        timing: dict[str, object] = {
+            "wait_for_output": wait_for_output,
+            "idle_timeout": DEFAULT_IDLE_TIME,
+        }
+        action_specific: dict[str, dict[str, object]] = {
+            "type": {"input": input, **timing},
+            "type_submit": {"input": f"{input}\n", **timing},
+            "interrupt": {"input": "\u0003", **timing},
+            "read": timing,
+            "restart": {"restart": True},
+        }
+        result = await exec_scalar_request(
+            sandbox,
+            "bash_session",
+            {"session_name": store.session_id, **(action_specific[action])},
+            str,
+            timeout,
+        )
-        result = (
-            await exec_model_request(
-                sandbox=sandbox,
-                method="bash_session",
-                params=params,
-                result_type=BashResult,
-                timeout=timeout,
-            )
-        ).root
+        # Return the appropriate response
+        return (
+            "Bash session restarted."
+            if isinstance(result, BashRestartResult)
+            else result
+        )
-        if isinstance(result, BashRestartResult):
-            return "Bash session restarted."
+    return execute
-        # return output (including stderr if any)
-        return f"{result.stderr}\n{result.stdout}" if result.stderr else result.stdout
-    return execute
+async def _get_sandbox(store: BashSessionStore) -> SandboxEnvironment:
+    if not store.sandbox:
+        (sandbox, sandbox_version) = await tool_support_sandbox("bash session")
+        required_version = Version.parse("1.0.0")
+        if sandbox_version < required_version:
+            raise PrerequisiteError(
+                dedent(f"""
+                    The 'inspect-tool-support' version in your container is '{sandbox_version}'. The 'bash_session' tool requires version '{required_version}' or newer. Please update your container image to the latest version of 'inspect-tool-support'.
+                    """).strip()
+            )
+        store.sandbox = sandbox
+    return store.sandbox

inspect_ai/tool/_tools/_computer/_computer.py CHANGED Viewed

@@ -6,7 +6,31 @@ from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
 from inspect_ai.tool._tool_call import ToolCallModelInput, ToolCallModelInputHints
 from . import _common as common
-from ._resources.tool._constants import Action
+# this is duplicated from ._resources.tool._constants import Action
+# changes should be synchronized!
+Action = Literal[
+    "key",
+    "hold_key",
+    "type",
+    "cursor_position",
+    "mouse_move",
+    "left_mouse_down",
+    "left_mouse_up",
+    "left_click",
+    "left_click_drag",
+    "right_click",
+    "middle_click",
+    "back_click",
+    "forward_click",
+    "double_click",
+    "triple_click",
+    "scroll",
+    "wait",
+    "screenshot",
+]
 ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]

inspect_ai/tool/_tools/_execute.py CHANGED Viewed

@@ -96,7 +96,10 @@ def python(
           The output of the Python code.
         """
         result = await sandbox_env(sandbox).exec(
-            cmd=["python3"], input=code, timeout=timeout, user=user
+            cmd=["bash", "--login", "-c", "python3 -"],
+            input=code,
+            timeout=timeout,
+            user=user,
         )
         # return output (including stderr if any)
         output = ""

inspect_ai/tool/_tools/_text_editor.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pydantic import BaseModel, Discriminator, RootModel
 from inspect_ai.tool import ToolResult
 from inspect_ai.tool._tool_support_helpers import (
     exec_scalar_request,
-    tool_container_sandbox,
+    tool_support_sandbox,
 )
 from .._tool import Tool, tool
@@ -70,12 +70,13 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
     that a change made to a file by on Subtask will be visible to another Subtask.
     Args:
-      timeout: Timeout (in seconds) for command.
+      timeout: Timeout (in seconds) for command. Defaults to 180 if not provided.
       user: User to execute commands as.
     Returns:
       String with command output (stdout) or command error (stderr).
     """
+    timeout = timeout or 180
     async def execute(
         command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
@@ -101,7 +102,7 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
         Returns:
           The output of the command.
         """
-        sandbox = await tool_container_sandbox("editor")
+        (sandbox, _) = await tool_support_sandbox("editor")
         # Create a dictionary of the parameters
         params = {

inspect_ai/tool/_tools/_web_browser/_web_browser.py CHANGED Viewed

@@ -10,7 +10,7 @@ from inspect_ai.tool._tool_call import ToolCall, ToolCallContent, ToolCallView
 from inspect_ai.tool._tool_info import parse_tool_info
 from inspect_ai.tool._tool_support_helpers import (
     exec_model_request,
-    tool_container_sandbox,
+    tool_support_sandbox,
 )
 from inspect_ai.tool._tool_with import tool_with
 from inspect_ai.util._store_model import StoreModel, store_as
@@ -397,8 +397,10 @@ def web_browser_refresh(instance: str | None = None) -> Tool:
 async def _web_browser_cmd(
     tool_name: str, instance: str | None, params: dict[str, object]
 ) -> ToolResult:
+    # TODO: Is it worth it to plumb this down from the @tool?
+    timeout = 180
     try:
-        sandbox_env = await tool_container_sandbox("web browser")
+        (sandbox_env, _) = await tool_support_sandbox("web browser")
     except PrerequisiteError as e:
         # The user may have the old, incompatible, sandbox. If so, use that and
         # execute the old compatible code.
@@ -419,13 +421,18 @@ async def _web_browser_cmd(
                 method="web_new_session",
                 params={"headful": False},
                 result_type=NewSessionResult,
+                timeout=timeout,
             )
         ).session_name
     params["session_name"] = store.session_id
     crawler_result = await exec_model_request(
-        sandbox=sandbox_env, method=tool_name, params=params, result_type=CrawlerResult
+        sandbox=sandbox_env,
+        method=tool_name,
+        params=params,
+        result_type=CrawlerResult,
+        timeout=timeout,
     )
     if crawler_result.error and crawler_result.error.strip() != "":
         raise ToolError(crawler_result.error)

inspect_ai/util/__init__.py CHANGED Viewed

@@ -1,6 +1,14 @@
 from inspect_ai._util.registry import RegistryType, registry_create
 from inspect_ai._util.trace import trace_action, trace_message
+from inspect_ai.util._limit import (
+    Limit,
+    LimitExceededError,
+    apply_limits,
+    message_limit,
+    token_limit,
+)
+from ._collect import collect
 from ._concurrency import concurrency
 from ._console import input_screen
 from ._display import DisplayType, display_counter, display_type
@@ -21,6 +29,7 @@ from ._sandbox import (
     sandbox_with,
     sandboxenv,
 )
+from ._span import span
 from ._store import Store, store
 from ._store_model import StoreModel, store_as
 from ._subprocess import (
@@ -31,6 +40,7 @@ from ._subtask import Subtask, subtask
 from ._throttle import throttle
 __all__ = [
+    "apply_limits",
     "ExecResult",
     "concurrency",
     "DisplayType",
@@ -42,9 +52,12 @@ __all__ = [
     "JSONType",
     "JSONSchema",
     "json_schema",
+    "Limit",
+    "message_limit",
     "OutputLimitExceededError",
     "resource",
     "subprocess",
+    "LimitExceededError",
     "SandboxEnvironment",
     "SandboxEnvironmentConfigType",
     "SandboxEnvironmentLimits",
@@ -60,9 +73,12 @@ __all__ = [
     "store",
     "StoreModel",
     "store_as",
+    "span",
+    "collect",
     "Subtask",
     "subtask",
     "throttle",
+    "token_limit",
     "trace_action",
     "trace_message",
     "RegistryType",

inspect_ai/util/_anyio.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import itertools
 import sys
+import anyio
+from inspect_ai._util._async import current_async_backend
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -36,3 +40,10 @@ def _flatten_exception(exc: Exception) -> list[Exception]:
     ]
     return maybe_this_exception + other_exceptions
+def safe_current_task_id() -> int | None:
+    if current_async_backend() is not None:
+        return anyio.get_current_task().id
+    else:
+        return None

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl