PyPI - inspect-ai - Versions diffs - 0.3.90__py3-none-any.whl → 0.3.91__py3-none-any.whl - Mend

inspect-ai 0.3.90py3-none-any.whl → 0.3.91py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (370) hide show

inspect_ai/agent/_handoff.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Sequence
 from inspect_ai._util.registry import (
     RegistryInfo,
@@ -6,7 +6,8 @@ from inspect_ai._util.registry import (
     registry_unqualified_name,
     set_registry_info,
 )
-from inspect_ai.tool._tool import Tool, ToolResult
+from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
+from inspect_ai.tool._tool_def import ToolDef
 from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
 from ._agent import Agent
@@ -86,7 +87,9 @@ class AgentTool(Tool):
         raise RuntimeError("AgentTool should not be called directly")
-def has_handoff(tools: list[Tool] | None) -> bool:
+def has_handoff(
+    tools: Sequence[Tool | ToolDef | ToolSource] | None,
+) -> bool:
     if tools:
         return any([isinstance(tool, AgentTool) for tool in tools])
     else:

inspect_ai/agent/_human/agent.py CHANGED Viewed

@@ -18,6 +18,7 @@ def human_cli(
     answer: bool | str = True,
     intermediate_scoring: bool = False,
     record_session: bool = True,
+    user: str | None = None,
 ) -> Agent:
     """Human CLI agent for tasks that run in a sandbox.
@@ -37,6 +38,7 @@ def human_cli(
           that the answer matches the expected format.
        intermediate_scoring: Allow the human agent to check their score while working.
        record_session: Record all user commands and outputs in the sandbox bash session.
+       user: User to login as. Defaults to the sandbox environment's default user.
     Returns:
        Agent: Human CLI agent.
@@ -48,7 +50,7 @@ def human_cli(
         async with agent_lock:
             # ensure that we have a sandbox to work with
             try:
-                connection = await sandbox().connection()
+                connection = await sandbox().connection(user=user)
             except ProcessLookupError:
                 raise RuntimeError("Human agent must run in a task with a sandbox.")
             except NotImplementedError:
@@ -66,13 +68,13 @@ def human_cli(
                     )
                     # install agent tools
-                    await install_human_agent(commands, record_session)
+                    await install_human_agent(user, commands, record_session)
                     # hookup the view ui
                     view.connect(connection)
                     # run sandbox service
-                    return await run_human_agent_service(state, commands, view)
+                    return await run_human_agent_service(user, state, commands, view)
             # support both fullscreen ui and fallback
             if display_type() == "full":

inspect_ai/agent/_human/install.py CHANGED Viewed

@@ -17,7 +17,9 @@ RECORD_SESSION_DIR = "/var/tmp/user-sessions"
 async def install_human_agent(
-    commands: list[HumanAgentCommand], record_session: bool
+    user: str | None,
+    commands: list[HumanAgentCommand],
+    record_session: bool,
 ) -> None:
     # see if we have already installed
     if not (await sandbox().exec(["mkdir", HUMAN_AGENT_DIR])).success:
@@ -35,7 +37,7 @@ async def install_human_agent(
     await checked_write_file(f"{INSTALL_DIR}/{BASHRC}", bash_rc, executable=True)
     # write and run installation script
-    install_sh = human_agent_install_sh()
+    install_sh = human_agent_install_sh(user)
     await checked_write_file(f"{INSTALL_DIR}/{INSTALL_SH}", install_sh, executable=True)
     await checked_exec(["bash", f"./{INSTALL_SH}"], cwd=INSTALL_DIR)
     await checked_exec(["rm", "-rf", INSTALL_DIR])
@@ -177,8 +179,8 @@ def human_agent_bashrc(commands: list[HumanAgentCommand], record_session: bool)
     INSTRUCTIONS = dedent("""
     if [ -z "$INSTRUCTIONS_SHOWN" ]; then
         export INSTRUCTIONS_SHOWN=1
-        task instructions > instructions.txt
-        cat instructions.txt
+        task instructions > ~/instructions.txt
+        cat ~/instructions.txt
     fi
     """).lstrip()
@@ -190,7 +192,7 @@ def human_agent_bashrc(commands: list[HumanAgentCommand], record_session: bool)
     return "\n".join([TERMINAL_CHECK, COMMANDS, RECORDING, INSTRUCTIONS, CLOCK])
-def human_agent_install_sh() -> str:
+def human_agent_install_sh(user: str | None) -> str:
     return dedent(f"""
     #!/usr/bin/env bash
@@ -201,8 +203,15 @@ def human_agent_install_sh() -> str:
     # copy command script
     cp {TASK_PY} $HUMAN_AGENT
-    # append to .bashrc
-    cat {BASHRC} >> ~/{BASHRC}
+    # get user's home directory
+    USER="{user or ""}"
+    if [ -z "$USER" ]; then
+        USER=$(whoami)
+    fi
+    USER_HOME=$(getent passwd $USER | cut -d: -f6)
+    # append to user's .bashrc
+    cat {BASHRC} >> $USER_HOME/{BASHRC}
     """)

inspect_ai/agent/_human/panel.py CHANGED Viewed

@@ -35,6 +35,7 @@ class HumanAgentPanel(InputPanel):
     VSCODE_LINKS_ID = "vscode-links"
     LOGIN_VSCODE_TERMINAL_ID = "login-vscode-terminal"
     LOGIN_VSCODE_WINDOW_ID = "login-vscode-window"
+    LOGIN_VSCODE_WINDOW_LABEL_ID = "login-vscode-window-label"
     COMMAND_INSTRUCTIONS_ID = "command-instructions"
     SANDBOX_COMMAND_ID = "sandbox-command"
@@ -88,7 +89,11 @@ class HumanAgentPanel(InputPanel):
                     markup=False,
                 )
                 with Horizontal(id=self.VSCODE_LINKS_ID):
-                    yield Label("Login:", classes=self.LINK_LABEL_CLASS)
+                    yield Label(
+                        "Login:",
+                        classes=self.LINK_LABEL_CLASS,
+                        id=self.LOGIN_VSCODE_WINDOW_LABEL_ID,
+                    )
                     yield VSCodeLink(
                         "VS Code Window",
                         id=self.LOGIN_VSCODE_WINDOW_ID,
@@ -146,6 +151,14 @@ class HumanAgentPanel(InputPanel):
             window_btn = cast(
                 VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_WINDOW_ID}")
             )
+            window_lbl = cast(
+                Label, self.query_one(f"#{self.LOGIN_VSCODE_WINDOW_LABEL_ID}")
+            )
+            window_btn_and_lbl_display = (
+                vscode and connection.vscode_command is not None
+            )
+            window_btn.display = window_btn_and_lbl_display
+            window_lbl.display = window_btn_and_lbl_display
             if connection.vscode_command is not None:
                 window_btn.commands = [
                     VSCodeCommand(

inspect_ai/agent/_human/service.py CHANGED Viewed

@@ -10,7 +10,10 @@ from .view import HumanAgentView
 async def run_human_agent_service(
-    state: AgentState, commands: list[HumanAgentCommand], view: HumanAgentView | None
+    user: str | None,
+    state: AgentState,
+    commands: list[HumanAgentCommand],
+    view: HumanAgentView | None,
 ) -> AgentState:
     # initialise agent state
     instructions = "\n\n".join([message.text for message in state.messages]).strip()
@@ -39,6 +42,7 @@ async def run_human_agent_service(
         methods=methods,
         until=task_is_completed,
         sandbox=sandbox(),
+        user=user,
     )
     # set the answer if we have one

inspect_ai/agent/_react.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from logging import getLogger
-from typing import Literal, cast
+from typing import Literal, Sequence, cast
 from inspect_ai._util._async import is_callable_coroutine
 from inspect_ai.model._call_tools import execute_tools
@@ -13,9 +13,10 @@ from inspect_ai.model._chat_message import (
 from inspect_ai.model._model import Model, get_model
 from inspect_ai.model._trim import trim_messages
 from inspect_ai.scorer._score import score
-from inspect_ai.tool._tool import Tool, ToolResult, tool
+from inspect_ai.tool._mcp.connection import mcp_connection
+from inspect_ai.tool._tool import Tool, ToolResult, ToolSource, tool
+from inspect_ai.tool._tool_def import ToolDef
 from inspect_ai.tool._tool_info import parse_tool_info
-from inspect_ai.tool._tool_with import tool_with
 from ._agent import Agent, AgentState, agent, agent_with
 from ._filter import MessageFilter
@@ -37,7 +38,7 @@ def react(
     name: str | None = None,
     description: str | None = None,
     prompt: str | AgentPrompt | None = AgentPrompt(),
-    tools: list[Tool] | None = None,
+    tools: Sequence[Tool | ToolDef | ToolSource] | None = None,
     model: str | Model | Agent | None = None,
     attempts: int | AgentAttempts = 1,
     submit: AgentSubmit = AgentSubmit(),
@@ -88,6 +89,31 @@ def react(
     Returns:
         ReAct agent.
     """
+    # default submit tool
+    @tool(name="submit")
+    def default_submit_tool() -> Tool:
+        async def execute(answer: str) -> ToolResult:
+            """Submit an answer for evaluation.
+            Args:
+              answer (str): Submitted answer
+            """
+            return answer
+        return execute
+    # resolve tools
+    tools = list(tools) if tools is not None else []
+    # resolve submit tool
+    submit_tool = ToolDef(
+        submit.tool or default_submit_tool(),
+        name=submit.name,
+        description=submit.description,
+    )
+    tools.append(submit_tool)
     # resolve prompt / system message
     prompt = AgentPrompt(prompt) if isinstance(prompt, str) else prompt
     if prompt:
@@ -98,7 +124,7 @@ def react(
             prompt_lines.append(prompt.handoff_prompt)
         if prompt.assistant_prompt:
             prompt_lines.append(prompt.assistant_prompt)
-        prompt_content = "\n\n".join(prompt_lines).format(submit=submit.name)
+        prompt_content = "\n\n".join(prompt_lines).format(submit=submit_tool.name)
         system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
     else:
         system_message = None
@@ -106,151 +132,146 @@ def react(
     # resolve attempts
     attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
-    # submission tool
-    @tool
-    def submit_tool() -> Tool:
-        async def execute(answer: str) -> ToolResult:
-            """Submit an answer for evaluation.
-            Args:
-              answer (str): Submitted answer
-            """
-            return answer
-        return execute
-    # helper to extract a submitted answer
     def submission(tool_results: list[ChatMessage]) -> str | None:
         return next(
             (
                 result.text
                 for result in tool_results
                 if isinstance(result, ChatMessageTool)
-                and result.function == submit.name
+                and result.function == submit_tool.name
             ),
             None,
         )
-    # resolve tools
-    tools = tools or []
-    tools.append(tool_with(submit_tool(), submit.name, submit.description))
     async def execute(state: AgentState) -> AgentState:
-        # prepend system message if we have one
-        if system_message:
-            state.messages.insert(0, system_message)
-        # resolve overflow handling
-        if truncation == "auto":
-            overflow = cast(MessageFilter | None, trim_messages)
-        elif truncation == "disabled":
-            overflow = None
-        else:
-            overflow = truncation
-        # track attempts
-        attempt_count = 0
-        # main loop = will terminate after submit (subject to max_attempts)
-        # or if a message or token limit is hit
-        while True:
-            # generate output and append assistant message
-            state = await _agent_generate(model, state, tools)
-            # check for context window overflow
-            if state.output.stop_reason == "model_length":
-                from inspect_ai.log._transcript import transcript
-                if overflow is not None:
-                    previous_messages = state.messages[:-1]
-                    state.messages = await overflow(previous_messages)
-                    if len(state.messages) < len(previous_messages):
-                        transcript().info(
-                            "Agent exceeded model context window, truncating messages and continuing."
+        async with mcp_connection(tools):
+            # prepend system message if we have one
+            if system_message:
+                state.messages.insert(0, system_message)
+            # resolve overflow handling
+            if truncation == "auto":
+                overflow = cast(MessageFilter | None, trim_messages)
+            elif truncation == "disabled":
+                overflow = None
+            else:
+                overflow = truncation
+            # track attempts
+            attempt_count = 0
+            # main loop = will terminate after submit (subject to max_attempts)
+            # or if a message or token limit is hit
+            while True:
+                # generate output and append assistant message
+                state = await _agent_generate(model, state, tools)
+                # check for context window overflow
+                if state.output.stop_reason == "model_length":
+                    from inspect_ai.log._transcript import transcript
+                    if overflow is not None:
+                        previous_messages = state.messages[:-1]
+                        state.messages = await overflow(previous_messages)
+                        if len(state.messages) < len(previous_messages):
+                            transcript().info(
+                                "Agent exceeded model context window, truncating messages and continuing."
+                            )
+                            continue
+                    # no overflow policy or overflow didn't reduce conversation length
+                    transcript().info("Agent terminated: model context window exceeded")
+                    break
+                # resolve tool calls (if any)
+                if state.output.message.tool_calls:
+                    # call tool functions
+                    messages, output = await execute_tools(state.messages, tools)
+                    state.messages.extend(messages)
+                    if output:
+                        state.output = output
+                    # check for a submission
+                    answer = submission(messages)
+                    if answer is not None:
+                        # set the output to the answer for scoring
+                        state.output.completion = (
+                            f"{state.output.completion}\n\n{answer}".strip()
                         )
-                        continue
-                # no overflow policy or overflow didn't reduce conversation length
-                transcript().info("Agent terminated: model context window exceeded")
-                break
-            # resolve tool calls (if any)
-            if state.output.message.tool_calls:
-                # call tool functions
-                messages, output = await execute_tools(state.messages, tools)
-                state.messages.extend(messages)
-                if output:
-                    state.output = output
-                # check for a submission
-                answer = submission(messages)
-                if answer is not None:
-                    # set the output to the answer for scoring
-                    state.output.completion = (
-                        f"{state.output.completion}\n\n{answer}".strip()
-                    )
-                    # exit if we are at max_attempts
-                    attempt_count += 1
-                    if attempt_count >= attempts.attempts:
-                        break
+                        # exit if we are at max_attempts
+                        attempt_count += 1
+                        if attempt_count >= attempts.attempts:
+                            break
-                    # exit if the submission is successful
-                    answer_scores = await score(state)
-                    if attempts.score_value(answer_scores[0].value) == 1.0:
-                        break
+                        # exit if the submission is successful
+                        answer_scores = await score(state)
+                        if attempts.score_value(answer_scores[0].value) == 1.0:
+                            break
-                    # otherwise notify the model that it was incorrect and continue
-                    else:
-                        if callable(attempts.incorrect_message):
-                            if not is_callable_coroutine(attempts.incorrect_message):
-                                raise ValueError(
-                                    "The incorrect_message function must be async."
+                        # otherwise notify the model that it was incorrect and continue
+                        else:
+                            if callable(attempts.incorrect_message):
+                                if not is_callable_coroutine(
+                                    attempts.incorrect_message
+                                ):
+                                    raise ValueError(
+                                        "The incorrect_message function must be async."
+                                    )
+                                response_message: str = (
+                                    await attempts.incorrect_message(
+                                        state, answer_scores
+                                    )
                                 )
-                            response_message: str = await attempts.incorrect_message(
-                                state, answer_scores
+                            else:
+                                response_message = attempts.incorrect_message
+                            state.messages.append(
+                                ChatMessageUser(content=response_message)
                             )
-                        else:
-                            response_message = attempts.incorrect_message
-                        state.messages.append(ChatMessageUser(content=response_message))
-            # call the on_continue hook (if any)
-            if callable(on_continue):
-                if not is_callable_coroutine(on_continue):
-                    raise ValueError("The on_continue function must be async.")
-                do_continue = await cast(AgentContinue, on_continue)(state)
-                if do_continue is True:
-                    # if there were no tool calls we need to send back a user message
-                    if not state.output.message.tool_calls:
+                # call the on_continue hook (if any)
+                if callable(on_continue):
+                    if not is_callable_coroutine(on_continue):
+                        raise ValueError("The on_continue function must be async.")
+                    do_continue = await cast(AgentContinue, on_continue)(state)
+                    if do_continue is True:
+                        # if there were no tool calls we need to send back a user message
+                        if not state.output.message.tool_calls:
+                            state.messages.append(
+                                ChatMessageUser(
+                                    content=DEFAULT_CONTINUE_PROMPT.format(
+                                        submit=submit_tool.name
+                                    )
+                                )
+                            )
+                    elif isinstance(do_continue, str):
                         state.messages.append(
                             ChatMessageUser(
-                                content=DEFAULT_CONTINUE_PROMPT.format(
-                                    submit=submit.name
-                                )
+                                content=do_continue.format(submit=submit_tool.name)
                             )
                         )
-                elif isinstance(do_continue, str):
+                    else:  # do_continue is False
+                        break
+                # if there is no on_continue hook then add a user message if there were no tool calls
+                elif not state.output.message.tool_calls:
+                    continue_msg = (
+                        DEFAULT_CONTINUE_PROMPT
+                        if on_continue is None
+                        else str(on_continue)
+                    )
                     state.messages.append(
-                        ChatMessageUser(content=do_continue.format(submit=submit.name))
+                        ChatMessageUser(
+                            content=continue_msg.format(submit=submit_tool.name)
+                        )
                     )
-                else:  # do_continue is False
-                    break
-            # if there is no on_continue hook then add a user message if there were no tool calls
-            elif not state.output.message.tool_calls:
-                continue_msg = (
-                    DEFAULT_CONTINUE_PROMPT if on_continue is None else str(on_continue)
-                )
-                state.messages.append(
-                    ChatMessageUser(content=continue_msg.format(submit=submit.name))
-                )
-        # once we are complete, remove submit tool calls from the history
-        # (as they will potentially confuse parent agents who also have
-        # their own submit tools that they are 'watching' for)
-        state.messages = _remove_submit_tool(state.messages, submit.name)
-        return state
+            # once we are complete, remove submit tool calls from the history
+            # (as they will potentially confuse parent agents who also have
+            # their own submit tools that they are 'watching' for)
+            state.messages = _remove_submit_tool(state.messages, submit_tool.name)
+            return state
     if name is not None or description is not None:
         return agent_with(execute, name=name, description=description)
@@ -259,12 +280,24 @@ def react(
 async def _agent_generate(
-    model: str | Model | Agent | None, state: AgentState, tools: list[Tool]
+    model: str | Model | Agent | None,
+    state: AgentState,
+    tools: Sequence[Tool | ToolDef | ToolSource],
 ) -> AgentState:
     # convert model to agent
     if isinstance(model, str | Model) or model is None:
         model = _model_generate(model)
+    # resolve tools
+    resolved_tools: list[Tool] = []
+    for t in tools:
+        if isinstance(t, ToolSource):
+            resolved_tools.extend(await t.tools())
+        elif isinstance(t, ToolDef):
+            resolved_tools.append(t.as_tool())
+        else:
+            resolved_tools.append(t)
     # confirm we have a tools param
     agent_tool_info = parse_tool_info(model)
     if "tools" not in agent_tool_info.parameters.properties:
@@ -273,7 +306,7 @@ async def _agent_generate(
         )
     # call the agent
-    return await model(state, tools)
+    return await model(state, resolved_tools)
 def _model_generate(model: str | Model | None) -> Agent:

inspect_ai/agent/_types.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Awaitable, Callable, NamedTuple, TypeAlias
 from inspect_ai.agent._agent import AgentState
 from inspect_ai.scorer._metric import Score, ValueToFloat, value_to_float
+from inspect_ai.tool._tool import Tool
 DEFAULT_HANDOFF_PROMPT = """
 You are part of a multi-agent system designed to make agent coordination and
@@ -80,8 +81,18 @@ class AgentAttempts(NamedTuple):
 class AgentSubmit(NamedTuple):
     """Configure the submit tool of a react agent."""
-    name: str = "submit"
-    """Name for submit tool."""
+    name: str | None = None
+    """Name for submit tool (defaults to 'submit')."""
-    description: str = "Submit an answer for evaluation."
-    """Description of submit tool."""
+    description: str | None = None
+    """Description of submit tool (defaults to 'Submit an answer for evaluation')."""
+    tool: Tool | None = None
+    """Alternate implementation for submit tool.
+    The tool can provide its `name` and `description` internally,
+    or these values can be overriden by the `name` and `description`
+    fields in `AgentSubmit`
+    The tool should return the `answer` provided to it for scoring.
+    """

inspect_ai/approval/_policy.py CHANGED Viewed

@@ -2,7 +2,7 @@ import fnmatch
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Generator, cast
+from typing import Any, Generator
 from pydantic import BaseModel, Field, model_validator
@@ -140,7 +140,7 @@ def approval_policies_from_config(
     def create_approval_policy(
         name: str, tools: str | list[str], params: dict[str, Any] = {}
     ) -> ApprovalPolicy:
-        approver = cast(Approver, registry_create("approver", name, **params))
+        approver = registry_create("approver", name, **params)
         return ApprovalPolicy(approver, tools)
     # map config -> policy

inspect-ai 0.3.90__py3-none-any.whl → 0.3.91__py3-none-any.whl

inspect-ai 0.3.90py3-none-any.whl → 0.3.91py3-none-any.whl