PyPI - inspect-ai - Versions diffs - 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl - Mend

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import os
+import socket
 from logging import getLogger
 from typing import Any
+import httpx
 from openai import (
+    DEFAULT_CONNECTION_LIMITS,
+    DEFAULT_TIMEOUT,
     APIConnectionError,
     APITimeoutError,
     AsyncAzureOpenAI,
@@ -102,6 +106,9 @@ class OpenAIAPI(ModelAPI):
                         ],
                     )
+        # create async http client
+        http_client = OpenAIAsyncHttpxClient()
         # azure client
         if self.is_azure():
             # resolve base_url
@@ -126,6 +133,7 @@ class OpenAIAPI(ModelAPI):
                 max_retries=(
                     config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
                 ),
+                http_client=http_client,
                 **model_args,
             )
         else:
@@ -135,6 +143,7 @@ class OpenAIAPI(ModelAPI):
                 max_retries=(
                     config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
                 ),
+                http_client=http_client,
                 **model_args,
             )
@@ -231,6 +240,16 @@ class OpenAIAPI(ModelAPI):
                     ModelUsage(
                         input_tokens=completion.usage.prompt_tokens,
                         output_tokens=completion.usage.completion_tokens,
+                        input_tokens_cache_read=(
+                            completion.usage.prompt_tokens_details.cached_tokens
+                            if completion.usage.prompt_tokens_details is not None
+                            else None  # openai only have cache read stats/pricing.
+                        ),
+                        reasoning_tokens=(
+                            completion.usage.completion_tokens_details.reasoning_tokens
+                            if completion.usage.completion_tokens_details is not None
+                            else None
+                        ),
                         total_tokens=completion.usage.total_tokens,
                     )
                     if completion.usage
@@ -250,10 +269,8 @@ class OpenAIAPI(ModelAPI):
     def is_rate_limit(self, ex: BaseException) -> bool:
         if isinstance(ex, RateLimitError):
             # Do not retry on these rate limit errors
-            if (
-                "Request too large" not in ex.message
-                and "You exceeded your current quota" not in ex.message
-            ):
+            # The quota exceeded one is related to monthly account quotas.
+            if "You exceeded your current quota" not in ex.message:
                 return True
         elif isinstance(
             ex, (APIConnectionError | APITimeoutError | InternalServerError)
@@ -342,3 +359,39 @@ class OpenAIAPI(ModelAPI):
             )
         else:
             return e
+class OpenAIAsyncHttpxClient(httpx.AsyncClient):
+    """Custom async client that deals better with long running Async requests.
+    Based on Anthropic DefaultAsyncHttpClient implementation that they
+    released along with Claude 3.7 as well as the OpenAI DefaultAsyncHttpxClient
+    """
+    def __init__(self, **kwargs: Any) -> None:
+        # This is based on the openai DefaultAsyncHttpxClient:
+        # https://github.com/openai/openai-python/commit/347363ed67a6a1611346427bb9ebe4becce53f7e
+        kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
+        kwargs.setdefault("limits", DEFAULT_CONNECTION_LIMITS)
+        kwargs.setdefault("follow_redirects", True)
+        # This is based on the anthrpopic changes for claude 3.7:
+        # https://github.com/anthropics/anthropic-sdk-python/commit/c5387e69e799f14e44006ea4e54fdf32f2f74393#diff-3acba71f89118b06b03f2ba9f782c49ceed5bb9f68d62727d929f1841b61d12bR1387-R1403
+        # set socket options to deal with long running reasoning requests
+        socket_options = [
+            (socket.SOL_SOCKET, socket.SO_KEEPALIVE, True),
+            (socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 60),
+            (socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 5),
+        ]
+        TCP_KEEPIDLE = getattr(socket, "TCP_KEEPIDLE", None)
+        if TCP_KEEPIDLE is not None:
+            socket_options.append((socket.IPPROTO_TCP, TCP_KEEPIDLE, 60))
+        kwargs["transport"] = httpx.AsyncHTTPTransport(
+            limits=DEFAULT_CONNECTION_LIMITS,
+            socket_options=socket_options,
+        )
+        super().__init__(**kwargs)

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -69,6 +69,16 @@ async def generate_o1(
         usage=ModelUsage(
             input_tokens=completion.usage.prompt_tokens,
             output_tokens=completion.usage.completion_tokens,
+            input_tokens_cache_read=(
+                completion.usage.prompt_tokens_details.cached_tokens
+                if completion.usage.prompt_tokens_details is not None
+                else None  # openai only have cache read stats/pricing.
+            ),
+            reasoning_tokens=(
+                completion.usage.completion_tokens_details.reasoning_tokens
+                if completion.usage.completion_tokens_details is not None
+                else None
+            ),
             total_tokens=completion.usage.total_tokens,
         )
         if completion.usage

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -48,7 +48,7 @@ def openai() -> type[ModelAPI]:
 def anthropic() -> type[ModelAPI]:
     FEATURE = "Anthropic API"
     PACKAGE = "anthropic"
-    MIN_VERSION = "0.29.0"
+    MIN_VERSION = "0.47.1"
     # verify we have the package
     try:

inspect_ai/model/_reasoning.py CHANGED Viewed

@@ -5,13 +5,26 @@ from typing import NamedTuple
 class ContentWithReasoning(NamedTuple):
     content: str
     reasoning: str
+    signature: str | None = None
+    redacted: bool = False
 def parse_content_with_reasoning(content: str) -> ContentWithReasoning | None:
-    match = re.match(r"\s*<think>(.*?)</think>(.*)", content, re.DOTALL)
+    # Match <think> tag with optional attributes
+    pattern = r'\s*<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>(.*)'
+    match = re.match(pattern, content, re.DOTALL)
     if match:
+        signature = match.group(1)  # This will be None if not present
+        redacted_value = match.group(2)  # This will be "true" or None
+        reasoning = match.group(3).strip()
+        content_text = match.group(4).strip()
         return ContentWithReasoning(
-            content=match.group(2).strip(), reasoning=match.group(1).strip()
+            content=content_text,
+            reasoning=reasoning,
+            signature=signature,
+            redacted=redacted_value == "true",
         )
     else:
         return None

inspect_ai/scorer/_model.py CHANGED Viewed

@@ -274,25 +274,29 @@ def chat_history(state: TaskState) -> str:
     # begin history with text of first message (it will come right after
     # 'Task' or 'Question' in the template)
-    history: list[str] = [messages[0].text]
-    # for subsequent messages present with e.g. Assistant: {message.text}
-    for message in messages[1:]:
-        if isinstance(message, ChatMessageUser):
-            history.append(f"User: {message.text}")
-        elif isinstance(message, ChatMessageAssistant):
-            assistant_message = [message.text] if message.text else []
-            if message.tool_calls:
-                assistant_message.extend(
-                    [
-                        format_function_call(tool_call.function, tool_call.arguments)
-                        for tool_call in message.tool_calls
-                    ]
+    history: list[str] = []
+    if len(messages) > 0:
+        history.append(messages[0].text)
+        # for subsequent messages present with e.g. Assistant: {message.text}
+        for message in messages[1:]:
+            if isinstance(message, ChatMessageUser):
+                history.append(f"User: {message.text}")
+            elif isinstance(message, ChatMessageAssistant):
+                assistant_message = [message.text] if message.text else []
+                if message.tool_calls:
+                    assistant_message.extend(
+                        [
+                            format_function_call(
+                                tool_call.function, tool_call.arguments
+                            )
+                            for tool_call in message.tool_calls
+                        ]
+                    )
+                history.append("Assistant: " + "\n\n".join(assistant_message))
+            elif isinstance(message, ChatMessageTool):
+                history.append(
+                    f"Tool ({message.function}): {message.tool_error or ''}{message.text}"
                 )
-            history.append("Assistant: " + "\n\n".join(assistant_message))
-        elif isinstance(message, ChatMessageTool):
-            history.append(
-                f"Tool ({message.function}): {message.tool_error or ''}{message.text}"
-            )
     return "\n\n".join(history)

inspect_ai/solver/_human_agent/agent.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import asyncio
+from typing import cast
 from inspect_ai.util import display_type, input_panel, sandbox
+from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
 from .._solver import Generate, Solver, solver
 from .._task_state import TaskState
@@ -56,19 +58,21 @@ def human_agent(
             # helper function to run the agent (called for fullscreen vs. fallback below)
             async def run_human_agent(view: HumanAgentView) -> TaskState:
-                # create agent commands
-                commands = human_agent_commands(
-                    state, answer, intermediate_scoring, record_session
-                )
+                sandbox_proxy = cast(SandboxEnvironmentProxy, sandbox())
+                with sandbox_proxy.no_events():
+                    # create agent commands
+                    commands = human_agent_commands(
+                        state, answer, intermediate_scoring, record_session
+                    )
-                # install agent tools
-                await install_human_agent(state, commands, record_session)
+                    # install agent tools
+                    await install_human_agent(state, commands, record_session)
-                # hookup the view ui
-                view.connect(connection)
+                    # hookup the view ui
+                    view.connect(connection)
-                # run sandbox service
-                return await run_human_agent_service(state, commands, view)
+                    # run sandbox service
+                    return await run_human_agent_service(state, commands, view)
             # support both fullscreen ui and fallback
             if display_type() == "full":

inspect_ai/solver/_human_agent/commands/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from .instructions import InstructionsCommand
 from .note import NoteCommand
 from .score import ScoreCommand
 from .status import StatusCommand
-from .submit import SubmitCommand, ValidateCommand
+from .submit import QuitCommand, SubmitCommand, ValidateCommand
 def human_agent_commands(
@@ -15,8 +15,12 @@ def human_agent_commands(
     intermediate_scoring: bool,
     record_session: bool,
 ) -> list[HumanAgentCommand]:
-    # base submit and validate
-    commands = [SubmitCommand(record_session), ValidateCommand(answer)]
+    # base submit, validate, and quit
+    commands = [
+        SubmitCommand(record_session),
+        ValidateCommand(answer),
+        QuitCommand(record_session),
+    ]
     # optional intermediate scoring
     if intermediate_scoring:

inspect_ai/solver/_human_agent/commands/submit.py CHANGED Viewed

@@ -16,22 +16,89 @@ from .command import HumanAgentCommand, call_human_agent
 logger = getLogger(__name__)
-class SubmitCommand(HumanAgentCommand):
+class SessionEndCommand(HumanAgentCommand):
     def __init__(self, record_session: bool):
         super().__init__()
         self._record_session = record_session
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    async def _read_session_logs(self) -> dict[str, str]:
+        # retreive session logs (don't fail)
+        sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
+        result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
+        if not result.success:
+            logger.warning(f"Error listing human agent session logs: {result.stderr}")
+            return {}
+        # read logs
+        session_logs: dict[str, str] = {}
+        for session_log in result.stdout.strip().splitlines():
+            try:
+                session_logs[session_log] = await sandbox().read_file(
+                    (sessions_dir / session_log).as_posix()
+                )
+            except Exception as ex:
+                logger.warning(f"Error reading human agent session log: {ex}")
+        return session_logs
+class QuitCommand(SessionEndCommand):
     @property
     def name(self) -> str:
-        return "submit"
+        return "quit"
     @property
     def description(self) -> str:
-        return "Submit your final answer for the task."
+        return "Quit the task without submitting an answer."
+    def cli(self, args: Namespace) -> None:
+        # verify that the user wants to proceed
+        action = "quit the task without submitting an answer (ending the exercise)"
+        while True:
+            response = (
+                input(
+                    f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
+                )
+                .lower()
+                .strip()
+            )
+            if response in ["yes", "y"]:
+                break
+            elif response in ["no", "n"]:
+                return
+            else:
+                print("Please enter yes or no.")
+        # thank the user!
+        print(
+            "\nThank you for working on this task!\n\n"
+            + "Your task will now be scored and you will be disconnected from this container.\n"
+        )
+        call_human_agent("quit")
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def submit() -> None:
+            if self._record_session:
+                state.logs = await self._read_session_logs()
+            state.running = False
+            state.answer = ""
+        return submit
+class SubmitCommand(SessionEndCommand):
     @property
-    def group(self) -> Literal[1, 2, 3]:
-        return 1
+    def name(self) -> str:
+        return "submit"
+    @property
+    def description(self) -> str:
+        return "Submit your final answer for the task."
     @property
     def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
@@ -55,10 +122,12 @@ class SubmitCommand(HumanAgentCommand):
         # verify that the user wants to proceed
         answer = call_args.get("answer", None)
         answer_text = f" '{answer}'" if answer else ""
+        action = f"end the task and submit{answer_text}"
         while True:
             response = (
                 input(
-                    f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
+                    f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
                 )
                 .lower()
                 .strip()
@@ -76,13 +145,10 @@ class SubmitCommand(HumanAgentCommand):
             + "Your task will now be scored and you will be disconnected from this container.\n"
         )
-        # submit the task
         call_human_agent("submit", **call_args)
     def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
-        async def submit(
-            answer: str | None, session_logs: dict[str, str] | None = None
-        ) -> None:
+        async def submit(answer: str) -> None:
             if self._record_session:
                 state.logs = await self._read_session_logs()
             state.running = False
@@ -90,26 +156,6 @@ class SubmitCommand(HumanAgentCommand):
         return submit
-    async def _read_session_logs(self) -> dict[str, str]:
-        # retreive session logs (don't fail)
-        sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
-        result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
-        if not result.success:
-            logger.warning(f"Error listing human agent session logs: {result.stderr}")
-            return {}
-        # read logs
-        session_logs: dict[str, str] = {}
-        for session_log in result.stdout.strip().splitlines():
-            try:
-                session_logs[session_log] = await sandbox().read_file(
-                    (sessions_dir / session_log).as_posix()
-                )
-            except Exception as ex:
-                logger.warning(f"Error reading human agent session log: {ex}")
-        return session_logs
 class ValidateCommand(HumanAgentCommand):
     def __init__(self, answer: bool | str) -> None:

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from inspect_ai._util.content import (
     Content,
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -41,6 +42,7 @@ __all__ = [
     "Content",
     "ContentAudio",
     "ContentImage",
+    "ContentReasoning",
     "ContentText",
     "ContentVideo",
     "ToolCall",

inspect_ai/tool/_tool.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import (
 from inspect_ai._util.content import (
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -35,10 +36,11 @@ ToolResult = (
     | float
     | bool
     | ContentText
+    | ContentReasoning
     | ContentImage
     | ContentAudio
     | ContentVideo
-    | list[ContentText | ContentImage | ContentAudio | ContentVideo]
+    | list[ContentText | ContentReasoning | ContentImage | ContentAudio | ContentVideo]
 )
 """Valid types for results from tool calls."""

inspect_ai/tool/_tools/_computer/_common.py CHANGED Viewed

@@ -11,19 +11,6 @@ from inspect_ai.tool import ToolError, ToolResult
 from inspect_ai.util._sandbox.context import sandbox_with
 from inspect_ai.util._sandbox.environment import SandboxEnvironment
-Action = Literal[
-    "key",
-    "type",
-    "mouse_move",
-    "left_click",
-    "left_click_drag",
-    "right_click",
-    "middle_click",
-    "double_click",
-    "screenshot",
-    "cursor_position",
-]
 class ToolExecResult(BaseModel):
     output: str | None = Field(default=None)
@@ -31,6 +18,122 @@ class ToolExecResult(BaseModel):
     base64_image: str | None = Field(default=None)
+async def cursor_position(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["cursor_position"], timeout=timeout)
+async def screenshot(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["screenshot"], timeout=timeout)
+async def wait(duration: int, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["wait", "--duration", f"{duration}"], timeout=timeout)
+async def mouse_move(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["mouse_move", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def left_mouse_down(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["left_mouse_down"], timeout=timeout)
+async def left_mouse_up(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["left_mouse_up"], timeout=timeout)
+async def left_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["left_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def left_click_drag(
+    start_coordinate: list[int], coordinate: list[int], timeout: int | None = None
+) -> ToolResult:
+    return await _send_cmd(
+        [
+            "left_click_drag",
+            "--start_coordinate",
+            f"{start_coordinate[0]}",
+            f"{start_coordinate[1]}",
+            "--coordinate",
+            f"{coordinate[0]}",
+            f"{coordinate[1]}",
+        ],
+        timeout=timeout,
+    )
+async def right_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["right_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def middle_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["middle_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def double_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["double_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def triple_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["triple_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
+        timeout=timeout,
+    )
+async def scroll(
+    scroll_amount: int,
+    scroll_direction: Literal["up", "down", "left", "right"],
+    coordinate: list[int] | None,
+    timeout: int | None = None,
+) -> ToolResult:
+    return await _send_cmd(
+        [
+            "scroll",
+            "--scroll_amount",
+            f"{scroll_amount}",
+            "--scroll_direction",
+            f"{scroll_direction}",
+        ]
+        + (
+            ["--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"]
+            if coordinate
+            else []
+        ),
+        timeout=timeout,
+    )
+async def press_key(key: str, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["key", "--text", key], timeout=timeout)
+async def hold_key(key: str, duration: int, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["hold_key", "--text", key, "--duration", f"{duration}"], timeout=timeout
+    )
+async def type(text: str, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["type", "--text", text], timeout=timeout)
 async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult:
     from inspect_ai.log._samples import sample_active
@@ -39,7 +142,7 @@ async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResul
     sample_id = sample.sample.id
     assert sample_id
-    cmd = ["python3", "/opt/inspect/tool/computer_tool.py", "--action"] + cmdTail
+    cmd = ["python3", "/opt/inspect/tool/computer_tool.py"] + cmdTail
     raw_exec_result = await (await computer_sandbox()).exec(cmd, timeout=timeout)
@@ -72,50 +175,6 @@ async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResul
     return "OK"
-async def cursor_position(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["cursor_position"], timeout=timeout)
-async def screenshot(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["screenshot"], timeout=timeout)
-async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(
-        ["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout
-    )
-async def left_click(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["left_click"], timeout=timeout)
-async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(
-        ["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout
-    )
-async def right_click(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["right_click"], timeout=timeout)
-async def middle_click(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["middle_click"], timeout=timeout)
-async def double_click(timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["double_click"], timeout=timeout)
-async def press_key(key: str, timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["key", "--text", key], timeout=timeout)
-async def type(text: str, timeout: int | None = None) -> ToolResult:
-    return await _send_cmd(["type", "--text", text], timeout=timeout)
 async def computer_sandbox() -> SandboxEnvironment:
     sb = await sandbox_with("/opt/inspect/tool/computer_tool.py")
     if sb:

inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

inspect-ai 0.3.70py3-none-any.whl → 0.3.72py3-none-any.whl