PyPI - inspect-ai - Versions diffs - 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl - Mend

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

inspect_ai/solver/_human_agent/agent.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import asyncio
+from typing import cast
 from inspect_ai.util import display_type, input_panel, sandbox
+from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
 from .._solver import Generate, Solver, solver
 from .._task_state import TaskState
@@ -56,19 +58,21 @@ def human_agent(
             # helper function to run the agent (called for fullscreen vs. fallback below)
             async def run_human_agent(view: HumanAgentView) -> TaskState:
-                # create agent commands
-                commands = human_agent_commands(
-                    state, answer, intermediate_scoring, record_session
-                )
+                sandbox_proxy = cast(SandboxEnvironmentProxy, sandbox())
+                with sandbox_proxy.no_events():
+                    # create agent commands
+                    commands = human_agent_commands(
+                        state, answer, intermediate_scoring, record_session
+                    )
-                # install agent tools
-                await install_human_agent(state, commands, record_session)
+                    # install agent tools
+                    await install_human_agent(state, commands, record_session)
-                # hookup the view ui
-                view.connect(connection)
+                    # hookup the view ui
+                    view.connect(connection)
-                # run sandbox service
-                return await run_human_agent_service(state, commands, view)
+                    # run sandbox service
+                    return await run_human_agent_service(state, commands, view)
             # support both fullscreen ui and fallback
             if display_type() == "full":

inspect_ai/solver/_human_agent/commands/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from .instructions import InstructionsCommand
 from .note import NoteCommand
 from .score import ScoreCommand
 from .status import StatusCommand
-from .submit import SubmitCommand, ValidateCommand
+from .submit import QuitCommand, SubmitCommand, ValidateCommand
 def human_agent_commands(
@@ -15,8 +15,12 @@ def human_agent_commands(
     intermediate_scoring: bool,
     record_session: bool,
 ) -> list[HumanAgentCommand]:
-    # base submit and validate
-    commands = [SubmitCommand(record_session), ValidateCommand(answer)]
+    # base submit, validate, and quit
+    commands = [
+        SubmitCommand(record_session),
+        ValidateCommand(answer),
+        QuitCommand(record_session),
+    ]
     # optional intermediate scoring
     if intermediate_scoring:

inspect_ai/solver/_human_agent/commands/submit.py CHANGED Viewed

@@ -16,22 +16,89 @@ from .command import HumanAgentCommand, call_human_agent
 logger = getLogger(__name__)
-class SubmitCommand(HumanAgentCommand):
+class SessionEndCommand(HumanAgentCommand):
     def __init__(self, record_session: bool):
         super().__init__()
         self._record_session = record_session
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    async def _read_session_logs(self) -> dict[str, str]:
+        # retreive session logs (don't fail)
+        sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
+        result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
+        if not result.success:
+            logger.warning(f"Error listing human agent session logs: {result.stderr}")
+            return {}
+        # read logs
+        session_logs: dict[str, str] = {}
+        for session_log in result.stdout.strip().splitlines():
+            try:
+                session_logs[session_log] = await sandbox().read_file(
+                    (sessions_dir / session_log).as_posix()
+                )
+            except Exception as ex:
+                logger.warning(f"Error reading human agent session log: {ex}")
+        return session_logs
+class QuitCommand(SessionEndCommand):
     @property
     def name(self) -> str:
-        return "submit"
+        return "quit"
     @property
     def description(self) -> str:
-        return "Submit your final answer for the task."
+        return "Quit the task without submitting an answer."
+    def cli(self, args: Namespace) -> None:
+        # verify that the user wants to proceed
+        action = "quit the task without submitting an answer (ending the exercise)"
+        while True:
+            response = (
+                input(
+                    f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
+                )
+                .lower()
+                .strip()
+            )
+            if response in ["yes", "y"]:
+                break
+            elif response in ["no", "n"]:
+                return
+            else:
+                print("Please enter yes or no.")
+        # thank the user!
+        print(
+            "\nThank you for working on this task!\n\n"
+            + "Your task will now be scored and you will be disconnected from this container.\n"
+        )
+        call_human_agent("quit")
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def submit() -> None:
+            if self._record_session:
+                state.logs = await self._read_session_logs()
+            state.running = False
+            state.answer = ""
+        return submit
+class SubmitCommand(SessionEndCommand):
     @property
-    def group(self) -> Literal[1, 2, 3]:
-        return 1
+    def name(self) -> str:
+        return "submit"
+    @property
+    def description(self) -> str:
+        return "Submit your final answer for the task."
     @property
     def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
@@ -55,10 +122,12 @@ class SubmitCommand(HumanAgentCommand):
         # verify that the user wants to proceed
         answer = call_args.get("answer", None)
         answer_text = f" '{answer}'" if answer else ""
+        action = f"end the task and submit{answer_text}"
         while True:
             response = (
                 input(
-                    f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
+                    f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
                 )
                 .lower()
                 .strip()
@@ -76,13 +145,10 @@ class SubmitCommand(HumanAgentCommand):
             + "Your task will now be scored and you will be disconnected from this container.\n"
         )
-        # submit the task
         call_human_agent("submit", **call_args)
     def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
-        async def submit(
-            answer: str | None, session_logs: dict[str, str] | None = None
-        ) -> None:
+        async def submit(answer: str) -> None:
             if self._record_session:
                 state.logs = await self._read_session_logs()
             state.running = False
@@ -90,26 +156,6 @@ class SubmitCommand(HumanAgentCommand):
         return submit
-    async def _read_session_logs(self) -> dict[str, str]:
-        # retreive session logs (don't fail)
-        sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
-        result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
-        if not result.success:
-            logger.warning(f"Error listing human agent session logs: {result.stderr}")
-            return {}
-        # read logs
-        session_logs: dict[str, str] = {}
-        for session_log in result.stdout.strip().splitlines():
-            try:
-                session_logs[session_log] = await sandbox().read_file(
-                    (sessions_dir / session_log).as_posix()
-                )
-            except Exception as ex:
-                logger.warning(f"Error reading human agent session log: {ex}")
-        return session_logs
 class ValidateCommand(HumanAgentCommand):
     def __init__(self, answer: bool | str) -> None:

inspect_ai/solver/_limit.py CHANGED Viewed

@@ -7,15 +7,15 @@ class SampleLimitExceededError(Exception):
     """Exception raised when a sample limit is exceeded.
     Args:
-       type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
-       value (int): Value compared to
-       limit (int): Limit applied.
+       type: Type of limit exceeded.
+       value: Value compared to
+       limit: Limit applied.
        message (str | None): Optional. Human readable message.
     """
     def __init__(
         self,
-        type: Literal["message", "time", "token", "operator", "custom"],
+        type: Literal["message", "time", "working", "token", "operator", "custom"],
         *,
         value: int,
         limit: int,

inspect_ai/solver/_plan.py CHANGED Viewed

@@ -118,9 +118,6 @@ class Plan(Solver):
                     st.complete(state)
                 check_sample_interrupt()
-            # mark completed
-            state.completed = True
         finally:
             # always do cleanup if we have one
             if self.cleanup:

inspect_ai/solver/_task_state.py CHANGED Viewed

@@ -7,6 +7,7 @@ from random import Random
 from typing import Any, Iterable, SupportsIndex, Type, Union, cast, overload
 from pydantic_core import to_jsonable_python
+from shortuuid import uuid
 from inspect_ai._util.interrupt import check_sample_interrupt
 from inspect_ai.dataset._dataset import MT, Sample, metadata_as
@@ -165,6 +166,7 @@ class TaskState:
         self._token_limit = token_limit
         self._completed = completed
         self._store = Store()
+        self._uuid = uuid()
         if choices:
             self.choices = Choices(choices)
@@ -373,6 +375,11 @@ class TaskState:
     scores: dict[str, Score] | None = None
     """Scores yielded by running task."""
+    @property
+    def uuid(self) -> str:
+        """Globally unique identifier for sample run."""
+        return self._uuid
     def metadata_as(self, metadata_cls: Type[MT]) -> MT:
         """Pydantic model interface to metadata.

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from inspect_ai._util.content import (
     Content,
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -41,6 +42,7 @@ __all__ = [
     "Content",
     "ContentAudio",
     "ContentImage",
+    "ContentReasoning",
     "ContentText",
     "ContentVideo",
     "ToolCall",

inspect_ai/tool/_tool.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import (
 from inspect_ai._util.content import (
     ContentAudio,
     ContentImage,
+    ContentReasoning,
     ContentText,
     ContentVideo,
 )
@@ -35,10 +36,11 @@ ToolResult = (
     | float
     | bool
     | ContentText
+    | ContentReasoning
     | ContentImage
     | ContentAudio
     | ContentVideo
-    | list[ContentText | ContentImage | ContentAudio | ContentVideo]
+    | list[ContentText | ContentReasoning | ContentImage | ContentAudio | ContentVideo]
 )
 """Valid types for results from tool calls."""

inspect_ai/tool/_tools/_computer/_resources/tool/_run.py CHANGED Viewed

@@ -32,7 +32,7 @@ async def run(
             maybe_truncate(stdout.decode(), truncate_after=truncate_after),
             maybe_truncate(stderr.decode(), truncate_after=truncate_after),
         )
-    except asyncio.TimeoutError as exc:
+    except (TimeoutError, asyncio.TimeoutError) as exc:
         try:
             process.kill()
         except ProcessLookupError:

inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc ADDED Viewed

@@ -0,0 +1,8 @@
+[MASTER]
+; R - Refactorings
+; C - Convention
+; W - Warning
+; E - Error
+enable=C,R,W,E
+disable=R0903,C0114,C0115,C0116,C0301,C0411,C1804,C1805,W0120,W0511,E0401,E1101,E0611,E1128
+score=no

inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json ADDED Viewed

@@ -0,0 +1,24 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "type": "debugpy",
+      "request": "launch",
+      "name": "Debug Web Server",
+      "program": "${workspaceFolder}/web_server.py"
+    },
+    {
+      "type": "debugpy",
+      "request": "launch",
+      "name": "Debug Web Client interactive mode",
+      "program": "${workspaceFolder}/web_client.py"
+    },
+    {
+      "type": "debugpy",
+      "request": "launch",
+      "name": "Debug Web Client w/arguments",
+      "program": "${workspaceFolder}/web_client.py",
+      "args": ["${command:pickArgs}"]
+    }
+  ]
+}

inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+  "cSpell.words": [
+    "activedescendant",
+    "describedby",
+    "domcontentloaded",
+    "figcaption",
+    "flowto",
+    "framenavigated",
+    "headful",
+    "idref",
+    "jsonrpcclient",
+    "jsonrpcserver",
+    "keepalive",
+    "keyshortcuts",
+    "labelfor",
+    "labelledby",
+    "labelwrapped",
+    "multiselectable",
+    "Rects",
+    "roledescription",
+    "rubyannotation",
+    "tablecaption",
+    "valuetext"
+  ]
+}

inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile CHANGED Viewed

@@ -8,16 +8,15 @@ RUN apt-get update
 RUN pip install --upgrade pip
+RUN pip install playwright jsonrpcclient jsonrpcserver httpx aiohttp pillow pydantic tenacity
 # Install playwright
-RUN pip install playwright
 RUN playwright install
 RUN playwright install-deps
-# Install other dependancies
-RUN pip install dm-env-rpc pillow bs4 lxml
 # Copy Python files alongside the Dockerfile
-COPY *.py ./
+COPY . .
 # Run the server
-CMD ["python3", "/app/web_browser/web_server.py"]
+CMD ["python3", "/app/web_browser/web_server.py"]
+# CMD ["tail", "-f", "/dev/null"]

inspect_ai/tool/_tools/_web_browser/_resources/README.md CHANGED Viewed

@@ -1,7 +1,6 @@
 ## Headless Browser Tool
-This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
+This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
 ### Usage
@@ -37,27 +36,27 @@ The result will be printed out in _stdout_ in the following format:
 ```
 # Inside the Docker container
-error: <an ERROR message if one occured>
+error: <an ERROR message if one occurred>
 info: <general info about the container>
 web_url: <the URL of the page the browser is currently at>
 web_at: <accessibility tree of the visible elements of the page>
-```
+```
 ### Design
 The following diagram describes the design and the intended usage of the tool:
-![diagram](images/usage_diagram.png)
+![diagram](images/usage_diagram.svg)
 The tool consists of the following components:
-* [WebServer](web_server.py) - a server which launches a stateful session with the headless chromium browser and interracts with it through the [Playwright API](https://playwright.dev/python/docs/intro) upon receiving client commands. The server components are:
-  * _dm_env_servicer.py_ - an implementation for the gRPC Service based on [dm_env_rpc protocol](https://github.com/google-deepmind/dm_env_rpc).
-  * _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
-  * _playwright_crawler.py_ - a wrapper over the sync Playwright API.
+- [WebServer](web_server.py) - a server which launches a stateful session with the headless chromium browser and interacts with it through the [Playwright API](https://playwright.dev/python/docs/intro) upon receiving client commands. The server components are:
+  - _dm_env_servicer.py_ - an implementation for the gRPC Service based on [dm_env_rpc protocol](https://github.com/google-deepmind/dm_env_rpc).
+  - _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
+  - _playwright_crawler.py_ - a wrapper over the sync Playwright API.
-* [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
+- [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
   1. creates a connection with the server;
   2. sends user command to the server;
   3. receives the response in the form of observations and prints them to stdout;

inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py ADDED Viewed

@@ -0,0 +1,71 @@
+from functools import reduce
+from typing import Iterable, TypedDict
+from accessibility_tree_node import AccessibilityTreeNode
+from cdp.a11y import AXNode, AXNodeId
+from cdp.dom_snapshot import DOMSnapshot, create_snapshot_context
+from rectangle import Rectangle
+_AccType = tuple[
+    AXNode | None,
+    dict[AXNodeId, AXNode],
+]
+class AccessibilityTree(TypedDict):
+    root: AccessibilityTreeNode
+    nodes: dict[AXNodeId, AccessibilityTreeNode]
+def create_accessibility_tree(
+    *,
+    ax_nodes: Iterable[AXNode],
+    dom_snapshot: DOMSnapshot,
+    device_scale_factor: float,
+    window_bounds: Rectangle,
+) -> AccessibilityTree | None:
+    """
+    Creates an accessibility tree from the given Chrome DevTools Protocol AX nodes and DOM snapshot.
+    Args:
+      ax_nodes (Iterable[AXNode]): An iterable of AXNode objects representing the accessibility nodes.
+      dom_snapshot (DOMSnapshot): A snapshot of the DOM at the time of accessibility tree creation.
+      device_scale_factor (float): The scale factor of the device.
+      window_bounds (Bounds): The bounds of the window.
+    Returns:
+      AccessibilityTree: The accessibility tree.
+    """
+    # first make a dict of AXNodeId's to AXNode's and find the root on the way
+    def reducer(acc: _AccType, ax_node: AXNode) -> _AccType:
+        root_node, nodes = acc
+        nodes[ax_node.nodeId] = ax_node
+        return (
+            # TODO: What do we want for multiple roots?
+            root_node or (ax_node if ax_node.parentId is None else None),
+            nodes,
+        )
+    initial_acc: _AccType = (None, {})  # The inference engine is weak
+    root_node, nodes = reduce(reducer, ax_nodes, initial_acc)
+    if not root_node:
+        return None
+    # Now create the AccessibilityTreeNode hierarchy
+    snapshot_context = create_snapshot_context(dom_snapshot)
+    all_accessibility_tree_nodes: dict[AXNodeId, AccessibilityTreeNode] = {}
+    return AccessibilityTree(
+        root=AccessibilityTreeNode(
+            ax_node=root_node,
+            ax_nodes=nodes,
+            parent=None,
+            all_accessibility_tree_nodes=all_accessibility_tree_nodes,
+            snapshot_context=snapshot_context,
+            device_scale_factor=device_scale_factor,
+            window_bounds=window_bounds,
+        ),
+        nodes=all_accessibility_tree_nodes,
+    )

inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl