inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +7 -3
  3. inspect_ai/_cli/eval.py +17 -2
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +4 -3
  6. inspect_ai/_display/core/config.py +3 -3
  7. inspect_ai/_display/core/panel.py +7 -3
  8. inspect_ai/_display/plain/__init__.py +0 -0
  9. inspect_ai/_display/plain/display.py +203 -0
  10. inspect_ai/_display/rich/display.py +4 -9
  11. inspect_ai/_display/textual/app.py +4 -1
  12. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  13. inspect_ai/_display/textual/widgets/samples.py +119 -16
  14. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  15. inspect_ai/_eval/eval.py +32 -20
  16. inspect_ai/_eval/evalset.py +7 -5
  17. inspect_ai/_eval/score.py +1 -0
  18. inspect_ai/_eval/task/__init__.py +2 -2
  19. inspect_ai/_eval/task/images.py +40 -25
  20. inspect_ai/_eval/task/results.py +50 -22
  21. inspect_ai/_eval/task/run.py +180 -124
  22. inspect_ai/_eval/task/sandbox.py +10 -5
  23. inspect_ai/_eval/task/task.py +140 -25
  24. inspect_ai/_util/constants.py +2 -0
  25. inspect_ai/_util/content.py +23 -1
  26. inspect_ai/_util/images.py +20 -17
  27. inspect_ai/_util/kvstore.py +73 -0
  28. inspect_ai/_util/notgiven.py +18 -0
  29. inspect_ai/_util/port_names.py +61 -0
  30. inspect_ai/_util/text.py +23 -0
  31. inspect_ai/_util/thread.py +5 -0
  32. inspect_ai/_view/www/App.css +31 -1
  33. inspect_ai/_view/www/dist/assets/index.css +31 -1
  34. inspect_ai/_view/www/dist/assets/index.js +25375 -1846
  35. inspect_ai/_view/www/log-schema.json +129 -15
  36. inspect_ai/_view/www/package.json +2 -0
  37. inspect_ai/_view/www/src/App.mjs +8 -10
  38. inspect_ai/_view/www/src/Types.mjs +0 -1
  39. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  40. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  41. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  42. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  43. inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
  44. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  45. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  46. inspect_ai/_view/www/src/index.js +75 -2
  47. inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
  48. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
  49. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  50. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  51. inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
  52. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  53. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
  54. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
  55. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  56. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  57. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  58. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  59. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  60. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  61. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  62. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  63. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  64. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  65. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  66. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  67. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  68. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  69. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  70. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  71. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  72. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  73. inspect_ai/_view/www/src/types/log.d.ts +62 -27
  74. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  75. inspect_ai/_view/www/src/utils/Json.mjs +12 -6
  76. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
  77. inspect_ai/_view/www/vite.config.js +7 -0
  78. inspect_ai/_view/www/yarn.lock +116 -0
  79. inspect_ai/approval/_human/__init__.py +0 -0
  80. inspect_ai/approval/_human/util.py +2 -2
  81. inspect_ai/approval/_policy.py +12 -6
  82. inspect_ai/dataset/_sources/csv.py +2 -1
  83. inspect_ai/dataset/_sources/json.py +2 -1
  84. inspect_ai/dataset/_sources/util.py +15 -7
  85. inspect_ai/log/_condense.py +11 -1
  86. inspect_ai/log/_log.py +3 -6
  87. inspect_ai/log/_recorders/eval.py +19 -8
  88. inspect_ai/log/_samples.py +26 -5
  89. inspect_ai/log/_transcript.py +32 -2
  90. inspect_ai/model/__init__.py +10 -2
  91. inspect_ai/model/_call_tools.py +59 -12
  92. inspect_ai/model/_chat_message.py +2 -4
  93. inspect_ai/model/_conversation.py +61 -0
  94. inspect_ai/model/_generate_config.py +10 -4
  95. inspect_ai/model/_model.py +117 -18
  96. inspect_ai/model/_model_output.py +7 -2
  97. inspect_ai/model/_providers/anthropic.py +109 -51
  98. inspect_ai/model/_providers/azureai.py +26 -24
  99. inspect_ai/model/_providers/bedrock.py +43 -44
  100. inspect_ai/model/_providers/google.py +121 -58
  101. inspect_ai/model/_providers/groq.py +7 -5
  102. inspect_ai/model/_providers/hf.py +11 -6
  103. inspect_ai/model/_providers/mistral.py +17 -20
  104. inspect_ai/model/_providers/openai.py +32 -21
  105. inspect_ai/model/_providers/openai_o1.py +9 -8
  106. inspect_ai/model/_providers/providers.py +1 -1
  107. inspect_ai/model/_providers/together.py +8 -8
  108. inspect_ai/model/_providers/vertex.py +18 -8
  109. inspect_ai/scorer/__init__.py +13 -2
  110. inspect_ai/scorer/_metrics/__init__.py +2 -2
  111. inspect_ai/scorer/_metrics/std.py +3 -3
  112. inspect_ai/scorer/_reducer/reducer.py +1 -1
  113. inspect_ai/scorer/_scorer.py +2 -2
  114. inspect_ai/solver/__init__.py +2 -5
  115. inspect_ai/solver/_prompt.py +35 -5
  116. inspect_ai/solver/_task_state.py +80 -38
  117. inspect_ai/tool/__init__.py +11 -1
  118. inspect_ai/tool/_tool.py +21 -3
  119. inspect_ai/tool/_tool_call.py +10 -0
  120. inspect_ai/tool/_tool_def.py +16 -5
  121. inspect_ai/tool/_tool_with.py +21 -4
  122. inspect_ai/tool/beta/__init__.py +5 -0
  123. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  124. inspect_ai/tool/beta/_computer/_common.py +133 -0
  125. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  126. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  127. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  128. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  129. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  130. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  131. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  132. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  133. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  134. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  135. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  136. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  137. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  138. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  139. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  140. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  141. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  142. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  143. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  144. inspect_ai/util/__init__.py +2 -3
  145. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  146. inspect_ai/util/_display.py +14 -4
  147. inspect_ai/util/_limit.py +26 -0
  148. inspect_ai/util/_sandbox/context.py +12 -13
  149. inspect_ai/util/_sandbox/docker/compose.py +24 -11
  150. inspect_ai/util/_sandbox/docker/docker.py +84 -14
  151. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  152. inspect_ai/util/_sandbox/environment.py +27 -1
  153. inspect_ai/util/_sandbox/local.py +1 -0
  154. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
  155. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
  156. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  157. inspect_ai/model/_trace.py +0 -48
  158. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
  159. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
  160. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
  161. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # remove marker files
5
+ rm -f /tmp/.X${DISPLAY_NUM}-lock
6
+ rm -f /tmp/xfce_started
7
+
8
+ /opt/inspect/entrypoint/xvfb_startup.sh
9
+ /opt/inspect/entrypoint/xfce_startup.sh
10
+ /opt/inspect/entrypoint/x11vnc_startup.sh
11
+ /opt/inspect/entrypoint/novnc_startup.sh
12
+
13
+ # Run CMD if provided
14
+ echo "Executing CMD from derived Dockerfile: $@"
15
+ exec "$@"
16
+
17
+ # Keep the container running
18
+ tail -f /dev/null
@@ -0,0 +1,20 @@
1
+ #!/bin/bash
2
+ echo "starting noVNC"
3
+
4
+ # Start noVNC with explicit websocket settings
5
+ websockify \
6
+ --web=/usr/share/novnc/ \
7
+ 6080 localhost:5900 \
8
+ > /tmp/novnc.log 2>&1 &
9
+
10
+ # Wait for noVNC to start
11
+ timeout=10
12
+ while [ $timeout -gt 0 ]; do
13
+ if netstat -tuln | grep -q ":6080 "; then
14
+ break
15
+ fi
16
+ sleep 1
17
+ ((timeout--))
18
+ done
19
+
20
+ echo "noVNC started successfully"
@@ -0,0 +1,48 @@
1
+ #!/bin/bash
2
+ echo "starting vnc"
3
+
4
+ (x11vnc -display $DISPLAY \
5
+ -forever \
6
+ -shared \
7
+ -wait 50 \
8
+ -cursor most \
9
+ -cursor arrow \
10
+ -rfbport 5900 \
11
+ -nopw \
12
+ 2>/tmp/x11vnc_stderr.log) &
13
+
14
+ x11vnc_pid=$!
15
+
16
+ # Wait for x11vnc to start
17
+ timeout=10
18
+ while [ $timeout -gt 0 ]; do
19
+ if netstat -tuln | grep -q ":5900 "; then
20
+ break
21
+ fi
22
+ sleep 1
23
+ ((timeout--))
24
+ done
25
+
26
+ if [ $timeout -eq 0 ]; then
27
+ echo "x11vnc failed to start, stderr output:" >&2
28
+ cat /tmp/x11vnc_stderr.log >&2
29
+ exit 1
30
+ fi
31
+
32
+ : > /tmp/x11vnc_stderr.log
33
+
34
+ # Monitor x11vnc process in the background
35
+ (
36
+ while true; do
37
+ if ! kill -0 $x11vnc_pid 2>/dev/null; then
38
+ echo "x11vnc process crashed, restarting..." >&2
39
+ if [ -f /tmp/x11vnc_stderr.log ]; then
40
+ echo "x11vnc stderr output:" >&2
41
+ cat /tmp/x11vnc_stderr.log >&2
42
+ rm /tmp/x11vnc_stderr.log
43
+ fi
44
+ exec "$0"
45
+ fi
46
+ sleep 5
47
+ done
48
+ ) &
@@ -0,0 +1,13 @@
1
+ #!/bin/bash
2
+
3
+ echo "starting XFCE4"
4
+ startxfce4 &
5
+
6
+ while ! pgrep -x "xfce4-session" > /dev/null; do
7
+ echo "Waiting for XFCE4 to start..."
8
+ sleep 1
9
+ done
10
+
11
+ echo "XFCE4 is fully started!"
12
+ touch /tmp/xfce_started
13
+
@@ -0,0 +1,48 @@
1
+ #!/bin/bash
2
+ set -e # Exit on error
3
+
4
+ DPI=96
5
+ RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24
6
+
7
+ # Function to check if Xvfb is already running
8
+ check_xvfb_running() {
9
+ if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then
10
+ return 0 # Xvfb is already running
11
+ else
12
+ return 1 # Xvfb is not running
13
+ fi
14
+ }
15
+
16
+ # Function to check if Xvfb is ready
17
+ wait_for_xvfb() {
18
+ local timeout=10
19
+ local start_time=$(date +%s)
20
+ while ! xdpyinfo >/dev/null 2>&1; do
21
+ if [ $(($(date +%s) - start_time)) -gt $timeout ]; then
22
+ echo "Xvfb failed to start within $timeout seconds" >&2
23
+ return 1
24
+ fi
25
+ sleep 0.1
26
+ done
27
+ return 0
28
+ }
29
+
30
+ # Check if Xvfb is already running
31
+ if check_xvfb_running; then
32
+ echo "Xvfb is already running on display ${DISPLAY}"
33
+ exit 0
34
+ fi
35
+
36
+ # Start Xvfb
37
+ Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix &
38
+ XVFB_PID=$!
39
+
40
+ # Wait for Xvfb to start
41
+ if wait_for_xvfb; then
42
+ echo "Xvfb started successfully on display ${DISPLAY}"
43
+ echo "Xvfb PID: $XVFB_PID"
44
+ else
45
+ echo "Xvfb failed to start"
46
+ kill $XVFB_PID
47
+ exit 1
48
+ fi
@@ -0,0 +1,10 @@
1
+ [Desktop Entry]
2
+ Version=1.0
3
+ Type=Application
4
+ Name=Firefox Web Browser
5
+ Comment=Browse the World Wide Web
6
+ Exec=firefox-esr %u
7
+ Icon=firefox-esr
8
+ Path=
9
+ Terminal=false
10
+ StartupNotify=true
@@ -0,0 +1,10 @@
1
+ [Desktop Entry]
2
+ Version=1.0
3
+ Type=Application
4
+ Name=Visual Studio Code
5
+ Comment=Code Editing. Redefined.
6
+ Exec=/usr/share/code/code %F
7
+ Icon=vscode
8
+ Path=
9
+ Terminal=false
10
+ StartupNotify=false
@@ -0,0 +1,10 @@
1
+ [Desktop Entry]
2
+ Version=1.0
3
+ Type=Application
4
+ Name=XPaint
5
+ Comment=Xpaint painting application
6
+ Exec=xpaint
7
+ Icon=xpaint
8
+ Path=
9
+ Terminal=false
10
+ StartupNotify=false
@@ -0,0 +1,22 @@
1
+ import logging
2
+
3
+
4
+ def setup_logger(level=logging.INFO):
5
+ """
6
+ This logger emits all of its output to PID 1's stdout.
7
+
8
+ This makes it so that logging from invocations of the computer_tool cli show up in `docker logs` output.
9
+ """
10
+ new_logger = logging.getLogger("computer_tool")
11
+ new_logger.setLevel(level)
12
+
13
+ stdout_handler = logging.FileHandler("/proc/1/fd/1", mode="w")
14
+ stdout_handler.setLevel(level)
15
+ stdout_handler.setFormatter(
16
+ logging.Formatter("%(name)s(pid=%(process)d) - %(levelname)s - %(message)s")
17
+ )
18
+
19
+ if not new_logger.handlers:
20
+ new_logger.addHandler(stdout_handler)
21
+
22
+ return new_logger
@@ -0,0 +1,42 @@
1
+ """Utility to run shell commands asynchronously with a timeout."""
2
+
3
+ import asyncio
4
+
5
+ TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for.</NOTE>"
6
+ MAX_RESPONSE_LEN: int = 16000
7
+
8
+
9
+ def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN):
10
+ """Truncate content and append a notice if content exceeds the specified length."""
11
+ return (
12
+ content
13
+ if not truncate_after or len(content) <= truncate_after
14
+ else content[:truncate_after] + TRUNCATED_MESSAGE
15
+ )
16
+
17
+
18
+ async def run(
19
+ cmd: str,
20
+ timeout: float | None = 120.0, # seconds
21
+ truncate_after: int | None = MAX_RESPONSE_LEN,
22
+ ):
23
+ """Run a shell command asynchronously with a timeout."""
24
+ process = await asyncio.create_subprocess_shell(
25
+ cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
26
+ )
27
+
28
+ try:
29
+ stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
30
+ return (
31
+ process.returncode or 0,
32
+ maybe_truncate(stdout.decode(), truncate_after=truncate_after),
33
+ maybe_truncate(stderr.decode(), truncate_after=truncate_after),
34
+ )
35
+ except asyncio.TimeoutError as exc:
36
+ try:
37
+ process.kill()
38
+ except ProcessLookupError:
39
+ pass
40
+ raise TimeoutError(
41
+ f"Command '{cmd}' timed out after {timeout} seconds"
42
+ ) from exc
@@ -0,0 +1,33 @@
1
+ from dataclasses import dataclass, fields, replace
2
+
3
+
4
+ @dataclass(kw_only=True, frozen=True)
5
+ class ToolResult:
6
+ """Represents the result of a tool execution."""
7
+
8
+ output: str | None = None
9
+ error: str | None = None
10
+ base64_image: str | None = None
11
+
12
+ def __bool__(self):
13
+ return any(getattr(self, field.name) for field in fields(self))
14
+
15
+ def __add__(self, other: "ToolResult"):
16
+ def combine_fields(
17
+ field: str | None, other_field: str | None, concatenate: bool = True
18
+ ):
19
+ if field and other_field:
20
+ if concatenate:
21
+ return field + other_field
22
+ raise ValueError("Cannot combine tool results")
23
+ return field or other_field
24
+
25
+ return ToolResult(
26
+ output=combine_fields(self.output, other.output),
27
+ error=combine_fields(self.error, other.error),
28
+ base64_image=combine_fields(self.base64_image, other.base64_image, False),
29
+ )
30
+
31
+ def replace(self, **kwargs):
32
+ """Returns a new ToolResult with the given fields replaced."""
33
+ return replace(self, **kwargs)
@@ -0,0 +1,262 @@
1
+ """Based on https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py"""
2
+
3
+ import asyncio
4
+ import base64
5
+ import logging
6
+ import os
7
+ import shlex
8
+ from pathlib import Path
9
+ from typing import Literal, TypedDict
10
+ from uuid import uuid4
11
+
12
+ from _run import run
13
+ from _tool_result import ToolResult
14
+
15
+ OUTPUT_DIR = "/tmp/outputs"
16
+
17
+ TYPING_DELAY_MS = 12
18
+ TYPING_GROUP_SIZE = 50
19
+
20
+ ColorCount = Literal[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4]
21
+
22
+ Action = Literal[
23
+ "key",
24
+ "type",
25
+ "mouse_move",
26
+ "left_click",
27
+ "left_click_drag",
28
+ "right_click",
29
+ "middle_click",
30
+ "double_click",
31
+ "screenshot",
32
+ "cursor_position",
33
+ ]
34
+
35
+
36
+ class ToolError(Exception):
37
+ def __init__(self, message):
38
+ self.message = message
39
+
40
+
41
+ class Resolution(TypedDict):
42
+ width: int
43
+ height: int
44
+
45
+
46
+ # sizes above XGA/WXGA are not recommended (see README.md)
47
+ # scale down to one of these targets if ComputerTool._scaling_enabled is set
48
+ MAX_SCALING_TARGETS: dict[str, Resolution] = {
49
+ "XGA": Resolution(width=1024, height=768), # 4:3
50
+ "WXGA": Resolution(width=1280, height=800), # 16:10
51
+ "FWXGA": Resolution(width=1366, height=768), # ~16:9
52
+ }
53
+
54
+
55
+ ScalingSource = Literal["computer", "api"]
56
+
57
+
58
+ class ComputerToolOptions(TypedDict):
59
+ display_height_px: int
60
+ display_width_px: int
61
+ display_number: int | None
62
+
63
+
64
+ def chunks(s: str, chunk_size: int) -> list[str]:
65
+ return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
66
+
67
+
68
+ class X11Client:
69
+ """
70
+ A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
71
+
72
+ The tool parameters are defined by Anthropic and are not editable.
73
+ """
74
+
75
+ width: int
76
+ height: int
77
+ display_num: int | None
78
+ # TODO: Complete plumbing this or remove it
79
+ color_count: ColorCount | None = 256
80
+
81
+ _screenshot_delay = 2.0
82
+ _scaling_enabled = True
83
+
84
+ @property
85
+ def options(self) -> ComputerToolOptions:
86
+ width, height = self.scale_coordinates("computer", self.width, self.height)
87
+ return {
88
+ "display_width_px": width,
89
+ "display_height_px": height,
90
+ "display_number": self.display_num,
91
+ }
92
+
93
+ def __init__(self):
94
+ super().__init__()
95
+
96
+ self.width = int(os.getenv("WIDTH") or 0)
97
+ self.height = int(os.getenv("HEIGHT") or 0)
98
+ assert self.width and self.height, "WIDTH, HEIGHT must be set"
99
+ if (display_num := os.getenv("DISPLAY_NUM")) is not None:
100
+ self.display_num = int(display_num)
101
+ self._display_prefix = f"DISPLAY=:{self.display_num} "
102
+ else:
103
+ self.display_num = None
104
+ self._display_prefix = ""
105
+
106
+ self.xdotool = f"{self._display_prefix}xdotool"
107
+
108
+ async def __call__(
109
+ self,
110
+ *,
111
+ action: Action,
112
+ text: str | None = None,
113
+ coordinate: tuple[int, int] | None = None,
114
+ **kwargs,
115
+ ):
116
+ if action in ("mouse_move", "left_click_drag"):
117
+ if coordinate is None:
118
+ raise ToolError(f"coordinate is required for {action}")
119
+ if text is not None:
120
+ raise ToolError(f"text is not accepted for {action}")
121
+ if not isinstance(coordinate, list) or len(coordinate) != 2:
122
+ raise ToolError(f"{coordinate} must be a tuple of length 2")
123
+ if not all(isinstance(i, int) and i >= 0 for i in coordinate):
124
+ raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
125
+
126
+ x, y = self.scale_coordinates("api", coordinate[0], coordinate[1])
127
+
128
+ if action == "mouse_move":
129
+ return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}")
130
+ elif action == "left_click_drag":
131
+ return await self.shell(
132
+ f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
133
+ )
134
+
135
+ if action in ("key", "type"):
136
+ if text is None:
137
+ raise ToolError(f"text is required for {action}")
138
+ if coordinate is not None:
139
+ raise ToolError(f"coordinate is not accepted for {action}")
140
+ if not isinstance(text, str):
141
+ raise ToolError(output=f"{text} must be a string")
142
+
143
+ if action == "key":
144
+ return await self.shell(
145
+ f"{self.xdotool} key -- {' '.join(shlex.quote(part) for part in text.split())}"
146
+ )
147
+ elif action == "type":
148
+ results: list[ToolResult] = []
149
+ for chunk in chunks(text, TYPING_GROUP_SIZE):
150
+ cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
151
+ results.append(await self.shell(cmd, take_screenshot=False))
152
+
153
+ screenshot_base64 = await self.take_screenshot_after_delay()
154
+ return ToolResult(
155
+ output="".join(result.output or "" for result in results),
156
+ error="".join(result.error or "" for result in results),
157
+ base64_image=screenshot_base64,
158
+ )
159
+
160
+ if action in (
161
+ "left_click",
162
+ "right_click",
163
+ "double_click",
164
+ "middle_click",
165
+ "screenshot",
166
+ "cursor_position",
167
+ ):
168
+ if text is not None:
169
+ raise ToolError(f"text is not accepted for {action}")
170
+ if coordinate is not None:
171
+ raise ToolError(f"coordinate is not accepted for {action}")
172
+
173
+ if action == "screenshot":
174
+ return await self.screenshot()
175
+ elif action == "cursor_position":
176
+ result = await self.shell(
177
+ f"{self.xdotool} getmouselocation --shell",
178
+ take_screenshot=False,
179
+ )
180
+ output = result.output or ""
181
+ x, y = self.scale_coordinates(
182
+ "computer",
183
+ int(output.split("X=")[1].split("\n")[0]),
184
+ int(output.split("Y=")[1].split("\n")[0]),
185
+ )
186
+ return result.replace(output=f"X={x},Y={y}")
187
+ else:
188
+ click_arg = {
189
+ "left_click": "1",
190
+ "right_click": "3",
191
+ "middle_click": "2",
192
+ "double_click": "--repeat 2 --delay 300 1",
193
+ }[action]
194
+ return await self.shell(f"{self.xdotool} click {click_arg}")
195
+
196
+ raise ToolError(f"Invalid action: {action}")
197
+
198
+ async def screenshot(self):
199
+ """Take a screenshot of the current screen and return the base64 encoded image."""
200
+ output_dir = Path(OUTPUT_DIR)
201
+ output_dir.mkdir(parents=True, exist_ok=True)
202
+ path = output_dir / f"screenshot_{uuid4().hex}.png"
203
+
204
+ result = await self.shell(
205
+ f"{self._display_prefix}scrot --silent -p {path}", take_screenshot=False
206
+ )
207
+ if self._scaling_enabled:
208
+ x, y = self.scale_coordinates("computer", self.width, self.height)
209
+ convert_cmd = f"convert {path} -resize {x}x{y}!"
210
+ if self.color_count is not None:
211
+ convert_cmd += f" -colors {self.color_count}"
212
+ convert_cmd += f" {path}"
213
+ await self.shell(convert_cmd, take_screenshot=False)
214
+
215
+ if path.exists():
216
+ return result.replace(
217
+ base64_image=base64.b64encode(path.read_bytes()).decode()
218
+ )
219
+ raise ToolError(f"Failed to take screenshot: {result.error}")
220
+
221
+ async def shell(self, command: str, take_screenshot=True) -> ToolResult:
222
+ """Run a shell command and return the output, error, and optionally a screenshot."""
223
+ logging.debug(f"running shell command {command}")
224
+ _, stdout, stderr = await run(command)
225
+ logging.debug(f"shell command returned stdout: {stdout}, stderr: {stderr}")
226
+ return ToolResult(
227
+ output=stdout,
228
+ error=stderr,
229
+ base64_image=(await self.take_screenshot_after_delay())
230
+ if take_screenshot
231
+ else None,
232
+ )
233
+
234
+ async def take_screenshot_after_delay(self) -> str:
235
+ # delay to let things settle before taking a screenshot
236
+ await asyncio.sleep(self._screenshot_delay)
237
+ return (await self.screenshot()).base64_image
238
+
239
+ def scale_coordinates(self, source: ScalingSource, x: int, y: int):
240
+ """Scale coordinates to a target maximum resolution."""
241
+ if not self._scaling_enabled:
242
+ return x, y
243
+ ratio = self.width / self.height
244
+ target_dimension = None
245
+ for dimension in MAX_SCALING_TARGETS.values():
246
+ # allow some error in the aspect ratio - not ratios are exactly 16:9
247
+ if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
248
+ if dimension["width"] < self.width:
249
+ target_dimension = dimension
250
+ break
251
+ if target_dimension is None:
252
+ return x, y
253
+ # should be less than 1
254
+ x_scaling_factor = target_dimension["width"] / self.width
255
+ y_scaling_factor = target_dimension["height"] / self.height
256
+ if source == "api":
257
+ if x > self.width or y > self.height:
258
+ raise ToolError(f"Coordinates {x}, {y} are out of bounds")
259
+ # scale up
260
+ return round(x / x_scaling_factor), round(y / y_scaling_factor)
261
+ # scale down
262
+ return round(x * x_scaling_factor), round(y * y_scaling_factor)
@@ -0,0 +1,85 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import logging
5
+ import os
6
+ import sys
7
+ import time
8
+
9
+ from _logger import setup_logger
10
+ from _tool_result import ToolResult
11
+ from _x11_client import X11Client
12
+
13
+ # This is a bit sketchy. We really want to use relative imports here. Using absolute imports
14
+ # works at runtime, but it prevents intellisense from working. However, when this folder is
15
+ # copied to the container, by default relative imports won't work if this file is launched
16
+ # normally. To overcome this, two things need to happen:
17
+ # 1. PYTHONPATH must be set to the parent of the container folder. `PYTHONPATH=/opt`
18
+ # 2. The program must be launched with the -m flag. `python3 -m computer_tool.computer_tool`
19
+ #
20
+ # TODO: There's got to be a cleaner way.
21
+
22
+ my_logger = setup_logger(logging.INFO)
23
+
24
+
25
+ def main():
26
+ try:
27
+ args = parse_arguments()
28
+ my_logger.info(f"({args})")
29
+ result = asyncio.run(execute_action(args))
30
+
31
+ print(
32
+ json.dumps(
33
+ {
34
+ "output": result.output,
35
+ "error": result.error,
36
+ "base64_image": result.base64_image,
37
+ }
38
+ )
39
+ )
40
+ my_logger.debug("SUCCESS")
41
+ except Exception as e:
42
+ my_logger.warning(f"An error occurred: {e}")
43
+ print(f"An error occurred: {e}", file=sys.stderr)
44
+ sys.exit(1)
45
+
46
+
47
+ def parse_arguments():
48
+ parser = argparse.ArgumentParser(description="Execute computer tool action")
49
+ parser.add_argument("--action", type=str, required=True, help="Action to perform")
50
+ parser.add_argument("--text", type=str, help="Optional text parameter")
51
+ parser.add_argument(
52
+ "--coordinate",
53
+ type=int,
54
+ nargs=2,
55
+ help="Optional coordinate parameter as a list of two integers",
56
+ )
57
+ return parser.parse_args()
58
+
59
+
60
+ async def execute_action(args) -> ToolResult:
61
+ # we can't do anything until X11 is ready to go.
62
+ await wait_for_file("/tmp/xfce_started")
63
+
64
+ computer = X11Client()
65
+ return await computer(
66
+ action=args.action,
67
+ text=args.text,
68
+ coordinate=args.coordinate if args.coordinate else None,
69
+ )
70
+
71
+
72
+ async def wait_for_file(file_path, check_interval=1):
73
+ if os.path.exists(file_path):
74
+ return
75
+ my_logger.info(f"Waiting for {file_path}")
76
+ start_time = time.time()
77
+ while not os.path.exists(file_path):
78
+ await asyncio.sleep(check_interval)
79
+ my_logger.info(
80
+ f"Done waiting for {file_path} after {time.time() - start_time:.1f} seconds"
81
+ )
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
@@ -3,6 +3,7 @@ from inspect_ai._util.trace import trace_action, trace_message
3
3
  from ._concurrency import concurrency
4
4
  from ._console import input_screen
5
5
  from ._display import DisplayType, display_type
6
+ from ._limit import SampleLimitExceededError
6
7
  from ._panel import InputPanel, input_panel
7
8
  from ._resource import resource
8
9
  from ._sandbox import (
@@ -26,7 +27,6 @@ from ._subprocess import (
26
27
  )
27
28
  from ._subtask import Subtask, subtask
28
29
  from ._throttle import throttle
29
- from ._trace import trace_enabled, trace_panel
30
30
 
31
31
  __all__ = [
32
32
  "ExecResult",
@@ -37,6 +37,7 @@ __all__ = [
37
37
  "input_panel",
38
38
  "input_screen",
39
39
  "OutputLimitExceededError",
40
+ "SampleLimitExceededError",
40
41
  "resource",
41
42
  "subprocess",
42
43
  "SandboxEnvironment",
@@ -56,8 +57,6 @@ __all__ = [
56
57
  "Subtask",
57
58
  "subtask",
58
59
  "throttle",
59
- "trace_enabled",
60
- "trace_panel",
61
60
  "trace_action",
62
61
  "trace_message",
63
62
  ]