inspect-ai 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +3 -1
- inspect_ai/_cli/eval.py +15 -9
- inspect_ai/_display/core/active.py +4 -1
- inspect_ai/_display/core/config.py +3 -3
- inspect_ai/_display/core/panel.py +7 -3
- inspect_ai/_display/plain/__init__.py +0 -0
- inspect_ai/_display/plain/display.py +203 -0
- inspect_ai/_display/rich/display.py +0 -5
- inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
- inspect_ai/_display/textual/widgets/samples.py +79 -12
- inspect_ai/_display/textual/widgets/sandbox.py +37 -0
- inspect_ai/_eval/eval.py +10 -1
- inspect_ai/_eval/loader.py +79 -19
- inspect_ai/_eval/registry.py +6 -0
- inspect_ai/_eval/score.py +3 -1
- inspect_ai/_eval/task/results.py +51 -22
- inspect_ai/_eval/task/run.py +47 -13
- inspect_ai/_eval/task/sandbox.py +10 -5
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/port_names.py +61 -0
- inspect_ai/_util/text.py +23 -0
- inspect_ai/_view/www/App.css +31 -1
- inspect_ai/_view/www/dist/assets/index.css +31 -1
- inspect_ai/_view/www/dist/assets/index.js +25498 -2044
- inspect_ai/_view/www/log-schema.json +32 -2
- inspect_ai/_view/www/package.json +2 -0
- inspect_ai/_view/www/src/App.mjs +14 -16
- inspect_ai/_view/www/src/Types.mjs +1 -2
- inspect_ai/_view/www/src/api/Types.ts +133 -0
- inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
- inspect_ai/_view/www/src/api/api-http.ts +219 -0
- inspect_ai/_view/www/src/api/api-shared.ts +47 -0
- inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
- inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
- inspect_ai/_view/www/src/api/index.ts +51 -0
- inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
- inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
- inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
- inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
- inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
- inspect_ai/_view/www/src/index.js +77 -4
- inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
- inspect_ai/_view/www/src/navbar/Navbar.mjs +4 -1
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +19 -10
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
- inspect_ai/_view/www/src/samples/SampleList.mjs +19 -49
- inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -26
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +14 -11
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
- inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
- inspect_ai/_view/www/src/types/log.d.ts +13 -2
- inspect_ai/_view/www/src/utils/Format.mjs +10 -3
- inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +13 -9
- inspect_ai/_view/www/src/utils/vscode.ts +36 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +11 -5
- inspect_ai/_view/www/vite.config.js +7 -0
- inspect_ai/_view/www/yarn.lock +116 -0
- inspect_ai/approval/_human/__init__.py +0 -0
- inspect_ai/approval/_human/manager.py +1 -1
- inspect_ai/approval/_policy.py +12 -6
- inspect_ai/log/_log.py +1 -1
- inspect_ai/log/_samples.py +16 -0
- inspect_ai/log/_transcript.py +4 -1
- inspect_ai/model/_call_tools.py +59 -0
- inspect_ai/model/_conversation.py +16 -7
- inspect_ai/model/_generate_config.py +12 -12
- inspect_ai/model/_model.py +117 -18
- inspect_ai/model/_model_output.py +22 -2
- inspect_ai/model/_openai.py +383 -0
- inspect_ai/model/_providers/anthropic.py +152 -55
- inspect_ai/model/_providers/azureai.py +21 -21
- inspect_ai/model/_providers/bedrock.py +37 -40
- inspect_ai/model/_providers/goodfire.py +248 -0
- inspect_ai/model/_providers/google.py +46 -54
- inspect_ai/model/_providers/groq.py +7 -3
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +13 -12
- inspect_ai/model/_providers/openai.py +51 -218
- inspect_ai/model/_providers/openai_o1.py +11 -12
- inspect_ai/model/_providers/providers.py +23 -1
- inspect_ai/model/_providers/together.py +12 -12
- inspect_ai/model/_providers/util/__init__.py +2 -3
- inspect_ai/model/_providers/util/hf_handler.py +1 -1
- inspect_ai/model/_providers/util/llama31.py +1 -1
- inspect_ai/model/_providers/util/util.py +0 -76
- inspect_ai/model/_providers/vertex.py +1 -4
- inspect_ai/scorer/_metric.py +3 -0
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/scorer/_scorer.py +4 -3
- inspect_ai/solver/__init__.py +4 -5
- inspect_ai/solver/_basic_agent.py +1 -1
- inspect_ai/solver/_bridge/__init__.py +3 -0
- inspect_ai/solver/_bridge/bridge.py +100 -0
- inspect_ai/solver/_bridge/patch.py +170 -0
- inspect_ai/solver/_prompt.py +35 -5
- inspect_ai/solver/_solver.py +6 -0
- inspect_ai/solver/_task_state.py +80 -38
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +12 -1
- inspect_ai/tool/_tool_call.py +10 -0
- inspect_ai/tool/_tool_def.py +16 -5
- inspect_ai/tool/_tool_with.py +21 -4
- inspect_ai/tool/beta/__init__.py +5 -0
- inspect_ai/tool/beta/_computer/__init__.py +3 -0
- inspect_ai/tool/beta/_computer/_common.py +133 -0
- inspect_ai/tool/beta/_computer/_computer.py +155 -0
- inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
- inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
- inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
- inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_display.py +5 -0
- inspect_ai/util/_limit.py +26 -0
- inspect_ai/util/_sandbox/docker/docker.py +64 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -1
- inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
- inspect_ai/util/_sandbox/environment.py +14 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +159 -126
- inspect_ai/_view/www/src/api/Types.mjs +0 -117
- inspect_ai/_view/www/src/api/api-http.mjs +0 -300
- inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
- inspect_ai/_view/www/src/api/index.mjs +0 -49
- inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
- inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
- inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,262 @@
|
|
1
|
+
"""Based on https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py"""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import base64
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import shlex
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Literal, TypedDict
|
10
|
+
from uuid import uuid4
|
11
|
+
|
12
|
+
from _run import run
|
13
|
+
from _tool_result import ToolResult
|
14
|
+
|
15
|
+
OUTPUT_DIR = "/tmp/outputs"
|
16
|
+
|
17
|
+
TYPING_DELAY_MS = 12
|
18
|
+
TYPING_GROUP_SIZE = 50
|
19
|
+
|
20
|
+
ColorCount = Literal[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4]
|
21
|
+
|
22
|
+
Action = Literal[
|
23
|
+
"key",
|
24
|
+
"type",
|
25
|
+
"mouse_move",
|
26
|
+
"left_click",
|
27
|
+
"left_click_drag",
|
28
|
+
"right_click",
|
29
|
+
"middle_click",
|
30
|
+
"double_click",
|
31
|
+
"screenshot",
|
32
|
+
"cursor_position",
|
33
|
+
]
|
34
|
+
|
35
|
+
|
36
|
+
class ToolError(Exception):
|
37
|
+
def __init__(self, message):
|
38
|
+
self.message = message
|
39
|
+
|
40
|
+
|
41
|
+
class Resolution(TypedDict):
|
42
|
+
width: int
|
43
|
+
height: int
|
44
|
+
|
45
|
+
|
46
|
+
# sizes above XGA/WXGA are not recommended (see README.md)
|
47
|
+
# scale down to one of these targets if ComputerTool._scaling_enabled is set
|
48
|
+
MAX_SCALING_TARGETS: dict[str, Resolution] = {
|
49
|
+
"XGA": Resolution(width=1024, height=768), # 4:3
|
50
|
+
"WXGA": Resolution(width=1280, height=800), # 16:10
|
51
|
+
"FWXGA": Resolution(width=1366, height=768), # ~16:9
|
52
|
+
}
|
53
|
+
|
54
|
+
|
55
|
+
ScalingSource = Literal["computer", "api"]
|
56
|
+
|
57
|
+
|
58
|
+
class ComputerToolOptions(TypedDict):
|
59
|
+
display_height_px: int
|
60
|
+
display_width_px: int
|
61
|
+
display_number: int | None
|
62
|
+
|
63
|
+
|
64
|
+
def chunks(s: str, chunk_size: int) -> list[str]:
|
65
|
+
return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
|
66
|
+
|
67
|
+
|
68
|
+
class X11Client:
|
69
|
+
"""
|
70
|
+
A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
|
71
|
+
|
72
|
+
The tool parameters are defined by Anthropic and are not editable.
|
73
|
+
"""
|
74
|
+
|
75
|
+
width: int
|
76
|
+
height: int
|
77
|
+
display_num: int | None
|
78
|
+
# TODO: Complete plumbing this or remove it
|
79
|
+
color_count: ColorCount | None = 256
|
80
|
+
|
81
|
+
_screenshot_delay = 2.0
|
82
|
+
_scaling_enabled = True
|
83
|
+
|
84
|
+
@property
|
85
|
+
def options(self) -> ComputerToolOptions:
|
86
|
+
width, height = self.scale_coordinates("computer", self.width, self.height)
|
87
|
+
return {
|
88
|
+
"display_width_px": width,
|
89
|
+
"display_height_px": height,
|
90
|
+
"display_number": self.display_num,
|
91
|
+
}
|
92
|
+
|
93
|
+
def __init__(self):
|
94
|
+
super().__init__()
|
95
|
+
|
96
|
+
self.width = int(os.getenv("WIDTH") or 0)
|
97
|
+
self.height = int(os.getenv("HEIGHT") or 0)
|
98
|
+
assert self.width and self.height, "WIDTH, HEIGHT must be set"
|
99
|
+
if (display_num := os.getenv("DISPLAY_NUM")) is not None:
|
100
|
+
self.display_num = int(display_num)
|
101
|
+
self._display_prefix = f"DISPLAY=:{self.display_num} "
|
102
|
+
else:
|
103
|
+
self.display_num = None
|
104
|
+
self._display_prefix = ""
|
105
|
+
|
106
|
+
self.xdotool = f"{self._display_prefix}xdotool"
|
107
|
+
|
108
|
+
async def __call__(
|
109
|
+
self,
|
110
|
+
*,
|
111
|
+
action: Action,
|
112
|
+
text: str | None = None,
|
113
|
+
coordinate: tuple[int, int] | None = None,
|
114
|
+
**kwargs,
|
115
|
+
):
|
116
|
+
if action in ("mouse_move", "left_click_drag"):
|
117
|
+
if coordinate is None:
|
118
|
+
raise ToolError(f"coordinate is required for {action}")
|
119
|
+
if text is not None:
|
120
|
+
raise ToolError(f"text is not accepted for {action}")
|
121
|
+
if not isinstance(coordinate, list) or len(coordinate) != 2:
|
122
|
+
raise ToolError(f"{coordinate} must be a tuple of length 2")
|
123
|
+
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
|
124
|
+
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
|
125
|
+
|
126
|
+
x, y = self.scale_coordinates("api", coordinate[0], coordinate[1])
|
127
|
+
|
128
|
+
if action == "mouse_move":
|
129
|
+
return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}")
|
130
|
+
elif action == "left_click_drag":
|
131
|
+
return await self.shell(
|
132
|
+
f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
|
133
|
+
)
|
134
|
+
|
135
|
+
if action in ("key", "type"):
|
136
|
+
if text is None:
|
137
|
+
raise ToolError(f"text is required for {action}")
|
138
|
+
if coordinate is not None:
|
139
|
+
raise ToolError(f"coordinate is not accepted for {action}")
|
140
|
+
if not isinstance(text, str):
|
141
|
+
raise ToolError(output=f"{text} must be a string")
|
142
|
+
|
143
|
+
if action == "key":
|
144
|
+
return await self.shell(
|
145
|
+
f"{self.xdotool} key -- {' '.join(shlex.quote(part) for part in text.split())}"
|
146
|
+
)
|
147
|
+
elif action == "type":
|
148
|
+
results: list[ToolResult] = []
|
149
|
+
for chunk in chunks(text, TYPING_GROUP_SIZE):
|
150
|
+
cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
|
151
|
+
results.append(await self.shell(cmd, take_screenshot=False))
|
152
|
+
|
153
|
+
screenshot_base64 = await self.take_screenshot_after_delay()
|
154
|
+
return ToolResult(
|
155
|
+
output="".join(result.output or "" for result in results),
|
156
|
+
error="".join(result.error or "" for result in results),
|
157
|
+
base64_image=screenshot_base64,
|
158
|
+
)
|
159
|
+
|
160
|
+
if action in (
|
161
|
+
"left_click",
|
162
|
+
"right_click",
|
163
|
+
"double_click",
|
164
|
+
"middle_click",
|
165
|
+
"screenshot",
|
166
|
+
"cursor_position",
|
167
|
+
):
|
168
|
+
if text is not None:
|
169
|
+
raise ToolError(f"text is not accepted for {action}")
|
170
|
+
if coordinate is not None:
|
171
|
+
raise ToolError(f"coordinate is not accepted for {action}")
|
172
|
+
|
173
|
+
if action == "screenshot":
|
174
|
+
return await self.screenshot()
|
175
|
+
elif action == "cursor_position":
|
176
|
+
result = await self.shell(
|
177
|
+
f"{self.xdotool} getmouselocation --shell",
|
178
|
+
take_screenshot=False,
|
179
|
+
)
|
180
|
+
output = result.output or ""
|
181
|
+
x, y = self.scale_coordinates(
|
182
|
+
"computer",
|
183
|
+
int(output.split("X=")[1].split("\n")[0]),
|
184
|
+
int(output.split("Y=")[1].split("\n")[0]),
|
185
|
+
)
|
186
|
+
return result.replace(output=f"X={x},Y={y}")
|
187
|
+
else:
|
188
|
+
click_arg = {
|
189
|
+
"left_click": "1",
|
190
|
+
"right_click": "3",
|
191
|
+
"middle_click": "2",
|
192
|
+
"double_click": "--repeat 2 --delay 300 1",
|
193
|
+
}[action]
|
194
|
+
return await self.shell(f"{self.xdotool} click {click_arg}")
|
195
|
+
|
196
|
+
raise ToolError(f"Invalid action: {action}")
|
197
|
+
|
198
|
+
async def screenshot(self):
|
199
|
+
"""Take a screenshot of the current screen and return the base64 encoded image."""
|
200
|
+
output_dir = Path(OUTPUT_DIR)
|
201
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
202
|
+
path = output_dir / f"screenshot_{uuid4().hex}.png"
|
203
|
+
|
204
|
+
result = await self.shell(
|
205
|
+
f"{self._display_prefix}scrot --silent -p {path}", take_screenshot=False
|
206
|
+
)
|
207
|
+
if self._scaling_enabled:
|
208
|
+
x, y = self.scale_coordinates("computer", self.width, self.height)
|
209
|
+
convert_cmd = f"convert {path} -resize {x}x{y}!"
|
210
|
+
if self.color_count is not None:
|
211
|
+
convert_cmd += f" -colors {self.color_count}"
|
212
|
+
convert_cmd += f" {path}"
|
213
|
+
await self.shell(convert_cmd, take_screenshot=False)
|
214
|
+
|
215
|
+
if path.exists():
|
216
|
+
return result.replace(
|
217
|
+
base64_image=base64.b64encode(path.read_bytes()).decode()
|
218
|
+
)
|
219
|
+
raise ToolError(f"Failed to take screenshot: {result.error}")
|
220
|
+
|
221
|
+
async def shell(self, command: str, take_screenshot=True) -> ToolResult:
|
222
|
+
"""Run a shell command and return the output, error, and optionally a screenshot."""
|
223
|
+
logging.debug(f"running shell command {command}")
|
224
|
+
_, stdout, stderr = await run(command)
|
225
|
+
logging.debug(f"shell command returned stdout: {stdout}, stderr: {stderr}")
|
226
|
+
return ToolResult(
|
227
|
+
output=stdout,
|
228
|
+
error=stderr,
|
229
|
+
base64_image=(await self.take_screenshot_after_delay())
|
230
|
+
if take_screenshot
|
231
|
+
else None,
|
232
|
+
)
|
233
|
+
|
234
|
+
async def take_screenshot_after_delay(self) -> str:
|
235
|
+
# delay to let things settle before taking a screenshot
|
236
|
+
await asyncio.sleep(self._screenshot_delay)
|
237
|
+
return (await self.screenshot()).base64_image
|
238
|
+
|
239
|
+
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
|
240
|
+
"""Scale coordinates to a target maximum resolution."""
|
241
|
+
if not self._scaling_enabled:
|
242
|
+
return x, y
|
243
|
+
ratio = self.width / self.height
|
244
|
+
target_dimension = None
|
245
|
+
for dimension in MAX_SCALING_TARGETS.values():
|
246
|
+
# allow some error in the aspect ratio - not ratios are exactly 16:9
|
247
|
+
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
248
|
+
if dimension["width"] < self.width:
|
249
|
+
target_dimension = dimension
|
250
|
+
break
|
251
|
+
if target_dimension is None:
|
252
|
+
return x, y
|
253
|
+
# should be less than 1
|
254
|
+
x_scaling_factor = target_dimension["width"] / self.width
|
255
|
+
y_scaling_factor = target_dimension["height"] / self.height
|
256
|
+
if source == "api":
|
257
|
+
if x > self.width or y > self.height:
|
258
|
+
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
|
259
|
+
# scale up
|
260
|
+
return round(x / x_scaling_factor), round(y / y_scaling_factor)
|
261
|
+
# scale down
|
262
|
+
return round(x * x_scaling_factor), round(y * y_scaling_factor)
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import argparse
|
2
|
+
import asyncio
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
import time
|
8
|
+
|
9
|
+
from _logger import setup_logger
|
10
|
+
from _tool_result import ToolResult
|
11
|
+
from _x11_client import X11Client
|
12
|
+
|
13
|
+
# This is a bit sketchy. We really want to use relative imports here. Using absolute imports
|
14
|
+
# works at runtime, but it prevents intellisense from working. However, when this folder is
|
15
|
+
# copied to the container, by default relative imports won't work if this file is launched
|
16
|
+
# normally. To overcome this, two things need to happen:
|
17
|
+
# 1. PYTHONPATH must be set to the parent of the container folder. `PYTHONPATH=/opt`
|
18
|
+
# 2. The program must be launched with the -m flag. `python3 -m computer_tool.computer_tool`
|
19
|
+
#
|
20
|
+
# TODO: There's got to be a cleaner way.
|
21
|
+
|
22
|
+
my_logger = setup_logger(logging.INFO)
|
23
|
+
|
24
|
+
|
25
|
+
def main():
|
26
|
+
try:
|
27
|
+
args = parse_arguments()
|
28
|
+
my_logger.info(f"({args})")
|
29
|
+
result = asyncio.run(execute_action(args))
|
30
|
+
|
31
|
+
print(
|
32
|
+
json.dumps(
|
33
|
+
{
|
34
|
+
"output": result.output,
|
35
|
+
"error": result.error,
|
36
|
+
"base64_image": result.base64_image,
|
37
|
+
}
|
38
|
+
)
|
39
|
+
)
|
40
|
+
my_logger.debug("SUCCESS")
|
41
|
+
except Exception as e:
|
42
|
+
my_logger.warning(f"An error occurred: {e}")
|
43
|
+
print(f"An error occurred: {e}", file=sys.stderr)
|
44
|
+
sys.exit(1)
|
45
|
+
|
46
|
+
|
47
|
+
def parse_arguments():
|
48
|
+
parser = argparse.ArgumentParser(description="Execute computer tool action")
|
49
|
+
parser.add_argument("--action", type=str, required=True, help="Action to perform")
|
50
|
+
parser.add_argument("--text", type=str, help="Optional text parameter")
|
51
|
+
parser.add_argument(
|
52
|
+
"--coordinate",
|
53
|
+
type=int,
|
54
|
+
nargs=2,
|
55
|
+
help="Optional coordinate parameter as a list of two integers",
|
56
|
+
)
|
57
|
+
return parser.parse_args()
|
58
|
+
|
59
|
+
|
60
|
+
async def execute_action(args) -> ToolResult:
|
61
|
+
# we can't do anything until X11 is ready to go.
|
62
|
+
await wait_for_file("/tmp/xfce_started")
|
63
|
+
|
64
|
+
computer = X11Client()
|
65
|
+
return await computer(
|
66
|
+
action=args.action,
|
67
|
+
text=args.text,
|
68
|
+
coordinate=args.coordinate if args.coordinate else None,
|
69
|
+
)
|
70
|
+
|
71
|
+
|
72
|
+
async def wait_for_file(file_path, check_interval=1):
|
73
|
+
if os.path.exists(file_path):
|
74
|
+
return
|
75
|
+
my_logger.info(f"Waiting for {file_path}")
|
76
|
+
start_time = time.time()
|
77
|
+
while not os.path.exists(file_path):
|
78
|
+
await asyncio.sleep(check_interval)
|
79
|
+
my_logger.info(
|
80
|
+
f"Done waiting for {file_path} after {time.time() - start_time:.1f} seconds"
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
if __name__ == "__main__":
|
85
|
+
main()
|
File without changes
|
inspect_ai/util/__init__.py
CHANGED
@@ -3,6 +3,7 @@ from inspect_ai._util.trace import trace_action, trace_message
|
|
3
3
|
from ._concurrency import concurrency
|
4
4
|
from ._console import input_screen
|
5
5
|
from ._display import DisplayType, display_type
|
6
|
+
from ._limit import SampleLimitExceededError
|
6
7
|
from ._panel import InputPanel, input_panel
|
7
8
|
from ._resource import resource
|
8
9
|
from ._sandbox import (
|
@@ -36,6 +37,7 @@ __all__ = [
|
|
36
37
|
"input_panel",
|
37
38
|
"input_screen",
|
38
39
|
"OutputLimitExceededError",
|
40
|
+
"SampleLimitExceededError",
|
39
41
|
"resource",
|
40
42
|
"subprocess",
|
41
43
|
"SandboxEnvironment",
|
inspect_ai/util/_display.py
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
|
3
|
+
|
4
|
+
class SampleLimitExceededError(Exception):
|
5
|
+
"""Exception raised when a sample limit is exceeded.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
|
9
|
+
value (int): Value compared to
|
10
|
+
limit (int): Limit applied.
|
11
|
+
message (str | None): Optional. Human readable message.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
type: Literal["message", "time", "token", "operator", "custom"],
|
17
|
+
*,
|
18
|
+
value: int,
|
19
|
+
limit: int,
|
20
|
+
message: str | None = None,
|
21
|
+
) -> None:
|
22
|
+
self.type = type
|
23
|
+
self.value = value
|
24
|
+
self.limit = limit
|
25
|
+
self.message = f"Exceeded {type} limit: {limit:,}"
|
26
|
+
super().__init__(message)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import errno
|
2
|
+
import json
|
2
3
|
import os
|
3
4
|
import tempfile
|
4
5
|
from logging import getLogger
|
@@ -7,9 +8,11 @@ from typing import Literal, Union, cast, overload
|
|
7
8
|
|
8
9
|
from typing_extensions import override
|
9
10
|
|
10
|
-
from inspect_ai.util._subprocess import ExecResult
|
11
|
+
from inspect_ai.util._subprocess import ExecResult, subprocess
|
11
12
|
|
12
13
|
from ..environment import (
|
14
|
+
HostMapping,
|
15
|
+
PortMapping,
|
13
16
|
SandboxConnection,
|
14
17
|
SandboxEnvironment,
|
15
18
|
SandboxEnvironmentConfigType,
|
@@ -439,6 +442,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
439
442
|
"remote-containers.attachToRunningContainer",
|
440
443
|
container,
|
441
444
|
],
|
445
|
+
ports=await get_ports_info(container),
|
442
446
|
container=container,
|
443
447
|
)
|
444
448
|
# error (not currently running)
|
@@ -468,3 +472,62 @@ async def container_working_dir(
|
|
468
472
|
+ f"{result.stderr}"
|
469
473
|
)
|
470
474
|
return default
|
475
|
+
|
476
|
+
|
477
|
+
async def get_ports_info(container: str) -> list[PortMapping] | None:
|
478
|
+
try:
|
479
|
+
result = await subprocess(
|
480
|
+
[
|
481
|
+
"docker",
|
482
|
+
"inspect",
|
483
|
+
container,
|
484
|
+
"--format",
|
485
|
+
"{{json .NetworkSettings.Ports}}",
|
486
|
+
],
|
487
|
+
timeout=60,
|
488
|
+
)
|
489
|
+
|
490
|
+
if not result.success:
|
491
|
+
raise RuntimeError(result.stderr)
|
492
|
+
|
493
|
+
return parse_docker_inspect_ports(result.stdout)
|
494
|
+
|
495
|
+
# It's currently a policy decision to let docker timeouts to be silent.
|
496
|
+
except TimeoutError:
|
497
|
+
return None
|
498
|
+
|
499
|
+
|
500
|
+
def parse_docker_inspect_ports(json_str: str) -> list[PortMapping] | None:
|
501
|
+
"""
|
502
|
+
Parses the JSON output from `docker inspect {container_name} --format='{{json .NetworkSettings.Ports}}'` to extract port mappings.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
json_str (str): A JSON string representing the `NetworkSettings.Ports` output of `docker inspect`. e.g.
|
506
|
+
```
|
507
|
+
{
|
508
|
+
"5900/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54023"}],
|
509
|
+
"8080/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54024"}]
|
510
|
+
}
|
511
|
+
```
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
list[PortMapping] | None: A list of PortMapping objects if any port mappings are found,
|
515
|
+
otherwise None.
|
516
|
+
"""
|
517
|
+
data = json.loads(json_str)
|
518
|
+
port_mappings = []
|
519
|
+
for port_protocol, mappings in data.items():
|
520
|
+
if mappings is None:
|
521
|
+
continue
|
522
|
+
container_port, protocol = port_protocol.split("/")
|
523
|
+
host_mappings = [
|
524
|
+
HostMapping(host_ip=mapping["HostIp"], host_port=int(mapping["HostPort"]))
|
525
|
+
for mapping in mappings
|
526
|
+
]
|
527
|
+
port_mapping = PortMapping(
|
528
|
+
container_port=int(container_port),
|
529
|
+
protocol=protocol,
|
530
|
+
mappings=host_mappings,
|
531
|
+
)
|
532
|
+
port_mappings.append(port_mapping)
|
533
|
+
return port_mappings if port_mappings else None
|
@@ -6,13 +6,15 @@ from inspect_ai.util._subprocess import subprocess
|
|
6
6
|
INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB = "aisiuk/inspect-web-browser-tool"
|
7
7
|
|
8
8
|
INSPECT_WEB_BROWSER_IMAGE = "inspect_web_browser"
|
9
|
+
INSPECT_COMPUTER_IMAGE = "inspect-computer-tool"
|
9
10
|
|
10
11
|
INTERNAL_IMAGES = {
|
11
12
|
INSPECT_WEB_BROWSER_IMAGE: PKG_PATH
|
12
13
|
/ "tool"
|
13
14
|
/ "_tools"
|
14
15
|
/ "_web_browser"
|
15
|
-
/ "_resources"
|
16
|
+
/ "_resources",
|
17
|
+
INSPECT_COMPUTER_IMAGE: PKG_PATH / "tool" / "beta" / "_computer" / "_resources",
|
16
18
|
}
|
17
19
|
|
18
20
|
|
@@ -57,7 +57,7 @@ async def validate_docker_compose(
|
|
57
57
|
version: str = DOCKER_COMPOSE_REQUIRED_VERSION,
|
58
58
|
) -> None:
|
59
59
|
def parse_version(stdout: str) -> semver.Version:
|
60
|
-
version = json.loads(stdout)["version"].removeprefix("v")
|
60
|
+
version = json.loads(stdout)["version"].removeprefix("v").split("+")[0]
|
61
61
|
return semver.Version.parse(version)
|
62
62
|
|
63
63
|
await validate_version(
|
@@ -28,6 +28,17 @@ SampleCleanup = Callable[
|
|
28
28
|
]
|
29
29
|
|
30
30
|
|
31
|
+
class HostMapping(BaseModel):
|
32
|
+
host_ip: str
|
33
|
+
host_port: int
|
34
|
+
|
35
|
+
|
36
|
+
class PortMapping(BaseModel):
|
37
|
+
container_port: int
|
38
|
+
protocol: Literal["tcp", "udp"]
|
39
|
+
mappings: list[HostMapping]
|
40
|
+
|
41
|
+
|
31
42
|
class SandboxConnection(BaseModel):
|
32
43
|
"""Information required to connect to sandbox."""
|
33
44
|
|
@@ -40,6 +51,9 @@ class SandboxConnection(BaseModel):
|
|
40
51
|
vscode_command: list[Any] | None = Field(default=None)
|
41
52
|
"""Optional vscode command (+args) to connect to sandbox."""
|
42
53
|
|
54
|
+
ports: list[PortMapping] | None = Field(default=None)
|
55
|
+
"""Optional list of port mappings into container"""
|
56
|
+
|
43
57
|
container: str | None = Field(default=None)
|
44
58
|
"""Optional container name (does not apply to all sandboxes)."""
|
45
59
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.60
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Safety Institute
|
6
6
|
License: MIT License
|
@@ -54,6 +54,7 @@ Requires-Dist: aioboto3; extra == "dev"
|
|
54
54
|
Requires-Dist: azure-ai-inference; extra == "dev"
|
55
55
|
Requires-Dist: google-cloud-aiplatform; extra == "dev"
|
56
56
|
Requires-Dist: google-generativeai; extra == "dev"
|
57
|
+
Requires-Dist: goodfire; extra == "dev"
|
57
58
|
Requires-Dist: groq; extra == "dev"
|
58
59
|
Requires-Dist: ipython; extra == "dev"
|
59
60
|
Requires-Dist: mistralai; extra == "dev"
|
@@ -67,7 +68,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
67
68
|
Requires-Dist: pytest-cov; extra == "dev"
|
68
69
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
69
70
|
Requires-Dist: pytest-xdist; extra == "dev"
|
70
|
-
Requires-Dist: ruff==0.9.
|
71
|
+
Requires-Dist: ruff==0.9.3; extra == "dev"
|
71
72
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
72
73
|
Requires-Dist: types-PyYAML; extra == "dev"
|
73
74
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|