inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +7 -3
- inspect_ai/_cli/eval.py +17 -2
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +4 -3
- inspect_ai/_display/core/config.py +3 -3
- inspect_ai/_display/core/panel.py +7 -3
- inspect_ai/_display/plain/__init__.py +0 -0
- inspect_ai/_display/plain/display.py +203 -0
- inspect_ai/_display/rich/display.py +4 -9
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
- inspect_ai/_display/textual/widgets/samples.py +119 -16
- inspect_ai/_display/textual/widgets/sandbox.py +37 -0
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/score.py +1 -0
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/results.py +50 -22
- inspect_ai/_eval/task/run.py +180 -124
- inspect_ai/_eval/task/sandbox.py +10 -5
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/port_names.py +61 -0
- inspect_ai/_util/text.py +23 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_view/www/App.css +31 -1
- inspect_ai/_view/www/dist/assets/index.css +31 -1
- inspect_ai/_view/www/dist/assets/index.js +25375 -1846
- inspect_ai/_view/www/log-schema.json +129 -15
- inspect_ai/_view/www/package.json +2 -0
- inspect_ai/_view/www/src/App.mjs +8 -10
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
- inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
- inspect_ai/_view/www/src/index.js +75 -2
- inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
- inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
- inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
- inspect_ai/_view/www/src/types/log.d.ts +62 -27
- inspect_ai/_view/www/src/utils/Format.mjs +10 -3
- inspect_ai/_view/www/src/utils/Json.mjs +12 -6
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
- inspect_ai/_view/www/vite.config.js +7 -0
- inspect_ai/_view/www/yarn.lock +116 -0
- inspect_ai/approval/_human/__init__.py +0 -0
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/approval/_policy.py +12 -6
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_recorders/eval.py +19 -8
- inspect_ai/log/_samples.py +26 -5
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +59 -12
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/_conversation.py +61 -0
- inspect_ai/model/_generate_config.py +10 -4
- inspect_ai/model/_model.py +117 -18
- inspect_ai/model/_model_output.py +7 -2
- inspect_ai/model/_providers/anthropic.py +109 -51
- inspect_ai/model/_providers/azureai.py +26 -24
- inspect_ai/model/_providers/bedrock.py +43 -44
- inspect_ai/model/_providers/google.py +121 -58
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +17 -20
- inspect_ai/model/_providers/openai.py +32 -21
- inspect_ai/model/_providers/openai_o1.py +9 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/together.py +8 -8
- inspect_ai/model/_providers/vertex.py +18 -8
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/scorer/_scorer.py +2 -2
- inspect_ai/solver/__init__.py +2 -5
- inspect_ai/solver/_prompt.py +35 -5
- inspect_ai/solver/_task_state.py +80 -38
- inspect_ai/tool/__init__.py +11 -1
- inspect_ai/tool/_tool.py +21 -3
- inspect_ai/tool/_tool_call.py +10 -0
- inspect_ai/tool/_tool_def.py +16 -5
- inspect_ai/tool/_tool_with.py +21 -4
- inspect_ai/tool/beta/__init__.py +5 -0
- inspect_ai/tool/beta/_computer/__init__.py +3 -0
- inspect_ai/tool/beta/_computer/_common.py +133 -0
- inspect_ai/tool/beta/_computer/_computer.py +155 -0
- inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
- inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
- inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
- inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/util/__init__.py +2 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_limit.py +26 -0
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -11
- inspect_ai/util/_sandbox/docker/docker.py +84 -14
- inspect_ai/util/_sandbox/docker/internal.py +3 -1
- inspect_ai/util/_sandbox/environment.py +27 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
- inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
- inspect_ai/model/_trace.py +0 -48
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
|
|
1
|
-
from contextvars import ContextVar
|
2
|
-
|
3
1
|
from rich import print
|
4
2
|
from rich.console import RenderableType
|
5
3
|
from rich.text import Text
|
@@ -7,12 +5,7 @@ from rich.text import Text
|
|
7
5
|
from inspect_ai._util.transcript import transcript_panel
|
8
6
|
|
9
7
|
|
10
|
-
def
|
11
|
-
"""Is trace mode currently enabled."""
|
12
|
-
return _trace.get(None) is True
|
13
|
-
|
14
|
-
|
15
|
-
def trace_panel(
|
8
|
+
def conversation_panel(
|
16
9
|
title: str,
|
17
10
|
*,
|
18
11
|
subtitle: str | None = None,
|
@@ -20,8 +13,8 @@ def trace_panel(
|
|
20
13
|
) -> None:
|
21
14
|
"""Trace content into a standard trace panel display.
|
22
15
|
|
23
|
-
Typically you would call `
|
24
|
-
|
16
|
+
Typically you would call `display_type() == "conversation"` to confirm that
|
17
|
+
we are in conversation mode before calling `conversation_panel()`.
|
25
18
|
|
26
19
|
Args:
|
27
20
|
title (str): Panel title.
|
@@ -32,10 +25,3 @@ def trace_panel(
|
|
32
25
|
transcript_panel(title, subtitle, content),
|
33
26
|
Text(),
|
34
27
|
)
|
35
|
-
|
36
|
-
|
37
|
-
def init_trace(trace: bool | None) -> None:
|
38
|
-
_trace.set(trace)
|
39
|
-
|
40
|
-
|
41
|
-
_trace: ContextVar[bool | None] = ContextVar("_trace_mode")
|
inspect_ai/util/_display.py
CHANGED
@@ -3,10 +3,11 @@ from logging import getLogger
|
|
3
3
|
from typing import Literal
|
4
4
|
|
5
5
|
from inspect_ai._util.constants import DEFAULT_DISPLAY
|
6
|
+
from inspect_ai._util.thread import is_main_thread
|
6
7
|
|
7
8
|
logger = getLogger(__name__)
|
8
9
|
|
9
|
-
DisplayType = Literal["full", "rich", "plain", "none"]
|
10
|
+
DisplayType = Literal["full", "conversation", "rich", "plain", "none"]
|
10
11
|
"""Console display type."""
|
11
12
|
|
12
13
|
|
@@ -15,15 +16,24 @@ _display_type: DisplayType | None = None
|
|
15
16
|
|
16
17
|
def init_display_type(display: str | None = None) -> DisplayType:
|
17
18
|
global _display_type
|
18
|
-
global _display_metrics
|
19
19
|
display = (
|
20
20
|
display or os.environ.get("INSPECT_DISPLAY", DEFAULT_DISPLAY).lower().strip()
|
21
21
|
)
|
22
|
+
|
23
|
+
# if we are on a background thread then throttle down to "plain"
|
24
|
+
# ("full" requires textual which cannot run in a background thread
|
25
|
+
# b/c it calls the Python signal function; "rich" assumes exclusive
|
26
|
+
# display access which may not be the case for threads)
|
27
|
+
if display in ["full", "rich"] and not is_main_thread():
|
28
|
+
display = "plain"
|
29
|
+
|
22
30
|
match display:
|
23
|
-
case "full" | "rich" | "plain" | "none":
|
31
|
+
case "full" | "conversation" | "rich" | "plain" | "none":
|
24
32
|
_display_type = display
|
25
33
|
case _:
|
26
|
-
logger.warning(
|
34
|
+
logger.warning(
|
35
|
+
f"Unknown display type '{display}' (setting display to 'full')"
|
36
|
+
)
|
27
37
|
_display_type = "full"
|
28
38
|
return _display_type
|
29
39
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
|
3
|
+
|
4
|
+
class SampleLimitExceededError(Exception):
|
5
|
+
"""Exception raised when a sample limit is exceeded.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
|
9
|
+
value (int): Value compared to
|
10
|
+
limit (int): Limit applied.
|
11
|
+
message (str | None): Optional. Human readable message.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
type: Literal["message", "time", "token", "operator", "custom"],
|
17
|
+
*,
|
18
|
+
value: int,
|
19
|
+
limit: int,
|
20
|
+
message: str | None = None,
|
21
|
+
) -> None:
|
22
|
+
self.type = type
|
23
|
+
self.value = value
|
24
|
+
self.limit = limit
|
25
|
+
self.message = f"Exceeded {type} limit: {limit:,}"
|
26
|
+
super().__init__(message)
|
@@ -4,6 +4,8 @@ from typing import Any, NoReturn, cast
|
|
4
4
|
|
5
5
|
from shortuuid import uuid
|
6
6
|
|
7
|
+
from inspect_ai._util.constants import SANDBOX_SETUP_TIMEOUT
|
8
|
+
|
7
9
|
from .environment import (
|
8
10
|
SampleCleanup,
|
9
11
|
SampleInit,
|
@@ -193,23 +195,20 @@ async def setup_sandbox_environment(
|
|
193
195
|
setup_file = f"/tmp/{uuid()}"
|
194
196
|
await env.write_file(setup_file, setup)
|
195
197
|
|
196
|
-
#
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
)
|
204
|
-
|
198
|
+
# execute and then remove setup script (don't retry it on timeout
|
199
|
+
# in case it is not idempotent)
|
200
|
+
try:
|
201
|
+
await env.exec(["chmod", "+x", setup_file], timeout=30)
|
202
|
+
result = await env.exec(
|
203
|
+
["env", setup_file], timeout=SANDBOX_SETUP_TIMEOUT, timeout_retry=False
|
204
|
+
)
|
205
205
|
if not result.success:
|
206
206
|
raise RuntimeError(
|
207
207
|
f"Failed to execute setup script for sample: {result.stderr}"
|
208
208
|
)
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
await exec(["rm", setup_file])
|
209
|
+
await env.exec(["rm", setup_file], timeout=30)
|
210
|
+
except TimeoutError:
|
211
|
+
raise RuntimeError("Timed out executing setup command in sandbox")
|
213
212
|
|
214
213
|
|
215
214
|
def default_sandbox_environment(
|
@@ -25,16 +25,17 @@ COMPOSE_WAIT = "120"
|
|
25
25
|
|
26
26
|
|
27
27
|
async def compose_up(project: ComposeProject) -> None:
|
28
|
-
# Start the environment
|
29
|
-
|
28
|
+
# Start the environment. Note that we don't check the result because docker will
|
29
|
+
# return a non-zero exit code for services that exit (even successfully) when
|
30
|
+
# passing the --wait flag (see https://github.com/docker/compose/issues/10596).
|
31
|
+
# In practice, we will catch any errors when calling compose_check_running()
|
32
|
+
# immediately after we call compose_up().
|
33
|
+
await compose_command(
|
30
34
|
["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT],
|
31
35
|
project=project,
|
32
36
|
# wait up to 5 minutes for container to go up (compose wait + 3 minutes)
|
33
37
|
timeout=300,
|
34
38
|
)
|
35
|
-
if not result.success:
|
36
|
-
msg = f"Failed to start docker services for {project.config}: {result.stderr}"
|
37
|
-
raise RuntimeError(msg)
|
38
39
|
|
39
40
|
|
40
41
|
async def compose_down(project: ComposeProject, quiet: bool = True) -> None:
|
@@ -91,14 +92,21 @@ async def compose_cp(
|
|
91
92
|
raise RuntimeError(msg)
|
92
93
|
|
93
94
|
|
94
|
-
async def compose_check_running(
|
95
|
+
async def compose_check_running(
|
96
|
+
services: list[str], project: ComposeProject
|
97
|
+
) -> list[str]:
|
95
98
|
# Check to ensure that the status of containers is healthy
|
96
99
|
running_services = await compose_ps(project=project, status="running")
|
97
|
-
|
98
|
-
|
100
|
+
exited_services = await compose_ps(project=project, status="exited")
|
101
|
+
successful_services = running_services + [
|
102
|
+
service for service in exited_services if service["ExitCode"] == 0
|
103
|
+
]
|
104
|
+
|
105
|
+
if len(successful_services) > 0:
|
106
|
+
if len(successful_services) != len(services):
|
99
107
|
unhealthy_services = services
|
100
|
-
for
|
101
|
-
unhealthy_services.remove(
|
108
|
+
for successful_service in successful_services:
|
109
|
+
unhealthy_services.remove(successful_service["Service"])
|
102
110
|
|
103
111
|
msg = (
|
104
112
|
"One or more docker containers failed to start from "
|
@@ -108,6 +116,8 @@ async def compose_check_running(services: list[str], project: ComposeProject) ->
|
|
108
116
|
else:
|
109
117
|
raise RuntimeError("No services started")
|
110
118
|
|
119
|
+
return [service["Service"] for service in running_services]
|
120
|
+
|
111
121
|
|
112
122
|
async def compose_ps(
|
113
123
|
project: ComposeProject,
|
@@ -166,6 +176,7 @@ async def compose_exec(
|
|
166
176
|
*,
|
167
177
|
project: ComposeProject,
|
168
178
|
timeout: int | None,
|
179
|
+
timeout_retry: bool = True,
|
169
180
|
input: str | bytes | None = None,
|
170
181
|
output_limit: int | None = None,
|
171
182
|
) -> ExecResult[str]:
|
@@ -173,6 +184,7 @@ async def compose_exec(
|
|
173
184
|
["exec"] + command,
|
174
185
|
project=project,
|
175
186
|
timeout=timeout,
|
187
|
+
timeout_retry=timeout_retry,
|
176
188
|
input=input,
|
177
189
|
forward_env=False,
|
178
190
|
output_limit=output_limit,
|
@@ -258,6 +270,7 @@ async def compose_command(
|
|
258
270
|
*,
|
259
271
|
project: ComposeProject,
|
260
272
|
timeout: int | None,
|
273
|
+
timeout_retry: bool = True,
|
261
274
|
input: str | bytes | None = None,
|
262
275
|
cwd: str | Path | None = None,
|
263
276
|
forward_env: bool = True,
|
@@ -325,7 +338,7 @@ async def compose_command(
|
|
325
338
|
return await run_command(command_timeout)
|
326
339
|
except TimeoutError:
|
327
340
|
retries += 1
|
328
|
-
if retries <= MAX_RETRIES:
|
341
|
+
if timeout_retry and (retries <= MAX_RETRIES):
|
329
342
|
logger.info(
|
330
343
|
f"Retrying docker compose command: {shlex.join(compose_command)}"
|
331
344
|
)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import errno
|
2
|
+
import json
|
2
3
|
import os
|
3
4
|
import tempfile
|
4
5
|
from logging import getLogger
|
@@ -7,9 +8,11 @@ from typing import Literal, Union, cast, overload
|
|
7
8
|
|
8
9
|
from typing_extensions import override
|
9
10
|
|
10
|
-
from inspect_ai.util._subprocess import ExecResult
|
11
|
+
from inspect_ai.util._subprocess import ExecResult, subprocess
|
11
12
|
|
12
13
|
from ..environment import (
|
14
|
+
HostMapping,
|
15
|
+
PortMapping,
|
13
16
|
SandboxConnection,
|
14
17
|
SandboxEnvironment,
|
15
18
|
SandboxEnvironmentConfigType,
|
@@ -138,28 +141,31 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
138
141
|
# start the services
|
139
142
|
await compose_up(project)
|
140
143
|
|
144
|
+
# check to ensure that the services are running
|
145
|
+
running_services = await compose_check_running(
|
146
|
+
list(services.keys()), project=project
|
147
|
+
)
|
148
|
+
|
141
149
|
# note that the project is running
|
142
150
|
project_startup(project)
|
143
151
|
|
144
|
-
#
|
145
|
-
await compose_check_running(list(services.keys()), project=project)
|
146
|
-
|
147
|
-
# create sandbox environments
|
152
|
+
# create sandbox environments for all running services
|
148
153
|
default_service: str | None = None
|
149
154
|
environments: dict[str, SandboxEnvironment] = {}
|
150
155
|
for service, service_info in services.items():
|
151
|
-
|
152
|
-
|
156
|
+
if service in running_services:
|
157
|
+
# update the project w/ the working directory
|
158
|
+
working_dir = await container_working_dir(service, project)
|
153
159
|
|
154
|
-
|
155
|
-
|
160
|
+
# create the docker sandbox environemnt
|
161
|
+
docker_env = DockerSandboxEnvironment(service, project, working_dir)
|
156
162
|
|
157
|
-
|
158
|
-
|
159
|
-
|
163
|
+
# save reference to default service if requested
|
164
|
+
if service_info.get("x-default", False):
|
165
|
+
default_service = service
|
160
166
|
|
161
|
-
|
162
|
-
|
167
|
+
# record service => environment
|
168
|
+
environments[service] = docker_env
|
163
169
|
|
164
170
|
# confirm that we have a 'default' environemnt
|
165
171
|
if environments.get("default", None) is None and default_service is None:
|
@@ -225,6 +231,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
225
231
|
env: dict[str, str] = {},
|
226
232
|
user: str | None = None,
|
227
233
|
timeout: int | None = None,
|
234
|
+
timeout_retry: bool = True,
|
228
235
|
) -> ExecResult[str]:
|
229
236
|
# additional args
|
230
237
|
args = []
|
@@ -251,6 +258,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
251
258
|
args + [self._service] + cmd,
|
252
259
|
project=self._project,
|
253
260
|
timeout=timeout,
|
261
|
+
timeout_retry=timeout_retry,
|
254
262
|
input=input,
|
255
263
|
output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
|
256
264
|
)
|
@@ -428,11 +436,14 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
428
436
|
# return container connection
|
429
437
|
if container:
|
430
438
|
return SandboxConnection(
|
439
|
+
type="docker",
|
431
440
|
command=f"docker exec -it {container} bash -l",
|
432
441
|
vscode_command=[
|
433
442
|
"remote-containers.attachToRunningContainer",
|
434
443
|
container,
|
435
444
|
],
|
445
|
+
ports=await get_ports_info(container),
|
446
|
+
container=container,
|
436
447
|
)
|
437
448
|
# error (not currently running)
|
438
449
|
else:
|
@@ -461,3 +472,62 @@ async def container_working_dir(
|
|
461
472
|
+ f"{result.stderr}"
|
462
473
|
)
|
463
474
|
return default
|
475
|
+
|
476
|
+
|
477
|
+
async def get_ports_info(container: str) -> list[PortMapping] | None:
|
478
|
+
try:
|
479
|
+
result = await subprocess(
|
480
|
+
[
|
481
|
+
"docker",
|
482
|
+
"inspect",
|
483
|
+
container,
|
484
|
+
"--format",
|
485
|
+
"{{json .NetworkSettings.Ports}}",
|
486
|
+
],
|
487
|
+
timeout=60,
|
488
|
+
)
|
489
|
+
|
490
|
+
if not result.success:
|
491
|
+
raise RuntimeError(result.stderr)
|
492
|
+
|
493
|
+
return parse_docker_inspect_ports(result.stdout)
|
494
|
+
|
495
|
+
# It's currently a policy decision to let docker timeouts to be silent.
|
496
|
+
except TimeoutError:
|
497
|
+
return None
|
498
|
+
|
499
|
+
|
500
|
+
def parse_docker_inspect_ports(json_str: str) -> list[PortMapping] | None:
|
501
|
+
"""
|
502
|
+
Parses the JSON output from `docker inspect {container_name} --format='{{json .NetworkSettings.Ports}}'` to extract port mappings.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
json_str (str): A JSON string representing the `NetworkSettings.Ports` output of `docker inspect`. e.g.
|
506
|
+
```
|
507
|
+
{
|
508
|
+
"5900/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54023"}],
|
509
|
+
"8080/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54024"}]
|
510
|
+
}
|
511
|
+
```
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
list[PortMapping] | None: A list of PortMapping objects if any port mappings are found,
|
515
|
+
otherwise None.
|
516
|
+
"""
|
517
|
+
data = json.loads(json_str)
|
518
|
+
port_mappings = []
|
519
|
+
for port_protocol, mappings in data.items():
|
520
|
+
if mappings is None:
|
521
|
+
continue
|
522
|
+
container_port, protocol = port_protocol.split("/")
|
523
|
+
host_mappings = [
|
524
|
+
HostMapping(host_ip=mapping["HostIp"], host_port=int(mapping["HostPort"]))
|
525
|
+
for mapping in mappings
|
526
|
+
]
|
527
|
+
port_mapping = PortMapping(
|
528
|
+
container_port=int(container_port),
|
529
|
+
protocol=protocol,
|
530
|
+
mappings=host_mappings,
|
531
|
+
)
|
532
|
+
port_mappings.append(port_mapping)
|
533
|
+
return port_mappings if port_mappings else None
|
@@ -6,13 +6,15 @@ from inspect_ai.util._subprocess import subprocess
|
|
6
6
|
INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB = "aisiuk/inspect-web-browser-tool"
|
7
7
|
|
8
8
|
INSPECT_WEB_BROWSER_IMAGE = "inspect_web_browser"
|
9
|
+
INSPECT_COMPUTER_IMAGE = "inspect-computer-tool"
|
9
10
|
|
10
11
|
INTERNAL_IMAGES = {
|
11
12
|
INSPECT_WEB_BROWSER_IMAGE: PKG_PATH
|
12
13
|
/ "tool"
|
13
14
|
/ "_tools"
|
14
15
|
/ "_web_browser"
|
15
|
-
/ "_resources"
|
16
|
+
/ "_resources",
|
17
|
+
INSPECT_COMPUTER_IMAGE: PKG_PATH / "tool" / "beta" / "_computer" / "_resources",
|
16
18
|
}
|
17
19
|
|
18
20
|
|
@@ -28,15 +28,35 @@ SampleCleanup = Callable[
|
|
28
28
|
]
|
29
29
|
|
30
30
|
|
31
|
+
class HostMapping(BaseModel):
|
32
|
+
host_ip: str
|
33
|
+
host_port: int
|
34
|
+
|
35
|
+
|
36
|
+
class PortMapping(BaseModel):
|
37
|
+
container_port: int
|
38
|
+
protocol: Literal["tcp", "udp"]
|
39
|
+
mappings: list[HostMapping]
|
40
|
+
|
41
|
+
|
31
42
|
class SandboxConnection(BaseModel):
|
32
43
|
"""Information required to connect to sandbox."""
|
33
44
|
|
45
|
+
type: str
|
46
|
+
"""Sandbox type name (e.g. 'docker', 'local', etc.)"""
|
47
|
+
|
34
48
|
command: str
|
35
49
|
"""Shell command to connect to sandbox."""
|
36
50
|
|
37
51
|
vscode_command: list[Any] | None = Field(default=None)
|
38
52
|
"""Optional vscode command (+args) to connect to sandbox."""
|
39
53
|
|
54
|
+
ports: list[PortMapping] | None = Field(default=None)
|
55
|
+
"""Optional list of port mappings into container"""
|
56
|
+
|
57
|
+
container: str | None = Field(default=None)
|
58
|
+
"""Optional container name (does not apply to all sandboxes)."""
|
59
|
+
|
40
60
|
|
41
61
|
class SandboxEnvironment(abc.ABC):
|
42
62
|
"""Environment for executing arbitrary code from tools.
|
@@ -139,6 +159,7 @@ class SandboxEnvironment(abc.ABC):
|
|
139
159
|
env: dict[str, str] = {},
|
140
160
|
user: str | None = None,
|
141
161
|
timeout: int | None = None,
|
162
|
+
timeout_retry: bool = True,
|
142
163
|
) -> ExecResult[str]:
|
143
164
|
"""Execute a command within a sandbox environment.
|
144
165
|
|
@@ -155,12 +176,17 @@ class SandboxEnvironment(abc.ABC):
|
|
155
176
|
env (dict[str,str]): Environment variables for execution.
|
156
177
|
user (str | None): Optional username or UID to run the command as.
|
157
178
|
timeout (int | None): Optional execution timeout (seconds).
|
179
|
+
timeout_retry (bool): Retry the command in the case that it times out.
|
180
|
+
Commands will be retried up to twice, with a timeout of no greater
|
181
|
+
than 60 seconds for the first retry and 30 for the second.
|
182
|
+
|
158
183
|
|
159
184
|
Returns:
|
160
185
|
Execution result (status code, stderr/stdout, etc.)
|
161
186
|
|
162
187
|
Raises:
|
163
|
-
TimeoutError: If the specified `timeout` expires
|
188
|
+
TimeoutError: If the specified `timeout` expires
|
189
|
+
(and `timeout_retry` attempts also timeout).
|
164
190
|
UnicodeDecodeError: If an error occurs while
|
165
191
|
decoding the command output.
|
166
192
|
PermissionError: If the user does not have
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.59
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Safety Institute
|
6
6
|
License: MIT License
|
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
67
67
|
Requires-Dist: pytest-cov; extra == "dev"
|
68
68
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
69
69
|
Requires-Dist: pytest-xdist; extra == "dev"
|
70
|
-
Requires-Dist: ruff==0.9.
|
70
|
+
Requires-Dist: ruff==0.9.2; extra == "dev"
|
71
71
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
72
72
|
Requires-Dist: types-PyYAML; extra == "dev"
|
73
73
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|