inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +7 -3
  3. inspect_ai/_cli/eval.py +17 -2
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +4 -3
  6. inspect_ai/_display/core/config.py +3 -3
  7. inspect_ai/_display/core/panel.py +7 -3
  8. inspect_ai/_display/plain/__init__.py +0 -0
  9. inspect_ai/_display/plain/display.py +203 -0
  10. inspect_ai/_display/rich/display.py +4 -9
  11. inspect_ai/_display/textual/app.py +4 -1
  12. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  13. inspect_ai/_display/textual/widgets/samples.py +119 -16
  14. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  15. inspect_ai/_eval/eval.py +32 -20
  16. inspect_ai/_eval/evalset.py +7 -5
  17. inspect_ai/_eval/score.py +1 -0
  18. inspect_ai/_eval/task/__init__.py +2 -2
  19. inspect_ai/_eval/task/images.py +40 -25
  20. inspect_ai/_eval/task/results.py +50 -22
  21. inspect_ai/_eval/task/run.py +180 -124
  22. inspect_ai/_eval/task/sandbox.py +10 -5
  23. inspect_ai/_eval/task/task.py +140 -25
  24. inspect_ai/_util/constants.py +2 -0
  25. inspect_ai/_util/content.py +23 -1
  26. inspect_ai/_util/images.py +20 -17
  27. inspect_ai/_util/kvstore.py +73 -0
  28. inspect_ai/_util/notgiven.py +18 -0
  29. inspect_ai/_util/port_names.py +61 -0
  30. inspect_ai/_util/text.py +23 -0
  31. inspect_ai/_util/thread.py +5 -0
  32. inspect_ai/_view/www/App.css +31 -1
  33. inspect_ai/_view/www/dist/assets/index.css +31 -1
  34. inspect_ai/_view/www/dist/assets/index.js +25375 -1846
  35. inspect_ai/_view/www/log-schema.json +129 -15
  36. inspect_ai/_view/www/package.json +2 -0
  37. inspect_ai/_view/www/src/App.mjs +8 -10
  38. inspect_ai/_view/www/src/Types.mjs +0 -1
  39. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  40. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  41. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  42. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  43. inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
  44. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  45. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  46. inspect_ai/_view/www/src/index.js +75 -2
  47. inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
  48. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
  49. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  50. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  51. inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
  52. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  53. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
  54. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
  55. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  56. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  57. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  58. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  59. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  60. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  61. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  62. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  63. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  64. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  65. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  66. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  67. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  68. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  69. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  70. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  71. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  72. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  73. inspect_ai/_view/www/src/types/log.d.ts +62 -27
  74. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  75. inspect_ai/_view/www/src/utils/Json.mjs +12 -6
  76. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
  77. inspect_ai/_view/www/vite.config.js +7 -0
  78. inspect_ai/_view/www/yarn.lock +116 -0
  79. inspect_ai/approval/_human/__init__.py +0 -0
  80. inspect_ai/approval/_human/util.py +2 -2
  81. inspect_ai/approval/_policy.py +12 -6
  82. inspect_ai/dataset/_sources/csv.py +2 -1
  83. inspect_ai/dataset/_sources/json.py +2 -1
  84. inspect_ai/dataset/_sources/util.py +15 -7
  85. inspect_ai/log/_condense.py +11 -1
  86. inspect_ai/log/_log.py +3 -6
  87. inspect_ai/log/_recorders/eval.py +19 -8
  88. inspect_ai/log/_samples.py +26 -5
  89. inspect_ai/log/_transcript.py +32 -2
  90. inspect_ai/model/__init__.py +10 -2
  91. inspect_ai/model/_call_tools.py +59 -12
  92. inspect_ai/model/_chat_message.py +2 -4
  93. inspect_ai/model/_conversation.py +61 -0
  94. inspect_ai/model/_generate_config.py +10 -4
  95. inspect_ai/model/_model.py +117 -18
  96. inspect_ai/model/_model_output.py +7 -2
  97. inspect_ai/model/_providers/anthropic.py +109 -51
  98. inspect_ai/model/_providers/azureai.py +26 -24
  99. inspect_ai/model/_providers/bedrock.py +43 -44
  100. inspect_ai/model/_providers/google.py +121 -58
  101. inspect_ai/model/_providers/groq.py +7 -5
  102. inspect_ai/model/_providers/hf.py +11 -6
  103. inspect_ai/model/_providers/mistral.py +17 -20
  104. inspect_ai/model/_providers/openai.py +32 -21
  105. inspect_ai/model/_providers/openai_o1.py +9 -8
  106. inspect_ai/model/_providers/providers.py +1 -1
  107. inspect_ai/model/_providers/together.py +8 -8
  108. inspect_ai/model/_providers/vertex.py +18 -8
  109. inspect_ai/scorer/__init__.py +13 -2
  110. inspect_ai/scorer/_metrics/__init__.py +2 -2
  111. inspect_ai/scorer/_metrics/std.py +3 -3
  112. inspect_ai/scorer/_reducer/reducer.py +1 -1
  113. inspect_ai/scorer/_scorer.py +2 -2
  114. inspect_ai/solver/__init__.py +2 -5
  115. inspect_ai/solver/_prompt.py +35 -5
  116. inspect_ai/solver/_task_state.py +80 -38
  117. inspect_ai/tool/__init__.py +11 -1
  118. inspect_ai/tool/_tool.py +21 -3
  119. inspect_ai/tool/_tool_call.py +10 -0
  120. inspect_ai/tool/_tool_def.py +16 -5
  121. inspect_ai/tool/_tool_with.py +21 -4
  122. inspect_ai/tool/beta/__init__.py +5 -0
  123. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  124. inspect_ai/tool/beta/_computer/_common.py +133 -0
  125. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  126. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  127. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  128. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  129. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  130. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  131. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  132. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  133. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  134. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  135. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  136. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  137. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  138. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  139. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  140. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  141. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  142. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  143. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  144. inspect_ai/util/__init__.py +2 -3
  145. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  146. inspect_ai/util/_display.py +14 -4
  147. inspect_ai/util/_limit.py +26 -0
  148. inspect_ai/util/_sandbox/context.py +12 -13
  149. inspect_ai/util/_sandbox/docker/compose.py +24 -11
  150. inspect_ai/util/_sandbox/docker/docker.py +84 -14
  151. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  152. inspect_ai/util/_sandbox/environment.py +27 -1
  153. inspect_ai/util/_sandbox/local.py +1 -0
  154. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
  155. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
  156. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  157. inspect_ai/model/_trace.py +0 -48
  158. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
  159. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
  160. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
  161. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
1
- from contextvars import ContextVar
2
-
3
1
  from rich import print
4
2
  from rich.console import RenderableType
5
3
  from rich.text import Text
@@ -7,12 +5,7 @@ from rich.text import Text
7
5
  from inspect_ai._util.transcript import transcript_panel
8
6
 
9
7
 
10
- def trace_enabled() -> bool:
11
- """Is trace mode currently enabled."""
12
- return _trace.get(None) is True
13
-
14
-
15
- def trace_panel(
8
+ def conversation_panel(
16
9
  title: str,
17
10
  *,
18
11
  subtitle: str | None = None,
@@ -20,8 +13,8 @@ def trace_panel(
20
13
  ) -> None:
21
14
  """Trace content into a standard trace panel display.
22
15
 
23
- Typically you would call `trace_enabled()` to confirm that trace mode
24
- is enabled before calling `trace_panel()`.
16
+ Typically you would call `display_type() == "conversation"` to confirm that
17
+ we are in conversation mode before calling `conversation_panel()`.
25
18
 
26
19
  Args:
27
20
  title (str): Panel title.
@@ -32,10 +25,3 @@ def trace_panel(
32
25
  transcript_panel(title, subtitle, content),
33
26
  Text(),
34
27
  )
35
-
36
-
37
- def init_trace(trace: bool | None) -> None:
38
- _trace.set(trace)
39
-
40
-
41
- _trace: ContextVar[bool | None] = ContextVar("_trace_mode")
@@ -3,10 +3,11 @@ from logging import getLogger
3
3
  from typing import Literal
4
4
 
5
5
  from inspect_ai._util.constants import DEFAULT_DISPLAY
6
+ from inspect_ai._util.thread import is_main_thread
6
7
 
7
8
  logger = getLogger(__name__)
8
9
 
9
- DisplayType = Literal["full", "rich", "plain", "none"]
10
+ DisplayType = Literal["full", "conversation", "rich", "plain", "none"]
10
11
  """Console display type."""
11
12
 
12
13
 
@@ -15,15 +16,24 @@ _display_type: DisplayType | None = None
15
16
 
16
17
  def init_display_type(display: str | None = None) -> DisplayType:
17
18
  global _display_type
18
- global _display_metrics
19
19
  display = (
20
20
  display or os.environ.get("INSPECT_DISPLAY", DEFAULT_DISPLAY).lower().strip()
21
21
  )
22
+
23
+ # if we are on a background thread then throttle down to "plain"
24
+ # ("full" requires textual which cannot run in a background thread
25
+ # b/c it calls the Python signal function; "rich" assumes exclusive
26
+ # display access which may not be the case for threads)
27
+ if display in ["full", "rich"] and not is_main_thread():
28
+ display = "plain"
29
+
22
30
  match display:
23
- case "full" | "rich" | "plain" | "none":
31
+ case "full" | "conversation" | "rich" | "plain" | "none":
24
32
  _display_type = display
25
33
  case _:
26
- logger.warning(f"Unknown display type '{display}'")
34
+ logger.warning(
35
+ f"Unknown display type '{display}' (setting display to 'full')"
36
+ )
27
37
  _display_type = "full"
28
38
  return _display_type
29
39
 
@@ -0,0 +1,26 @@
1
+ from typing import Literal
2
+
3
+
4
+ class SampleLimitExceededError(Exception):
5
+ """Exception raised when a sample limit is exceeded.
6
+
7
+ Args:
8
+ type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
9
+ value (int): Value compared to
10
+ limit (int): Limit applied.
11
+ message (str | None): Optional. Human readable message.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ type: Literal["message", "time", "token", "operator", "custom"],
17
+ *,
18
+ value: int,
19
+ limit: int,
20
+ message: str | None = None,
21
+ ) -> None:
22
+ self.type = type
23
+ self.value = value
24
+ self.limit = limit
25
+ self.message = f"Exceeded {type} limit: {limit:,}"
26
+ super().__init__(message)
@@ -4,6 +4,8 @@ from typing import Any, NoReturn, cast
4
4
 
5
5
  from shortuuid import uuid
6
6
 
7
+ from inspect_ai._util.constants import SANDBOX_SETUP_TIMEOUT
8
+
7
9
  from .environment import (
8
10
  SampleCleanup,
9
11
  SampleInit,
@@ -193,23 +195,20 @@ async def setup_sandbox_environment(
193
195
  setup_file = f"/tmp/{uuid()}"
194
196
  await env.write_file(setup_file, setup)
195
197
 
196
- # chmod, execute, and remove
197
- async def exec(cmd: list[str]) -> None:
198
- try:
199
- result = await env.exec(cmd, timeout=30)
200
- except TimeoutError:
201
- raise RuntimeError(
202
- f"Timed out executing command {' '.join(cmd)} in sandbox"
203
- )
204
-
198
+ # execute and then remove setup script (don't retry it on timeout
199
+ # in case it is not idempotent)
200
+ try:
201
+ await env.exec(["chmod", "+x", setup_file], timeout=30)
202
+ result = await env.exec(
203
+ ["env", setup_file], timeout=SANDBOX_SETUP_TIMEOUT, timeout_retry=False
204
+ )
205
205
  if not result.success:
206
206
  raise RuntimeError(
207
207
  f"Failed to execute setup script for sample: {result.stderr}"
208
208
  )
209
-
210
- await exec(["chmod", "+x", setup_file])
211
- await exec(["env", setup_file])
212
- await exec(["rm", setup_file])
209
+ await env.exec(["rm", setup_file], timeout=30)
210
+ except TimeoutError:
211
+ raise RuntimeError("Timed out executing setup command in sandbox")
213
212
 
214
213
 
215
214
  def default_sandbox_environment(
@@ -25,16 +25,17 @@ COMPOSE_WAIT = "120"
25
25
 
26
26
 
27
27
  async def compose_up(project: ComposeProject) -> None:
28
- # Start the environment
29
- result = await compose_command(
28
+ # Start the environment. Note that we don't check the result because docker will
29
+ # return a non-zero exit code for services that exit (even successfully) when
30
+ # passing the --wait flag (see https://github.com/docker/compose/issues/10596).
31
+ # In practice, we will catch any errors when calling compose_check_running()
32
+ # immediately after we call compose_up().
33
+ await compose_command(
30
34
  ["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT],
31
35
  project=project,
32
36
  # wait up to 5 minutes for container to go up (compose wait + 3 minutes)
33
37
  timeout=300,
34
38
  )
35
- if not result.success:
36
- msg = f"Failed to start docker services for {project.config}: {result.stderr}"
37
- raise RuntimeError(msg)
38
39
 
39
40
 
40
41
  async def compose_down(project: ComposeProject, quiet: bool = True) -> None:
@@ -91,14 +92,21 @@ async def compose_cp(
91
92
  raise RuntimeError(msg)
92
93
 
93
94
 
94
- async def compose_check_running(services: list[str], project: ComposeProject) -> None:
95
+ async def compose_check_running(
96
+ services: list[str], project: ComposeProject
97
+ ) -> list[str]:
95
98
  # Check to ensure that the status of containers is healthy
96
99
  running_services = await compose_ps(project=project, status="running")
97
- if len(running_services) > 0:
98
- if len(running_services) != len(services):
100
+ exited_services = await compose_ps(project=project, status="exited")
101
+ successful_services = running_services + [
102
+ service for service in exited_services if service["ExitCode"] == 0
103
+ ]
104
+
105
+ if len(successful_services) > 0:
106
+ if len(successful_services) != len(services):
99
107
  unhealthy_services = services
100
- for running_service in running_services:
101
- unhealthy_services.remove(running_service["Service"])
108
+ for successful_service in successful_services:
109
+ unhealthy_services.remove(successful_service["Service"])
102
110
 
103
111
  msg = (
104
112
  "One or more docker containers failed to start from "
@@ -108,6 +116,8 @@ async def compose_check_running(services: list[str], project: ComposeProject) ->
108
116
  else:
109
117
  raise RuntimeError("No services started")
110
118
 
119
+ return [service["Service"] for service in running_services]
120
+
111
121
 
112
122
  async def compose_ps(
113
123
  project: ComposeProject,
@@ -166,6 +176,7 @@ async def compose_exec(
166
176
  *,
167
177
  project: ComposeProject,
168
178
  timeout: int | None,
179
+ timeout_retry: bool = True,
169
180
  input: str | bytes | None = None,
170
181
  output_limit: int | None = None,
171
182
  ) -> ExecResult[str]:
@@ -173,6 +184,7 @@ async def compose_exec(
173
184
  ["exec"] + command,
174
185
  project=project,
175
186
  timeout=timeout,
187
+ timeout_retry=timeout_retry,
176
188
  input=input,
177
189
  forward_env=False,
178
190
  output_limit=output_limit,
@@ -258,6 +270,7 @@ async def compose_command(
258
270
  *,
259
271
  project: ComposeProject,
260
272
  timeout: int | None,
273
+ timeout_retry: bool = True,
261
274
  input: str | bytes | None = None,
262
275
  cwd: str | Path | None = None,
263
276
  forward_env: bool = True,
@@ -325,7 +338,7 @@ async def compose_command(
325
338
  return await run_command(command_timeout)
326
339
  except TimeoutError:
327
340
  retries += 1
328
- if retries <= MAX_RETRIES:
341
+ if timeout_retry and (retries <= MAX_RETRIES):
329
342
  logger.info(
330
343
  f"Retrying docker compose command: {shlex.join(compose_command)}"
331
344
  )
@@ -1,4 +1,5 @@
1
1
  import errno
2
+ import json
2
3
  import os
3
4
  import tempfile
4
5
  from logging import getLogger
@@ -7,9 +8,11 @@ from typing import Literal, Union, cast, overload
7
8
 
8
9
  from typing_extensions import override
9
10
 
10
- from inspect_ai.util._subprocess import ExecResult
11
+ from inspect_ai.util._subprocess import ExecResult, subprocess
11
12
 
12
13
  from ..environment import (
14
+ HostMapping,
15
+ PortMapping,
13
16
  SandboxConnection,
14
17
  SandboxEnvironment,
15
18
  SandboxEnvironmentConfigType,
@@ -138,28 +141,31 @@ class DockerSandboxEnvironment(SandboxEnvironment):
138
141
  # start the services
139
142
  await compose_up(project)
140
143
 
144
+ # check to ensure that the services are running
145
+ running_services = await compose_check_running(
146
+ list(services.keys()), project=project
147
+ )
148
+
141
149
  # note that the project is running
142
150
  project_startup(project)
143
151
 
144
- # check to ensure that the services are running
145
- await compose_check_running(list(services.keys()), project=project)
146
-
147
- # create sandbox environments
152
+ # create sandbox environments for all running services
148
153
  default_service: str | None = None
149
154
  environments: dict[str, SandboxEnvironment] = {}
150
155
  for service, service_info in services.items():
151
- # update the project w/ the working directory
152
- working_dir = await container_working_dir(service, project)
156
+ if service in running_services:
157
+ # update the project w/ the working directory
158
+ working_dir = await container_working_dir(service, project)
153
159
 
154
- # create the docker sandbox environemnt
155
- docker_env = DockerSandboxEnvironment(service, project, working_dir)
160
+ # create the docker sandbox environemnt
161
+ docker_env = DockerSandboxEnvironment(service, project, working_dir)
156
162
 
157
- # save reference to default service if requested
158
- if service_info.get("x-default", False):
159
- default_service = service
163
+ # save reference to default service if requested
164
+ if service_info.get("x-default", False):
165
+ default_service = service
160
166
 
161
- # record service => environment
162
- environments[service] = docker_env
167
+ # record service => environment
168
+ environments[service] = docker_env
163
169
 
164
170
  # confirm that we have a 'default' environemnt
165
171
  if environments.get("default", None) is None and default_service is None:
@@ -225,6 +231,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
225
231
  env: dict[str, str] = {},
226
232
  user: str | None = None,
227
233
  timeout: int | None = None,
234
+ timeout_retry: bool = True,
228
235
  ) -> ExecResult[str]:
229
236
  # additional args
230
237
  args = []
@@ -251,6 +258,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
251
258
  args + [self._service] + cmd,
252
259
  project=self._project,
253
260
  timeout=timeout,
261
+ timeout_retry=timeout_retry,
254
262
  input=input,
255
263
  output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
256
264
  )
@@ -428,11 +436,14 @@ class DockerSandboxEnvironment(SandboxEnvironment):
428
436
  # return container connection
429
437
  if container:
430
438
  return SandboxConnection(
439
+ type="docker",
431
440
  command=f"docker exec -it {container} bash -l",
432
441
  vscode_command=[
433
442
  "remote-containers.attachToRunningContainer",
434
443
  container,
435
444
  ],
445
+ ports=await get_ports_info(container),
446
+ container=container,
436
447
  )
437
448
  # error (not currently running)
438
449
  else:
@@ -461,3 +472,62 @@ async def container_working_dir(
461
472
  + f"{result.stderr}"
462
473
  )
463
474
  return default
475
+
476
+
477
+ async def get_ports_info(container: str) -> list[PortMapping] | None:
478
+ try:
479
+ result = await subprocess(
480
+ [
481
+ "docker",
482
+ "inspect",
483
+ container,
484
+ "--format",
485
+ "{{json .NetworkSettings.Ports}}",
486
+ ],
487
+ timeout=60,
488
+ )
489
+
490
+ if not result.success:
491
+ raise RuntimeError(result.stderr)
492
+
493
+ return parse_docker_inspect_ports(result.stdout)
494
+
495
+ # It's currently a policy decision to let docker timeouts to be silent.
496
+ except TimeoutError:
497
+ return None
498
+
499
+
500
+ def parse_docker_inspect_ports(json_str: str) -> list[PortMapping] | None:
501
+ """
502
+ Parses the JSON output from `docker inspect {container_name} --format='{{json .NetworkSettings.Ports}}'` to extract port mappings.
503
+
504
+ Args:
505
+ json_str (str): A JSON string representing the `NetworkSettings.Ports` output of `docker inspect`. e.g.
506
+ ```
507
+ {
508
+ "5900/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54023"}],
509
+ "8080/tcp": [{"HostIp": "0.0.0.0", "HostPort": "54024"}]
510
+ }
511
+ ```
512
+
513
+ Returns:
514
+ list[PortMapping] | None: A list of PortMapping objects if any port mappings are found,
515
+ otherwise None.
516
+ """
517
+ data = json.loads(json_str)
518
+ port_mappings = []
519
+ for port_protocol, mappings in data.items():
520
+ if mappings is None:
521
+ continue
522
+ container_port, protocol = port_protocol.split("/")
523
+ host_mappings = [
524
+ HostMapping(host_ip=mapping["HostIp"], host_port=int(mapping["HostPort"]))
525
+ for mapping in mappings
526
+ ]
527
+ port_mapping = PortMapping(
528
+ container_port=int(container_port),
529
+ protocol=protocol,
530
+ mappings=host_mappings,
531
+ )
532
+ port_mappings.append(port_mapping)
533
+ return port_mappings if port_mappings else None
@@ -6,13 +6,15 @@ from inspect_ai.util._subprocess import subprocess
6
6
  INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB = "aisiuk/inspect-web-browser-tool"
7
7
 
8
8
  INSPECT_WEB_BROWSER_IMAGE = "inspect_web_browser"
9
+ INSPECT_COMPUTER_IMAGE = "inspect-computer-tool"
9
10
 
10
11
  INTERNAL_IMAGES = {
11
12
  INSPECT_WEB_BROWSER_IMAGE: PKG_PATH
12
13
  / "tool"
13
14
  / "_tools"
14
15
  / "_web_browser"
15
- / "_resources"
16
+ / "_resources",
17
+ INSPECT_COMPUTER_IMAGE: PKG_PATH / "tool" / "beta" / "_computer" / "_resources",
16
18
  }
17
19
 
18
20
 
@@ -28,15 +28,35 @@ SampleCleanup = Callable[
28
28
  ]
29
29
 
30
30
 
31
+ class HostMapping(BaseModel):
32
+ host_ip: str
33
+ host_port: int
34
+
35
+
36
+ class PortMapping(BaseModel):
37
+ container_port: int
38
+ protocol: Literal["tcp", "udp"]
39
+ mappings: list[HostMapping]
40
+
41
+
31
42
  class SandboxConnection(BaseModel):
32
43
  """Information required to connect to sandbox."""
33
44
 
45
+ type: str
46
+ """Sandbox type name (e.g. 'docker', 'local', etc.)"""
47
+
34
48
  command: str
35
49
  """Shell command to connect to sandbox."""
36
50
 
37
51
  vscode_command: list[Any] | None = Field(default=None)
38
52
  """Optional vscode command (+args) to connect to sandbox."""
39
53
 
54
+ ports: list[PortMapping] | None = Field(default=None)
55
+ """Optional list of port mappings into container"""
56
+
57
+ container: str | None = Field(default=None)
58
+ """Optional container name (does not apply to all sandboxes)."""
59
+
40
60
 
41
61
  class SandboxEnvironment(abc.ABC):
42
62
  """Environment for executing arbitrary code from tools.
@@ -139,6 +159,7 @@ class SandboxEnvironment(abc.ABC):
139
159
  env: dict[str, str] = {},
140
160
  user: str | None = None,
141
161
  timeout: int | None = None,
162
+ timeout_retry: bool = True,
142
163
  ) -> ExecResult[str]:
143
164
  """Execute a command within a sandbox environment.
144
165
 
@@ -155,12 +176,17 @@ class SandboxEnvironment(abc.ABC):
155
176
  env (dict[str,str]): Environment variables for execution.
156
177
  user (str | None): Optional username or UID to run the command as.
157
178
  timeout (int | None): Optional execution timeout (seconds).
179
+ timeout_retry (bool): Retry the command in the case that it times out.
180
+ Commands will be retried up to twice, with a timeout of no greater
181
+ than 60 seconds for the first retry and 30 for the second.
182
+
158
183
 
159
184
  Returns:
160
185
  Execution result (status code, stderr/stdout, etc.)
161
186
 
162
187
  Raises:
163
- TimeoutError: If the specified `timeout` expires.
188
+ TimeoutError: If the specified `timeout` expires
189
+ (and `timeout_retry` attempts also timeout).
164
190
  UnicodeDecodeError: If an error occurs while
165
191
  decoding the command output.
166
192
  PermissionError: If the user does not have
@@ -55,6 +55,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
55
55
  env: dict[str, str] = {},
56
56
  user: str | None = None,
57
57
  timeout: int | None = None,
58
+ timeout_retry: bool = True,
58
59
  ) -> ExecResult[str]:
59
60
  if user is not None:
60
61
  warnings.warn(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: inspect_ai
3
- Version: 0.3.57
3
+ Version: 0.3.59
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Safety Institute
6
6
  License: MIT License
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
67
67
  Requires-Dist: pytest-cov; extra == "dev"
68
68
  Requires-Dist: pytest-dotenv; extra == "dev"
69
69
  Requires-Dist: pytest-xdist; extra == "dev"
70
- Requires-Dist: ruff==0.9.0; extra == "dev"
70
+ Requires-Dist: ruff==0.9.2; extra == "dev"
71
71
  Requires-Dist: textual-dev>=0.86.2; extra == "dev"
72
72
  Requires-Dist: types-PyYAML; extra == "dev"
73
73
  Requires-Dist: types-beautifulsoup4; extra == "dev"