inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. inspect_ai/_cli/eval.py +55 -1
  2. inspect_ai/_cli/main.py +2 -0
  3. inspect_ai/_cli/trace.py +244 -0
  4. inspect_ai/_display/core/progress.py +9 -3
  5. inspect_ai/_display/core/results.py +8 -4
  6. inspect_ai/_display/textual/app.py +5 -1
  7. inspect_ai/_display/textual/widgets/task_detail.py +3 -0
  8. inspect_ai/_display/textual/widgets/tasks.py +97 -6
  9. inspect_ai/_eval/eval.py +33 -0
  10. inspect_ai/_eval/evalset.py +4 -0
  11. inspect_ai/_eval/registry.py +2 -2
  12. inspect_ai/_eval/task/images.py +4 -14
  13. inspect_ai/_eval/task/results.py +22 -4
  14. inspect_ai/_eval/task/run.py +40 -20
  15. inspect_ai/_eval/task/sandbox.py +72 -43
  16. inspect_ai/_eval/task/task.py +4 -0
  17. inspect_ai/_eval/task/util.py +2 -0
  18. inspect_ai/_util/constants.py +3 -3
  19. inspect_ai/_util/display.py +1 -0
  20. inspect_ai/_util/logger.py +34 -8
  21. inspect_ai/_util/trace.py +275 -0
  22. inspect_ai/_view/www/App.css +13 -0
  23. inspect_ai/_view/www/dist/assets/index.css +13 -0
  24. inspect_ai/_view/www/dist/assets/index.js +80 -43
  25. inspect_ai/_view/www/src/App.mjs +31 -6
  26. inspect_ai/_view/www/src/Types.mjs +6 -0
  27. inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
  28. inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
  29. inspect_ai/_view/www/src/components/Tools.mjs +46 -18
  30. inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
  31. inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
  32. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
  33. inspect_ai/log/_log.py +6 -0
  34. inspect_ai/log/_message.py +2 -2
  35. inspect_ai/log/_recorders/eval.py +8 -18
  36. inspect_ai/log/_recorders/json.py +19 -17
  37. inspect_ai/model/_cache.py +22 -16
  38. inspect_ai/model/_call_tools.py +9 -1
  39. inspect_ai/model/_generate_config.py +8 -2
  40. inspect_ai/model/_model.py +11 -12
  41. inspect_ai/model/_providers/azureai.py +1 -1
  42. inspect_ai/model/_providers/bedrock.py +18 -2
  43. inspect_ai/model/_providers/hf.py +1 -1
  44. inspect_ai/model/_providers/openai.py +32 -8
  45. inspect_ai/model/_providers/providers.py +1 -1
  46. inspect_ai/model/_providers/vllm.py +1 -1
  47. inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
  48. inspect_ai/util/_sandbox/context.py +7 -3
  49. inspect_ai/util/_sandbox/docker/compose.py +58 -19
  50. inspect_ai/util/_sandbox/docker/config.py +8 -10
  51. inspect_ai/util/_sandbox/docker/docker.py +20 -16
  52. inspect_ai/util/_sandbox/docker/util.py +3 -9
  53. inspect_ai/util/_sandbox/environment.py +7 -2
  54. inspect_ai/util/_sandbox/limits.py +1 -1
  55. inspect_ai/util/_sandbox/local.py +8 -9
  56. inspect_ai/util/_sandbox/service.py +17 -7
  57. inspect_ai/util/_subprocess.py +6 -1
  58. inspect_ai/util/_subtask.py +8 -2
  59. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
  60. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
  61. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
  62. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
  63. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
  64. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ from logging import getLogger
5
5
  from pathlib import Path, PurePosixPath
6
6
  from typing import Literal, Union, cast, overload
7
7
 
8
- import aiofiles
9
8
  from typing_extensions import override
10
9
 
11
10
  from inspect_ai.util._subprocess import ExecResult
@@ -43,7 +42,7 @@ from .compose import (
43
42
  from .config import CONFIG_FILES, DOCKERFILE
44
43
  from .internal import build_internal_image, is_internal_image
45
44
  from .prereqs import validate_prereqs
46
- from .util import ComposeProject, sandbox_log, task_project_name
45
+ from .util import ComposeProject, task_project_name
47
46
 
48
47
  logger = getLogger(__name__)
49
48
 
@@ -54,6 +53,11 @@ class DockerSandboxEnvironment(SandboxEnvironment):
54
53
  def config_files(cls) -> list[str]:
55
54
  return CONFIG_FILES + [DOCKERFILE]
56
55
 
56
+ @classmethod
57
+ def default_concurrency(cls) -> int | None:
58
+ count = os.cpu_count() or 1
59
+ return 2 * count
60
+
57
61
  @classmethod
58
62
  async def task_init(
59
63
  cls, task_name: str, config: SandboxEnvironmentConfigType | None
@@ -109,8 +113,6 @@ class DockerSandboxEnvironment(SandboxEnvironment):
109
113
  config: SandboxEnvironmentConfigType | None,
110
114
  metadata: dict[str, str],
111
115
  ) -> dict[str, SandboxEnvironment]:
112
- sandbox_log("setup")
113
-
114
116
  # create environment variables for sample metadata
115
117
  env: dict[str, str] = {}
116
118
  if isinstance(config, str) and Path(config).exists():
@@ -260,7 +262,9 @@ class DockerSandboxEnvironment(SandboxEnvironment):
260
262
 
261
263
  @override
262
264
  async def write_file(self, file: str, contents: str | bytes) -> None:
263
- sandbox_log(f"write_file: {file}")
265
+ # exec function w/ timeout
266
+ async def exec(cmd: list[str]) -> ExecResult[str]:
267
+ return await self.exec(cmd, timeout=60)
264
268
 
265
269
  # resolve relative file paths
266
270
  file = self.container_file(file)
@@ -307,8 +311,8 @@ class DockerSandboxEnvironment(SandboxEnvironment):
307
311
  local_tmpfile.close() # this will also delete the file
308
312
 
309
313
  if not hasattr(self, "_docker_user"):
310
- uid = (await self.exec(["id", "-u"])).stdout.strip()
311
- gid = (await self.exec(["id", "-g"])).stdout.strip()
314
+ uid = (await exec(["id", "-u"])).stdout.strip()
315
+ gid = (await exec(["id", "-g"])).stdout.strip()
312
316
  self._docker_user = (uid, gid)
313
317
 
314
318
  await compose_command(
@@ -327,7 +331,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
327
331
  parent = PurePosixPath(file).parent
328
332
 
329
333
  # We do these steps in a shell script for efficiency to avoid round-trips to docker.
330
- res_cp = await self.exec(
334
+ res_cp = await exec(
331
335
  [
332
336
  "sh",
333
337
  "-e",
@@ -342,7 +346,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
342
346
 
343
347
  if res_cp.returncode != 0:
344
348
  if "Permission denied" in res_cp.stderr:
345
- ls_result = await self.exec(["ls", "-la", "."])
349
+ ls_result = await exec(["ls", "-la", "."])
346
350
  error_string = f"Permission was denied. Error details: {res_cp.stderr}; ls -la: {ls_result.stdout}; {self._docker_user=}"
347
351
  raise PermissionError(error_string)
348
352
  elif (
@@ -363,8 +367,6 @@ class DockerSandboxEnvironment(SandboxEnvironment):
363
367
 
364
368
  @override
365
369
  async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]:
366
- sandbox_log(f"read_file: {file}")
367
-
368
370
  # Write the contents to a temp file
369
371
  with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
370
372
  # resolve relative file paths
@@ -403,11 +405,11 @@ class DockerSandboxEnvironment(SandboxEnvironment):
403
405
 
404
406
  # read and return w/ appropriate encoding
405
407
  if text:
406
- async with aiofiles.open(dest_file, "r", encoding="utf-8") as f:
407
- return await f.read()
408
+ with open(dest_file, "r", encoding="utf-8") as f:
409
+ return f.read()
408
410
  else:
409
- async with aiofiles.open(dest_file, "rb") as f:
410
- return await f.read()
411
+ with open(dest_file, "rb") as f:
412
+ return f.read()
411
413
 
412
414
  @override
413
415
  async def connection(self) -> SandboxConnection:
@@ -445,7 +447,9 @@ class DockerSandboxEnvironment(SandboxEnvironment):
445
447
  async def container_working_dir(
446
448
  service: str, project: ComposeProject, default: str = "/"
447
449
  ) -> str:
448
- result = await compose_exec([service, "sh", "-c", "pwd"], project)
450
+ result = await compose_exec(
451
+ [service, "sh", "-c", "pwd"], timeout=60, project=project
452
+ )
449
453
  if result.success:
450
454
  return result.stdout.strip()
451
455
  else:
@@ -5,8 +5,6 @@ from pathlib import Path
5
5
 
6
6
  from shortuuid import uuid
7
7
 
8
- from inspect_ai._util.constants import SANDBOX
9
-
10
8
  from ..environment import SandboxEnvironmentConfigType
11
9
  from .config import (
12
10
  COMPOSE_DOCKERFILE_YAML,
@@ -41,7 +39,7 @@ class ComposeProject:
41
39
 
42
40
  # if its a Dockerfile, then config is the auto-generated .compose.yaml
43
41
  if config_path and is_dockerfile(config_path.name):
44
- config = await auto_compose_file(
42
+ config = auto_compose_file(
45
43
  COMPOSE_DOCKERFILE_YAML, config_path.parent.as_posix()
46
44
  )
47
45
 
@@ -51,12 +49,12 @@ class ComposeProject:
51
49
 
52
50
  # no config passed, look for 'auto-config' (compose.yaml, Dockerfile, etc.)
53
51
  else:
54
- config = await resolve_compose_file()
52
+ config = resolve_compose_file()
55
53
 
56
54
  # this could be a cleanup where docker has tracked a .compose.yaml file
57
55
  # as part of its ConfigFiles and passed it back to us -- we in the
58
56
  # meantime have cleaned it up so we re-create it here as required
59
- await ensure_auto_compose_file(config)
57
+ ensure_auto_compose_file(config)
60
58
 
61
59
  # return project
62
60
  return ComposeProject(name, config, env)
@@ -94,7 +92,3 @@ inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{22}$"
94
92
 
95
93
  def is_inspect_project(name: str) -> bool:
96
94
  return re.match(inspect_project_pattern, name) is not None
97
-
98
-
99
- def sandbox_log(msg: str) -> None:
100
- logger.log(SANDBOX, f"DOCKER: {msg}")
@@ -53,6 +53,11 @@ class SandboxEnvironment(abc.ABC):
53
53
  """Standard config files for this provider (used for automatic discovery)"""
54
54
  return []
55
55
 
56
+ @classmethod
57
+ def default_concurrency(cls) -> int | None:
58
+ """Default max_sandboxes for this provider (`None` means no maximum)"""
59
+ return None
60
+
56
61
  @classmethod
57
62
  async def task_init(
58
63
  cls, task_name: str, config: SandboxEnvironmentConfigType | None
@@ -143,7 +148,7 @@ class SandboxEnvironment(abc.ABC):
143
148
  The current working directory for execution will be the per-sample
144
149
  filesystem context.
145
150
 
146
- Each output stream (stdout and stderr) is limited to 1 MiB. If exceeded, an
151
+ Each output stream (stdout and stderr) is limited to 10 MiB. If exceeded, an
147
152
  `OutputLimitExceededError` will be raised.
148
153
 
149
154
  Args:
@@ -164,7 +169,7 @@ class SandboxEnvironment(abc.ABC):
164
169
  PermissionError: If the user does not have
165
170
  permission to execute the command.
166
171
  OutputLimitExceededError: If an output stream
167
- exceeds the 1 MiB limit.
172
+ exceeds the 10 MiB limit.
168
173
  """
169
174
  ...
170
175
 
@@ -29,7 +29,7 @@ def verify_exec_result_size(exec_result: ExecResult[str]) -> None:
29
29
  """Verify the size of the output streams in an `ExecResult`.
30
30
 
31
31
  Raises:
32
- OutputLimitExceededError: If an output stream exceeds the 1 MiB limit.
32
+ OutputLimitExceededError: If an output stream exceeds the limit.
33
33
  """
34
34
  limit = SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE
35
35
  stdout_truncated = truncate_string_to_bytes(exec_result.stdout, limit)
@@ -3,7 +3,6 @@ import warnings
3
3
  from pathlib import Path
4
4
  from typing import Literal, Union, cast, overload
5
5
 
6
- import aiofiles
7
6
  from typing_extensions import override
8
7
 
9
8
  from .._subprocess import ExecResult, subprocess
@@ -85,11 +84,11 @@ class LocalSandboxEnvironment(SandboxEnvironment):
85
84
  Path(file).parent.mkdir(parents=True, exist_ok=True)
86
85
 
87
86
  if isinstance(contents, str):
88
- async with aiofiles.open(file, "w", encoding="utf-8") as f:
89
- await f.write(contents)
87
+ with open(file, "w", encoding="utf-8") as f:
88
+ f.write(contents)
90
89
  else:
91
- async with aiofiles.open(file, "wb") as f:
92
- await f.write(contents)
90
+ with open(file, "wb") as f:
91
+ f.write(contents)
93
92
 
94
93
  @overload
95
94
  async def read_file(self, file: str, text: Literal[True] = True) -> str: ...
@@ -102,11 +101,11 @@ class LocalSandboxEnvironment(SandboxEnvironment):
102
101
  file = self._resolve_file(file)
103
102
  verify_read_file_size(file)
104
103
  if text:
105
- async with aiofiles.open(file, "r", encoding="utf-8") as f:
106
- return await f.read()
104
+ with open(file, "r", encoding="utf-8") as f:
105
+ return f.read()
107
106
  else:
108
- async with aiofiles.open(file, "rb") as f:
109
- return await f.read()
107
+ with open(file, "rb") as f:
108
+ return f.read()
110
109
 
111
110
  def _resolve_file(self, file: str) -> str:
112
111
  path = Path(file)
@@ -10,6 +10,8 @@ from typing import (
10
10
 
11
11
  from pydantic import JsonValue
12
12
 
13
+ from inspect_ai.util._subprocess import ExecResult
14
+
13
15
  from .environment import SandboxEnvironment
14
16
 
15
17
  REQUESTS_DIR = "requests"
@@ -129,9 +131,9 @@ class SandboxService:
129
131
  """Handle all pending service requests."""
130
132
  # list pending requests
131
133
  list_requests = f"ls -1 {self._requests_dir}/*.json"
132
- result = await self._sandbox.exec(["bash", "-c", list_requests])
134
+ result = await self._exec(["bash", "-c", list_requests])
133
135
 
134
- # process reqests
136
+ # process requests
135
137
  if result.success:
136
138
  request_files = result.stdout.strip().splitlines()
137
139
  if request_files:
@@ -142,7 +144,7 @@ class SandboxService:
142
144
  async def _handle_request(self, request_file: str) -> None:
143
145
  # read request
144
146
  read_request = f"cat {request_file}"
145
- result = await self._sandbox.exec(["bash", "-c", read_request])
147
+ result = await self._exec(["bash", "-c", read_request])
146
148
  if not result.success:
147
149
  raise RuntimeError(
148
150
  f"Error reading request for service {self._name}: '{read_request}' ({result.stderr})"
@@ -181,7 +183,7 @@ class SandboxService:
181
183
  await self._write_text_file(response_path, json.dumps(response_data))
182
184
 
183
185
  # remove request file
184
- exec_rm = await self._sandbox.exec(["rm", "-f", request_file])
186
+ exec_rm = await self._exec(["rm", "-f", request_file])
185
187
  if not exec_rm.success:
186
188
  raise RuntimeError(
187
189
  f"Error removing request file '{request_file}': {exec_rm.stderr}"
@@ -215,8 +217,8 @@ class SandboxService:
215
217
 
216
218
  async def _create_rpc_dir(self, name: str) -> str:
217
219
  rpc_dir = PurePosixPath(self._service_dir, name).as_posix()
218
- result = await self._sandbox.exec(["rm", "-rf", rpc_dir])
219
- result = await self._sandbox.exec(["mkdir", "-p", rpc_dir])
220
+ result = await self._exec(["rm", "-rf", rpc_dir])
221
+ result = await self._exec(["mkdir", "-p", rpc_dir])
220
222
  if not result.success:
221
223
  raise RuntimeError(
222
224
  f"Error creating rpc directory '{name}' for sandbox '{self._name}': {result.stderr}"
@@ -224,11 +226,19 @@ class SandboxService:
224
226
  return rpc_dir
225
227
 
226
228
  async def _write_text_file(self, file: str, contents: str) -> None:
227
- result = await self._sandbox.exec(["tee", "--", file], input=contents)
229
+ result = await self._exec(["tee", "--", file], input=contents)
228
230
  if not result.success:
229
231
  msg = f"Failed to write file '{file}' into container: {result.stderr}"
230
232
  raise RuntimeError(msg)
231
233
 
234
+ async def _exec(self, cmd: list[str], input: str | None = None) -> ExecResult[str]:
235
+ try:
236
+ return await self._sandbox.exec(cmd, input=input, timeout=30)
237
+ except TimeoutError:
238
+ raise RuntimeError(
239
+ f"Timed out executing command {' '.join(cmd)} in sandbox"
240
+ )
241
+
232
242
  def _generate_client(self) -> str:
233
243
  return dedent(f"""
234
244
  from typing import Any
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import os
3
+ import shlex
3
4
  import sys
4
5
  from asyncio.subprocess import Process
5
6
  from contextvars import ContextVar
@@ -8,6 +9,8 @@ from logging import getLogger
8
9
  from pathlib import Path
9
10
  from typing import AsyncGenerator, Generic, Literal, TypeVar, Union, cast, overload
10
11
 
12
+ from inspect_ai._util.trace import trace_action
13
+
11
14
  from ._concurrency import concurrency
12
15
 
13
16
  logger = getLogger(__name__)
@@ -217,7 +220,9 @@ async def subprocess(
217
220
 
218
221
  # run command
219
222
  async with concurrency("subprocesses", max_subprocesses_context_var.get()):
220
- return await run_command_timeout()
223
+ message = args if isinstance(args, str) else shlex.join(args)
224
+ with trace_action(logger, "Subprocess", message):
225
+ return await run_command_timeout()
221
226
 
222
227
 
223
228
  def init_max_subprocesses(max_subprocesses: int | None = None) -> None:
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import inspect
3
3
  from functools import wraps
4
+ from logging import getLogger
4
5
  from typing import (
5
6
  Any,
6
7
  Callable,
@@ -13,6 +14,7 @@ from typing import (
13
14
 
14
15
  from inspect_ai._util._async import is_callable_coroutine
15
16
  from inspect_ai._util.content import Content
17
+ from inspect_ai._util.trace import trace_action
16
18
  from inspect_ai.util._store import Store, dict_jsonable, init_subtask_store
17
19
 
18
20
  SubtaskResult = str | int | float | bool | list[Content]
@@ -20,6 +22,9 @@ SubtaskResult = str | int | float | bool | list[Content]
20
22
  RT = TypeVar("RT", SubtaskResult, Any)
21
23
 
22
24
 
25
+ logger = getLogger(__name__)
26
+
27
+
23
28
  @runtime_checkable
24
29
  class Subtask(Protocol):
25
30
  """Subtask with distinct `Store` and `Transcript`.
@@ -118,8 +123,9 @@ def subtask(
118
123
  init_subtask(subtask_name, store if store else Store())
119
124
 
120
125
  # run the subtask
121
- with track_store_changes(): # type: ignore
122
- result = await func(*args, **kwargs)
126
+ with trace_action(logger, "Subtask", subtask_name):
127
+ with track_store_changes(): # type: ignore
128
+ result = await func(*args, **kwargs)
123
129
 
124
130
  # return result and event
125
131
  return result, list(transcript().events)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inspect_ai
3
- Version: 0.3.52
3
+ Version: 0.3.54
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Safety Institute
6
6
  License: MIT License
@@ -20,7 +20,6 @@ Classifier: Operating System :: OS Independent
20
20
  Requires-Python: >=3.10
21
21
  Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
- Requires-Dist: aiofiles
24
23
  Requires-Dist: aiohttp>=3.9.0
25
24
  Requires-Dist: anyio>=4.4.0
26
25
  Requires-Dist: beautifulsoup4
@@ -71,7 +70,6 @@ Requires-Dist: pytest-xdist; extra == "dev"
71
70
  Requires-Dist: ruff==0.8.3; extra == "dev"
72
71
  Requires-Dist: textual-dev>=0.86.2; extra == "dev"
73
72
  Requires-Dist: types-PyYAML; extra == "dev"
74
- Requires-Dist: types-aiofiles; extra == "dev"
75
73
  Requires-Dist: types-beautifulsoup4; extra == "dev"
76
74
  Requires-Dist: types-aioboto3; extra == "dev"
77
75
  Requires-Dist: types-boto3; extra == "dev"
@@ -98,22 +96,22 @@ To get started with Inspect, please see the documentation at <https://inspect.ai
98
96
 
99
97
  ***
100
98
 
101
-
102
-
103
99
  To work on development of Inspect, clone the repository and install with the `-e` flag and `[dev]` optional dependencies:
104
100
 
105
101
  ```bash
106
- $ git clone https://github.com/UKGovernmentBEIS/inspect_ai.git
107
- $ cd inspect_ai
108
- $ pip install -e ".[dev]"
102
+ git clone https://github.com/UKGovernmentBEIS/inspect_ai.git
103
+ cd inspect_ai
104
+ pip install -e ".[dev]"
109
105
  ```
110
106
 
111
107
  Optionally install pre-commit hooks via
108
+
112
109
  ```bash
113
110
  make hooks
114
111
  ```
115
112
 
116
113
  Run linting, formatting, and tests via
114
+
117
115
  ```bash
118
116
  make check
119
117
  make test