inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +4 -2
  3. inspect_ai/_cli/eval.py +2 -0
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +0 -2
  6. inspect_ai/_display/core/panel.py +1 -1
  7. inspect_ai/_display/rich/display.py +4 -4
  8. inspect_ai/_display/textual/app.py +4 -1
  9. inspect_ai/_display/textual/widgets/samples.py +41 -5
  10. inspect_ai/_eval/eval.py +32 -20
  11. inspect_ai/_eval/evalset.py +7 -5
  12. inspect_ai/_eval/run.py +16 -11
  13. inspect_ai/_eval/task/__init__.py +2 -2
  14. inspect_ai/_eval/task/images.py +40 -25
  15. inspect_ai/_eval/task/run.py +141 -119
  16. inspect_ai/_eval/task/task.py +140 -25
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/content.py +23 -1
  19. inspect_ai/_util/datetime.py +1 -1
  20. inspect_ai/_util/deprecation.py +1 -1
  21. inspect_ai/_util/images.py +20 -17
  22. inspect_ai/_util/json.py +11 -1
  23. inspect_ai/_util/kvstore.py +73 -0
  24. inspect_ai/_util/logger.py +2 -1
  25. inspect_ai/_util/notgiven.py +18 -0
  26. inspect_ai/_util/thread.py +5 -0
  27. inspect_ai/_util/trace.py +39 -3
  28. inspect_ai/_util/transcript.py +36 -7
  29. inspect_ai/_view/www/.prettierrc.js +12 -0
  30. inspect_ai/_view/www/dist/assets/index.js +322 -226
  31. inspect_ai/_view/www/log-schema.json +221 -138
  32. inspect_ai/_view/www/src/App.mjs +18 -9
  33. inspect_ai/_view/www/src/Types.mjs +0 -1
  34. inspect_ai/_view/www/src/api/Types.mjs +15 -4
  35. inspect_ai/_view/www/src/api/api-http.mjs +2 -0
  36. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
  37. inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
  38. inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
  39. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  40. inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
  41. inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
  42. inspect_ai/_view/www/src/components/Tools.mjs +18 -3
  43. inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
  44. inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
  45. inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
  46. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
  47. inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
  48. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
  49. inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
  50. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
  51. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
  52. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
  53. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
  54. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
  55. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
  56. inspect_ai/_view/www/src/types/log.d.ts +53 -35
  57. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  58. inspect_ai/approval/_human/util.py +2 -2
  59. inspect_ai/dataset/_sources/csv.py +2 -1
  60. inspect_ai/dataset/_sources/json.py +2 -1
  61. inspect_ai/dataset/_sources/util.py +15 -7
  62. inspect_ai/log/_condense.py +11 -1
  63. inspect_ai/log/_log.py +27 -5
  64. inspect_ai/log/_recorders/eval.py +21 -8
  65. inspect_ai/log/_samples.py +10 -5
  66. inspect_ai/log/_transcript.py +28 -1
  67. inspect_ai/model/__init__.py +10 -2
  68. inspect_ai/model/_call_tools.py +82 -17
  69. inspect_ai/model/_chat_message.py +2 -4
  70. inspect_ai/model/{_trace.py → _conversation.py} +9 -8
  71. inspect_ai/model/_model.py +2 -2
  72. inspect_ai/model/_providers/anthropic.py +9 -7
  73. inspect_ai/model/_providers/azureai.py +6 -4
  74. inspect_ai/model/_providers/bedrock.py +6 -4
  75. inspect_ai/model/_providers/google.py +103 -14
  76. inspect_ai/model/_providers/groq.py +7 -5
  77. inspect_ai/model/_providers/hf.py +11 -6
  78. inspect_ai/model/_providers/mistral.py +6 -9
  79. inspect_ai/model/_providers/openai.py +34 -8
  80. inspect_ai/model/_providers/openai_o1.py +10 -12
  81. inspect_ai/model/_providers/vertex.py +17 -4
  82. inspect_ai/scorer/__init__.py +13 -2
  83. inspect_ai/scorer/_metrics/__init__.py +2 -2
  84. inspect_ai/scorer/_metrics/std.py +3 -3
  85. inspect_ai/tool/__init__.py +9 -1
  86. inspect_ai/tool/_tool.py +9 -2
  87. inspect_ai/tool/_tool_info.py +2 -1
  88. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
  89. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
  90. inspect_ai/util/__init__.py +4 -3
  91. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  92. inspect_ai/util/_display.py +14 -4
  93. inspect_ai/util/_sandbox/context.py +12 -13
  94. inspect_ai/util/_sandbox/docker/compose.py +24 -13
  95. inspect_ai/util/_sandbox/docker/docker.py +20 -13
  96. inspect_ai/util/_sandbox/docker/util.py +2 -1
  97. inspect_ai/util/_sandbox/environment.py +13 -1
  98. inspect_ai/util/_sandbox/local.py +1 -0
  99. inspect_ai/util/_sandbox/self_check.py +18 -18
  100. inspect_ai/util/_store.py +2 -2
  101. inspect_ai/util/_subprocess.py +3 -3
  102. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
  103. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
  104. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
  105. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
  106. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
  107. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
@@ -138,28 +138,31 @@ class DockerSandboxEnvironment(SandboxEnvironment):
138
138
  # start the services
139
139
  await compose_up(project)
140
140
 
141
+ # check to ensure that the services are running
142
+ running_services = await compose_check_running(
143
+ list(services.keys()), project=project
144
+ )
145
+
141
146
  # note that the project is running
142
147
  project_startup(project)
143
148
 
144
- # check to ensure that the services are running
145
- await compose_check_running(list(services.keys()), project=project)
146
-
147
- # create sandbox environments
149
+ # create sandbox environments for all running services
148
150
  default_service: str | None = None
149
151
  environments: dict[str, SandboxEnvironment] = {}
150
152
  for service, service_info in services.items():
151
- # update the project w/ the working directory
152
- working_dir = await container_working_dir(service, project)
153
+ if service in running_services:
154
+ # update the project w/ the working directory
155
+ working_dir = await container_working_dir(service, project)
153
156
 
154
- # create the docker sandbox environemnt
155
- docker_env = DockerSandboxEnvironment(service, project, working_dir)
157
+ # create the docker sandbox environemnt
158
+ docker_env = DockerSandboxEnvironment(service, project, working_dir)
156
159
 
157
- # save reference to default service if requested
158
- if service_info.get("x-default", False):
159
- default_service = service
160
+ # save reference to default service if requested
161
+ if service_info.get("x-default", False):
162
+ default_service = service
160
163
 
161
- # record service => environment
162
- environments[service] = docker_env
164
+ # record service => environment
165
+ environments[service] = docker_env
163
166
 
164
167
  # confirm that we have a 'default' environemnt
165
168
  if environments.get("default", None) is None and default_service is None:
@@ -225,6 +228,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
225
228
  env: dict[str, str] = {},
226
229
  user: str | None = None,
227
230
  timeout: int | None = None,
231
+ timeout_retry: bool = True,
228
232
  ) -> ExecResult[str]:
229
233
  # additional args
230
234
  args = []
@@ -251,6 +255,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
251
255
  args + [self._service] + cmd,
252
256
  project=self._project,
253
257
  timeout=timeout,
258
+ timeout_retry=timeout_retry,
254
259
  input=input,
255
260
  output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
256
261
  )
@@ -428,11 +433,13 @@ class DockerSandboxEnvironment(SandboxEnvironment):
428
433
  # return container connection
429
434
  if container:
430
435
  return SandboxConnection(
436
+ type="docker",
431
437
  command=f"docker exec -it {container} bash -l",
432
438
  vscode_command=[
433
439
  "remote-containers.attachToRunningContainer",
434
440
  container,
435
441
  ],
442
+ container=container,
436
443
  )
437
444
  # error (not currently running)
438
445
  else:
@@ -84,7 +84,8 @@ def task_project_name(task: str) -> str:
84
84
  if len(task) == 0:
85
85
  task = "task"
86
86
 
87
- return f"inspect-{task[:12]}-i{uuid().lower()[:6]}"
87
+ # _- breaks docker project name constraints so we strip trailing underscores.
88
+ return f"inspect-{task[:12].rstrip('_')}-i{uuid().lower()[:6]}"
88
89
 
89
90
 
90
91
  inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{6,}$"
@@ -31,12 +31,18 @@ SampleCleanup = Callable[
31
31
  class SandboxConnection(BaseModel):
32
32
  """Information required to connect to sandbox."""
33
33
 
34
+ type: str
35
+ """Sandbox type name (e.g. 'docker', 'local', etc.)"""
36
+
34
37
  command: str
35
38
  """Shell command to connect to sandbox."""
36
39
 
37
40
  vscode_command: list[Any] | None = Field(default=None)
38
41
  """Optional vscode command (+args) to connect to sandbox."""
39
42
 
43
+ container: str | None = Field(default=None)
44
+ """Optional container name (does not apply to all sandboxes)."""
45
+
40
46
 
41
47
  class SandboxEnvironment(abc.ABC):
42
48
  """Environment for executing arbitrary code from tools.
@@ -139,6 +145,7 @@ class SandboxEnvironment(abc.ABC):
139
145
  env: dict[str, str] = {},
140
146
  user: str | None = None,
141
147
  timeout: int | None = None,
148
+ timeout_retry: bool = True,
142
149
  ) -> ExecResult[str]:
143
150
  """Execute a command within a sandbox environment.
144
151
 
@@ -155,12 +162,17 @@ class SandboxEnvironment(abc.ABC):
155
162
  env (dict[str,str]): Environment variables for execution.
156
163
  user (str | None): Optional username or UID to run the command as.
157
164
  timeout (int | None): Optional execution timeout (seconds).
165
+ timeout_retry (bool): Retry the command in the case that it times out.
166
+ Commands will be retried up to twice, with a timeout of no greater
167
+ than 60 seconds for the first retry and 30 for the second.
168
+
158
169
 
159
170
  Returns:
160
171
  Execution result (status code, stderr/stdout, etc.)
161
172
 
162
173
  Raises:
163
- TimeoutError: If the specified `timeout` expires.
174
+ TimeoutError: If the specified `timeout` expires
175
+ (and `timeout_retry` attempts also timeout).
164
176
  UnicodeDecodeError: If an error occurs while
165
177
  decoding the command output.
166
178
  PermissionError: If the user does not have
@@ -55,6 +55,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
55
55
  env: dict[str, str] = {},
56
56
  user: str | None = None,
57
57
  timeout: int | None = None,
58
+ timeout_retry: bool = True,
58
59
  ) -> ExecResult[str]:
59
60
  if user is not None:
60
61
  warnings.warn(
@@ -75,9 +75,9 @@ async def test_read_and_write_file_text(sandbox_env: SandboxEnvironment) -> None
75
75
  written_file_string = await sandbox_env.read_file(
76
76
  "test_read_and_write_file_text.file", text=True
77
77
  )
78
- assert (
79
- "great #content\nincluding newlines" == written_file_string
80
- ), f"unexpected content: [{written_file_string}]"
78
+ assert "great #content\nincluding newlines" == written_file_string, (
79
+ f"unexpected content: [{written_file_string}]"
80
+ )
81
81
  await _cleanup_file(sandbox_env, "test_read_and_write_file_text.file")
82
82
 
83
83
 
@@ -219,9 +219,9 @@ async def test_exec_output(sandbox_env: SandboxEnvironment) -> None:
219
219
  exec_result = await sandbox_env.exec(["sh", "-c", "echo foo; echo bar"])
220
220
  expected = "foo\nbar\n"
221
221
  # in the assertion message, we show the actual bytes to help debug newline issues
222
- assert (
223
- exec_result.stdout == expected
224
- ), f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
222
+ assert exec_result.stdout == expected, (
223
+ f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
224
+ )
225
225
 
226
226
 
227
227
  async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None:
@@ -248,13 +248,13 @@ async def test_exec_as_user(sandbox_env: SandboxEnvironment) -> None:
248
248
 
249
249
  # Test exec as different users
250
250
  root_result = await sandbox_env.exec(["whoami"], user="root")
251
- assert (
252
- root_result.stdout.strip() == "root"
253
- ), f"Expected 'root', got '{root_result.stdout.strip()}'"
251
+ assert root_result.stdout.strip() == "root", (
252
+ f"Expected 'root', got '{root_result.stdout.strip()}'"
253
+ )
254
254
  myuser_result = await sandbox_env.exec(["whoami"], user=username)
255
- assert (
256
- myuser_result.stdout.strip() == username
257
- ), f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
255
+ assert myuser_result.stdout.strip() == username, (
256
+ f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
257
+ )
258
258
  finally:
259
259
  # Clean up
260
260
  await sandbox_env.exec(["userdel", "-r", username], user="root")
@@ -266,9 +266,9 @@ async def test_exec_as_nonexistent_user(sandbox_env: SandboxEnvironment) -> None
266
266
  expected_error = (
267
267
  "unable to find user nonexistent: no matching entries in passwd file"
268
268
  )
269
- assert (
270
- expected_error in result.stdout
271
- ), f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
269
+ assert expected_error in result.stdout, (
270
+ f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
271
+ )
272
272
 
273
273
 
274
274
  async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None:
@@ -291,9 +291,9 @@ async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None:
291
291
  file_path = cwd_subdirectory + "/" + file_name
292
292
  await sandbox_env.write_file(file_path, "ls me plz")
293
293
  current_dir_contents = (await sandbox_env.exec(["ls"], cwd=cwd_subdirectory)).stdout
294
- assert (
295
- file_name in current_dir_contents
296
- ), f"{file_name} not found in {current_dir_contents}"
294
+ assert file_name in current_dir_contents, (
295
+ f"{file_name} not found in {current_dir_contents}"
296
+ )
297
297
  await _cleanup_file(sandbox_env, file_path)
298
298
 
299
299
 
inspect_ai/util/_store.py CHANGED
@@ -34,8 +34,8 @@ class Store:
34
34
  inheriting from Pydantic `BaseModel`)
35
35
  """
36
36
 
37
- def __init__(self) -> None:
38
- self._data: dict[str, Any] = {}
37
+ def __init__(self, data: dict[str, Any] | None = None) -> None:
38
+ self._data = deepcopy(data) if data else {}
39
39
 
40
40
  @overload
41
41
  def get(self, key: str, default: None = None) -> Any: ...
@@ -101,9 +101,9 @@ async def subprocess(
101
101
  input = input.encode() if isinstance(input, str) else input
102
102
 
103
103
  # function to run command (we may or may not run it w/ concurrency)
104
- async def run_command() -> (
105
- AsyncGenerator[Union[Process, ExecResult[str], ExecResult[bytes]], None]
106
- ):
104
+ async def run_command() -> AsyncGenerator[
105
+ Union[Process, ExecResult[str], ExecResult[bytes]], None
106
+ ]:
107
107
  if isinstance(args, str):
108
108
  proc = await asyncio.create_subprocess_shell(
109
109
  args,
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: inspect_ai
3
- Version: 0.3.56
3
+ Version: 0.3.58
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Safety Institute
6
6
  License: MIT License
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
67
67
  Requires-Dist: pytest-cov; extra == "dev"
68
68
  Requires-Dist: pytest-dotenv; extra == "dev"
69
69
  Requires-Dist: pytest-xdist; extra == "dev"
70
- Requires-Dist: ruff==0.8.4; extra == "dev"
70
+ Requires-Dist: ruff==0.9.1; extra == "dev"
71
71
  Requires-Dist: textual-dev>=0.86.2; extra == "dev"
72
72
  Requires-Dist: types-PyYAML; extra == "dev"
73
73
  Requires-Dist: types-beautifulsoup4; extra == "dev"