inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,17 @@ ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
13
13
 
14
14
  @tool
15
15
  def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool:
16
+ """Desktop computer tool.
17
+
18
+ See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-computer>.
19
+
20
+ Args:
21
+ max_screenshots: The maximum number of screenshots to play
22
+ back to the model as input. Defaults to 1 (set to `None` to have no limit).
23
+ timeout: Timeout in seconds for computer tool actions.
24
+ Defaults to 180 (set to `None` for no timeout).
25
+ """
26
+
16
27
  async def execute(
17
28
  action: Action,
18
29
  text: str | None = None,
@@ -1,4 +1,4 @@
1
- from inspect_ai.util import sandbox
1
+ from inspect_ai.util import sandbox as sandbox_env
2
2
 
3
3
  from .._tool import Tool, tool
4
4
  from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer
@@ -20,14 +20,17 @@ def code_viewer(language: str, code_param: str) -> ToolCallViewer:
20
20
 
21
21
 
22
22
  @tool(viewer=code_viewer("bash", "cmd"))
23
- def bash(timeout: int | None = None, user: str | None = None) -> Tool:
23
+ def bash(
24
+ timeout: int | None = None, user: str | None = None, sandbox: str | None = None
25
+ ) -> Tool:
24
26
  """Bash shell command execution tool.
25
27
 
26
28
  Execute bash shell commands using a sandbox environment (e.g. "docker").
27
29
 
28
30
  Args:
29
- timeout (int | None): Timeout (in seconds) for command.
30
- user (str | None): User to execute commands as.
31
+ timeout: Timeout (in seconds) for command.
32
+ user: User to execute commands as.
33
+ sandbox: Optional sandbox environmnent name.
31
34
 
32
35
  Returns:
33
36
  String with command output (stdout) or command error (stderr).
@@ -44,7 +47,7 @@ def bash(timeout: int | None = None, user: str | None = None) -> Tool:
44
47
  The output of the command.
45
48
  """
46
49
  # execute the command
47
- result = await sandbox().exec(
50
+ result = await sandbox_env(sandbox).exec(
48
51
  cmd=["bash", "--login", "-c", cmd], timeout=timeout, user=user
49
52
  )
50
53
  # return output (including stderr if any)
@@ -57,14 +60,17 @@ def bash(timeout: int | None = None, user: str | None = None) -> Tool:
57
60
 
58
61
 
59
62
  @tool(viewer=code_viewer("python", "code"))
60
- def python(timeout: int | None = None, user: str | None = None) -> Tool:
63
+ def python(
64
+ timeout: int | None = None, user: str | None = None, sandbox: str | None = None
65
+ ) -> Tool:
61
66
  """Python code execution tool.
62
67
 
63
68
  Execute Python code using a sandbox environment (e.g. "docker").
64
69
 
65
70
  Args:
66
- timeout (int | None): Timeout (in seconds) for command.
67
- user (str | None): User to execute commands as.
71
+ timeout: Timeout (in seconds) for command.
72
+ user: User to execute commands as.
73
+ sandbox: Optional sandbox environmnent name.
68
74
 
69
75
  Returns:
70
76
  String with command output (stdout) or command error (stderr).
@@ -89,7 +95,7 @@ def python(timeout: int | None = None, user: str | None = None) -> Tool:
89
95
  Returns:
90
96
  The output of the Python code.
91
97
  """
92
- result = await sandbox().exec(
98
+ result = await sandbox_env(sandbox).exec(
93
99
  cmd=["python3"], input=code, timeout=timeout, user=user
94
100
  )
95
101
  # return output (including stderr if any)
@@ -40,7 +40,7 @@ The result will be printed out in _stdout_ in the following format:
40
40
  error: <an ERROR message if one occured>
41
41
  info: <general info about the container>
42
42
  web_url: <the URL of the page the browser is currently at>
43
- wen_at: <accessibility tree of the visible elements of the page>
43
+ web_at: <accessibility tree of the visible elements of the page>
44
44
  ```
45
45
 
46
46
 
@@ -57,7 +57,7 @@ The tool consists of the following components:
57
57
  * _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
58
58
  * _playwright_crawler.py_ - a wrapper over the sync Playwright API.
59
59
 
60
- * [WebClient](web_client.py) - a simple stateless client to interract with the server. When launched, the client:
60
+ * [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
61
61
  1. creates a connection with the server;
62
62
  2. sends user command to the server;
63
63
  3. receives the response in the form of observations and prints them to stdout;
@@ -16,10 +16,12 @@ from inspect_ai.util._store_model import StoreModel, store_as
16
16
  def web_browser(interactive: bool = True) -> list[Tool]:
17
17
  """Tools used for web browser navigation.
18
18
 
19
+ See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-browser>.
20
+
19
21
  Args:
20
- interactive (bool): Provide interactive tools (enable
21
- clicking, typing, and submitting forms). Defaults
22
- to True.
22
+ interactive: Provide interactive tools (enable
23
+ clicking, typing, and submitting forms). Defaults
24
+ to True.
23
25
 
24
26
  Returns:
25
27
  List of tools used for web browser navigation.
@@ -41,14 +41,16 @@ def web_search(
41
41
  A web search is conducted using the specified provider, the results are parsed for relevance
42
42
  using the specified model, and the top 'num_results' relevant pages are returned.
43
43
 
44
+ See further documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-search>.
45
+
44
46
  Args:
45
- provider (Literal["google"]): Search provider (defaults to "google", currently
47
+ provider: Search provider (defaults to "google", currently
46
48
  the only provider). Possible future providers include "brave" and "bing".
47
- num_results (int): Number of web search result pages to return to the model.
48
- max_provider_calls (int): Maximum number of search calls to make to the search provider.
49
- max_connections (int): Maximum number of concurrent connections to API
49
+ num_results: Number of web search result pages to return to the model.
50
+ max_provider_calls: Maximum number of search calls to make to the search provider.
51
+ max_connections: Maximum number of concurrent connections to API
50
52
  endpoint of search provider.
51
- model (str | Model): Model used to parse web pages for relevance.
53
+ model: Model used to parse web pages for relevance.
52
54
 
53
55
  Returns:
54
56
  A tool that can be registered for use by models to search the web.
@@ -23,12 +23,12 @@ def concurrency(
23
23
  for launching subprocesses is handled via the `subprocess` function.
24
24
 
25
25
  Args:
26
- name (str): Name for concurrency context. This serves as the
26
+ name: Name for concurrency context. This serves as the
27
27
  display name for the context, and also the unique context
28
28
  key (if the `key` parameter is omitted)
29
- concurrency (int): Maximum number of coroutines that can
29
+ concurrency: Maximum number of coroutines that can
30
30
  enter the context.
31
- key (str | None): Unique context key for this context. Optional.
31
+ key: Unique context key for this context. Optional.
32
32
  Used if the unique key isn't human readable -- e.g. includes
33
33
  api tokens or account ids so that the more readable `name`
34
34
  can be presented to users e.g in console UI>
inspect_ai/util/_panel.py CHANGED
@@ -5,6 +5,8 @@ from typing_extensions import Self
5
5
 
6
6
 
7
7
  class InputPanel(Container):
8
+ """Base class for for Inspect input panels."""
9
+
8
10
  DEFAULT_TITLE = "Panel"
9
11
 
10
12
  DEFAULT_CLASSES = "task-input-panel"
@@ -33,18 +33,18 @@ def resource(
33
33
  `resource("templates/prompt.txt", type="file")`
34
34
 
35
35
  Args:
36
- resource (str): Path to local or remote (e.g. s3://)
37
- resource, or for `type="auto"` (the default),
38
- a string containing the literal resource value.
39
- type (Literal["auto", "file"]): For "auto" (the default),
40
- interpret the resource as a literal string if its not
41
- a valid path. For "file", always interpret it as
42
- a file path.
43
- fs_options (dict[str, Any]): Optional. Additional
44
- arguments to pass through to the `fsspec` filesystem
45
- provider (e.g. `S3FileSystem`). Use `{"anon": True }`
46
- if you are accessing a public S3 bucket with no
47
- credentials.
36
+ resource: Path to local or remote (e.g. s3://)
37
+ resource, or for `type="auto"` (the default),
38
+ a string containing the literal resource value.
39
+ type: For "auto" (the default),
40
+ interpret the resource as a literal string if its not
41
+ a valid path. For "file", always interpret it as
42
+ a file path.
43
+ fs_options: Optional. Additional
44
+ arguments to pass through to the `fsspec` filesystem
45
+ provider (e.g. `S3FileSystem`). Use `{"anon": True }`
46
+ if you are accessing a public S3 bucket with no
47
+ credentials.
48
48
 
49
49
  Returns:
50
50
  Text content of resource.
@@ -3,12 +3,13 @@ import os
3
3
  import shlex
4
4
  from logging import getLogger
5
5
  from pathlib import Path
6
- from typing import Any, Literal, TypedDict, cast
6
+ from typing import Any, Literal, cast
7
7
 
8
8
  import yaml
9
9
  from pydantic import BaseModel
10
10
 
11
11
  from inspect_ai._util.error import PrerequisiteError
12
+ from inspect_ai._util.trace import trace_message
12
13
  from inspect_ai.util._display import display_type
13
14
  from inspect_ai.util._subprocess import ExecResult, subprocess
14
15
 
@@ -16,26 +17,39 @@ from .prereqs import (
16
17
  DOCKER_COMPOSE_REQUIRED_VERSION_PULL_POLICY,
17
18
  validate_docker_compose,
18
19
  )
20
+ from .service import ComposeService, services_healthcheck_time
19
21
  from .util import ComposeProject, is_inspect_project
20
22
 
21
23
  logger = getLogger(__name__)
22
24
 
23
25
  # How long to wait for compose environment to pass a health check
24
- COMPOSE_WAIT = "120"
26
+ COMPOSE_WAIT = 120
25
27
 
26
28
 
27
- async def compose_up(project: ComposeProject) -> None:
29
+ async def compose_up(
30
+ project: ComposeProject, services: dict[str, ComposeService]
31
+ ) -> None:
32
+ # compute the maximum amount of time we will
33
+ up_command = ["up", "--detach", "--wait"]
34
+
35
+ # are there healthchecks in the service definitions? if so then peg our timeout
36
+ # at the maximum total wait time. otherwise, pick a reasonable default
37
+ healthcheck_time = services_healthcheck_time(services)
38
+ if healthcheck_time > 0:
39
+ timeout: int = healthcheck_time
40
+ trace_message(logger, "Docker", "Docker services heathcheck timeout: {timeout}")
41
+ else:
42
+ timeout = COMPOSE_WAIT
43
+
44
+ # align global wait timeout to maximum healthcheck timeout
45
+ up_command.extend(["--wait-timeout", str(timeout + 1)])
46
+
28
47
  # Start the environment. Note that we don't check the result because docker will
29
48
  # return a non-zero exit code for services that exit (even successfully) when
30
49
  # passing the --wait flag (see https://github.com/docker/compose/issues/10596).
31
50
  # In practice, we will catch any errors when calling compose_check_running()
32
51
  # immediately after we call compose_up().
33
- await compose_command(
34
- ["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT],
35
- project=project,
36
- # wait up to 5 minutes for container to go up (compose wait + 3 minutes)
37
- timeout=300,
38
- )
52
+ await compose_command(up_command, project=project, timeout=timeout)
39
53
 
40
54
 
41
55
  async def compose_down(project: ComposeProject, quiet: bool = True) -> None:
@@ -191,17 +205,6 @@ async def compose_exec(
191
205
  )
192
206
 
193
207
 
194
- ComposeService = TypedDict(
195
- "ComposeService",
196
- {
197
- "image": str | None,
198
- "build": str | None,
199
- "x-default": bool | None,
200
- "x-local": bool | None,
201
- },
202
- )
203
-
204
-
205
208
  async def compose_services(project: ComposeProject) -> dict[str, ComposeService]:
206
209
  result = await compose_command(["config"], project=project, timeout=60)
207
210
  if not result.success:
@@ -42,7 +42,8 @@ def find_compose_file(parent: str = "") -> str | None:
42
42
 
43
43
 
44
44
  def is_dockerfile(file: str) -> bool:
45
- return os.path.basename(file) == DOCKERFILE
45
+ path = Path(file)
46
+ return path.stem == DOCKERFILE or path.suffix == f".{DOCKERFILE}"
46
47
 
47
48
 
48
49
  def has_dockerfile(parent: str = "") -> bool:
@@ -9,6 +9,7 @@ from typing import Literal, Union, cast, overload
9
9
 
10
10
  from typing_extensions import override
11
11
 
12
+ from inspect_ai._util.error import PrerequisiteError
12
13
  from inspect_ai.util._subprocess import ExecResult, subprocess
13
14
 
14
15
  from ..environment import (
@@ -85,6 +86,14 @@ class DockerSandboxEnvironment(SandboxEnvironment):
85
86
 
86
87
  services = await compose_services(project)
87
88
  for name, service in services.items():
89
+ # if the service has an explicit container_name then
90
+ # error (as this won't work w/ epochs > 1)
91
+ container_name = service.get("container_name", None)
92
+ if container_name:
93
+ raise PrerequisiteError(
94
+ f"ERROR: Docker service '{name}' includes an explicitly configured container_name ('{container_name}'). This is not permitted, as container names should be provisioned by Docker compose and an explicit container_name will not work with epochs > 1."
95
+ )
96
+
88
97
  # build internal images
89
98
  image = service.get("image", None)
90
99
  if image and is_internal_image(image):
@@ -139,7 +148,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
139
148
  services = await compose_services(project)
140
149
 
141
150
  # start the services
142
- await compose_up(project)
151
+ await compose_up(project, services)
143
152
 
144
153
  # check to ensure that the services are running
145
154
  running_services = await compose_check_running(
@@ -0,0 +1,100 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import TypedDict
4
+
5
+
6
+ class ComposeServiceHealthcheck(TypedDict, total=False):
7
+ start_period: str
8
+ interval: str
9
+ retries: int
10
+ timeout: str
11
+
12
+
13
+ ComposeService = TypedDict(
14
+ "ComposeService",
15
+ {
16
+ "image": str,
17
+ "build": str,
18
+ "container_name": str,
19
+ "x-default": bool,
20
+ "x-local": bool,
21
+ "healthcheck": ComposeServiceHealthcheck,
22
+ },
23
+ total=False,
24
+ )
25
+
26
+
27
+ def services_healthcheck_time(services: dict[str, ComposeService]) -> int:
28
+ max_time = 0
29
+
30
+ for _, service in services.items():
31
+ service_time = service_healthcheck_time(service)
32
+ max_time = max(max_time, service_time)
33
+
34
+ return max_time
35
+
36
+
37
+ def service_healthcheck_time(service: ComposeService) -> int:
38
+ """
39
+ Calculate the maximum time a single service's healthcheck could take.
40
+
41
+ The total time is:
42
+ (retries * (interval + timeout))
43
+
44
+ Default values (from Docker documentation):
45
+ - retries: 3
46
+ - interval: 30s
47
+ - timeout: 30s
48
+ """
49
+ healthcheck = service.get("healthcheck", None)
50
+ if healthcheck is None:
51
+ return 0
52
+
53
+ # Parse duration strings with defaults
54
+ retries = healthcheck.get("retries", 3)
55
+ interval = parse_duration(healthcheck.get("interval", "30s"))
56
+ timeout = parse_duration(healthcheck.get("timeout", "30s"))
57
+
58
+ # Calculate total time in seconds
59
+ total_time = retries * (interval.seconds + timeout.seconds)
60
+
61
+ return int(total_time)
62
+
63
+
64
+ @dataclass
65
+ class Duration:
66
+ nanoseconds: int
67
+
68
+ @property
69
+ def seconds(self) -> float:
70
+ return self.nanoseconds / 1_000_000_000
71
+
72
+
73
+ def parse_duration(duration_str: str) -> Duration:
74
+ """Parse a Docker compose style duration string."""
75
+ if not duration_str:
76
+ return Duration(0)
77
+
78
+ units = {
79
+ "ns": 1,
80
+ "us": 1_000,
81
+ "ms": 1_000_000,
82
+ "s": 1_000_000_000,
83
+ "m": 60_000_000_000,
84
+ "h": 3_600_000_000_000,
85
+ }
86
+
87
+ duration_str = "".join(duration_str.split())
88
+ pattern = re.compile(r"(\d+)([a-z]+)")
89
+ matches = pattern.findall(duration_str)
90
+
91
+ if not matches:
92
+ raise ValueError(f"Invalid duration format: {duration_str}")
93
+
94
+ total_nanoseconds = 0
95
+ for number, unit in matches:
96
+ if unit not in units:
97
+ raise ValueError(f"Invalid unit: {unit}")
98
+ total_nanoseconds += int(number) * units[unit]
99
+
100
+ return Duration(total_nanoseconds)
@@ -65,91 +65,6 @@ class SandboxEnvironment(abc.ABC):
65
65
  filesystem context to copy samples files into and resolve relative paths to.
66
66
  """
67
67
 
68
- @classmethod
69
- def config_files(cls) -> list[str]:
70
- """Standard config files for this provider (used for automatic discovery)"""
71
- return []
72
-
73
- @classmethod
74
- def default_concurrency(cls) -> int | None:
75
- """Default max_sandboxes for this provider (`None` means no maximum)"""
76
- return None
77
-
78
- @classmethod
79
- async def task_init(
80
- cls, task_name: str, config: SandboxEnvironmentConfigType | None
81
- ) -> None:
82
- """Called at task startup initialize resources.
83
-
84
- Args:
85
- task_name (str): Name of task using the sandbox environment.
86
- config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
87
- """
88
- pass
89
-
90
- @classmethod
91
- async def sample_init(
92
- cls,
93
- task_name: str,
94
- config: SandboxEnvironmentConfigType | None,
95
- metadata: dict[str, str],
96
- ) -> dict[str, "SandboxEnvironment"]:
97
- """Initialize sandbox environments for a sample.
98
-
99
- Args:
100
- task_name (str): Name of task using the sandbox environment.
101
- config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
102
- metadata (dict[str,str]): Sample `metadata` field
103
-
104
- Returns:
105
- Dictionary of named sandbox environments. The environment which represents
106
- the default environment (resolved by `sandbox("default")` or `sandbox()`) must
107
- be the first key/value pair in the dictionary.
108
- """
109
- return {}
110
-
111
- @classmethod
112
- @abc.abstractmethod
113
- async def sample_cleanup(
114
- cls,
115
- task_name: str,
116
- config: SandboxEnvironmentConfigType | None,
117
- environments: dict[str, "SandboxEnvironment"],
118
- interrupted: bool,
119
- ) -> None:
120
- """Cleanup sandbox environments.
121
-
122
- Args:
123
- task_name (str): Name of task using the sandbox environment.
124
- config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
125
- environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample.
126
- interrupted (bool): Was the task interrupted by an error or cancellation
127
- """
128
- ...
129
-
130
- @classmethod
131
- async def task_cleanup(
132
- cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
133
- ) -> None:
134
- """Called at task exit as a last chance to cleanup resources.
135
-
136
- Args:
137
- task_name (str): Name of task using the sandbox environment.
138
- config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
139
- cleanup (bool): Whether to actually cleanup environment resources
140
- (False if `--no-sandbox-cleanup` was specified)
141
- """
142
- pass
143
-
144
- @classmethod
145
- async def cli_cleanup(cls, id: str | None) -> None:
146
- """Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup).
147
-
148
- Args:
149
- id (str | None): Optional ID to limit scope of cleanup.
150
- """
151
- pass
152
-
153
68
  @abc.abstractmethod
154
69
  async def exec(
155
70
  self,
@@ -170,13 +85,13 @@ class SandboxEnvironment(abc.ABC):
170
85
  `OutputLimitExceededError` will be raised.
171
86
 
172
87
  Args:
173
- cmd (str | list[str]): Command or command and arguments to execute.
174
- input (str | bytes | None): Standard input (optional).
175
- cwd (str | None): Current working dir (optional). If relative, will be relative to the per-sample filesystem context.
176
- env (dict[str,str]): Environment variables for execution.
177
- user (str | None): Optional username or UID to run the command as.
178
- timeout (int | None): Optional execution timeout (seconds).
179
- timeout_retry (bool): Retry the command in the case that it times out.
88
+ cmd: Command or command and arguments to execute.
89
+ input: Standard input (optional).
90
+ cwd: Current working dir (optional). If relative, will be relative to the per-sample filesystem context.
91
+ env: Environment variables for execution.
92
+ user: Optional username or UID to run the command as.
93
+ timeout: Optional execution timeout (seconds).
94
+ timeout_retry: Retry the command in the case that it times out.
180
95
  Commands will be retried up to twice, with a timeout of no greater
181
96
  than 60 seconds for the first retry and 30 for the second.
182
97
 
@@ -204,9 +119,9 @@ class SandboxEnvironment(abc.ABC):
204
119
  should be automatically created.
205
120
 
206
121
  Args:
207
- file (str): Path to file (relative file paths will resolve to the
122
+ file: Path to file (relative file paths will resolve to the
208
123
  per-sample working directory).
209
- contents (str | bytes): Text or binary file contents.
124
+ contents: Text or binary file contents.
210
125
 
211
126
  Raises:
212
127
  PermissionError: If the current user does not have permission to
@@ -233,9 +148,9 @@ class SandboxEnvironment(abc.ABC):
233
148
  to specifying `newline=""` in a call to the Python `open()` function.
234
149
 
235
150
  Args:
236
- file (str): Path to file (relative file paths will resolve to the
151
+ file: Path to file (relative file paths will resolve to the
237
152
  per-sample working directory).
238
- text (bool): Read as a utf-8 encoded text file.
153
+ text: Read as a utf-8 encoded text file.
239
154
 
240
155
  Returns:
241
156
  Contents of file (as str or bytes for binary files)
@@ -265,6 +180,91 @@ class SandboxEnvironment(abc.ABC):
265
180
  """
266
181
  raise NotImplementedError("connection not implemented")
267
182
 
183
+ @classmethod
184
+ def config_files(cls) -> list[str]:
185
+ """Standard config files for this provider (used for automatic discovery)"""
186
+ return []
187
+
188
+ @classmethod
189
+ def default_concurrency(cls) -> int | None:
190
+ """Default max_sandboxes for this provider (`None` means no maximum)"""
191
+ return None
192
+
193
+ @classmethod
194
+ async def task_init(
195
+ cls, task_name: str, config: SandboxEnvironmentConfigType | None
196
+ ) -> None:
197
+ """Called at task startup initialize resources.
198
+
199
+ Args:
200
+ task_name: Name of task using the sandbox environment.
201
+ config: Implementation defined configuration (optional).
202
+ """
203
+ pass
204
+
205
+ @classmethod
206
+ async def sample_init(
207
+ cls,
208
+ task_name: str,
209
+ config: SandboxEnvironmentConfigType | None,
210
+ metadata: dict[str, str],
211
+ ) -> dict[str, "SandboxEnvironment"]:
212
+ """Initialize sandbox environments for a sample.
213
+
214
+ Args:
215
+ task_name: Name of task using the sandbox environment.
216
+ config: Implementation defined configuration (optional).
217
+ metadata: Sample `metadata` field
218
+
219
+ Returns:
220
+ Dictionary of named sandbox environments. The environment which represents
221
+ the default environment (resolved by `sandbox("default")` or `sandbox()`) must
222
+ be the first key/value pair in the dictionary.
223
+ """
224
+ return {}
225
+
226
+ @classmethod
227
+ @abc.abstractmethod
228
+ async def sample_cleanup(
229
+ cls,
230
+ task_name: str,
231
+ config: SandboxEnvironmentConfigType | None,
232
+ environments: dict[str, "SandboxEnvironment"],
233
+ interrupted: bool,
234
+ ) -> None:
235
+ """Cleanup sandbox environments.
236
+
237
+ Args:
238
+ task_name: Name of task using the sandbox environment.
239
+ config: Implementation defined configuration (optional).
240
+ environments: Sandbox environments created for this sample.
241
+ interrupted: Was the task interrupted by an error or cancellation
242
+ """
243
+ ...
244
+
245
+ @classmethod
246
+ async def task_cleanup(
247
+ cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
248
+ ) -> None:
249
+ """Called at task exit as a last chance to cleanup resources.
250
+
251
+ Args:
252
+ task_name: Name of task using the sandbox environment.
253
+ config: Implementation defined configuration (optional).
254
+ cleanup: Whether to actually cleanup environment resources
255
+ (False if `--no-sandbox-cleanup` was specified)
256
+ """
257
+ pass
258
+
259
+ @classmethod
260
+ async def cli_cleanup(cls, id: str | None) -> None:
261
+ """Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup).
262
+
263
+ Args:
264
+ id: Optional ID to limit scope of cleanup.
265
+ """
266
+ pass
267
+
268
268
 
269
269
  @dataclass
270
270
  class SandboxEnvironments:
@@ -284,7 +284,10 @@ class SandboxEnvironmentSpec(NamedTuple):
284
284
  """Specification of a SandboxEnvironment."""
285
285
 
286
286
  type: str
287
+ """Sandbox type (e.g. 'local', 'docker')"""
288
+
287
289
  config: SandboxEnvironmentConfigType | None = None
290
+ """Sandbox configuration (filename or config object)."""
288
291
 
289
292
 
290
293
  SandboxEnvironmentConfigType = BaseModel | str