inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,17 @@ ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
|
|
13
13
|
|
14
14
|
@tool
|
15
15
|
def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool:
|
16
|
+
"""Desktop computer tool.
|
17
|
+
|
18
|
+
See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-computer>.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
max_screenshots: The maximum number of screenshots to play
|
22
|
+
back to the model as input. Defaults to 1 (set to `None` to have no limit).
|
23
|
+
timeout: Timeout in seconds for computer tool actions.
|
24
|
+
Defaults to 180 (set to `None` for no timeout).
|
25
|
+
"""
|
26
|
+
|
16
27
|
async def execute(
|
17
28
|
action: Action,
|
18
29
|
text: str | None = None,
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from inspect_ai.util import sandbox
|
1
|
+
from inspect_ai.util import sandbox as sandbox_env
|
2
2
|
|
3
3
|
from .._tool import Tool, tool
|
4
4
|
from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer
|
@@ -20,14 +20,17 @@ def code_viewer(language: str, code_param: str) -> ToolCallViewer:
|
|
20
20
|
|
21
21
|
|
22
22
|
@tool(viewer=code_viewer("bash", "cmd"))
|
23
|
-
def bash(
|
23
|
+
def bash(
|
24
|
+
timeout: int | None = None, user: str | None = None, sandbox: str | None = None
|
25
|
+
) -> Tool:
|
24
26
|
"""Bash shell command execution tool.
|
25
27
|
|
26
28
|
Execute bash shell commands using a sandbox environment (e.g. "docker").
|
27
29
|
|
28
30
|
Args:
|
29
|
-
timeout
|
30
|
-
user
|
31
|
+
timeout: Timeout (in seconds) for command.
|
32
|
+
user: User to execute commands as.
|
33
|
+
sandbox: Optional sandbox environmnent name.
|
31
34
|
|
32
35
|
Returns:
|
33
36
|
String with command output (stdout) or command error (stderr).
|
@@ -44,7 +47,7 @@ def bash(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
44
47
|
The output of the command.
|
45
48
|
"""
|
46
49
|
# execute the command
|
47
|
-
result = await sandbox
|
50
|
+
result = await sandbox_env(sandbox).exec(
|
48
51
|
cmd=["bash", "--login", "-c", cmd], timeout=timeout, user=user
|
49
52
|
)
|
50
53
|
# return output (including stderr if any)
|
@@ -57,14 +60,17 @@ def bash(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
57
60
|
|
58
61
|
|
59
62
|
@tool(viewer=code_viewer("python", "code"))
|
60
|
-
def python(
|
63
|
+
def python(
|
64
|
+
timeout: int | None = None, user: str | None = None, sandbox: str | None = None
|
65
|
+
) -> Tool:
|
61
66
|
"""Python code execution tool.
|
62
67
|
|
63
68
|
Execute Python code using a sandbox environment (e.g. "docker").
|
64
69
|
|
65
70
|
Args:
|
66
|
-
timeout
|
67
|
-
user
|
71
|
+
timeout: Timeout (in seconds) for command.
|
72
|
+
user: User to execute commands as.
|
73
|
+
sandbox: Optional sandbox environmnent name.
|
68
74
|
|
69
75
|
Returns:
|
70
76
|
String with command output (stdout) or command error (stderr).
|
@@ -89,7 +95,7 @@ def python(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
89
95
|
Returns:
|
90
96
|
The output of the Python code.
|
91
97
|
"""
|
92
|
-
result = await sandbox
|
98
|
+
result = await sandbox_env(sandbox).exec(
|
93
99
|
cmd=["python3"], input=code, timeout=timeout, user=user
|
94
100
|
)
|
95
101
|
# return output (including stderr if any)
|
@@ -40,7 +40,7 @@ The result will be printed out in _stdout_ in the following format:
|
|
40
40
|
error: <an ERROR message if one occured>
|
41
41
|
info: <general info about the container>
|
42
42
|
web_url: <the URL of the page the browser is currently at>
|
43
|
-
|
43
|
+
web_at: <accessibility tree of the visible elements of the page>
|
44
44
|
```
|
45
45
|
|
46
46
|
|
@@ -57,7 +57,7 @@ The tool consists of the following components:
|
|
57
57
|
* _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
|
58
58
|
* _playwright_crawler.py_ - a wrapper over the sync Playwright API.
|
59
59
|
|
60
|
-
* [WebClient](web_client.py) - a simple stateless client to
|
60
|
+
* [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
|
61
61
|
1. creates a connection with the server;
|
62
62
|
2. sends user command to the server;
|
63
63
|
3. receives the response in the form of observations and prints them to stdout;
|
@@ -16,10 +16,12 @@ from inspect_ai.util._store_model import StoreModel, store_as
|
|
16
16
|
def web_browser(interactive: bool = True) -> list[Tool]:
|
17
17
|
"""Tools used for web browser navigation.
|
18
18
|
|
19
|
+
See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-browser>.
|
20
|
+
|
19
21
|
Args:
|
20
|
-
interactive
|
21
|
-
|
22
|
-
|
22
|
+
interactive: Provide interactive tools (enable
|
23
|
+
clicking, typing, and submitting forms). Defaults
|
24
|
+
to True.
|
23
25
|
|
24
26
|
Returns:
|
25
27
|
List of tools used for web browser navigation.
|
@@ -41,14 +41,16 @@ def web_search(
|
|
41
41
|
A web search is conducted using the specified provider, the results are parsed for relevance
|
42
42
|
using the specified model, and the top 'num_results' relevant pages are returned.
|
43
43
|
|
44
|
+
See further documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-search>.
|
45
|
+
|
44
46
|
Args:
|
45
|
-
provider
|
47
|
+
provider: Search provider (defaults to "google", currently
|
46
48
|
the only provider). Possible future providers include "brave" and "bing".
|
47
|
-
num_results
|
48
|
-
max_provider_calls
|
49
|
-
max_connections
|
49
|
+
num_results: Number of web search result pages to return to the model.
|
50
|
+
max_provider_calls: Maximum number of search calls to make to the search provider.
|
51
|
+
max_connections: Maximum number of concurrent connections to API
|
50
52
|
endpoint of search provider.
|
51
|
-
model
|
53
|
+
model: Model used to parse web pages for relevance.
|
52
54
|
|
53
55
|
Returns:
|
54
56
|
A tool that can be registered for use by models to search the web.
|
inspect_ai/util/_concurrency.py
CHANGED
@@ -23,12 +23,12 @@ def concurrency(
|
|
23
23
|
for launching subprocesses is handled via the `subprocess` function.
|
24
24
|
|
25
25
|
Args:
|
26
|
-
name
|
26
|
+
name: Name for concurrency context. This serves as the
|
27
27
|
display name for the context, and also the unique context
|
28
28
|
key (if the `key` parameter is omitted)
|
29
|
-
concurrency
|
29
|
+
concurrency: Maximum number of coroutines that can
|
30
30
|
enter the context.
|
31
|
-
key
|
31
|
+
key: Unique context key for this context. Optional.
|
32
32
|
Used if the unique key isn't human readable -- e.g. includes
|
33
33
|
api tokens or account ids so that the more readable `name`
|
34
34
|
can be presented to users e.g in console UI>
|
inspect_ai/util/_panel.py
CHANGED
inspect_ai/util/_resource.py
CHANGED
@@ -33,18 +33,18 @@ def resource(
|
|
33
33
|
`resource("templates/prompt.txt", type="file")`
|
34
34
|
|
35
35
|
Args:
|
36
|
-
resource
|
37
|
-
|
38
|
-
|
39
|
-
type
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
fs_options
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
36
|
+
resource: Path to local or remote (e.g. s3://)
|
37
|
+
resource, or for `type="auto"` (the default),
|
38
|
+
a string containing the literal resource value.
|
39
|
+
type: For "auto" (the default),
|
40
|
+
interpret the resource as a literal string if its not
|
41
|
+
a valid path. For "file", always interpret it as
|
42
|
+
a file path.
|
43
|
+
fs_options: Optional. Additional
|
44
|
+
arguments to pass through to the `fsspec` filesystem
|
45
|
+
provider (e.g. `S3FileSystem`). Use `{"anon": True }`
|
46
|
+
if you are accessing a public S3 bucket with no
|
47
|
+
credentials.
|
48
48
|
|
49
49
|
Returns:
|
50
50
|
Text content of resource.
|
@@ -3,12 +3,13 @@ import os
|
|
3
3
|
import shlex
|
4
4
|
from logging import getLogger
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Literal,
|
6
|
+
from typing import Any, Literal, cast
|
7
7
|
|
8
8
|
import yaml
|
9
9
|
from pydantic import BaseModel
|
10
10
|
|
11
11
|
from inspect_ai._util.error import PrerequisiteError
|
12
|
+
from inspect_ai._util.trace import trace_message
|
12
13
|
from inspect_ai.util._display import display_type
|
13
14
|
from inspect_ai.util._subprocess import ExecResult, subprocess
|
14
15
|
|
@@ -16,26 +17,39 @@ from .prereqs import (
|
|
16
17
|
DOCKER_COMPOSE_REQUIRED_VERSION_PULL_POLICY,
|
17
18
|
validate_docker_compose,
|
18
19
|
)
|
20
|
+
from .service import ComposeService, services_healthcheck_time
|
19
21
|
from .util import ComposeProject, is_inspect_project
|
20
22
|
|
21
23
|
logger = getLogger(__name__)
|
22
24
|
|
23
25
|
# How long to wait for compose environment to pass a health check
|
24
|
-
COMPOSE_WAIT =
|
26
|
+
COMPOSE_WAIT = 120
|
25
27
|
|
26
28
|
|
27
|
-
async def compose_up(
|
29
|
+
async def compose_up(
|
30
|
+
project: ComposeProject, services: dict[str, ComposeService]
|
31
|
+
) -> None:
|
32
|
+
# compute the maximum amount of time we will
|
33
|
+
up_command = ["up", "--detach", "--wait"]
|
34
|
+
|
35
|
+
# are there healthchecks in the service definitions? if so then peg our timeout
|
36
|
+
# at the maximum total wait time. otherwise, pick a reasonable default
|
37
|
+
healthcheck_time = services_healthcheck_time(services)
|
38
|
+
if healthcheck_time > 0:
|
39
|
+
timeout: int = healthcheck_time
|
40
|
+
trace_message(logger, "Docker", "Docker services heathcheck timeout: {timeout}")
|
41
|
+
else:
|
42
|
+
timeout = COMPOSE_WAIT
|
43
|
+
|
44
|
+
# align global wait timeout to maximum healthcheck timeout
|
45
|
+
up_command.extend(["--wait-timeout", str(timeout + 1)])
|
46
|
+
|
28
47
|
# Start the environment. Note that we don't check the result because docker will
|
29
48
|
# return a non-zero exit code for services that exit (even successfully) when
|
30
49
|
# passing the --wait flag (see https://github.com/docker/compose/issues/10596).
|
31
50
|
# In practice, we will catch any errors when calling compose_check_running()
|
32
51
|
# immediately after we call compose_up().
|
33
|
-
await compose_command(
|
34
|
-
["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT],
|
35
|
-
project=project,
|
36
|
-
# wait up to 5 minutes for container to go up (compose wait + 3 minutes)
|
37
|
-
timeout=300,
|
38
|
-
)
|
52
|
+
await compose_command(up_command, project=project, timeout=timeout)
|
39
53
|
|
40
54
|
|
41
55
|
async def compose_down(project: ComposeProject, quiet: bool = True) -> None:
|
@@ -191,17 +205,6 @@ async def compose_exec(
|
|
191
205
|
)
|
192
206
|
|
193
207
|
|
194
|
-
ComposeService = TypedDict(
|
195
|
-
"ComposeService",
|
196
|
-
{
|
197
|
-
"image": str | None,
|
198
|
-
"build": str | None,
|
199
|
-
"x-default": bool | None,
|
200
|
-
"x-local": bool | None,
|
201
|
-
},
|
202
|
-
)
|
203
|
-
|
204
|
-
|
205
208
|
async def compose_services(project: ComposeProject) -> dict[str, ComposeService]:
|
206
209
|
result = await compose_command(["config"], project=project, timeout=60)
|
207
210
|
if not result.success:
|
@@ -42,7 +42,8 @@ def find_compose_file(parent: str = "") -> str | None:
|
|
42
42
|
|
43
43
|
|
44
44
|
def is_dockerfile(file: str) -> bool:
|
45
|
-
|
45
|
+
path = Path(file)
|
46
|
+
return path.stem == DOCKERFILE or path.suffix == f".{DOCKERFILE}"
|
46
47
|
|
47
48
|
|
48
49
|
def has_dockerfile(parent: str = "") -> bool:
|
@@ -9,6 +9,7 @@ from typing import Literal, Union, cast, overload
|
|
9
9
|
|
10
10
|
from typing_extensions import override
|
11
11
|
|
12
|
+
from inspect_ai._util.error import PrerequisiteError
|
12
13
|
from inspect_ai.util._subprocess import ExecResult, subprocess
|
13
14
|
|
14
15
|
from ..environment import (
|
@@ -85,6 +86,14 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
85
86
|
|
86
87
|
services = await compose_services(project)
|
87
88
|
for name, service in services.items():
|
89
|
+
# if the service has an explicit container_name then
|
90
|
+
# error (as this won't work w/ epochs > 1)
|
91
|
+
container_name = service.get("container_name", None)
|
92
|
+
if container_name:
|
93
|
+
raise PrerequisiteError(
|
94
|
+
f"ERROR: Docker service '{name}' includes an explicitly configured container_name ('{container_name}'). This is not permitted, as container names should be provisioned by Docker compose and an explicit container_name will not work with epochs > 1."
|
95
|
+
)
|
96
|
+
|
88
97
|
# build internal images
|
89
98
|
image = service.get("image", None)
|
90
99
|
if image and is_internal_image(image):
|
@@ -139,7 +148,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
139
148
|
services = await compose_services(project)
|
140
149
|
|
141
150
|
# start the services
|
142
|
-
await compose_up(project)
|
151
|
+
await compose_up(project, services)
|
143
152
|
|
144
153
|
# check to ensure that the services are running
|
145
154
|
running_services = await compose_check_running(
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import re
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import TypedDict
|
4
|
+
|
5
|
+
|
6
|
+
class ComposeServiceHealthcheck(TypedDict, total=False):
|
7
|
+
start_period: str
|
8
|
+
interval: str
|
9
|
+
retries: int
|
10
|
+
timeout: str
|
11
|
+
|
12
|
+
|
13
|
+
ComposeService = TypedDict(
|
14
|
+
"ComposeService",
|
15
|
+
{
|
16
|
+
"image": str,
|
17
|
+
"build": str,
|
18
|
+
"container_name": str,
|
19
|
+
"x-default": bool,
|
20
|
+
"x-local": bool,
|
21
|
+
"healthcheck": ComposeServiceHealthcheck,
|
22
|
+
},
|
23
|
+
total=False,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
def services_healthcheck_time(services: dict[str, ComposeService]) -> int:
|
28
|
+
max_time = 0
|
29
|
+
|
30
|
+
for _, service in services.items():
|
31
|
+
service_time = service_healthcheck_time(service)
|
32
|
+
max_time = max(max_time, service_time)
|
33
|
+
|
34
|
+
return max_time
|
35
|
+
|
36
|
+
|
37
|
+
def service_healthcheck_time(service: ComposeService) -> int:
|
38
|
+
"""
|
39
|
+
Calculate the maximum time a single service's healthcheck could take.
|
40
|
+
|
41
|
+
The total time is:
|
42
|
+
(retries * (interval + timeout))
|
43
|
+
|
44
|
+
Default values (from Docker documentation):
|
45
|
+
- retries: 3
|
46
|
+
- interval: 30s
|
47
|
+
- timeout: 30s
|
48
|
+
"""
|
49
|
+
healthcheck = service.get("healthcheck", None)
|
50
|
+
if healthcheck is None:
|
51
|
+
return 0
|
52
|
+
|
53
|
+
# Parse duration strings with defaults
|
54
|
+
retries = healthcheck.get("retries", 3)
|
55
|
+
interval = parse_duration(healthcheck.get("interval", "30s"))
|
56
|
+
timeout = parse_duration(healthcheck.get("timeout", "30s"))
|
57
|
+
|
58
|
+
# Calculate total time in seconds
|
59
|
+
total_time = retries * (interval.seconds + timeout.seconds)
|
60
|
+
|
61
|
+
return int(total_time)
|
62
|
+
|
63
|
+
|
64
|
+
@dataclass
|
65
|
+
class Duration:
|
66
|
+
nanoseconds: int
|
67
|
+
|
68
|
+
@property
|
69
|
+
def seconds(self) -> float:
|
70
|
+
return self.nanoseconds / 1_000_000_000
|
71
|
+
|
72
|
+
|
73
|
+
def parse_duration(duration_str: str) -> Duration:
|
74
|
+
"""Parse a Docker compose style duration string."""
|
75
|
+
if not duration_str:
|
76
|
+
return Duration(0)
|
77
|
+
|
78
|
+
units = {
|
79
|
+
"ns": 1,
|
80
|
+
"us": 1_000,
|
81
|
+
"ms": 1_000_000,
|
82
|
+
"s": 1_000_000_000,
|
83
|
+
"m": 60_000_000_000,
|
84
|
+
"h": 3_600_000_000_000,
|
85
|
+
}
|
86
|
+
|
87
|
+
duration_str = "".join(duration_str.split())
|
88
|
+
pattern = re.compile(r"(\d+)([a-z]+)")
|
89
|
+
matches = pattern.findall(duration_str)
|
90
|
+
|
91
|
+
if not matches:
|
92
|
+
raise ValueError(f"Invalid duration format: {duration_str}")
|
93
|
+
|
94
|
+
total_nanoseconds = 0
|
95
|
+
for number, unit in matches:
|
96
|
+
if unit not in units:
|
97
|
+
raise ValueError(f"Invalid unit: {unit}")
|
98
|
+
total_nanoseconds += int(number) * units[unit]
|
99
|
+
|
100
|
+
return Duration(total_nanoseconds)
|
@@ -65,91 +65,6 @@ class SandboxEnvironment(abc.ABC):
|
|
65
65
|
filesystem context to copy samples files into and resolve relative paths to.
|
66
66
|
"""
|
67
67
|
|
68
|
-
@classmethod
|
69
|
-
def config_files(cls) -> list[str]:
|
70
|
-
"""Standard config files for this provider (used for automatic discovery)"""
|
71
|
-
return []
|
72
|
-
|
73
|
-
@classmethod
|
74
|
-
def default_concurrency(cls) -> int | None:
|
75
|
-
"""Default max_sandboxes for this provider (`None` means no maximum)"""
|
76
|
-
return None
|
77
|
-
|
78
|
-
@classmethod
|
79
|
-
async def task_init(
|
80
|
-
cls, task_name: str, config: SandboxEnvironmentConfigType | None
|
81
|
-
) -> None:
|
82
|
-
"""Called at task startup initialize resources.
|
83
|
-
|
84
|
-
Args:
|
85
|
-
task_name (str): Name of task using the sandbox environment.
|
86
|
-
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
87
|
-
"""
|
88
|
-
pass
|
89
|
-
|
90
|
-
@classmethod
|
91
|
-
async def sample_init(
|
92
|
-
cls,
|
93
|
-
task_name: str,
|
94
|
-
config: SandboxEnvironmentConfigType | None,
|
95
|
-
metadata: dict[str, str],
|
96
|
-
) -> dict[str, "SandboxEnvironment"]:
|
97
|
-
"""Initialize sandbox environments for a sample.
|
98
|
-
|
99
|
-
Args:
|
100
|
-
task_name (str): Name of task using the sandbox environment.
|
101
|
-
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
102
|
-
metadata (dict[str,str]): Sample `metadata` field
|
103
|
-
|
104
|
-
Returns:
|
105
|
-
Dictionary of named sandbox environments. The environment which represents
|
106
|
-
the default environment (resolved by `sandbox("default")` or `sandbox()`) must
|
107
|
-
be the first key/value pair in the dictionary.
|
108
|
-
"""
|
109
|
-
return {}
|
110
|
-
|
111
|
-
@classmethod
|
112
|
-
@abc.abstractmethod
|
113
|
-
async def sample_cleanup(
|
114
|
-
cls,
|
115
|
-
task_name: str,
|
116
|
-
config: SandboxEnvironmentConfigType | None,
|
117
|
-
environments: dict[str, "SandboxEnvironment"],
|
118
|
-
interrupted: bool,
|
119
|
-
) -> None:
|
120
|
-
"""Cleanup sandbox environments.
|
121
|
-
|
122
|
-
Args:
|
123
|
-
task_name (str): Name of task using the sandbox environment.
|
124
|
-
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
125
|
-
environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample.
|
126
|
-
interrupted (bool): Was the task interrupted by an error or cancellation
|
127
|
-
"""
|
128
|
-
...
|
129
|
-
|
130
|
-
@classmethod
|
131
|
-
async def task_cleanup(
|
132
|
-
cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
|
133
|
-
) -> None:
|
134
|
-
"""Called at task exit as a last chance to cleanup resources.
|
135
|
-
|
136
|
-
Args:
|
137
|
-
task_name (str): Name of task using the sandbox environment.
|
138
|
-
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
139
|
-
cleanup (bool): Whether to actually cleanup environment resources
|
140
|
-
(False if `--no-sandbox-cleanup` was specified)
|
141
|
-
"""
|
142
|
-
pass
|
143
|
-
|
144
|
-
@classmethod
|
145
|
-
async def cli_cleanup(cls, id: str | None) -> None:
|
146
|
-
"""Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup).
|
147
|
-
|
148
|
-
Args:
|
149
|
-
id (str | None): Optional ID to limit scope of cleanup.
|
150
|
-
"""
|
151
|
-
pass
|
152
|
-
|
153
68
|
@abc.abstractmethod
|
154
69
|
async def exec(
|
155
70
|
self,
|
@@ -170,13 +85,13 @@ class SandboxEnvironment(abc.ABC):
|
|
170
85
|
`OutputLimitExceededError` will be raised.
|
171
86
|
|
172
87
|
Args:
|
173
|
-
cmd
|
174
|
-
input
|
175
|
-
cwd
|
176
|
-
env
|
177
|
-
user
|
178
|
-
timeout
|
179
|
-
timeout_retry
|
88
|
+
cmd: Command or command and arguments to execute.
|
89
|
+
input: Standard input (optional).
|
90
|
+
cwd: Current working dir (optional). If relative, will be relative to the per-sample filesystem context.
|
91
|
+
env: Environment variables for execution.
|
92
|
+
user: Optional username or UID to run the command as.
|
93
|
+
timeout: Optional execution timeout (seconds).
|
94
|
+
timeout_retry: Retry the command in the case that it times out.
|
180
95
|
Commands will be retried up to twice, with a timeout of no greater
|
181
96
|
than 60 seconds for the first retry and 30 for the second.
|
182
97
|
|
@@ -204,9 +119,9 @@ class SandboxEnvironment(abc.ABC):
|
|
204
119
|
should be automatically created.
|
205
120
|
|
206
121
|
Args:
|
207
|
-
file
|
122
|
+
file: Path to file (relative file paths will resolve to the
|
208
123
|
per-sample working directory).
|
209
|
-
contents
|
124
|
+
contents: Text or binary file contents.
|
210
125
|
|
211
126
|
Raises:
|
212
127
|
PermissionError: If the current user does not have permission to
|
@@ -233,9 +148,9 @@ class SandboxEnvironment(abc.ABC):
|
|
233
148
|
to specifying `newline=""` in a call to the Python `open()` function.
|
234
149
|
|
235
150
|
Args:
|
236
|
-
file
|
151
|
+
file: Path to file (relative file paths will resolve to the
|
237
152
|
per-sample working directory).
|
238
|
-
text
|
153
|
+
text: Read as a utf-8 encoded text file.
|
239
154
|
|
240
155
|
Returns:
|
241
156
|
Contents of file (as str or bytes for binary files)
|
@@ -265,6 +180,91 @@ class SandboxEnvironment(abc.ABC):
|
|
265
180
|
"""
|
266
181
|
raise NotImplementedError("connection not implemented")
|
267
182
|
|
183
|
+
@classmethod
|
184
|
+
def config_files(cls) -> list[str]:
|
185
|
+
"""Standard config files for this provider (used for automatic discovery)"""
|
186
|
+
return []
|
187
|
+
|
188
|
+
@classmethod
|
189
|
+
def default_concurrency(cls) -> int | None:
|
190
|
+
"""Default max_sandboxes for this provider (`None` means no maximum)"""
|
191
|
+
return None
|
192
|
+
|
193
|
+
@classmethod
|
194
|
+
async def task_init(
|
195
|
+
cls, task_name: str, config: SandboxEnvironmentConfigType | None
|
196
|
+
) -> None:
|
197
|
+
"""Called at task startup initialize resources.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
task_name: Name of task using the sandbox environment.
|
201
|
+
config: Implementation defined configuration (optional).
|
202
|
+
"""
|
203
|
+
pass
|
204
|
+
|
205
|
+
@classmethod
|
206
|
+
async def sample_init(
|
207
|
+
cls,
|
208
|
+
task_name: str,
|
209
|
+
config: SandboxEnvironmentConfigType | None,
|
210
|
+
metadata: dict[str, str],
|
211
|
+
) -> dict[str, "SandboxEnvironment"]:
|
212
|
+
"""Initialize sandbox environments for a sample.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
task_name: Name of task using the sandbox environment.
|
216
|
+
config: Implementation defined configuration (optional).
|
217
|
+
metadata: Sample `metadata` field
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Dictionary of named sandbox environments. The environment which represents
|
221
|
+
the default environment (resolved by `sandbox("default")` or `sandbox()`) must
|
222
|
+
be the first key/value pair in the dictionary.
|
223
|
+
"""
|
224
|
+
return {}
|
225
|
+
|
226
|
+
@classmethod
|
227
|
+
@abc.abstractmethod
|
228
|
+
async def sample_cleanup(
|
229
|
+
cls,
|
230
|
+
task_name: str,
|
231
|
+
config: SandboxEnvironmentConfigType | None,
|
232
|
+
environments: dict[str, "SandboxEnvironment"],
|
233
|
+
interrupted: bool,
|
234
|
+
) -> None:
|
235
|
+
"""Cleanup sandbox environments.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
task_name: Name of task using the sandbox environment.
|
239
|
+
config: Implementation defined configuration (optional).
|
240
|
+
environments: Sandbox environments created for this sample.
|
241
|
+
interrupted: Was the task interrupted by an error or cancellation
|
242
|
+
"""
|
243
|
+
...
|
244
|
+
|
245
|
+
@classmethod
|
246
|
+
async def task_cleanup(
|
247
|
+
cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
|
248
|
+
) -> None:
|
249
|
+
"""Called at task exit as a last chance to cleanup resources.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
task_name: Name of task using the sandbox environment.
|
253
|
+
config: Implementation defined configuration (optional).
|
254
|
+
cleanup: Whether to actually cleanup environment resources
|
255
|
+
(False if `--no-sandbox-cleanup` was specified)
|
256
|
+
"""
|
257
|
+
pass
|
258
|
+
|
259
|
+
@classmethod
|
260
|
+
async def cli_cleanup(cls, id: str | None) -> None:
|
261
|
+
"""Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup).
|
262
|
+
|
263
|
+
Args:
|
264
|
+
id: Optional ID to limit scope of cleanup.
|
265
|
+
"""
|
266
|
+
pass
|
267
|
+
|
268
268
|
|
269
269
|
@dataclass
|
270
270
|
class SandboxEnvironments:
|
@@ -284,7 +284,10 @@ class SandboxEnvironmentSpec(NamedTuple):
|
|
284
284
|
"""Specification of a SandboxEnvironment."""
|
285
285
|
|
286
286
|
type: str
|
287
|
+
"""Sandbox type (e.g. 'local', 'docker')"""
|
288
|
+
|
287
289
|
config: SandboxEnvironmentConfigType | None = None
|
290
|
+
"""Sandbox configuration (filename or config object)."""
|
288
291
|
|
289
292
|
|
290
293
|
SandboxEnvironmentConfigType = BaseModel | str
|