inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/logger.py +14 -13
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +79 -47
- inspect_ai/_util/transcript.py +37 -4
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29752 -24492
- inspect_ai/_view/www/log-schema.json +262 -215
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +19 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +28 -5
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +28 -20
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +30 -2
- inspect_ai/log/_recorders/eval.py +2 -0
- inspect_ai/model/_call_tools.py +31 -7
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_providers/google.py +24 -6
- inspect_ai/model/_providers/openai.py +17 -3
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
- inspect_ai/util/__init__.py +12 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +17 -13
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +3 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +7 -8
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_subprocess.py +3 -3
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -38,9 +38,9 @@ class EnvironmentSpec:
|
|
38
38
|
for i, obs_spec in enumerate(env_obs_spec.values()):
|
39
39
|
self.observation_spec[i + 1] = convert(obs_spec)
|
40
40
|
|
41
|
-
assert isinstance(
|
42
|
-
|
43
|
-
)
|
41
|
+
assert isinstance(env.action_spec(), specs.Array), (
|
42
|
+
"Only a single action type is supported."
|
43
|
+
)
|
44
44
|
self.action_spec = {1: convert(env.action_spec())}
|
45
45
|
|
46
46
|
self.observation_manager = spec_manager.SpecManager(self.observation_spec)
|
@@ -234,12 +234,12 @@ class EnvironmentService(dm_env_rpc_pb2_grpc.EnvironmentServicer):
|
|
234
234
|
observations.
|
235
235
|
"""
|
236
236
|
with self._lock:
|
237
|
-
assert (
|
238
|
-
|
239
|
-
)
|
240
|
-
assert (
|
241
|
-
|
242
|
-
)
|
237
|
+
assert cur_world in self._envs, (
|
238
|
+
"Current world does not have an assosiated environment"
|
239
|
+
)
|
240
|
+
assert cur_world in self._joined_worlds, (
|
241
|
+
"Please join world before calling step."
|
242
|
+
)
|
243
243
|
env = self._envs[cur_world]
|
244
244
|
spec = self._specs[cur_world]
|
245
245
|
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from textwrap import dedent
|
3
3
|
|
4
|
+
from pydantic import Field
|
5
|
+
|
4
6
|
from inspect_ai._util.error import PrerequisiteError
|
5
7
|
from inspect_ai.tool._tool import Tool, ToolError, tool
|
6
8
|
from inspect_ai.tool._tool_call import ToolCall, ToolCallContent, ToolCallView
|
@@ -8,7 +10,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
|
|
8
10
|
from inspect_ai.tool._tool_with import tool_with
|
9
11
|
from inspect_ai.util._sandbox import SandboxEnvironment, sandbox_with
|
10
12
|
from inspect_ai.util._sandbox.docker.internal import INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB
|
11
|
-
from inspect_ai.util.
|
13
|
+
from inspect_ai.util._store_model import StoreModel, store_as
|
12
14
|
|
13
15
|
|
14
16
|
def web_browser(interactive: bool = True) -> list[Tool]:
|
@@ -97,12 +99,15 @@ def go_without_interactive_docs(tool: Tool) -> Tool:
|
|
97
99
|
# custom viewer for interactive tool calls that shows a truncated
|
98
100
|
# version of current the web accessiblity tree if available
|
99
101
|
|
100
|
-
|
102
|
+
|
103
|
+
class WebBrowserStore(StoreModel):
|
104
|
+
web_at: str = Field(default_factory=str)
|
105
|
+
session_id: str = Field(default_factory=str)
|
101
106
|
|
102
107
|
|
103
108
|
def web_at_viewer(call: ToolCall) -> ToolCallView:
|
104
109
|
# get the web accessiblity tree, if we have it create a view from it
|
105
|
-
web_at =
|
110
|
+
web_at = store_as(WebBrowserStore).web_at
|
106
111
|
element_id = call.arguments.get("element_id", 0)
|
107
112
|
if web_at and element_id:
|
108
113
|
lines = web_at.splitlines()
|
@@ -332,15 +337,14 @@ def web_browser_refresh() -> Tool:
|
|
332
337
|
|
333
338
|
WEB_CLIENT_REQUEST = "/app/web_browser/web_client.py"
|
334
339
|
WEB_CLIENT_NEW_SESSION = "/app/web_browser/web_client_new_session.py"
|
335
|
-
BROWSER_SESSION_ID = "BROWSER_SESSION_ID"
|
336
340
|
|
337
341
|
|
338
342
|
async def web_browser_cmd(cmd: str, *args: str) -> str:
|
339
343
|
sandbox_env = await sandbox_with(WEB_CLIENT_NEW_SESSION)
|
340
344
|
session_flag = ""
|
341
345
|
if sandbox_env:
|
342
|
-
|
343
|
-
if not
|
346
|
+
store = store_as(WebBrowserStore)
|
347
|
+
if not store.session_id:
|
344
348
|
result = await sandbox_env.exec(["python3", WEB_CLIENT_NEW_SESSION])
|
345
349
|
|
346
350
|
if not result.success:
|
@@ -348,10 +352,9 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
|
|
348
352
|
f"Error creating new web browser session: {result.stderr}"
|
349
353
|
)
|
350
354
|
|
351
|
-
|
352
|
-
store().set(BROWSER_SESSION_ID, browser_session)
|
355
|
+
store.session_id = result.stdout.strip("\n")
|
353
356
|
|
354
|
-
session_flag = f"--session_name={
|
357
|
+
session_flag = f"--session_name={store.session_id}"
|
355
358
|
|
356
359
|
else:
|
357
360
|
sandbox_env = await web_browser_sandbox()
|
@@ -369,7 +372,9 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
|
|
369
372
|
)
|
370
373
|
else:
|
371
374
|
response = parse_web_browser_output(result.stdout)
|
372
|
-
if "
|
375
|
+
if "error" in response and response.get("error", "").strip() != "":
|
376
|
+
raise ToolError(str(response.get("error")) or "(unknown error)")
|
377
|
+
elif "web_at" in response:
|
373
378
|
web_at = (
|
374
379
|
str(response.get("web_at")) or "(no web accessiblity tree available)"
|
375
380
|
)
|
@@ -379,10 +384,8 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
|
|
379
384
|
line.partition("data:image/png;base64")[0] for line in web_at_lines
|
380
385
|
]
|
381
386
|
web_at = "\n".join(web_at_lines)
|
382
|
-
|
387
|
+
store_as(WebBrowserStore).web_at = web_at
|
383
388
|
return web_at
|
384
|
-
elif "error" in response:
|
385
|
-
raise ToolError(str(response.get("error")) or "(unknown error)")
|
386
389
|
else:
|
387
390
|
raise RuntimeError(
|
388
391
|
f"web_browser output must contain either 'error' or 'web_at' field: {result.stdout}"
|
inspect_ai/util/__init__.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
+
from inspect_ai._util.trace import trace_action, trace_message
|
2
|
+
|
1
3
|
from ._concurrency import concurrency
|
2
4
|
from ._console import input_screen
|
5
|
+
from ._display import DisplayType, display_type
|
3
6
|
from ._panel import InputPanel, input_panel
|
4
7
|
from ._resource import resource
|
5
8
|
from ._sandbox import (
|
@@ -11,23 +14,25 @@ from ._sandbox import (
|
|
11
14
|
SandboxEnvironments,
|
12
15
|
SandboxEnvironmentSpec,
|
13
16
|
SandboxEnvironmentType,
|
14
|
-
SandboxService,
|
15
17
|
sandbox,
|
16
|
-
sandbox_service,
|
17
18
|
sandbox_with,
|
18
19
|
sandboxenv,
|
19
20
|
)
|
20
21
|
from ._store import Store, store
|
22
|
+
from ._store_model import StoreModel, store_as
|
21
23
|
from ._subprocess import (
|
22
24
|
ExecResult,
|
23
25
|
subprocess,
|
24
26
|
)
|
25
27
|
from ._subtask import Subtask, subtask
|
28
|
+
from ._throttle import throttle
|
26
29
|
from ._trace import trace_enabled, trace_panel
|
27
30
|
|
28
31
|
__all__ = [
|
29
32
|
"ExecResult",
|
30
33
|
"concurrency",
|
34
|
+
"DisplayType",
|
35
|
+
"display_type",
|
31
36
|
"InputPanel",
|
32
37
|
"input_panel",
|
33
38
|
"input_screen",
|
@@ -44,12 +49,15 @@ __all__ = [
|
|
44
49
|
"sandboxenv",
|
45
50
|
"sandbox",
|
46
51
|
"sandbox_with",
|
47
|
-
"SandboxService",
|
48
|
-
"sandbox_service",
|
49
52
|
"Store",
|
50
53
|
"store",
|
54
|
+
"StoreModel",
|
55
|
+
"store_as",
|
51
56
|
"Subtask",
|
52
57
|
"subtask",
|
58
|
+
"throttle",
|
53
59
|
"trace_enabled",
|
54
60
|
"trace_panel",
|
61
|
+
"trace_action",
|
62
|
+
"trace_message",
|
55
63
|
]
|
@@ -7,6 +7,7 @@ from inspect_ai._util.constants import DEFAULT_DISPLAY
|
|
7
7
|
logger = getLogger(__name__)
|
8
8
|
|
9
9
|
DisplayType = Literal["full", "rich", "plain", "none"]
|
10
|
+
"""Console display type."""
|
10
11
|
|
11
12
|
|
12
13
|
_display_type: DisplayType | None = None
|
@@ -28,6 +29,11 @@ def init_display_type(display: str | None = None) -> DisplayType:
|
|
28
29
|
|
29
30
|
|
30
31
|
def display_type() -> DisplayType:
|
32
|
+
"""Get the current console display type.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
DisplayType: Display type.
|
36
|
+
"""
|
31
37
|
global _display_type
|
32
38
|
if _display_type:
|
33
39
|
return _display_type
|
inspect_ai/util/_panel.py
CHANGED
@@ -1,23 +1,46 @@
|
|
1
1
|
from typing import Any, Protocol, TypeVar
|
2
2
|
|
3
3
|
from textual.containers import Container
|
4
|
+
from typing_extensions import Self
|
4
5
|
|
5
6
|
|
6
7
|
class InputPanel(Container):
|
8
|
+
DEFAULT_TITLE = "Panel"
|
9
|
+
|
7
10
|
DEFAULT_CLASSES = "task-input-panel"
|
8
11
|
|
12
|
+
DEFAULT_CSS = """
|
13
|
+
InputPanel {
|
14
|
+
padding: 0 1 1 1;
|
15
|
+
}
|
16
|
+
"""
|
17
|
+
|
9
18
|
class Host(Protocol):
|
10
19
|
def set_title(self, title: str) -> None: ...
|
11
20
|
def activate(self) -> None: ...
|
12
21
|
def deactivate(self) -> None: ...
|
13
22
|
def close(self) -> None: ...
|
14
23
|
|
15
|
-
def __init__(self,
|
24
|
+
def __init__(self, host: Host) -> None:
|
25
|
+
"""Initialise the panel.
|
26
|
+
|
27
|
+
Panels are created as required by the input_panel() function so
|
28
|
+
you should NOT override __init__ with your own initisation (rather,
|
29
|
+
you should define reactive props and/or methods that perform
|
30
|
+
initialisation).
|
31
|
+
|
32
|
+
You should also override the `DEFAULT_TITLE` variable for your panel to
|
33
|
+
provide a default tab title (you can change the table dynamically as
|
34
|
+
required using the `title` property).
|
35
|
+
|
36
|
+
Args:
|
37
|
+
host (InputPanel.Host): Interface to UI host of input panel.
|
38
|
+
"""
|
16
39
|
super().__init__()
|
17
|
-
self._title =
|
40
|
+
self._title = self.DEFAULT_TITLE
|
18
41
|
self._host = host
|
19
42
|
|
20
|
-
async def __aenter__(self) ->
|
43
|
+
async def __aenter__(self) -> Self:
|
21
44
|
self.activate()
|
22
45
|
return self
|
23
46
|
|
@@ -50,10 +73,10 @@ class InputPanel(Container):
|
|
50
73
|
pass
|
51
74
|
|
52
75
|
|
53
|
-
TP = TypeVar("TP", bound=InputPanel)
|
76
|
+
TP = TypeVar("TP", bound=InputPanel, covariant=True)
|
54
77
|
|
55
78
|
|
56
|
-
async def input_panel(
|
79
|
+
async def input_panel(panel: type[TP]) -> TP:
|
57
80
|
"""Create an input panel in the task display.
|
58
81
|
|
59
82
|
There can only be a single instance of an InputPanel with a given
|
@@ -65,19 +88,18 @@ async def input_panel(title: str, panel: type[TP]) -> TP:
|
|
65
88
|
the scope exits -- see below for open/close semantics)
|
66
89
|
|
67
90
|
```python
|
68
|
-
panel = await input_panel(
|
91
|
+
panel = await input_panel(CustomPanel)
|
69
92
|
panel.activate()
|
70
93
|
```
|
71
94
|
|
72
95
|
Activate and close an input panel using a context manager:
|
73
96
|
|
74
97
|
```python
|
75
|
-
async with await input_panel(
|
98
|
+
async with await input_panel(CustomPanel) as panel:
|
76
99
|
...
|
77
100
|
```
|
78
101
|
|
79
102
|
Args:
|
80
|
-
title (str): Input panel title.
|
81
103
|
panel (type[TP]): Type of panel widget (must derive from `InputPanel`)
|
82
104
|
|
83
105
|
Returns:
|
@@ -88,4 +110,4 @@ async def input_panel(title: str, panel: type[TP]) -> TP:
|
|
88
110
|
"""
|
89
111
|
from inspect_ai._display.core.active import task_screen
|
90
112
|
|
91
|
-
return await task_screen().input_panel(
|
113
|
+
return await task_screen().input_panel(panel)
|
@@ -13,7 +13,6 @@ from .environment import (
|
|
13
13
|
from .limits import OutputLimitExceededError, SandboxEnvironmentLimits
|
14
14
|
from .local import LocalSandboxEnvironment # noqa: F401
|
15
15
|
from .registry import sandboxenv
|
16
|
-
from .service import SandboxService, sandbox_service
|
17
16
|
|
18
17
|
__all__ = [
|
19
18
|
"OutputLimitExceededError",
|
@@ -27,6 +26,4 @@ __all__ = [
|
|
27
26
|
"sandboxenv",
|
28
27
|
"sandbox",
|
29
28
|
"sandbox_with",
|
30
|
-
"SandboxService",
|
31
|
-
"sandbox_service",
|
32
29
|
]
|
@@ -24,6 +24,10 @@ def sandbox(name: str | None = None) -> SandboxEnvironment:
|
|
24
24
|
|
25
25
|
Return:
|
26
26
|
SandboxEnvironment instance.
|
27
|
+
|
28
|
+
Raises:
|
29
|
+
ProcessLookupError: If there are no sandboxes available.
|
30
|
+
ValueError: If an invalid sandbox name is specified.
|
27
31
|
"""
|
28
32
|
# verify we have a context
|
29
33
|
environments = sandbox_environments_context_var.get(None)
|
@@ -102,7 +106,7 @@ async def sandbox_connections() -> dict[str, SandboxConnection]:
|
|
102
106
|
|
103
107
|
|
104
108
|
def raise_no_sandbox() -> NoReturn:
|
105
|
-
raise
|
109
|
+
raise ProcessLookupError(
|
106
110
|
"No sandbox environment has been provided for the current sample or task. "
|
107
111
|
+ "Please specify a sandbox for the sample or a global default sandbox for the task"
|
108
112
|
)
|
@@ -8,8 +8,8 @@ from typing import Any, Literal, TypedDict, cast
|
|
8
8
|
import yaml
|
9
9
|
from pydantic import BaseModel
|
10
10
|
|
11
|
-
from inspect_ai._util.display import display_type
|
12
11
|
from inspect_ai._util.error import PrerequisiteError
|
12
|
+
from inspect_ai.util._display import display_type
|
13
13
|
from inspect_ai.util._subprocess import ExecResult, subprocess
|
14
14
|
|
15
15
|
from .prereqs import (
|
@@ -29,11 +29,11 @@ async def compose_up(project: ComposeProject) -> None:
|
|
29
29
|
result = await compose_command(
|
30
30
|
["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT],
|
31
31
|
project=project,
|
32
|
+
# wait up to 5 minutes for container to go up (compose wait + 3 minutes)
|
33
|
+
timeout=300,
|
32
34
|
)
|
33
35
|
if not result.success:
|
34
|
-
msg =
|
35
|
-
f"Failed to start docker services for {project.config}: " f"{result.stderr}"
|
36
|
-
)
|
36
|
+
msg = f"Failed to start docker services for {project.config}: {result.stderr}"
|
37
37
|
raise RuntimeError(msg)
|
38
38
|
|
39
39
|
|
@@ -80,7 +80,11 @@ async def compose_cp(
|
|
80
80
|
output_limit: int | None = None,
|
81
81
|
) -> None:
|
82
82
|
result = await compose_command(
|
83
|
-
["cp", "--", src, dest],
|
83
|
+
["cp", "--", src, dest],
|
84
|
+
project=project,
|
85
|
+
timeout=120, # 2-minute timeout for file copies
|
86
|
+
cwd=cwd,
|
87
|
+
output_limit=output_limit,
|
84
88
|
)
|
85
89
|
if not result.success:
|
86
90
|
msg = f"Failed to copy file from '{src}' to '{dest}': {result.stderr}"
|
@@ -118,7 +122,7 @@ async def compose_ps(
|
|
118
122
|
command.append("--all")
|
119
123
|
if status:
|
120
124
|
command = command + ["--status", status]
|
121
|
-
result = await compose_command(command, project=project)
|
125
|
+
result = await compose_command(command, project=project, timeout=60)
|
122
126
|
if not result.success:
|
123
127
|
msg = f"Error querying for running services: {result.stderr}"
|
124
128
|
raise RuntimeError(msg)
|
@@ -136,6 +140,7 @@ async def compose_build(project: ComposeProject, capture_output: bool = False) -
|
|
136
140
|
result = await compose_command(
|
137
141
|
["build"],
|
138
142
|
project=project,
|
143
|
+
timeout=None, # no timeout for build
|
139
144
|
capture_output=capture_output,
|
140
145
|
)
|
141
146
|
if not result.success:
|
@@ -151,6 +156,7 @@ async def compose_pull(
|
|
151
156
|
return await compose_command(
|
152
157
|
["pull", "--ignore-buildable", "--policy", "missing", service],
|
153
158
|
project=project,
|
159
|
+
timeout=None, # no timeout for pull
|
154
160
|
capture_output=capture_output,
|
155
161
|
)
|
156
162
|
|
@@ -185,7 +191,7 @@ ComposeService = TypedDict(
|
|
185
191
|
|
186
192
|
|
187
193
|
async def compose_services(project: ComposeProject) -> dict[str, ComposeService]:
|
188
|
-
result = await compose_command(["config"], project=project)
|
194
|
+
result = await compose_command(["config"], project=project, timeout=60)
|
189
195
|
if not result.success:
|
190
196
|
raise RuntimeError(f"Error reading docker config: {result.stderr}")
|
191
197
|
return cast(dict[str, ComposeService], yaml.safe_load(result.stdout)["services"])
|
@@ -209,12 +215,13 @@ async def compose_ls() -> list[Project]:
|
|
209
215
|
|
210
216
|
async def compose_cleanup_images(
|
211
217
|
project: ComposeProject,
|
218
|
+
*,
|
212
219
|
cwd: str | None = None,
|
213
|
-
timeout: int | None
|
220
|
+
timeout: int | None,
|
214
221
|
) -> None:
|
215
222
|
# List the images that would be created for this compose
|
216
223
|
images_result = await compose_command(
|
217
|
-
["config", "--images"], project=project, cwd=cwd
|
224
|
+
["config", "--images"], project=project, timeout=timeout, cwd=cwd
|
218
225
|
)
|
219
226
|
|
220
227
|
# Remove those images explicitly
|
@@ -246,14 +253,11 @@ async def compose_cleanup_images(
|
|
246
253
|
logger.warning(msg)
|
247
254
|
|
248
255
|
|
249
|
-
DEFAULT_COMPOSE_TIMEOUT = 60
|
250
|
-
|
251
|
-
|
252
256
|
async def compose_command(
|
253
257
|
command: list[str],
|
254
258
|
*,
|
255
259
|
project: ComposeProject,
|
256
|
-
timeout: int | None
|
260
|
+
timeout: int | None,
|
257
261
|
input: str | bytes | None = None,
|
258
262
|
cwd: str | Path | None = None,
|
259
263
|
forward_env: bool = True,
|
@@ -78,7 +78,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
78
78
|
await compose_build(project)
|
79
79
|
|
80
80
|
# cleanup images created during build
|
81
|
-
await compose_cleanup_images(project)
|
81
|
+
await compose_cleanup_images(project, timeout=60)
|
82
82
|
|
83
83
|
services = await compose_services(project)
|
84
84
|
for name, service in services.items():
|
@@ -326,6 +326,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
326
326
|
container_tmpfile,
|
327
327
|
],
|
328
328
|
project=self._project,
|
329
|
+
timeout=60,
|
329
330
|
)
|
330
331
|
|
331
332
|
parent = PurePosixPath(file).parent
|
@@ -405,7 +406,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
405
406
|
|
406
407
|
# read and return w/ appropriate encoding
|
407
408
|
if text:
|
408
|
-
with open(dest_file, "r", encoding="utf-8") as f:
|
409
|
+
with open(dest_file, "r", newline="", encoding="utf-8") as f:
|
409
410
|
return f.read()
|
410
411
|
else:
|
411
412
|
with open(dest_file, "rb") as f:
|
@@ -424,13 +425,15 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
424
425
|
None,
|
425
426
|
)
|
426
427
|
|
427
|
-
# return container
|
428
|
+
# return container connection
|
428
429
|
if container:
|
429
430
|
return SandboxConnection(
|
430
|
-
command=f"docker exec -it {container}
|
431
|
-
|
431
|
+
command=f"docker exec -it {container} bash -l",
|
432
|
+
vscode_command=[
|
433
|
+
"remote-containers.attachToRunningContainer",
|
434
|
+
container,
|
435
|
+
],
|
432
436
|
)
|
433
|
-
|
434
437
|
# error (not currently running)
|
435
438
|
else:
|
436
439
|
raise ConnectionError(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from inspect_ai._util.constants import PKG_PATH
|
2
|
-
from inspect_ai._util.display import display_type
|
3
2
|
from inspect_ai._util.error import PrerequisiteError
|
3
|
+
from inspect_ai.util._display import display_type
|
4
4
|
from inspect_ai.util._subprocess import subprocess
|
5
5
|
|
6
6
|
INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB = "aisiuk/inspect-web-browser-tool"
|
@@ -84,10 +84,11 @@ def task_project_name(task: str) -> str:
|
|
84
84
|
if len(task) == 0:
|
85
85
|
task = "task"
|
86
86
|
|
87
|
-
|
87
|
+
# _- breaks docker project name constraints so we strip trailing underscores.
|
88
|
+
return f"inspect-{task[:12].rstrip('_')}-i{uuid().lower()[:6]}"
|
88
89
|
|
89
90
|
|
90
|
-
inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{
|
91
|
+
inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{6,}$"
|
91
92
|
|
92
93
|
|
93
94
|
def is_inspect_project(name: str) -> bool:
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import abc
|
4
4
|
from dataclasses import dataclass, field
|
5
|
-
from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload
|
5
|
+
from typing import Any, Awaitable, Callable, Literal, NamedTuple, Union, overload
|
6
6
|
|
7
7
|
from pydantic import BaseModel, Field
|
8
8
|
|
@@ -34,12 +34,9 @@ class SandboxConnection(BaseModel):
|
|
34
34
|
command: str
|
35
35
|
"""Shell command to connect to sandbox."""
|
36
36
|
|
37
|
-
vscode_command: list[
|
37
|
+
vscode_command: list[Any] | None = Field(default=None)
|
38
38
|
"""Optional vscode command (+args) to connect to sandbox."""
|
39
39
|
|
40
|
-
container: str | None = Field(default=None)
|
41
|
-
"""Optional container name (will not apply to all sandboxes)."""
|
42
|
-
|
43
40
|
|
44
41
|
class SandboxEnvironment(abc.ABC):
|
45
42
|
"""Environment for executing arbitrary code from tools.
|
@@ -205,6 +202,10 @@ class SandboxEnvironment(abc.ABC):
|
|
205
202
|
|
206
203
|
File size is limited to 100 MiB.
|
207
204
|
|
205
|
+
When reading text files, implementations should preserve newline constructs
|
206
|
+
(e.g. crlf should be preserved not converted to lf). This is equivalent
|
207
|
+
to specifying `newline=""` in a call to the Python `open()` function.
|
208
|
+
|
208
209
|
Args:
|
209
210
|
file (str): Path to file (relative file paths will resolve to the
|
210
211
|
per-sample working directory).
|
@@ -101,7 +101,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
101
101
|
file = self._resolve_file(file)
|
102
102
|
verify_read_file_size(file)
|
103
103
|
if text:
|
104
|
-
with open(file, "r", encoding="utf-8") as f:
|
104
|
+
with open(file, "r", newline="", encoding="utf-8") as f:
|
105
105
|
return f.read()
|
106
106
|
else:
|
107
107
|
with open(file, "rb") as f:
|
@@ -75,9 +75,9 @@ async def test_read_and_write_file_text(sandbox_env: SandboxEnvironment) -> None
|
|
75
75
|
written_file_string = await sandbox_env.read_file(
|
76
76
|
"test_read_and_write_file_text.file", text=True
|
77
77
|
)
|
78
|
-
assert (
|
79
|
-
"
|
80
|
-
)
|
78
|
+
assert "great #content\nincluding newlines" == written_file_string, (
|
79
|
+
f"unexpected content: [{written_file_string}]"
|
80
|
+
)
|
81
81
|
await _cleanup_file(sandbox_env, "test_read_and_write_file_text.file")
|
82
82
|
|
83
83
|
|
@@ -219,9 +219,9 @@ async def test_exec_output(sandbox_env: SandboxEnvironment) -> None:
|
|
219
219
|
exec_result = await sandbox_env.exec(["sh", "-c", "echo foo; echo bar"])
|
220
220
|
expected = "foo\nbar\n"
|
221
221
|
# in the assertion message, we show the actual bytes to help debug newline issues
|
222
|
-
assert (
|
223
|
-
exec_result.stdout
|
224
|
-
)
|
222
|
+
assert exec_result.stdout == expected, (
|
223
|
+
f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
|
224
|
+
)
|
225
225
|
|
226
226
|
|
227
227
|
async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None:
|
@@ -248,13 +248,13 @@ async def test_exec_as_user(sandbox_env: SandboxEnvironment) -> None:
|
|
248
248
|
|
249
249
|
# Test exec as different users
|
250
250
|
root_result = await sandbox_env.exec(["whoami"], user="root")
|
251
|
-
assert (
|
252
|
-
root_result.stdout.strip()
|
253
|
-
)
|
251
|
+
assert root_result.stdout.strip() == "root", (
|
252
|
+
f"Expected 'root', got '{root_result.stdout.strip()}'"
|
253
|
+
)
|
254
254
|
myuser_result = await sandbox_env.exec(["whoami"], user=username)
|
255
|
-
assert (
|
256
|
-
myuser_result.stdout.strip()
|
257
|
-
)
|
255
|
+
assert myuser_result.stdout.strip() == username, (
|
256
|
+
f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
|
257
|
+
)
|
258
258
|
finally:
|
259
259
|
# Clean up
|
260
260
|
await sandbox_env.exec(["userdel", "-r", username], user="root")
|
@@ -266,9 +266,9 @@ async def test_exec_as_nonexistent_user(sandbox_env: SandboxEnvironment) -> None
|
|
266
266
|
expected_error = (
|
267
267
|
"unable to find user nonexistent: no matching entries in passwd file"
|
268
268
|
)
|
269
|
-
assert (
|
270
|
-
expected_error in result.stdout
|
271
|
-
)
|
269
|
+
assert expected_error in result.stdout, (
|
270
|
+
f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
|
271
|
+
)
|
272
272
|
|
273
273
|
|
274
274
|
async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None:
|
@@ -291,9 +291,9 @@ async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None:
|
|
291
291
|
file_path = cwd_subdirectory + "/" + file_name
|
292
292
|
await sandbox_env.write_file(file_path, "ls me plz")
|
293
293
|
current_dir_contents = (await sandbox_env.exec(["ls"], cwd=cwd_subdirectory)).stdout
|
294
|
-
assert (
|
295
|
-
file_name in current_dir_contents
|
296
|
-
)
|
294
|
+
assert file_name in current_dir_contents, (
|
295
|
+
f"{file_name} not found in {current_dir_contents}"
|
296
|
+
)
|
297
297
|
await _cleanup_file(sandbox_env, file_path)
|
298
298
|
|
299
299
|
|