inspect-ai 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/info.py +2 -2
- inspect_ai/_cli/log.py +2 -2
- inspect_ai/_cli/score.py +2 -2
- inspect_ai/_display/core/display.py +19 -0
- inspect_ai/_display/core/panel.py +37 -7
- inspect_ai/_display/core/progress.py +29 -2
- inspect_ai/_display/core/results.py +79 -40
- inspect_ai/_display/core/textual.py +21 -0
- inspect_ai/_display/rich/display.py +28 -8
- inspect_ai/_display/textual/app.py +107 -1
- inspect_ai/_display/textual/display.py +1 -1
- inspect_ai/_display/textual/widgets/samples.py +132 -91
- inspect_ai/_display/textual/widgets/task_detail.py +236 -0
- inspect_ai/_display/textual/widgets/tasks.py +74 -6
- inspect_ai/_display/textual/widgets/toggle.py +32 -0
- inspect_ai/_eval/context.py +2 -0
- inspect_ai/_eval/eval.py +4 -3
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/run.py +35 -2
- inspect_ai/_eval/task/log.py +13 -11
- inspect_ai/_eval/task/results.py +12 -3
- inspect_ai/_eval/task/run.py +139 -36
- inspect_ai/_eval/task/sandbox.py +2 -1
- inspect_ai/_util/_async.py +30 -1
- inspect_ai/_util/file.py +31 -4
- inspect_ai/_util/html.py +3 -0
- inspect_ai/_util/logger.py +6 -5
- inspect_ai/_util/platform.py +5 -6
- inspect_ai/_util/registry.py +1 -1
- inspect_ai/_view/server.py +9 -9
- inspect_ai/_view/www/App.css +2 -2
- inspect_ai/_view/www/dist/assets/index.css +2 -2
- inspect_ai/_view/www/dist/assets/index.js +352 -294
- inspect_ai/_view/www/log-schema.json +13 -0
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +16 -13
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
- inspect_ai/_view/www/src/types/log.d.ts +2 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
- inspect_ai/_view/www/yarn.lock +9 -4
- inspect_ai/approval/__init__.py +1 -1
- inspect_ai/approval/_human/approver.py +35 -0
- inspect_ai/approval/_human/console.py +62 -0
- inspect_ai/approval/_human/manager.py +108 -0
- inspect_ai/approval/_human/panel.py +233 -0
- inspect_ai/approval/_human/util.py +51 -0
- inspect_ai/dataset/_sources/hf.py +2 -2
- inspect_ai/dataset/_sources/util.py +1 -1
- inspect_ai/log/_file.py +106 -36
- inspect_ai/log/_recorders/eval.py +226 -158
- inspect_ai/log/_recorders/file.py +9 -6
- inspect_ai/log/_recorders/json.py +35 -12
- inspect_ai/log/_recorders/recorder.py +15 -15
- inspect_ai/log/_samples.py +52 -0
- inspect_ai/model/_model.py +14 -0
- inspect_ai/model/_model_output.py +4 -0
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/hf.py +106 -4
- inspect_ai/model/_providers/util/__init__.py +2 -0
- inspect_ai/model/_providers/util/hf_handler.py +200 -0
- inspect_ai/scorer/_common.py +1 -1
- inspect_ai/solver/_plan.py +0 -8
- inspect_ai/solver/_task_state.py +18 -1
- inspect_ai/solver/_use_tools.py +9 -1
- inspect_ai/tool/_tool_def.py +2 -2
- inspect_ai/tool/_tool_info.py +14 -2
- inspect_ai/tool/_tool_params.py +2 -1
- inspect_ai/tool/_tools/_execute.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
- inspect_ai/util/__init__.py +5 -6
- inspect_ai/util/_panel.py +91 -0
- inspect_ai/util/_sandbox/__init__.py +2 -6
- inspect_ai/util/_sandbox/context.py +4 -3
- inspect_ai/util/_sandbox/docker/compose.py +12 -2
- inspect_ai/util/_sandbox/docker/docker.py +19 -9
- inspect_ai/util/_sandbox/docker/util.py +10 -2
- inspect_ai/util/_sandbox/environment.py +47 -41
- inspect_ai/util/_sandbox/local.py +15 -10
- inspect_ai/util/_subprocess.py +43 -3
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/RECORD +90 -82
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
- inspect_ai/approval/_human.py +0 -123
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import abc
|
2
4
|
from dataclasses import dataclass, field
|
3
5
|
from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload
|
@@ -6,49 +8,37 @@ from pydantic import BaseModel, Field
|
|
6
8
|
|
7
9
|
from .._subprocess import ExecResult
|
8
10
|
|
9
|
-
TaskInit = Callable[[str,
|
10
|
-
TaskCleanup = Callable[
|
11
|
+
TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
|
12
|
+
TaskCleanup = Callable[
|
13
|
+
[str, Union["SandboxEnvironmentConfigType", None], bool], Awaitable[None]
|
14
|
+
]
|
11
15
|
|
12
16
|
SampleInit = Callable[
|
13
|
-
[str,
|
17
|
+
[str, Union["SandboxEnvironmentConfigType", None], dict[str, str]],
|
18
|
+
Awaitable[dict[str, "SandboxEnvironment"]],
|
14
19
|
]
|
15
20
|
SampleCleanup = Callable[
|
16
|
-
[
|
21
|
+
[
|
22
|
+
str,
|
23
|
+
Union["SandboxEnvironmentConfigType", None],
|
24
|
+
dict[str, "SandboxEnvironment"],
|
25
|
+
bool,
|
26
|
+
],
|
27
|
+
Awaitable[None],
|
17
28
|
]
|
18
29
|
|
19
30
|
|
20
|
-
class
|
31
|
+
class SandboxConnection(BaseModel):
|
32
|
+
"""Information required to connect to sandbox."""
|
33
|
+
|
21
34
|
command: str
|
22
35
|
"""Shell command to connect to sandbox."""
|
23
36
|
|
24
|
-
|
25
|
-
"""
|
26
|
-
|
27
|
-
|
28
|
-
class SandboxConnectionLocal(SandboxConnectionBase):
|
29
|
-
type: Literal["local"] = Field(default="local")
|
30
|
-
|
31
|
-
|
32
|
-
class SandboxConnectionContainer(SandboxConnectionBase):
|
33
|
-
type: Literal["container"] = Field(default="container")
|
34
|
-
"""Sandbox login type."""
|
35
|
-
|
36
|
-
container: str
|
37
|
-
"""Container name."""
|
38
|
-
|
39
|
-
|
40
|
-
class SandboxConnectionSSH(SandboxConnectionBase):
|
41
|
-
type: Literal["ssh"] = Field(default="ssh")
|
42
|
-
"""Sandbox login type."""
|
43
|
-
|
44
|
-
destination: str
|
45
|
-
"""SSH destination server."""
|
37
|
+
vscode_command: list[str] | None = Field(default=None)
|
38
|
+
"""Optional vscode command (+args) to connect to sandbox."""
|
46
39
|
|
47
|
-
|
48
|
-
|
49
|
-
SandboxConnectionContainer, SandboxConnectionLocal, SandboxConnectionSSH
|
50
|
-
]
|
51
|
-
"""Information required to connect to sandbox."""
|
40
|
+
container: str | None = Field(default=None)
|
41
|
+
"""Optional container name (will not apply to all sandboxes)."""
|
52
42
|
|
53
43
|
|
54
44
|
class SandboxEnvironment(abc.ABC):
|
@@ -64,24 +54,29 @@ class SandboxEnvironment(abc.ABC):
|
|
64
54
|
return []
|
65
55
|
|
66
56
|
@classmethod
|
67
|
-
async def task_init(
|
57
|
+
async def task_init(
|
58
|
+
cls, task_name: str, config: SandboxEnvironmentConfigType | None
|
59
|
+
) -> None:
|
68
60
|
"""Called at task startup initialize resources.
|
69
61
|
|
70
62
|
Args:
|
71
63
|
task_name (str): Name of task using the sandbox environment.
|
72
|
-
config (
|
64
|
+
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
73
65
|
"""
|
74
66
|
pass
|
75
67
|
|
76
68
|
@classmethod
|
77
69
|
async def sample_init(
|
78
|
-
cls,
|
70
|
+
cls,
|
71
|
+
task_name: str,
|
72
|
+
config: SandboxEnvironmentConfigType | None,
|
73
|
+
metadata: dict[str, str],
|
79
74
|
) -> dict[str, "SandboxEnvironment"]:
|
80
75
|
"""Initialize sandbox environments for a sample.
|
81
76
|
|
82
77
|
Args:
|
83
78
|
task_name (str): Name of task using the sandbox environment.
|
84
|
-
config (
|
79
|
+
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
85
80
|
metadata (dict[str,str]): Sample `metadata` field
|
86
81
|
|
87
82
|
Returns:
|
@@ -96,7 +91,7 @@ class SandboxEnvironment(abc.ABC):
|
|
96
91
|
async def sample_cleanup(
|
97
92
|
cls,
|
98
93
|
task_name: str,
|
99
|
-
config:
|
94
|
+
config: SandboxEnvironmentConfigType | None,
|
100
95
|
environments: dict[str, "SandboxEnvironment"],
|
101
96
|
interrupted: bool,
|
102
97
|
) -> None:
|
@@ -104,7 +99,7 @@ class SandboxEnvironment(abc.ABC):
|
|
104
99
|
|
105
100
|
Args:
|
106
101
|
task_name (str): Name of task using the sandbox environment.
|
107
|
-
config (
|
102
|
+
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
108
103
|
environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample.
|
109
104
|
interrupted (bool): Was the task interrupted by an error or cancellation
|
110
105
|
"""
|
@@ -112,13 +107,13 @@ class SandboxEnvironment(abc.ABC):
|
|
112
107
|
|
113
108
|
@classmethod
|
114
109
|
async def task_cleanup(
|
115
|
-
cls, task_name: str, config:
|
110
|
+
cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
|
116
111
|
) -> None:
|
117
112
|
"""Called at task exit as a last chance to cleanup resources.
|
118
113
|
|
119
114
|
Args:
|
120
115
|
task_name (str): Name of task using the sandbox environment.
|
121
|
-
config (
|
116
|
+
config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
|
122
117
|
cleanup (bool): Whether to actually cleanup environment resources
|
123
118
|
(False if `--no-sandbox-cleanup` was specified)
|
124
119
|
"""
|
@@ -227,6 +222,15 @@ class SandboxEnvironment(abc.ABC):
|
|
227
222
|
...
|
228
223
|
|
229
224
|
async def connection(self) -> SandboxConnection:
|
225
|
+
"""Information required to connect to sandbox environment.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
SandboxConnection: connection information
|
229
|
+
|
230
|
+
Raises:
|
231
|
+
NotImplementedError: For sandboxes that don't provide connections
|
232
|
+
ConnectionError: If sandbox is not currently running.
|
233
|
+
"""
|
230
234
|
raise NotImplementedError("connection not implemented")
|
231
235
|
|
232
236
|
|
@@ -248,8 +252,10 @@ class SandboxEnvironmentSpec(NamedTuple):
|
|
248
252
|
"""Specification of a SandboxEnvironment."""
|
249
253
|
|
250
254
|
type: str
|
251
|
-
config:
|
255
|
+
config: SandboxEnvironmentConfigType | None = None
|
256
|
+
|
252
257
|
|
258
|
+
SandboxEnvironmentConfigType = BaseModel | str
|
253
259
|
|
254
260
|
SandboxEnvironmentType = SandboxEnvironmentSpec | str | tuple[str, str]
|
255
261
|
"""SandboxEnvironmentSpec and str and tuple shorthands for it.
|
@@ -7,8 +7,15 @@ import aiofiles
|
|
7
7
|
from typing_extensions import override
|
8
8
|
|
9
9
|
from .._subprocess import ExecResult, subprocess
|
10
|
-
from .environment import
|
11
|
-
|
10
|
+
from .environment import (
|
11
|
+
SandboxEnvironment,
|
12
|
+
SandboxEnvironmentConfigType,
|
13
|
+
)
|
14
|
+
from .limits import (
|
15
|
+
SandboxEnvironmentLimits,
|
16
|
+
verify_exec_result_size,
|
17
|
+
verify_read_file_size,
|
18
|
+
)
|
12
19
|
from .registry import sandboxenv
|
13
20
|
|
14
21
|
|
@@ -17,7 +24,10 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
17
24
|
@override
|
18
25
|
@classmethod
|
19
26
|
async def sample_init(
|
20
|
-
cls,
|
27
|
+
cls,
|
28
|
+
task_name: str,
|
29
|
+
config: SandboxEnvironmentConfigType | None,
|
30
|
+
metadata: dict[str, str],
|
21
31
|
) -> dict[str, SandboxEnvironment]:
|
22
32
|
return {"default": LocalSandboxEnvironment()}
|
23
33
|
|
@@ -26,7 +36,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
26
36
|
async def sample_cleanup(
|
27
37
|
cls,
|
28
38
|
task_name: str,
|
29
|
-
config:
|
39
|
+
config: SandboxEnvironmentConfigType | None,
|
30
40
|
environments: dict[str, SandboxEnvironment],
|
31
41
|
interrupted: bool,
|
32
42
|
) -> None:
|
@@ -63,6 +73,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
63
73
|
cwd=final_cwd,
|
64
74
|
env=env,
|
65
75
|
timeout=timeout,
|
76
|
+
output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
|
66
77
|
)
|
67
78
|
verify_exec_result_size(result)
|
68
79
|
return result
|
@@ -97,12 +108,6 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
97
108
|
async with aiofiles.open(file, "rb") as f:
|
98
109
|
return await f.read()
|
99
110
|
|
100
|
-
@override
|
101
|
-
async def connection(self) -> SandboxConnection:
|
102
|
-
return SandboxConnectionLocal(
|
103
|
-
command="/bin/bash --login", working_dir=self.directory.name
|
104
|
-
)
|
105
|
-
|
106
111
|
def _resolve_file(self, file: str) -> str:
|
107
112
|
path = Path(file)
|
108
113
|
if path.is_absolute():
|
inspect_ai/util/_subprocess.py
CHANGED
@@ -39,6 +39,7 @@ async def subprocess(
|
|
39
39
|
cwd: str | Path | None = None,
|
40
40
|
env: dict[str, str] = {},
|
41
41
|
capture_output: bool = True,
|
42
|
+
output_limit: int | None = None,
|
42
43
|
timeout: int | None = None,
|
43
44
|
) -> ExecResult[str]: ...
|
44
45
|
|
@@ -51,6 +52,7 @@ async def subprocess(
|
|
51
52
|
cwd: str | Path | None = None,
|
52
53
|
env: dict[str, str] = {},
|
53
54
|
capture_output: bool = True,
|
55
|
+
output_limit: int | None = None,
|
54
56
|
timeout: int | None = None,
|
55
57
|
) -> ExecResult[bytes]: ...
|
56
58
|
|
@@ -62,6 +64,7 @@ async def subprocess(
|
|
62
64
|
cwd: str | Path | None = None,
|
63
65
|
env: dict[str, str] = {},
|
64
66
|
capture_output: bool = True,
|
67
|
+
output_limit: int | None = None,
|
65
68
|
timeout: int | None = None,
|
66
69
|
) -> Union[ExecResult[str], ExecResult[bytes]]:
|
67
70
|
"""Execute and wait for a subprocess.
|
@@ -80,6 +83,8 @@ async def subprocess(
|
|
80
83
|
env (dict[str, str]): Additional environment variables.
|
81
84
|
capture_output (bool): Capture stderr and stdout into ExecResult
|
82
85
|
(if False, then output is redirected to parent stderr/stdout)
|
86
|
+
output_limit (int | None): Stop reading output if it exceeds
|
87
|
+
the specified limit (in bytes).
|
83
88
|
timeout (int | None): Timeout. If the timeout expires then
|
84
89
|
a `TimeoutError` will be raised.
|
85
90
|
|
@@ -119,10 +124,45 @@ async def subprocess(
|
|
119
124
|
# yield the proc
|
120
125
|
yield proc
|
121
126
|
|
127
|
+
# write stdin if specified
|
128
|
+
if proc.stdin is not None:
|
129
|
+
if input is not None:
|
130
|
+
proc.stdin.write(input)
|
131
|
+
await proc.stdin.drain()
|
132
|
+
proc.stdin.close()
|
133
|
+
await proc.stdin.wait_closed()
|
134
|
+
|
135
|
+
# read streams incrementally so we can check output limits
|
136
|
+
async def read_stream(stream: asyncio.StreamReader | None) -> bytes:
|
137
|
+
# return early for no stream
|
138
|
+
if stream is None:
|
139
|
+
return bytes()
|
140
|
+
|
141
|
+
# read 8k at a time
|
142
|
+
output = bytearray()
|
143
|
+
while True:
|
144
|
+
# read chunk and terminate if we are done
|
145
|
+
chunk = await stream.read(8192)
|
146
|
+
if not chunk:
|
147
|
+
break
|
148
|
+
|
149
|
+
# append to output
|
150
|
+
output.extend(chunk)
|
151
|
+
|
152
|
+
# stop if we have a limit and we have exceeded it
|
153
|
+
if output_limit is not None and len(output) > output_limit:
|
154
|
+
proc.kill()
|
155
|
+
break
|
156
|
+
|
157
|
+
# return stream output
|
158
|
+
return bytes(output)
|
159
|
+
|
122
160
|
# wait for it to execute and yield result
|
123
|
-
stdout, stderr = await
|
124
|
-
|
125
|
-
|
161
|
+
stdout, stderr = await asyncio.gather(
|
162
|
+
read_stream(proc.stdout), read_stream(proc.stderr)
|
163
|
+
)
|
164
|
+
returncode = await proc.wait()
|
165
|
+
success = returncode == 0
|
126
166
|
if text:
|
127
167
|
yield ExecResult[str](
|
128
168
|
success=success,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.51
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Safety Institute
|
6
6
|
License: MIT License
|
@@ -68,7 +68,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
68
68
|
Requires-Dist: pytest-cov; extra == "dev"
|
69
69
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
70
70
|
Requires-Dist: pytest-xdist; extra == "dev"
|
71
|
-
Requires-Dist: ruff==0.8.
|
71
|
+
Requires-Dist: ruff==0.8.2; extra == "dev"
|
72
72
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
73
73
|
Requires-Dist: types-PyYAML; extra == "dev"
|
74
74
|
Requires-Dist: types-aiofiles; extra == "dev"
|