inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_display/plain/display.py +9 -11
- inspect_ai/_display/textual/app.py +5 -5
- inspect_ai/_display/textual/widgets/samples.py +47 -18
- inspect_ai/_display/textual/widgets/transcript.py +25 -12
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +44 -15
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/interrupt.py +15 -0
- inspect_ai/_util/logger.py +23 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +301 -1
- inspect_ai/_util/transcript.py +10 -2
- inspect_ai/_util/working.py +46 -0
- inspect_ai/_view/www/dist/assets/index.css +56 -12
- inspect_ai/_view/www/dist/assets/index.js +905 -751
- inspect_ai/_view/www/log-schema.json +337 -2
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +188 -108
- inspect_ai/_view/www/src/utils/format.ts +7 -4
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +1 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_samples.py +5 -5
- inspect_ai/log/_transcript.py +31 -1
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_conversation.py +1 -1
- inspect_ai/model/_model.py +35 -16
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_providers/anthropic.py +13 -2
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +358 -302
- inspect_ai/model/_providers/groq.py +57 -23
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +81 -52
- inspect_ai/model/_providers/openai.py +9 -0
- inspect_ai/model/_providers/providers.py +6 -6
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +3 -3
- inspect_ai/solver/_solver.py +3 -0
- inspect_ai/solver/_task_state.py +10 -1
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
- inspect_ai/util/_sandbox/docker/compose.py +5 -9
- inspect_ai/util/_sandbox/docker/docker.py +20 -6
- inspect_ai/util/_sandbox/docker/util.py +10 -1
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +149 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +2 -1
- inspect_ai/util/_subprocess.py +4 -1
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
inspect_ai/solver/_plan.py
CHANGED
@@ -2,6 +2,7 @@ import inspect
|
|
2
2
|
from logging import getLogger
|
3
3
|
from typing import Any, Awaitable, Callable, TypeVar, cast
|
4
4
|
|
5
|
+
from inspect_ai._util.interrupt import check_sample_interrupt
|
5
6
|
from inspect_ai._util.registry import (
|
6
7
|
RegistryInfo,
|
7
8
|
is_registry_object,
|
@@ -115,15 +116,14 @@ class Plan(Solver):
|
|
115
116
|
with solver_transcript(self.finish, state) as st:
|
116
117
|
state = await self.finish(state, generate)
|
117
118
|
st.complete(state)
|
118
|
-
|
119
|
-
# mark completed
|
120
|
-
state.completed = True
|
119
|
+
check_sample_interrupt()
|
121
120
|
|
122
121
|
finally:
|
123
122
|
# always do cleanup if we have one
|
124
123
|
if self.cleanup:
|
125
124
|
try:
|
126
125
|
await self.cleanup(state)
|
126
|
+
check_sample_interrupt()
|
127
127
|
except Exception as ex:
|
128
128
|
logger.warning(f"Exception occurred during plan cleanup: {ex}")
|
129
129
|
|
inspect_ai/solver/_solver.py
CHANGED
@@ -15,6 +15,7 @@ from typing import (
|
|
15
15
|
from typing_extensions import Unpack
|
16
16
|
|
17
17
|
from inspect_ai._util._async import is_callable_coroutine
|
18
|
+
from inspect_ai._util.interrupt import check_sample_interrupt
|
18
19
|
from inspect_ai._util.registry import (
|
19
20
|
RegistryInfo,
|
20
21
|
registry_add,
|
@@ -200,6 +201,7 @@ def solver(
|
|
200
201
|
state: TaskState, generate: Generate
|
201
202
|
) -> TaskState:
|
202
203
|
state = await original_call(state, generate)
|
204
|
+
check_sample_interrupt()
|
203
205
|
set_sample_state(state)
|
204
206
|
return state
|
205
207
|
|
@@ -215,6 +217,7 @@ def solver(
|
|
215
217
|
state: TaskState, generate: Generate
|
216
218
|
) -> TaskState:
|
217
219
|
state = await solver(state, generate)
|
220
|
+
check_sample_interrupt()
|
218
221
|
set_sample_state(state)
|
219
222
|
return state
|
220
223
|
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -7,7 +7,9 @@ from random import Random
|
|
7
7
|
from typing import Any, Iterable, SupportsIndex, Type, Union, cast, overload
|
8
8
|
|
9
9
|
from pydantic_core import to_jsonable_python
|
10
|
+
from shortuuid import uuid
|
10
11
|
|
12
|
+
from inspect_ai._util.interrupt import check_sample_interrupt
|
11
13
|
from inspect_ai.dataset._dataset import MT, Sample, metadata_as
|
12
14
|
from inspect_ai.model import (
|
13
15
|
ChatMessage,
|
@@ -164,6 +166,7 @@ class TaskState:
|
|
164
166
|
self._token_limit = token_limit
|
165
167
|
self._completed = completed
|
166
168
|
self._store = Store()
|
169
|
+
self._uuid = uuid()
|
167
170
|
|
168
171
|
if choices:
|
169
172
|
self.choices = Choices(choices)
|
@@ -333,7 +336,7 @@ class TaskState:
|
|
333
336
|
def completed(self) -> bool:
|
334
337
|
"""Is the task completed.
|
335
338
|
|
336
|
-
Additionally, checks message and token limits and raises if they are exceeded.
|
339
|
+
Additionally, checks message and token limits and raises if they are exceeded, and also checks for an operator interrupt of the sample.
|
337
340
|
"""
|
338
341
|
from inspect_ai.log._samples import set_active_sample_total_messages
|
339
342
|
|
@@ -356,6 +359,7 @@ class TaskState:
|
|
356
359
|
"token", value=self.token_usage, limit=self.token_limit, state=self
|
357
360
|
)
|
358
361
|
else:
|
362
|
+
check_sample_interrupt()
|
359
363
|
return self._completed
|
360
364
|
|
361
365
|
@completed.setter
|
@@ -371,6 +375,11 @@ class TaskState:
|
|
371
375
|
scores: dict[str, Score] | None = None
|
372
376
|
"""Scores yielded by running task."""
|
373
377
|
|
378
|
+
@property
|
379
|
+
def uuid(self) -> str:
|
380
|
+
"""Globally unique identifier for sample run."""
|
381
|
+
return self._uuid
|
382
|
+
|
374
383
|
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
375
384
|
"""Pydantic model interface to metadata.
|
376
385
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import os
|
3
|
-
from typing import Literal, Protocol,
|
3
|
+
from typing import Literal, Protocol, runtime_checkable
|
4
4
|
|
5
5
|
import httpx
|
6
6
|
from bs4 import BeautifulSoup, NavigableString
|
@@ -90,8 +90,8 @@ def web_search(
|
|
90
90
|
return_exceptions=True,
|
91
91
|
)
|
92
92
|
for page, link in zip(pages, links):
|
93
|
-
if page and not isinstance(page,
|
94
|
-
page_contents.append(
|
93
|
+
if page and not isinstance(page, BaseException):
|
94
|
+
page_contents.append(page)
|
95
95
|
urls.append(link.url)
|
96
96
|
snippets.append(link.snippet)
|
97
97
|
search_calls += 1
|
inspect_ai/util/_concurrency.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
import asyncio
|
2
|
+
import contextlib
|
3
|
+
import time
|
2
4
|
from dataclasses import dataclass
|
5
|
+
from typing import AsyncIterator
|
3
6
|
|
7
|
+
from inspect_ai._util.working import report_sample_waiting_time
|
4
8
|
|
5
|
-
|
9
|
+
|
10
|
+
@contextlib.asynccontextmanager
|
11
|
+
async def concurrency(
|
6
12
|
name: str,
|
7
13
|
concurrency: int,
|
8
14
|
key: str | None = None,
|
9
|
-
) ->
|
10
|
-
"""
|
15
|
+
) -> AsyncIterator[None]:
|
16
|
+
"""Concurrency context manager.
|
11
17
|
|
12
18
|
A concurrency context can be used to limit the number of coroutines
|
13
19
|
executing a block of code (e.g calling an API). For example, here
|
@@ -32,9 +38,6 @@ def concurrency(
|
|
32
38
|
Used if the unique key isn't human readable -- e.g. includes
|
33
39
|
api tokens or account ids so that the more readable `name`
|
34
40
|
can be presented to users e.g in console UI>
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
Asyncio Semaphore for concurrency context.
|
38
41
|
"""
|
39
42
|
# sort out key
|
40
43
|
key = key if key else name
|
@@ -47,8 +50,11 @@ def concurrency(
|
|
47
50
|
)
|
48
51
|
_concurrency_semaphores[key] = semaphore
|
49
52
|
|
50
|
-
#
|
51
|
-
|
53
|
+
# wait and yield to protected code
|
54
|
+
start_wait = time.monotonic()
|
55
|
+
async with semaphore.semaphore:
|
56
|
+
report_sample_waiting_time(time.monotonic() - start_wait)
|
57
|
+
yield
|
52
58
|
|
53
59
|
|
54
60
|
def concurrency_status() -> dict[str, tuple[int, int]]:
|
@@ -5,6 +5,7 @@ from typing import Any, NoReturn, cast
|
|
5
5
|
from shortuuid import uuid
|
6
6
|
|
7
7
|
from inspect_ai._util.constants import SANDBOX_SETUP_TIMEOUT
|
8
|
+
from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
|
8
9
|
|
9
10
|
from .environment import (
|
10
11
|
SampleCleanup,
|
@@ -132,6 +133,9 @@ async def init_sandbox_environments_sample(
|
|
132
133
|
# verify that there is at least one environment and a 'default' env
|
133
134
|
validate_sandbox_environments(sandboxenv_type, environments)
|
134
135
|
|
136
|
+
# proxy environments (for recording SandboxEvent)
|
137
|
+
environments = {k: SandboxEnvironmentProxy(v) for k, v in environments.items()}
|
138
|
+
|
135
139
|
try:
|
136
140
|
# copy files into environments
|
137
141
|
await copy_sandbox_environment_files(files, environments)
|
@@ -148,6 +152,7 @@ async def init_sandbox_environments_sample(
|
|
148
152
|
return environments
|
149
153
|
|
150
154
|
except Exception as ex:
|
155
|
+
environments = unproxy_environments(environments)
|
151
156
|
await sample_cleanup(task_name, config, environments, True)
|
152
157
|
raise ex
|
153
158
|
|
@@ -161,9 +166,19 @@ async def cleanup_sandbox_environments_sample(
|
|
161
166
|
) -> None:
|
162
167
|
sandboxenv_type = registry_find_sandboxenv(type)
|
163
168
|
sample_cleanup = cast(SampleCleanup, getattr(sandboxenv_type, "sample_cleanup"))
|
169
|
+
environments = unproxy_environments(environments)
|
164
170
|
await sample_cleanup(task_name, config, environments, interrupted)
|
165
171
|
|
166
172
|
|
173
|
+
def unproxy_environments(
|
174
|
+
environments: dict[str, SandboxEnvironment],
|
175
|
+
) -> dict[str, SandboxEnvironment]:
|
176
|
+
return {
|
177
|
+
k: v._sandbox
|
178
|
+
for k, v in cast(dict[str, SandboxEnvironmentProxy], environments).items()
|
179
|
+
}
|
180
|
+
|
181
|
+
|
167
182
|
async def copy_sandbox_environment_files(
|
168
183
|
files: dict[str, bytes], environments: dict[str, SandboxEnvironment]
|
169
184
|
) -> None:
|
@@ -56,17 +56,22 @@ async def project_cleanup_shutdown(cleanup: bool) -> None:
|
|
56
56
|
title_style="bold",
|
57
57
|
title_justify="left",
|
58
58
|
)
|
59
|
+
table.add_column("Sample ID")
|
60
|
+
table.add_column("Epoch")
|
59
61
|
table.add_column("Container(s)", no_wrap=True)
|
60
|
-
table.add_column("Cleanup")
|
61
62
|
for project in shutdown_projects:
|
62
63
|
containers = await compose_ps(project, all=True)
|
63
64
|
table.add_row(
|
65
|
+
str(project.sample_id) if project.sample_id is not None else "",
|
66
|
+
str(project.epoch if project.epoch is not None else ""),
|
64
67
|
"\n".join(container["Name"] for container in containers),
|
65
|
-
f"[blue]inspect sandbox cleanup docker {project.name}[/blue]",
|
66
68
|
)
|
67
69
|
print(table)
|
68
70
|
print(
|
69
|
-
"\
|
71
|
+
"\n"
|
72
|
+
"Cleanup all containers : [blue]inspect sandbox cleanup docker[/blue]\n"
|
73
|
+
"Cleanup single container: [blue]inspect sandbox cleanup docker <container-id>[/blue]",
|
74
|
+
"\n",
|
70
75
|
)
|
71
76
|
|
72
77
|
# remove auto-compose files
|
@@ -28,7 +28,7 @@ COMPOSE_WAIT = 120
|
|
28
28
|
|
29
29
|
async def compose_up(
|
30
30
|
project: ComposeProject, services: dict[str, ComposeService]
|
31
|
-
) ->
|
31
|
+
) -> ExecResult[str]:
|
32
32
|
# compute the maximum amount of time we will
|
33
33
|
up_command = ["up", "--detach", "--wait"]
|
34
34
|
|
@@ -49,7 +49,8 @@ async def compose_up(
|
|
49
49
|
# passing the --wait flag (see https://github.com/docker/compose/issues/10596).
|
50
50
|
# In practice, we will catch any errors when calling compose_check_running()
|
51
51
|
# immediately after we call compose_up().
|
52
|
-
await compose_command(up_command, project=project, timeout=timeout)
|
52
|
+
result = await compose_command(up_command, project=project, timeout=timeout)
|
53
|
+
return result
|
53
54
|
|
54
55
|
|
55
56
|
async def compose_down(project: ComposeProject, quiet: bool = True) -> None:
|
@@ -121,14 +122,9 @@ async def compose_check_running(
|
|
121
122
|
unhealthy_services = services
|
122
123
|
for successful_service in successful_services:
|
123
124
|
unhealthy_services.remove(successful_service["Service"])
|
124
|
-
|
125
|
-
msg = (
|
126
|
-
"One or more docker containers failed to start from "
|
127
|
-
f"{project.config}: {','.join(unhealthy_services)}"
|
128
|
-
)
|
129
|
-
raise RuntimeError(msg)
|
125
|
+
return []
|
130
126
|
else:
|
131
|
-
|
127
|
+
return []
|
132
128
|
|
133
129
|
return [service["Service"] for service in running_services]
|
134
130
|
|
@@ -5,7 +5,7 @@ import os
|
|
5
5
|
import tempfile
|
6
6
|
from logging import getLogger
|
7
7
|
from pathlib import Path, PurePosixPath
|
8
|
-
from typing import Literal, Union,
|
8
|
+
from typing import Literal, Union, overload
|
9
9
|
|
10
10
|
from typing_extensions import override
|
11
11
|
|
@@ -139,8 +139,15 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
139
139
|
env[key] = str(value)
|
140
140
|
|
141
141
|
# create project
|
142
|
+
from inspect_ai.log._samples import sample_active
|
143
|
+
|
144
|
+
sample = sample_active()
|
142
145
|
project = await ComposeProject.create(
|
143
|
-
name=task_project_name(task_name),
|
146
|
+
name=task_project_name(task_name),
|
147
|
+
config=config,
|
148
|
+
sample_id=sample.sample.id if sample is not None else None,
|
149
|
+
epoch=sample.epoch if sample is not None else None,
|
150
|
+
env=env,
|
144
151
|
)
|
145
152
|
|
146
153
|
try:
|
@@ -148,13 +155,18 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
148
155
|
services = await compose_services(project)
|
149
156
|
|
150
157
|
# start the services
|
151
|
-
await compose_up(project, services)
|
158
|
+
result = await compose_up(project, services)
|
152
159
|
|
153
160
|
# check to ensure that the services are running
|
154
161
|
running_services = await compose_check_running(
|
155
162
|
list(services.keys()), project=project
|
156
163
|
)
|
157
164
|
|
165
|
+
if not running_services:
|
166
|
+
raise RuntimeError(
|
167
|
+
f"No services started.\nCompose up stderr: {result.stderr}"
|
168
|
+
)
|
169
|
+
|
158
170
|
# note that the project is running
|
159
171
|
project_startup(project)
|
160
172
|
|
@@ -209,9 +221,11 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
209
221
|
# (this enables us to show output for the cleanup operation)
|
210
222
|
if not interrupted:
|
211
223
|
# extract project from first environment
|
212
|
-
project =
|
213
|
-
|
214
|
-
|
224
|
+
project = (
|
225
|
+
next(iter(environments.values()))
|
226
|
+
.as_type(DockerSandboxEnvironment)
|
227
|
+
._project
|
228
|
+
)
|
215
229
|
# cleanup the project
|
216
230
|
await project_cleanup(project=project, quiet=True)
|
217
231
|
|
@@ -21,6 +21,8 @@ logger = getLogger(__name__)
|
|
21
21
|
class ComposeProject:
|
22
22
|
name: str
|
23
23
|
config: str | None
|
24
|
+
sample_id: int | str | None
|
25
|
+
epoch: int | None
|
24
26
|
env: dict[str, str] | None
|
25
27
|
|
26
28
|
@classmethod
|
@@ -28,6 +30,9 @@ class ComposeProject:
|
|
28
30
|
cls,
|
29
31
|
name: str,
|
30
32
|
config: SandboxEnvironmentConfigType | None,
|
33
|
+
*,
|
34
|
+
sample_id: int | str | None = None,
|
35
|
+
epoch: int | None = None,
|
31
36
|
env: dict[str, str] = {},
|
32
37
|
) -> "ComposeProject":
|
33
38
|
# resolve config to full path if we have one
|
@@ -58,16 +63,20 @@ class ComposeProject:
|
|
58
63
|
ensure_auto_compose_file(config)
|
59
64
|
|
60
65
|
# return project
|
61
|
-
return ComposeProject(name, config, env)
|
66
|
+
return ComposeProject(name, config, sample_id=sample_id, epoch=epoch, env=env)
|
62
67
|
|
63
68
|
def __init__(
|
64
69
|
self,
|
65
70
|
name: str,
|
66
71
|
config: str | None,
|
72
|
+
sample_id: int | str | None,
|
73
|
+
epoch: int | None,
|
67
74
|
env: dict[str, str],
|
68
75
|
) -> None:
|
69
76
|
self.name = name
|
70
77
|
self.config = config
|
78
|
+
self.sample_id = sample_id
|
79
|
+
self.epoch = epoch
|
71
80
|
self.env = env
|
72
81
|
|
73
82
|
def __eq__(self, other: object) -> bool:
|
@@ -2,12 +2,24 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import abc
|
4
4
|
from dataclasses import dataclass, field
|
5
|
-
from typing import
|
5
|
+
from typing import (
|
6
|
+
Any,
|
7
|
+
Awaitable,
|
8
|
+
Callable,
|
9
|
+
Literal,
|
10
|
+
NamedTuple,
|
11
|
+
Type,
|
12
|
+
TypeVar,
|
13
|
+
Union,
|
14
|
+
overload,
|
15
|
+
)
|
6
16
|
|
7
17
|
from pydantic import BaseModel, Field
|
8
18
|
|
9
19
|
from .._subprocess import ExecResult
|
10
20
|
|
21
|
+
ST = TypeVar("ST", bound="SandboxEnvironment")
|
22
|
+
|
11
23
|
TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
|
12
24
|
TaskCleanup = Callable[
|
13
25
|
[str, Union["SandboxEnvironmentConfigType", None], bool], Awaitable[None]
|
@@ -180,6 +192,25 @@ class SandboxEnvironment(abc.ABC):
|
|
180
192
|
"""
|
181
193
|
raise NotImplementedError("connection not implemented")
|
182
194
|
|
195
|
+
def as_type(self, sandbox_cls: Type[ST]) -> ST:
|
196
|
+
"""Verify and return a reference to a subclass of SandboxEnvironment.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
sandbox_cls: Class of sandbox (subclass of SandboxEnvironment)
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
Reference to the sandbox using the requested type.
|
203
|
+
|
204
|
+
Raises:
|
205
|
+
TypeError: If the sandbox is not of the requested type.
|
206
|
+
"""
|
207
|
+
if isinstance(self, sandbox_cls):
|
208
|
+
return self
|
209
|
+
else:
|
210
|
+
raise TypeError(
|
211
|
+
f"Expected instance of {sandbox_cls.__name__}, got {type(self).__name__}"
|
212
|
+
)
|
213
|
+
|
183
214
|
@classmethod
|
184
215
|
def config_files(cls) -> list[str]:
|
185
216
|
"""Standard config files for this provider (used for automatic discovery)"""
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import shlex
|
2
|
+
from typing import Literal, Type, Union, overload
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
from pydantic_core import to_jsonable_python
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
from inspect_ai._util.text import truncate_lines
|
9
|
+
from inspect_ai.util._subprocess import ExecResult
|
10
|
+
|
11
|
+
from .environment import (
|
12
|
+
ST,
|
13
|
+
SandboxConnection,
|
14
|
+
SandboxEnvironment,
|
15
|
+
SandboxEnvironmentConfigType,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class SandboxEnvironmentProxy(SandboxEnvironment):
|
20
|
+
def __init__(self, sandbox: SandboxEnvironment) -> None:
|
21
|
+
self._sandbox = sandbox
|
22
|
+
|
23
|
+
@override
|
24
|
+
async def exec(
|
25
|
+
self,
|
26
|
+
cmd: list[str],
|
27
|
+
input: str | bytes | None = None,
|
28
|
+
cwd: str | None = None,
|
29
|
+
env: dict[str, str] = {},
|
30
|
+
user: str | None = None,
|
31
|
+
timeout: int | None = None,
|
32
|
+
timeout_retry: bool = True,
|
33
|
+
) -> ExecResult[str]:
|
34
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
35
|
+
|
36
|
+
# make call
|
37
|
+
result = await self._sandbox.exec(
|
38
|
+
cmd, input, cwd, env, user, timeout, timeout_retry
|
39
|
+
)
|
40
|
+
|
41
|
+
# yield event
|
42
|
+
options: dict[str, JsonValue] = {}
|
43
|
+
if cwd:
|
44
|
+
options["cwd"] = cwd
|
45
|
+
if env:
|
46
|
+
options["env"] = to_jsonable_python(env)
|
47
|
+
if user:
|
48
|
+
options["user"] = user
|
49
|
+
if timeout is not None:
|
50
|
+
options["timeout"] = timeout
|
51
|
+
if timeout_retry is not True:
|
52
|
+
options["timeout_retry"] = timeout_retry
|
53
|
+
transcript()._event(
|
54
|
+
SandboxEvent(
|
55
|
+
action="exec",
|
56
|
+
cmd=" ".join([shlex.quote(c) for c in cmd]),
|
57
|
+
input=content_display(input) if input is not None else None,
|
58
|
+
options=options,
|
59
|
+
result=result.returncode,
|
60
|
+
output=content_display(
|
61
|
+
f"{result.stderr}\n\n{result.stdout}"
|
62
|
+
if result.stderr
|
63
|
+
else result.stdout
|
64
|
+
),
|
65
|
+
)
|
66
|
+
)
|
67
|
+
|
68
|
+
# return result
|
69
|
+
return result
|
70
|
+
|
71
|
+
@override
|
72
|
+
async def write_file(self, file: str, contents: str | bytes) -> None:
|
73
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
74
|
+
|
75
|
+
# make call
|
76
|
+
await self._sandbox.write_file(file, contents)
|
77
|
+
|
78
|
+
# yield event
|
79
|
+
transcript()._event(
|
80
|
+
SandboxEvent(
|
81
|
+
action="write_file", file=file, input=content_display(contents)
|
82
|
+
)
|
83
|
+
)
|
84
|
+
|
85
|
+
@overload
|
86
|
+
async def read_file(self, file: str, text: Literal[True] = True) -> str: ...
|
87
|
+
|
88
|
+
@overload
|
89
|
+
async def read_file(self, file: str, text: Literal[False]) -> bytes: ...
|
90
|
+
|
91
|
+
@override
|
92
|
+
async def read_file(self, file: str, text: bool = True) -> Union[str | bytes]:
|
93
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
94
|
+
|
95
|
+
# make call
|
96
|
+
if text is True:
|
97
|
+
output: str | bytes = await self._sandbox.read_file(file, True)
|
98
|
+
else:
|
99
|
+
output = await self._sandbox.read_file(file, False)
|
100
|
+
|
101
|
+
# yield event
|
102
|
+
transcript()._event(
|
103
|
+
SandboxEvent(action="read_file", file=file, output=content_display(output))
|
104
|
+
)
|
105
|
+
|
106
|
+
# return result
|
107
|
+
return output
|
108
|
+
|
109
|
+
@override
|
110
|
+
async def connection(self) -> SandboxConnection:
|
111
|
+
return await self._sandbox.connection()
|
112
|
+
|
113
|
+
@override
|
114
|
+
def as_type(self, sandbox_cls: Type[ST]) -> ST:
|
115
|
+
if isinstance(self._sandbox, sandbox_cls):
|
116
|
+
return self._sandbox
|
117
|
+
else:
|
118
|
+
raise TypeError(
|
119
|
+
f"Expected instance of {sandbox_cls.__name__}, got {type(self._sandbox).__name__}"
|
120
|
+
)
|
121
|
+
|
122
|
+
@classmethod
|
123
|
+
async def sample_cleanup(
|
124
|
+
cls,
|
125
|
+
task_name: str,
|
126
|
+
config: SandboxEnvironmentConfigType | None,
|
127
|
+
environments: dict[str, SandboxEnvironment],
|
128
|
+
interrupted: bool,
|
129
|
+
) -> None:
|
130
|
+
pass
|
131
|
+
|
132
|
+
|
133
|
+
def content_display(content: str | bytes) -> str:
|
134
|
+
if isinstance(content, str):
|
135
|
+
content, truncated = truncate_lines(content, 20)
|
136
|
+
if truncated:
|
137
|
+
content = f"{content}\n\nOutput truncated ({truncated} additional lines)"
|
138
|
+
return content
|
139
|
+
else:
|
140
|
+
return f"binary ({pretty_size(len(content))})"
|
141
|
+
|
142
|
+
|
143
|
+
def pretty_size(size: int) -> str:
|
144
|
+
if size < 1024:
|
145
|
+
return f"{size} B"
|
146
|
+
if size < 1024 * 1024:
|
147
|
+
return f"{size / 1024:.2f} KB"
|
148
|
+
|
149
|
+
return f"{size / (1024 * 1024):.2f} MB"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import tempfile
|
2
2
|
import warnings
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Literal, Union,
|
4
|
+
from typing import Literal, Union, overload
|
5
5
|
|
6
6
|
from typing_extensions import override
|
7
7
|
|
@@ -40,8 +40,8 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
40
40
|
interrupted: bool,
|
41
41
|
) -> None:
|
42
42
|
for environment in environments.values():
|
43
|
-
|
44
|
-
|
43
|
+
sandbox = environment.as_type(LocalSandboxEnvironment)
|
44
|
+
sandbox.directory.cleanup()
|
45
45
|
|
46
46
|
def __init__(self) -> None:
|
47
47
|
self.directory = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
@@ -445,7 +445,8 @@ async def test_exec_stdout_is_limited(sandbox_env: SandboxEnvironment) -> None:
|
|
445
445
|
assert "limit of 10 MiB was exceeded" in str(e_info.value)
|
446
446
|
truncated_output = e_info.value.truncated_output
|
447
447
|
# `yes` outputs 'y\n' (ASCII) so the size equals the string length.
|
448
|
-
|
448
|
+
# some shells additionally output 'canceled\n' so we add fudge factor for that
|
449
|
+
assert truncated_output and (len(truncated_output) - 10 * 1024**2) < 10
|
449
450
|
|
450
451
|
|
451
452
|
async def test_exec_stderr_is_limited(sandbox_env: SandboxEnvironment) -> None:
|
inspect_ai/util/_subprocess.py
CHANGED
@@ -199,7 +199,10 @@ async def subprocess(
|
|
199
199
|
else:
|
200
200
|
result = await asyncio.wait_for(anext(rc), timeout=timeout)
|
201
201
|
return cast(Union[ExecResult[str], ExecResult[bytes]], result)
|
202
|
-
|
202
|
+
# wait_for raises asyncio.TimeoutError under Python 3.10, but TimeoutError
|
203
|
+
# under Python > 3.11! asynio.timeout (introduced in Python 3.11) always
|
204
|
+
# raises the standard TimeoutError
|
205
|
+
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
203
206
|
# terminate timed out process -- try for graceful termination
|
204
207
|
# then be more forceful if requied
|
205
208
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.70
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Security Institute
|
6
6
|
License: MIT License
|
@@ -26,7 +26,7 @@ Requires-Dist: beautifulsoup4
|
|
26
26
|
Requires-Dist: click>=8.1.3
|
27
27
|
Requires-Dist: debugpy
|
28
28
|
Requires-Dist: docstring-parser>=0.16
|
29
|
-
Requires-Dist: fsspec
|
29
|
+
Requires-Dist: fsspec<=2024.12.0,>=2023.1.0
|
30
30
|
Requires-Dist: httpx
|
31
31
|
Requires-Dist: ijson>=3.2.0
|
32
32
|
Requires-Dist: jsonlines>=3.0.0
|
@@ -45,7 +45,7 @@ Requires-Dist: s3fs>=2023
|
|
45
45
|
Requires-Dist: semver>=3.0.0
|
46
46
|
Requires-Dist: shortuuid
|
47
47
|
Requires-Dist: tenacity
|
48
|
-
Requires-Dist: textual
|
48
|
+
Requires-Dist: textual>=0.86.2
|
49
49
|
Requires-Dist: typing_extensions>=4.9.0
|
50
50
|
Requires-Dist: zipp>=3.19.1
|
51
51
|
Provides-Extra: dev
|
@@ -53,7 +53,7 @@ Requires-Dist: anthropic; extra == "dev"
|
|
53
53
|
Requires-Dist: aioboto3; extra == "dev"
|
54
54
|
Requires-Dist: azure-ai-inference; extra == "dev"
|
55
55
|
Requires-Dist: google-cloud-aiplatform; extra == "dev"
|
56
|
-
Requires-Dist: google-
|
56
|
+
Requires-Dist: google-genai; extra == "dev"
|
57
57
|
Requires-Dist: goodfire; extra == "dev"
|
58
58
|
Requires-Dist: griffe; extra == "dev"
|
59
59
|
Requires-Dist: groq; extra == "dev"
|
@@ -71,7 +71,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
71
71
|
Requires-Dist: pytest-cov; extra == "dev"
|
72
72
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
73
73
|
Requires-Dist: pytest-xdist; extra == "dev"
|
74
|
-
Requires-Dist: ruff==0.9.
|
74
|
+
Requires-Dist: ruff==0.9.6; extra == "dev"
|
75
75
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
76
76
|
Requires-Dist: types-Markdown; extra == "dev"
|
77
77
|
Requires-Dist: types-PyYAML; extra == "dev"
|