inspect-ai 0.3.69__py3-none-any.whl → 0.3.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_display/textual/app.py +3 -2
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +25 -12
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +44 -15
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +10 -2
- inspect_ai/_util/working.py +46 -0
- inspect_ai/_view/www/dist/assets/index.css +56 -12
- inspect_ai/_view/www/dist/assets/index.js +904 -750
- inspect_ai/_view/www/log-schema.json +337 -2
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +188 -108
- inspect_ai/_view/www/src/utils/format.ts +7 -4
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +1 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +31 -1
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_conversation.py +1 -1
- inspect_ai/model/_model.py +32 -16
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_providers/anthropic.py +13 -2
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +2 -0
- inspect_ai/model/_providers/groq.py +57 -23
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +9 -0
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +149 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +74 -67
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import os
|
3
|
-
from typing import Literal, Protocol,
|
3
|
+
from typing import Literal, Protocol, runtime_checkable
|
4
4
|
|
5
5
|
import httpx
|
6
6
|
from bs4 import BeautifulSoup, NavigableString
|
@@ -90,8 +90,8 @@ def web_search(
|
|
90
90
|
return_exceptions=True,
|
91
91
|
)
|
92
92
|
for page, link in zip(pages, links):
|
93
|
-
if page and not isinstance(page,
|
94
|
-
page_contents.append(
|
93
|
+
if page and not isinstance(page, BaseException):
|
94
|
+
page_contents.append(page)
|
95
95
|
urls.append(link.url)
|
96
96
|
snippets.append(link.snippet)
|
97
97
|
search_calls += 1
|
inspect_ai/util/_concurrency.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
import asyncio
|
2
|
+
import contextlib
|
3
|
+
import time
|
2
4
|
from dataclasses import dataclass
|
5
|
+
from typing import AsyncIterator
|
3
6
|
|
7
|
+
from inspect_ai._util.working import report_sample_waiting_time
|
4
8
|
|
5
|
-
|
9
|
+
|
10
|
+
@contextlib.asynccontextmanager
|
11
|
+
async def concurrency(
|
6
12
|
name: str,
|
7
13
|
concurrency: int,
|
8
14
|
key: str | None = None,
|
9
|
-
) ->
|
10
|
-
"""
|
15
|
+
) -> AsyncIterator[None]:
|
16
|
+
"""Concurrency context manager.
|
11
17
|
|
12
18
|
A concurrency context can be used to limit the number of coroutines
|
13
19
|
executing a block of code (e.g calling an API). For example, here
|
@@ -32,9 +38,6 @@ def concurrency(
|
|
32
38
|
Used if the unique key isn't human readable -- e.g. includes
|
33
39
|
api tokens or account ids so that the more readable `name`
|
34
40
|
can be presented to users e.g in console UI>
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
Asyncio Semaphore for concurrency context.
|
38
41
|
"""
|
39
42
|
# sort out key
|
40
43
|
key = key if key else name
|
@@ -47,8 +50,11 @@ def concurrency(
|
|
47
50
|
)
|
48
51
|
_concurrency_semaphores[key] = semaphore
|
49
52
|
|
50
|
-
#
|
51
|
-
|
53
|
+
# wait and yield to protected code
|
54
|
+
start_wait = time.monotonic()
|
55
|
+
async with semaphore.semaphore:
|
56
|
+
report_sample_waiting_time(time.monotonic() - start_wait)
|
57
|
+
yield
|
52
58
|
|
53
59
|
|
54
60
|
def concurrency_status() -> dict[str, tuple[int, int]]:
|
@@ -5,6 +5,7 @@ from typing import Any, NoReturn, cast
|
|
5
5
|
from shortuuid import uuid
|
6
6
|
|
7
7
|
from inspect_ai._util.constants import SANDBOX_SETUP_TIMEOUT
|
8
|
+
from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
|
8
9
|
|
9
10
|
from .environment import (
|
10
11
|
SampleCleanup,
|
@@ -132,6 +133,9 @@ async def init_sandbox_environments_sample(
|
|
132
133
|
# verify that there is at least one environment and a 'default' env
|
133
134
|
validate_sandbox_environments(sandboxenv_type, environments)
|
134
135
|
|
136
|
+
# proxy environments (for recording SandboxEvent)
|
137
|
+
environments = {k: SandboxEnvironmentProxy(v) for k, v in environments.items()}
|
138
|
+
|
135
139
|
try:
|
136
140
|
# copy files into environments
|
137
141
|
await copy_sandbox_environment_files(files, environments)
|
@@ -148,6 +152,7 @@ async def init_sandbox_environments_sample(
|
|
148
152
|
return environments
|
149
153
|
|
150
154
|
except Exception as ex:
|
155
|
+
environments = unproxy_environments(environments)
|
151
156
|
await sample_cleanup(task_name, config, environments, True)
|
152
157
|
raise ex
|
153
158
|
|
@@ -161,9 +166,19 @@ async def cleanup_sandbox_environments_sample(
|
|
161
166
|
) -> None:
|
162
167
|
sandboxenv_type = registry_find_sandboxenv(type)
|
163
168
|
sample_cleanup = cast(SampleCleanup, getattr(sandboxenv_type, "sample_cleanup"))
|
169
|
+
environments = unproxy_environments(environments)
|
164
170
|
await sample_cleanup(task_name, config, environments, interrupted)
|
165
171
|
|
166
172
|
|
173
|
+
def unproxy_environments(
|
174
|
+
environments: dict[str, SandboxEnvironment],
|
175
|
+
) -> dict[str, SandboxEnvironment]:
|
176
|
+
return {
|
177
|
+
k: v._sandbox
|
178
|
+
for k, v in cast(dict[str, SandboxEnvironmentProxy], environments).items()
|
179
|
+
}
|
180
|
+
|
181
|
+
|
167
182
|
async def copy_sandbox_environment_files(
|
168
183
|
files: dict[str, bytes], environments: dict[str, SandboxEnvironment]
|
169
184
|
) -> None:
|
@@ -5,7 +5,7 @@ import os
|
|
5
5
|
import tempfile
|
6
6
|
from logging import getLogger
|
7
7
|
from pathlib import Path, PurePosixPath
|
8
|
-
from typing import Literal, Union,
|
8
|
+
from typing import Literal, Union, overload
|
9
9
|
|
10
10
|
from typing_extensions import override
|
11
11
|
|
@@ -145,7 +145,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
145
145
|
project = await ComposeProject.create(
|
146
146
|
name=task_project_name(task_name),
|
147
147
|
config=config,
|
148
|
-
sample_id=sample.id if sample is not None else None,
|
148
|
+
sample_id=sample.sample.id if sample is not None else None,
|
149
149
|
epoch=sample.epoch if sample is not None else None,
|
150
150
|
env=env,
|
151
151
|
)
|
@@ -221,9 +221,11 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
221
221
|
# (this enables us to show output for the cleanup operation)
|
222
222
|
if not interrupted:
|
223
223
|
# extract project from first environment
|
224
|
-
project =
|
225
|
-
|
226
|
-
|
224
|
+
project = (
|
225
|
+
next(iter(environments.values()))
|
226
|
+
.as_type(DockerSandboxEnvironment)
|
227
|
+
._project
|
228
|
+
)
|
227
229
|
# cleanup the project
|
228
230
|
await project_cleanup(project=project, quiet=True)
|
229
231
|
|
@@ -2,12 +2,24 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import abc
|
4
4
|
from dataclasses import dataclass, field
|
5
|
-
from typing import
|
5
|
+
from typing import (
|
6
|
+
Any,
|
7
|
+
Awaitable,
|
8
|
+
Callable,
|
9
|
+
Literal,
|
10
|
+
NamedTuple,
|
11
|
+
Type,
|
12
|
+
TypeVar,
|
13
|
+
Union,
|
14
|
+
overload,
|
15
|
+
)
|
6
16
|
|
7
17
|
from pydantic import BaseModel, Field
|
8
18
|
|
9
19
|
from .._subprocess import ExecResult
|
10
20
|
|
21
|
+
ST = TypeVar("ST", bound="SandboxEnvironment")
|
22
|
+
|
11
23
|
TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
|
12
24
|
TaskCleanup = Callable[
|
13
25
|
[str, Union["SandboxEnvironmentConfigType", None], bool], Awaitable[None]
|
@@ -180,6 +192,25 @@ class SandboxEnvironment(abc.ABC):
|
|
180
192
|
"""
|
181
193
|
raise NotImplementedError("connection not implemented")
|
182
194
|
|
195
|
+
def as_type(self, sandbox_cls: Type[ST]) -> ST:
|
196
|
+
"""Verify and return a reference to a subclass of SandboxEnvironment.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
sandbox_cls: Class of sandbox (subclass of SandboxEnvironment)
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
Reference to the sandbox using the requested type.
|
203
|
+
|
204
|
+
Raises:
|
205
|
+
TypeError: If the sandbox is not of the requested type.
|
206
|
+
"""
|
207
|
+
if isinstance(self, sandbox_cls):
|
208
|
+
return self
|
209
|
+
else:
|
210
|
+
raise TypeError(
|
211
|
+
f"Expected instance of {sandbox_cls.__name__}, got {type(self).__name__}"
|
212
|
+
)
|
213
|
+
|
183
214
|
@classmethod
|
184
215
|
def config_files(cls) -> list[str]:
|
185
216
|
"""Standard config files for this provider (used for automatic discovery)"""
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import shlex
|
2
|
+
from typing import Literal, Type, Union, overload
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
from pydantic_core import to_jsonable_python
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
from inspect_ai._util.text import truncate_lines
|
9
|
+
from inspect_ai.util._subprocess import ExecResult
|
10
|
+
|
11
|
+
from .environment import (
|
12
|
+
ST,
|
13
|
+
SandboxConnection,
|
14
|
+
SandboxEnvironment,
|
15
|
+
SandboxEnvironmentConfigType,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class SandboxEnvironmentProxy(SandboxEnvironment):
|
20
|
+
def __init__(self, sandbox: SandboxEnvironment) -> None:
|
21
|
+
self._sandbox = sandbox
|
22
|
+
|
23
|
+
@override
|
24
|
+
async def exec(
|
25
|
+
self,
|
26
|
+
cmd: list[str],
|
27
|
+
input: str | bytes | None = None,
|
28
|
+
cwd: str | None = None,
|
29
|
+
env: dict[str, str] = {},
|
30
|
+
user: str | None = None,
|
31
|
+
timeout: int | None = None,
|
32
|
+
timeout_retry: bool = True,
|
33
|
+
) -> ExecResult[str]:
|
34
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
35
|
+
|
36
|
+
# make call
|
37
|
+
result = await self._sandbox.exec(
|
38
|
+
cmd, input, cwd, env, user, timeout, timeout_retry
|
39
|
+
)
|
40
|
+
|
41
|
+
# yield event
|
42
|
+
options: dict[str, JsonValue] = {}
|
43
|
+
if cwd:
|
44
|
+
options["cwd"] = cwd
|
45
|
+
if env:
|
46
|
+
options["env"] = to_jsonable_python(env)
|
47
|
+
if user:
|
48
|
+
options["user"] = user
|
49
|
+
if timeout is not None:
|
50
|
+
options["timeout"] = timeout
|
51
|
+
if timeout_retry is not True:
|
52
|
+
options["timeout_retry"] = timeout_retry
|
53
|
+
transcript()._event(
|
54
|
+
SandboxEvent(
|
55
|
+
action="exec",
|
56
|
+
cmd=" ".join([shlex.quote(c) for c in cmd]),
|
57
|
+
input=content_display(input) if input is not None else None,
|
58
|
+
options=options,
|
59
|
+
result=result.returncode,
|
60
|
+
output=content_display(
|
61
|
+
f"{result.stderr}\n\n{result.stdout}"
|
62
|
+
if result.stderr
|
63
|
+
else result.stdout
|
64
|
+
),
|
65
|
+
)
|
66
|
+
)
|
67
|
+
|
68
|
+
# return result
|
69
|
+
return result
|
70
|
+
|
71
|
+
@override
|
72
|
+
async def write_file(self, file: str, contents: str | bytes) -> None:
|
73
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
74
|
+
|
75
|
+
# make call
|
76
|
+
await self._sandbox.write_file(file, contents)
|
77
|
+
|
78
|
+
# yield event
|
79
|
+
transcript()._event(
|
80
|
+
SandboxEvent(
|
81
|
+
action="write_file", file=file, input=content_display(contents)
|
82
|
+
)
|
83
|
+
)
|
84
|
+
|
85
|
+
@overload
|
86
|
+
async def read_file(self, file: str, text: Literal[True] = True) -> str: ...
|
87
|
+
|
88
|
+
@overload
|
89
|
+
async def read_file(self, file: str, text: Literal[False]) -> bytes: ...
|
90
|
+
|
91
|
+
@override
|
92
|
+
async def read_file(self, file: str, text: bool = True) -> Union[str | bytes]:
|
93
|
+
from inspect_ai.log._transcript import SandboxEvent, transcript
|
94
|
+
|
95
|
+
# make call
|
96
|
+
if text is True:
|
97
|
+
output: str | bytes = await self._sandbox.read_file(file, True)
|
98
|
+
else:
|
99
|
+
output = await self._sandbox.read_file(file, False)
|
100
|
+
|
101
|
+
# yield event
|
102
|
+
transcript()._event(
|
103
|
+
SandboxEvent(action="read_file", file=file, output=content_display(output))
|
104
|
+
)
|
105
|
+
|
106
|
+
# return result
|
107
|
+
return output
|
108
|
+
|
109
|
+
@override
|
110
|
+
async def connection(self) -> SandboxConnection:
|
111
|
+
return await self._sandbox.connection()
|
112
|
+
|
113
|
+
@override
|
114
|
+
def as_type(self, sandbox_cls: Type[ST]) -> ST:
|
115
|
+
if isinstance(self._sandbox, sandbox_cls):
|
116
|
+
return self._sandbox
|
117
|
+
else:
|
118
|
+
raise TypeError(
|
119
|
+
f"Expected instance of {sandbox_cls.__name__}, got {type(self._sandbox).__name__}"
|
120
|
+
)
|
121
|
+
|
122
|
+
@classmethod
|
123
|
+
async def sample_cleanup(
|
124
|
+
cls,
|
125
|
+
task_name: str,
|
126
|
+
config: SandboxEnvironmentConfigType | None,
|
127
|
+
environments: dict[str, SandboxEnvironment],
|
128
|
+
interrupted: bool,
|
129
|
+
) -> None:
|
130
|
+
pass
|
131
|
+
|
132
|
+
|
133
|
+
def content_display(content: str | bytes) -> str:
|
134
|
+
if isinstance(content, str):
|
135
|
+
content, truncated = truncate_lines(content, 20)
|
136
|
+
if truncated:
|
137
|
+
content = f"{content}\n\nOutput truncated ({truncated} additional lines)"
|
138
|
+
return content
|
139
|
+
else:
|
140
|
+
return f"binary ({pretty_size(len(content))})"
|
141
|
+
|
142
|
+
|
143
|
+
def pretty_size(size: int) -> str:
|
144
|
+
if size < 1024:
|
145
|
+
return f"{size} B"
|
146
|
+
if size < 1024 * 1024:
|
147
|
+
return f"{size / 1024:.2f} KB"
|
148
|
+
|
149
|
+
return f"{size / (1024 * 1024):.2f} MB"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import tempfile
|
2
2
|
import warnings
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Literal, Union,
|
4
|
+
from typing import Literal, Union, overload
|
5
5
|
|
6
6
|
from typing_extensions import override
|
7
7
|
|
@@ -40,8 +40,8 @@ class LocalSandboxEnvironment(SandboxEnvironment):
|
|
40
40
|
interrupted: bool,
|
41
41
|
) -> None:
|
42
42
|
for environment in environments.values():
|
43
|
-
|
44
|
-
|
43
|
+
sandbox = environment.as_type(LocalSandboxEnvironment)
|
44
|
+
sandbox.directory.cleanup()
|
45
45
|
|
46
46
|
def __init__(self) -> None:
|
47
47
|
self.directory = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.70
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Security Institute
|
6
6
|
License: MIT License
|
@@ -26,7 +26,7 @@ Requires-Dist: beautifulsoup4
|
|
26
26
|
Requires-Dist: click>=8.1.3
|
27
27
|
Requires-Dist: debugpy
|
28
28
|
Requires-Dist: docstring-parser>=0.16
|
29
|
-
Requires-Dist: fsspec
|
29
|
+
Requires-Dist: fsspec<=2024.12.0,>=2023.1.0
|
30
30
|
Requires-Dist: httpx
|
31
31
|
Requires-Dist: ijson>=3.2.0
|
32
32
|
Requires-Dist: jsonlines>=3.0.0
|
@@ -45,7 +45,7 @@ Requires-Dist: s3fs>=2023
|
|
45
45
|
Requires-Dist: semver>=3.0.0
|
46
46
|
Requires-Dist: shortuuid
|
47
47
|
Requires-Dist: tenacity
|
48
|
-
Requires-Dist: textual
|
48
|
+
Requires-Dist: textual>=0.86.2
|
49
49
|
Requires-Dist: typing_extensions>=4.9.0
|
50
50
|
Requires-Dist: zipp>=3.19.1
|
51
51
|
Provides-Extra: dev
|