inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +4 -2
- inspect_ai/_cli/eval.py +2 -0
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +0 -2
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/rich/display.py +4 -4
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/samples.py +41 -5
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/run.py +141 -119
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/logger.py +2 -1
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_util/trace.py +39 -3
- inspect_ai/_util/transcript.py +36 -7
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/dist/assets/index.js +322 -226
- inspect_ai/_view/www/log-schema.json +221 -138
- inspect_ai/_view/www/src/App.mjs +18 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +18 -3
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/types/log.d.ts +53 -35
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +27 -5
- inspect_ai/log/_recorders/eval.py +21 -8
- inspect_ai/log/_samples.py +10 -5
- inspect_ai/log/_transcript.py +28 -1
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +82 -17
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/{_trace.py → _conversation.py} +9 -8
- inspect_ai/model/_model.py +2 -2
- inspect_ai/model/_providers/anthropic.py +9 -7
- inspect_ai/model/_providers/azureai.py +6 -4
- inspect_ai/model/_providers/bedrock.py +6 -4
- inspect_ai/model/_providers/google.py +103 -14
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +6 -9
- inspect_ai/model/_providers/openai.py +34 -8
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_providers/vertex.py +17 -4
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/tool/__init__.py +9 -1
- inspect_ai/tool/_tool.py +9 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
- inspect_ai/util/__init__.py +4 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -13
- inspect_ai/util/_sandbox/docker/docker.py +20 -13
- inspect_ai/util/_sandbox/docker/util.py +2 -1
- inspect_ai/util/_sandbox/environment.py +13 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_store.py +2 -2
- inspect_ai/util/_subprocess.py +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
@@ -138,28 +138,31 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
138
138
|
# start the services
|
139
139
|
await compose_up(project)
|
140
140
|
|
141
|
+
# check to ensure that the services are running
|
142
|
+
running_services = await compose_check_running(
|
143
|
+
list(services.keys()), project=project
|
144
|
+
)
|
145
|
+
|
141
146
|
# note that the project is running
|
142
147
|
project_startup(project)
|
143
148
|
|
144
|
-
#
|
145
|
-
await compose_check_running(list(services.keys()), project=project)
|
146
|
-
|
147
|
-
# create sandbox environments
|
149
|
+
# create sandbox environments for all running services
|
148
150
|
default_service: str | None = None
|
149
151
|
environments: dict[str, SandboxEnvironment] = {}
|
150
152
|
for service, service_info in services.items():
|
151
|
-
|
152
|
-
|
153
|
+
if service in running_services:
|
154
|
+
# update the project w/ the working directory
|
155
|
+
working_dir = await container_working_dir(service, project)
|
153
156
|
|
154
|
-
|
155
|
-
|
157
|
+
# create the docker sandbox environemnt
|
158
|
+
docker_env = DockerSandboxEnvironment(service, project, working_dir)
|
156
159
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
+
# save reference to default service if requested
|
161
|
+
if service_info.get("x-default", False):
|
162
|
+
default_service = service
|
160
163
|
|
161
|
-
|
162
|
-
|
164
|
+
# record service => environment
|
165
|
+
environments[service] = docker_env
|
163
166
|
|
164
167
|
# confirm that we have a 'default' environemnt
|
165
168
|
if environments.get("default", None) is None and default_service is None:
|
@@ -225,6 +228,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
225
228
|
env: dict[str, str] = {},
|
226
229
|
user: str | None = None,
|
227
230
|
timeout: int | None = None,
|
231
|
+
timeout_retry: bool = True,
|
228
232
|
) -> ExecResult[str]:
|
229
233
|
# additional args
|
230
234
|
args = []
|
@@ -251,6 +255,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
251
255
|
args + [self._service] + cmd,
|
252
256
|
project=self._project,
|
253
257
|
timeout=timeout,
|
258
|
+
timeout_retry=timeout_retry,
|
254
259
|
input=input,
|
255
260
|
output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
|
256
261
|
)
|
@@ -428,11 +433,13 @@ class DockerSandboxEnvironment(SandboxEnvironment):
|
|
428
433
|
# return container connection
|
429
434
|
if container:
|
430
435
|
return SandboxConnection(
|
436
|
+
type="docker",
|
431
437
|
command=f"docker exec -it {container} bash -l",
|
432
438
|
vscode_command=[
|
433
439
|
"remote-containers.attachToRunningContainer",
|
434
440
|
container,
|
435
441
|
],
|
442
|
+
container=container,
|
436
443
|
)
|
437
444
|
# error (not currently running)
|
438
445
|
else:
|
@@ -84,7 +84,8 @@ def task_project_name(task: str) -> str:
|
|
84
84
|
if len(task) == 0:
|
85
85
|
task = "task"
|
86
86
|
|
87
|
-
|
87
|
+
# _- breaks docker project name constraints so we strip trailing underscores.
|
88
|
+
return f"inspect-{task[:12].rstrip('_')}-i{uuid().lower()[:6]}"
|
88
89
|
|
89
90
|
|
90
91
|
inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{6,}$"
|
@@ -31,12 +31,18 @@ SampleCleanup = Callable[
|
|
31
31
|
class SandboxConnection(BaseModel):
|
32
32
|
"""Information required to connect to sandbox."""
|
33
33
|
|
34
|
+
type: str
|
35
|
+
"""Sandbox type name (e.g. 'docker', 'local', etc.)"""
|
36
|
+
|
34
37
|
command: str
|
35
38
|
"""Shell command to connect to sandbox."""
|
36
39
|
|
37
40
|
vscode_command: list[Any] | None = Field(default=None)
|
38
41
|
"""Optional vscode command (+args) to connect to sandbox."""
|
39
42
|
|
43
|
+
container: str | None = Field(default=None)
|
44
|
+
"""Optional container name (does not apply to all sandboxes)."""
|
45
|
+
|
40
46
|
|
41
47
|
class SandboxEnvironment(abc.ABC):
|
42
48
|
"""Environment for executing arbitrary code from tools.
|
@@ -139,6 +145,7 @@ class SandboxEnvironment(abc.ABC):
|
|
139
145
|
env: dict[str, str] = {},
|
140
146
|
user: str | None = None,
|
141
147
|
timeout: int | None = None,
|
148
|
+
timeout_retry: bool = True,
|
142
149
|
) -> ExecResult[str]:
|
143
150
|
"""Execute a command within a sandbox environment.
|
144
151
|
|
@@ -155,12 +162,17 @@ class SandboxEnvironment(abc.ABC):
|
|
155
162
|
env (dict[str,str]): Environment variables for execution.
|
156
163
|
user (str | None): Optional username or UID to run the command as.
|
157
164
|
timeout (int | None): Optional execution timeout (seconds).
|
165
|
+
timeout_retry (bool): Retry the command in the case that it times out.
|
166
|
+
Commands will be retried up to twice, with a timeout of no greater
|
167
|
+
than 60 seconds for the first retry and 30 for the second.
|
168
|
+
|
158
169
|
|
159
170
|
Returns:
|
160
171
|
Execution result (status code, stderr/stdout, etc.)
|
161
172
|
|
162
173
|
Raises:
|
163
|
-
TimeoutError: If the specified `timeout` expires
|
174
|
+
TimeoutError: If the specified `timeout` expires
|
175
|
+
(and `timeout_retry` attempts also timeout).
|
164
176
|
UnicodeDecodeError: If an error occurs while
|
165
177
|
decoding the command output.
|
166
178
|
PermissionError: If the user does not have
|
@@ -75,9 +75,9 @@ async def test_read_and_write_file_text(sandbox_env: SandboxEnvironment) -> None
|
|
75
75
|
written_file_string = await sandbox_env.read_file(
|
76
76
|
"test_read_and_write_file_text.file", text=True
|
77
77
|
)
|
78
|
-
assert (
|
79
|
-
"
|
80
|
-
)
|
78
|
+
assert "great #content\nincluding newlines" == written_file_string, (
|
79
|
+
f"unexpected content: [{written_file_string}]"
|
80
|
+
)
|
81
81
|
await _cleanup_file(sandbox_env, "test_read_and_write_file_text.file")
|
82
82
|
|
83
83
|
|
@@ -219,9 +219,9 @@ async def test_exec_output(sandbox_env: SandboxEnvironment) -> None:
|
|
219
219
|
exec_result = await sandbox_env.exec(["sh", "-c", "echo foo; echo bar"])
|
220
220
|
expected = "foo\nbar\n"
|
221
221
|
# in the assertion message, we show the actual bytes to help debug newline issues
|
222
|
-
assert (
|
223
|
-
exec_result.stdout
|
224
|
-
)
|
222
|
+
assert exec_result.stdout == expected, (
|
223
|
+
f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
|
224
|
+
)
|
225
225
|
|
226
226
|
|
227
227
|
async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None:
|
@@ -248,13 +248,13 @@ async def test_exec_as_user(sandbox_env: SandboxEnvironment) -> None:
|
|
248
248
|
|
249
249
|
# Test exec as different users
|
250
250
|
root_result = await sandbox_env.exec(["whoami"], user="root")
|
251
|
-
assert (
|
252
|
-
root_result.stdout.strip()
|
253
|
-
)
|
251
|
+
assert root_result.stdout.strip() == "root", (
|
252
|
+
f"Expected 'root', got '{root_result.stdout.strip()}'"
|
253
|
+
)
|
254
254
|
myuser_result = await sandbox_env.exec(["whoami"], user=username)
|
255
|
-
assert (
|
256
|
-
myuser_result.stdout.strip()
|
257
|
-
)
|
255
|
+
assert myuser_result.stdout.strip() == username, (
|
256
|
+
f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
|
257
|
+
)
|
258
258
|
finally:
|
259
259
|
# Clean up
|
260
260
|
await sandbox_env.exec(["userdel", "-r", username], user="root")
|
@@ -266,9 +266,9 @@ async def test_exec_as_nonexistent_user(sandbox_env: SandboxEnvironment) -> None
|
|
266
266
|
expected_error = (
|
267
267
|
"unable to find user nonexistent: no matching entries in passwd file"
|
268
268
|
)
|
269
|
-
assert (
|
270
|
-
expected_error in result.stdout
|
271
|
-
)
|
269
|
+
assert expected_error in result.stdout, (
|
270
|
+
f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
|
271
|
+
)
|
272
272
|
|
273
273
|
|
274
274
|
async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None:
|
@@ -291,9 +291,9 @@ async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None:
|
|
291
291
|
file_path = cwd_subdirectory + "/" + file_name
|
292
292
|
await sandbox_env.write_file(file_path, "ls me plz")
|
293
293
|
current_dir_contents = (await sandbox_env.exec(["ls"], cwd=cwd_subdirectory)).stdout
|
294
|
-
assert (
|
295
|
-
file_name in current_dir_contents
|
296
|
-
)
|
294
|
+
assert file_name in current_dir_contents, (
|
295
|
+
f"{file_name} not found in {current_dir_contents}"
|
296
|
+
)
|
297
297
|
await _cleanup_file(sandbox_env, file_path)
|
298
298
|
|
299
299
|
|
inspect_ai/util/_store.py
CHANGED
@@ -34,8 +34,8 @@ class Store:
|
|
34
34
|
inheriting from Pydantic `BaseModel`)
|
35
35
|
"""
|
36
36
|
|
37
|
-
def __init__(self) -> None:
|
38
|
-
self._data
|
37
|
+
def __init__(self, data: dict[str, Any] | None = None) -> None:
|
38
|
+
self._data = deepcopy(data) if data else {}
|
39
39
|
|
40
40
|
@overload
|
41
41
|
def get(self, key: str, default: None = None) -> Any: ...
|
inspect_ai/util/_subprocess.py
CHANGED
@@ -101,9 +101,9 @@ async def subprocess(
|
|
101
101
|
input = input.encode() if isinstance(input, str) else input
|
102
102
|
|
103
103
|
# function to run command (we may or may not run it w/ concurrency)
|
104
|
-
async def run_command() ->
|
105
|
-
|
106
|
-
|
104
|
+
async def run_command() -> AsyncGenerator[
|
105
|
+
Union[Process, ExecResult[str], ExecResult[bytes]], None
|
106
|
+
]:
|
107
107
|
if isinstance(args, str):
|
108
108
|
proc = await asyncio.create_subprocess_shell(
|
109
109
|
args,
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.58
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Safety Institute
|
6
6
|
License: MIT License
|
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
67
67
|
Requires-Dist: pytest-cov; extra == "dev"
|
68
68
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
69
69
|
Requires-Dist: pytest-xdist; extra == "dev"
|
70
|
-
Requires-Dist: ruff==0.
|
70
|
+
Requires-Dist: ruff==0.9.1; extra == "dev"
|
71
71
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
72
72
|
Requires-Dist: types-PyYAML; extra == "dev"
|
73
73
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|