inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +7 -3
- inspect_ai/_cli/eval.py +17 -2
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +4 -3
- inspect_ai/_display/core/config.py +3 -3
- inspect_ai/_display/core/panel.py +7 -3
- inspect_ai/_display/plain/__init__.py +0 -0
- inspect_ai/_display/plain/display.py +203 -0
- inspect_ai/_display/rich/display.py +4 -9
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
- inspect_ai/_display/textual/widgets/samples.py +119 -16
- inspect_ai/_display/textual/widgets/sandbox.py +37 -0
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/score.py +1 -0
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/results.py +50 -22
- inspect_ai/_eval/task/run.py +180 -124
- inspect_ai/_eval/task/sandbox.py +10 -5
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/port_names.py +61 -0
- inspect_ai/_util/text.py +23 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_view/www/App.css +31 -1
- inspect_ai/_view/www/dist/assets/index.css +31 -1
- inspect_ai/_view/www/dist/assets/index.js +25375 -1846
- inspect_ai/_view/www/log-schema.json +129 -15
- inspect_ai/_view/www/package.json +2 -0
- inspect_ai/_view/www/src/App.mjs +8 -10
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
- inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
- inspect_ai/_view/www/src/index.js +75 -2
- inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
- inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
- inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
- inspect_ai/_view/www/src/types/log.d.ts +62 -27
- inspect_ai/_view/www/src/utils/Format.mjs +10 -3
- inspect_ai/_view/www/src/utils/Json.mjs +12 -6
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
- inspect_ai/_view/www/vite.config.js +7 -0
- inspect_ai/_view/www/yarn.lock +116 -0
- inspect_ai/approval/_human/__init__.py +0 -0
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/approval/_policy.py +12 -6
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_recorders/eval.py +19 -8
- inspect_ai/log/_samples.py +26 -5
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +59 -12
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/_conversation.py +61 -0
- inspect_ai/model/_generate_config.py +10 -4
- inspect_ai/model/_model.py +117 -18
- inspect_ai/model/_model_output.py +7 -2
- inspect_ai/model/_providers/anthropic.py +109 -51
- inspect_ai/model/_providers/azureai.py +26 -24
- inspect_ai/model/_providers/bedrock.py +43 -44
- inspect_ai/model/_providers/google.py +121 -58
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +17 -20
- inspect_ai/model/_providers/openai.py +32 -21
- inspect_ai/model/_providers/openai_o1.py +9 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/together.py +8 -8
- inspect_ai/model/_providers/vertex.py +18 -8
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/scorer/_scorer.py +2 -2
- inspect_ai/solver/__init__.py +2 -5
- inspect_ai/solver/_prompt.py +35 -5
- inspect_ai/solver/_task_state.py +80 -38
- inspect_ai/tool/__init__.py +11 -1
- inspect_ai/tool/_tool.py +21 -3
- inspect_ai/tool/_tool_call.py +10 -0
- inspect_ai/tool/_tool_def.py +16 -5
- inspect_ai/tool/_tool_with.py +21 -4
- inspect_ai/tool/beta/__init__.py +5 -0
- inspect_ai/tool/beta/_computer/__init__.py +3 -0
- inspect_ai/tool/beta/_computer/_common.py +133 -0
- inspect_ai/tool/beta/_computer/_computer.py +155 -0
- inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
- inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
- inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
- inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/util/__init__.py +2 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_limit.py +26 -0
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -11
- inspect_ai/util/_sandbox/docker/docker.py +84 -14
- inspect_ai/util/_sandbox/docker/internal.py +3 -1
- inspect_ai/util/_sandbox/environment.py +27 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
- inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
- inspect_ai/model/_trace.py +0 -48
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -5,28 +5,28 @@ from rich.console import RenderableType
|
|
5
5
|
from rich.table import Table
|
6
6
|
from rich.text import Text
|
7
7
|
from textual.app import ComposeResult
|
8
|
-
from textual.containers import
|
9
|
-
Horizontal,
|
10
|
-
HorizontalGroup,
|
11
|
-
Vertical,
|
12
|
-
VerticalGroup,
|
13
|
-
)
|
8
|
+
from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
|
14
9
|
from textual.reactive import reactive
|
15
10
|
from textual.widget import Widget
|
16
11
|
from textual.widgets import (
|
17
12
|
Button,
|
18
13
|
Collapsible,
|
14
|
+
Link,
|
19
15
|
LoadingIndicator,
|
20
16
|
OptionList,
|
21
17
|
Static,
|
22
18
|
)
|
23
19
|
from textual.widgets.option_list import Option, Separator
|
24
20
|
|
21
|
+
from inspect_ai._display.textual.widgets.port_mappings import get_url
|
25
22
|
from inspect_ai._util.format import format_progress_time
|
23
|
+
from inspect_ai._util.port_names import get_service_by_port
|
26
24
|
from inspect_ai._util.registry import registry_unqualified_name
|
27
25
|
from inspect_ai.log._samples import ActiveSample
|
26
|
+
from inspect_ai.log._transcript import ToolEvent
|
28
27
|
|
29
28
|
from .clock import Clock
|
29
|
+
from .sandbox import SandboxView
|
30
30
|
from .transcript import TranscriptView
|
31
31
|
|
32
32
|
|
@@ -73,6 +73,7 @@ class SamplesView(Widget):
|
|
73
73
|
|
74
74
|
async def set_highlighted_sample(self, highlighted: int | None) -> None:
|
75
75
|
sample_info = self.query_one(SampleInfo)
|
76
|
+
sample_vnc = self.query_one(SampleVNC)
|
76
77
|
transcript_view = self.query_one(TranscriptView)
|
77
78
|
sample_toolbar = self.query_one(SampleToolbar)
|
78
79
|
if highlighted is not None:
|
@@ -82,12 +83,14 @@ class SamplesView(Widget):
|
|
82
83
|
transcript_view.display = True
|
83
84
|
sample_toolbar.display = True
|
84
85
|
await sample_info.sync_sample(sample)
|
86
|
+
await sample_vnc.sync_sample(sample)
|
85
87
|
await transcript_view.sync_sample(sample)
|
86
88
|
await sample_toolbar.sync_sample(sample)
|
87
89
|
return
|
88
90
|
|
89
91
|
# otherwise hide ui
|
90
92
|
sample_info.display = False
|
93
|
+
sample_vnc.display = False
|
91
94
|
transcript_view.display = False
|
92
95
|
sample_toolbar.display = False
|
93
96
|
|
@@ -181,10 +184,59 @@ class SamplesList(OptionList):
|
|
181
184
|
return None
|
182
185
|
|
183
186
|
|
184
|
-
class
|
187
|
+
class SampleVNC(Horizontal):
|
188
|
+
DEFAULT_CSS = """
|
189
|
+
SampleVNC {
|
190
|
+
layout: grid;
|
191
|
+
grid-size: 2 1;
|
192
|
+
grid-columns: auto 1fr;
|
193
|
+
}
|
194
|
+
SampleVNC Static {
|
195
|
+
color: $secondary;
|
196
|
+
}
|
197
|
+
SampleVNC Link {
|
198
|
+
color: $accent;
|
199
|
+
}
|
200
|
+
"""
|
201
|
+
|
202
|
+
def __init__(self) -> None:
|
203
|
+
super().__init__()
|
204
|
+
self._sample: ActiveSample | None = None
|
205
|
+
|
206
|
+
def compose(self) -> ComposeResult:
|
207
|
+
yield Static("VNC: ")
|
208
|
+
yield Link("")
|
209
|
+
|
210
|
+
async def sync_sample(self, sample: ActiveSample) -> None:
|
211
|
+
if sample == self._sample:
|
212
|
+
return
|
213
|
+
|
214
|
+
# defult to hidden (show if we find a vnc connection)
|
215
|
+
self.display = False
|
216
|
+
|
217
|
+
# is there a vnc connection? if so populate
|
218
|
+
for connection in [c for c in sample.sandboxes.values() if c.ports]:
|
219
|
+
for port in connection.ports or []:
|
220
|
+
service = get_service_by_port(port.container_port, port.protocol)
|
221
|
+
if service == "noVNC" and port.mappings:
|
222
|
+
host_mappings = port.mappings
|
223
|
+
link = self.query_one(Link)
|
224
|
+
vnc_url = get_url(host_mappings[0].host_port, service)
|
225
|
+
if vnc_url:
|
226
|
+
link.text = vnc_url
|
227
|
+
link.url = link.text
|
228
|
+
self.display = True
|
229
|
+
break
|
230
|
+
|
231
|
+
|
232
|
+
class SampleInfo(Vertical):
|
185
233
|
DEFAULT_CSS = """
|
186
234
|
SampleInfo {
|
187
235
|
color: $text-muted;
|
236
|
+
layout: grid;
|
237
|
+
grid-size: 1 2;
|
238
|
+
grid-rows: auto 1;
|
239
|
+
grid-gutter: 1;
|
188
240
|
}
|
189
241
|
SampleInfo Collapsible {
|
190
242
|
padding: 0;
|
@@ -217,11 +269,13 @@ class SampleInfo(Horizontal):
|
|
217
269
|
def __init__(self) -> None:
|
218
270
|
super().__init__()
|
219
271
|
self._sample: ActiveSample | None = None
|
272
|
+
self._sandbox_count: int | None = None
|
220
273
|
|
221
274
|
def compose(self) -> ComposeResult:
|
222
275
|
with Collapsible(title=""):
|
223
276
|
yield SampleLimits()
|
224
277
|
yield SandboxesView()
|
278
|
+
yield SampleVNC()
|
225
279
|
|
226
280
|
async def sync_sample(self, sample: ActiveSample | None) -> None:
|
227
281
|
if sample is None:
|
@@ -232,12 +286,14 @@ class SampleInfo(Horizontal):
|
|
232
286
|
limits = self.query_one(SampleLimits)
|
233
287
|
await limits.sync_sample(sample)
|
234
288
|
|
289
|
+
new_sandbox_count = len(sample.sandboxes)
|
235
290
|
# bail if we've already processed this sample
|
236
|
-
if self._sample == sample:
|
291
|
+
if self._sample == sample and self._sandbox_count == new_sandbox_count:
|
237
292
|
return
|
238
293
|
|
239
294
|
# set sample
|
240
295
|
self._sample = sample
|
296
|
+
self._sandbox_count = new_sandbox_count
|
241
297
|
|
242
298
|
# update UI
|
243
299
|
self.display = True
|
@@ -245,6 +301,7 @@ class SampleInfo(Horizontal):
|
|
245
301
|
self.query_one(Collapsible).title = title
|
246
302
|
sandboxes = self.query_one(SandboxesView)
|
247
303
|
await sandboxes.sync_sample(sample)
|
304
|
+
await self.query_one(SampleVNC).sync_sample(sample)
|
248
305
|
|
249
306
|
|
250
307
|
class SampleLimits(Widget):
|
@@ -294,6 +351,9 @@ class SandboxesView(Vertical):
|
|
294
351
|
background: transparent;
|
295
352
|
height: auto;
|
296
353
|
}
|
354
|
+
#sandboxes-list {
|
355
|
+
height: auto;
|
356
|
+
}
|
297
357
|
SandboxesView Static {
|
298
358
|
background: transparent;
|
299
359
|
}
|
@@ -311,16 +371,24 @@ class SandboxesView(Vertical):
|
|
311
371
|
|
312
372
|
async def sync_sample(self, sample: ActiveSample) -> None:
|
313
373
|
if len(sample.sandboxes) > 0:
|
374
|
+
multiple_sandboxes = len(sample.sandboxes) > 1
|
314
375
|
self.display = True
|
315
376
|
sandboxes_caption = cast(Static, self.query_one("#sandboxes-caption"))
|
316
|
-
sandboxes_caption.update(
|
377
|
+
sandboxes_caption.update(
|
378
|
+
f"[bold]sandbox container{'s' if multiple_sandboxes else ''}:[/bold]"
|
379
|
+
)
|
317
380
|
|
318
381
|
sandboxes_list = self.query_one("#sandboxes-list")
|
319
382
|
await sandboxes_list.remove_children()
|
383
|
+
|
320
384
|
await sandboxes_list.mount_all(
|
321
|
-
[
|
385
|
+
[
|
386
|
+
SandboxView(connection, name if multiple_sandboxes else None)
|
387
|
+
for name, connection in sample.sandboxes.items()
|
388
|
+
]
|
322
389
|
)
|
323
|
-
|
390
|
+
|
391
|
+
await sandboxes_list.mount(
|
324
392
|
Static(
|
325
393
|
"[italic]Hold down Alt (or Option) to select text for copying[/italic]",
|
326
394
|
classes="clipboard-message",
|
@@ -332,16 +400,29 @@ class SandboxesView(Vertical):
|
|
332
400
|
|
333
401
|
|
334
402
|
class SampleToolbar(Horizontal):
|
403
|
+
STATUS_GROUP = "status_group"
|
404
|
+
TIMEOUT_TOOL_CALL = "timeout_tool_call"
|
335
405
|
CANCEL_SCORE_OUTPUT = "cancel_score_output"
|
336
406
|
CANCEL_RAISE_ERROR = "cancel_raise_error"
|
337
407
|
PENDING_STATUS = "pending_status"
|
338
408
|
PENDING_CAPTION = "pending_caption"
|
339
409
|
|
340
410
|
DEFAULT_CSS = f"""
|
411
|
+
SampleToolbar {{
|
412
|
+
grid-size: 5 1;
|
413
|
+
grid-columns: auto auto 1fr auto auto;
|
414
|
+
}}
|
415
|
+
SampleToolbar #{STATUS_GROUP} {{
|
416
|
+
min-width: 20;
|
417
|
+
}}
|
341
418
|
SampleToolbar Button {{
|
342
419
|
margin-bottom: 1;
|
343
420
|
margin-right: 2;
|
344
|
-
min-width:
|
421
|
+
min-width: 18;
|
422
|
+
}}
|
423
|
+
SampleToolbar #{TIMEOUT_TOOL_CALL} {{
|
424
|
+
color: $secondary-darken-3;
|
425
|
+
min-width: 16;
|
345
426
|
}}
|
346
427
|
SampleToolbar #{CANCEL_SCORE_OUTPUT} {{
|
347
428
|
color: $primary-darken-3;
|
@@ -356,9 +437,16 @@ class SampleToolbar(Horizontal):
|
|
356
437
|
self.sample: ActiveSample | None = None
|
357
438
|
|
358
439
|
def compose(self) -> ComposeResult:
|
359
|
-
with
|
360
|
-
|
361
|
-
|
440
|
+
with HorizontalGroup(id=self.STATUS_GROUP):
|
441
|
+
with VerticalGroup(id=self.PENDING_STATUS):
|
442
|
+
yield Static("Executing...", id=self.PENDING_CAPTION)
|
443
|
+
yield HorizontalGroup(EventLoadingIndicator(), Clock())
|
444
|
+
yield Button(
|
445
|
+
Text("Timeout Tool"),
|
446
|
+
id=self.TIMEOUT_TOOL_CALL,
|
447
|
+
tooltip="Cancel the tool call and report a timeout to the model.",
|
448
|
+
)
|
449
|
+
yield Horizontal()
|
362
450
|
yield Button(
|
363
451
|
Text("Cancel (Score)"),
|
364
452
|
id=self.CANCEL_SCORE_OUTPUT,
|
@@ -372,12 +460,21 @@ class SampleToolbar(Horizontal):
|
|
372
460
|
|
373
461
|
def on_mount(self) -> None:
|
374
462
|
self.query_one("#" + self.PENDING_STATUS).visible = False
|
463
|
+
self.query_one("#" + self.TIMEOUT_TOOL_CALL).display = False
|
375
464
|
self.query_one("#" + self.CANCEL_SCORE_OUTPUT).display = False
|
376
465
|
self.query_one("#" + self.CANCEL_RAISE_ERROR).display = False
|
377
466
|
|
378
467
|
def on_button_pressed(self, event: Button.Pressed) -> None:
|
379
468
|
if self.sample:
|
380
|
-
if event.button.id == self.
|
469
|
+
if event.button.id == self.TIMEOUT_TOOL_CALL:
|
470
|
+
last_event = (
|
471
|
+
self.sample.transcript.events[-1]
|
472
|
+
if self.sample.transcript.events
|
473
|
+
else None
|
474
|
+
)
|
475
|
+
if isinstance(last_event, ToolEvent):
|
476
|
+
last_event.cancel()
|
477
|
+
elif event.button.id == self.CANCEL_SCORE_OUTPUT:
|
381
478
|
self.sample.interrupt("score")
|
382
479
|
elif event.button.id == self.CANCEL_RAISE_ERROR:
|
383
480
|
self.sample.interrupt("error")
|
@@ -389,6 +486,7 @@ class SampleToolbar(Horizontal):
|
|
389
486
|
self.sample = sample
|
390
487
|
|
391
488
|
pending_status = self.query_one("#" + self.PENDING_STATUS)
|
489
|
+
timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
|
392
490
|
clock = self.query_one(Clock)
|
393
491
|
cancel_score_output = cast(
|
394
492
|
Button, self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
|
@@ -419,14 +517,19 @@ class SampleToolbar(Horizontal):
|
|
419
517
|
pending_caption.update(
|
420
518
|
Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
|
421
519
|
)
|
520
|
+
|
521
|
+
timeout_tool.display = isinstance(last_event, ToolEvent)
|
522
|
+
|
422
523
|
clock.start(last_event.timestamp.timestamp())
|
423
524
|
else:
|
424
525
|
pending_status.visible = False
|
526
|
+
timeout_tool.display = False
|
425
527
|
clock.stop()
|
426
528
|
|
427
529
|
else:
|
428
530
|
self.display = False
|
429
531
|
pending_status.visible = False
|
532
|
+
timeout_tool.display = False
|
430
533
|
clock.stop()
|
431
534
|
|
432
535
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from textual.app import ComposeResult
|
2
|
+
from textual.containers import Horizontal, Vertical
|
3
|
+
from textual.widgets import Static
|
4
|
+
|
5
|
+
from inspect_ai.util._sandbox.environment import SandboxConnection
|
6
|
+
|
7
|
+
from .port_mappings import PortMappingsView
|
8
|
+
|
9
|
+
|
10
|
+
class SandboxView(Vertical):
|
11
|
+
DEFAULT_CSS = """
|
12
|
+
.indent {
|
13
|
+
width: 2;
|
14
|
+
}
|
15
|
+
.no_indent {
|
16
|
+
width: 0;
|
17
|
+
}
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
connection: SandboxConnection,
|
23
|
+
name: str | None, # if None, no header or indent
|
24
|
+
) -> None:
|
25
|
+
super().__init__()
|
26
|
+
self.sandbox_name = name
|
27
|
+
self.connection = connection
|
28
|
+
|
29
|
+
def compose(self) -> ComposeResult:
|
30
|
+
if self.sandbox_name:
|
31
|
+
yield Static(self.sandbox_name)
|
32
|
+
with Horizontal():
|
33
|
+
yield Static("", classes="indent" if self.sandbox_name else "no_indent")
|
34
|
+
with Vertical():
|
35
|
+
yield Static(self.connection.command)
|
36
|
+
if self.connection.ports:
|
37
|
+
yield PortMappingsView(self.connection.ports)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -7,11 +7,12 @@ from shortuuid import uuid
|
|
7
7
|
from typing_extensions import Unpack
|
8
8
|
|
9
9
|
from inspect_ai._cli.util import parse_cli_args
|
10
|
-
from inspect_ai._display.core.active import display
|
10
|
+
from inspect_ai._display.core.active import display as task_display
|
11
11
|
from inspect_ai._util.config import resolve_args
|
12
12
|
from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
|
13
13
|
from inspect_ai._util.error import PrerequisiteError
|
14
14
|
from inspect_ai._util.file import absolute_file_path
|
15
|
+
from inspect_ai._util.logger import warn_once
|
15
16
|
from inspect_ai._util.platform import platform_init
|
16
17
|
from inspect_ai._util.registry import registry_lookup
|
17
18
|
from inspect_ai.approval._apply import init_tool_approval
|
@@ -34,7 +35,7 @@ from inspect_ai.scorer._reducer import reducer_log_names
|
|
34
35
|
from inspect_ai.solver._chain import chain
|
35
36
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
36
37
|
from inspect_ai.util import SandboxEnvironmentType
|
37
|
-
from inspect_ai.util.
|
38
|
+
from inspect_ai.util._display import DisplayType, display_type, init_display_type
|
38
39
|
|
39
40
|
from .context import init_eval_context
|
40
41
|
from .loader import ResolvedTask, resolve_tasks
|
@@ -55,6 +56,7 @@ def eval(
|
|
55
56
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
56
57
|
tags: list[str] | None = None,
|
57
58
|
trace: bool | None = None,
|
59
|
+
display: DisplayType | None = None,
|
58
60
|
approval: str | list[ApprovalPolicy] | None = None,
|
59
61
|
log_level: str | None = None,
|
60
62
|
log_level_transcript: str | None = None,
|
@@ -100,7 +102,8 @@ def eval(
|
|
100
102
|
solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
|
101
103
|
Optional (uses task solver by default).
|
102
104
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
103
|
-
trace
|
105
|
+
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
106
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
104
107
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
105
108
|
Either a path to an approval policy config file or a list of approval policies.
|
106
109
|
Defaults to no approval policy.
|
@@ -150,9 +153,11 @@ def eval(
|
|
150
153
|
platform_init()
|
151
154
|
|
152
155
|
# resolve eval trace
|
153
|
-
max_tasks, max_samples =
|
156
|
+
max_tasks, max_samples = init_eval_display(
|
157
|
+
display, trace, max_tasks, max_samples, model
|
158
|
+
)
|
154
159
|
|
155
|
-
return
|
160
|
+
return task_display().run_task_app(
|
156
161
|
main=eval_async(
|
157
162
|
tasks=tasks,
|
158
163
|
model=model,
|
@@ -163,7 +168,6 @@ def eval(
|
|
163
168
|
sandbox_cleanup=sandbox_cleanup,
|
164
169
|
solver=solver,
|
165
170
|
tags=tags,
|
166
|
-
trace=trace,
|
167
171
|
approval=approval,
|
168
172
|
log_level=log_level,
|
169
173
|
log_level_transcript=log_level_transcript,
|
@@ -201,7 +205,6 @@ async def eval_async(
|
|
201
205
|
sandbox_cleanup: bool | None = None,
|
202
206
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
203
207
|
tags: list[str] | None = None,
|
204
|
-
trace: bool | None = None,
|
205
208
|
approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
|
206
209
|
log_level: str | None = None,
|
207
210
|
log_level_transcript: str | None = None,
|
@@ -247,7 +250,6 @@ async def eval_async(
|
|
247
250
|
solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
|
248
251
|
Optional (uses task solver by default).
|
249
252
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
250
|
-
trace: (bool | None): Trace message interactions with evaluated model to terminal.
|
251
253
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
252
254
|
Either a path to an approval policy config file or a list of approval policies.
|
253
255
|
Defaults to no approval policy.
|
@@ -329,8 +331,8 @@ async def eval_async(
|
|
329
331
|
log.warning("No inspect tasks were found at the specified paths.")
|
330
332
|
return []
|
331
333
|
|
332
|
-
# apply
|
333
|
-
if
|
334
|
+
# apply conversation display constraints
|
335
|
+
if display_type() == "conversation":
|
334
336
|
# single task at a time
|
335
337
|
if max_tasks is not None:
|
336
338
|
max_tasks = 1
|
@@ -371,7 +373,6 @@ async def eval_async(
|
|
371
373
|
epochs_reducer=reducer_log_names(epochs_reducer)
|
372
374
|
if epochs_reducer
|
373
375
|
else None,
|
374
|
-
trace=trace,
|
375
376
|
approval=config_from_approval_policies(approval) if approval else None,
|
376
377
|
fail_on_error=fail_on_error,
|
377
378
|
message_limit=message_limit,
|
@@ -467,6 +468,7 @@ def eval_retry(
|
|
467
468
|
max_sandboxes: int | None = None,
|
468
469
|
sandbox_cleanup: bool | None = None,
|
469
470
|
trace: bool | None = None,
|
471
|
+
display: DisplayType | None = None,
|
470
472
|
fail_on_error: bool | float | None = None,
|
471
473
|
debug_errors: bool | None = None,
|
472
474
|
log_samples: bool | None = None,
|
@@ -501,6 +503,7 @@ def eval_retry(
|
|
501
503
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
502
504
|
(defaults to True)
|
503
505
|
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
506
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
504
507
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
505
508
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
506
509
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
@@ -529,9 +532,9 @@ def eval_retry(
|
|
529
532
|
platform_init()
|
530
533
|
|
531
534
|
# resolve eval trace
|
532
|
-
max_tasks, max_samples =
|
535
|
+
max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
|
533
536
|
|
534
|
-
return
|
537
|
+
return task_display().run_task_app(
|
535
538
|
main=eval_retry_async(
|
536
539
|
tasks=tasks,
|
537
540
|
log_level=log_level,
|
@@ -800,9 +803,8 @@ def eval_init(
|
|
800
803
|
|
801
804
|
# resolve tasks (set active model to resolve uses of the
|
802
805
|
# 'default' model in tools, solvers, and scorers)
|
803
|
-
from inspect_ai._display.core.active import display
|
804
806
|
|
805
|
-
with
|
807
|
+
with task_display().suspend_task_app():
|
806
808
|
resolved_tasks: list[ResolvedTask] = []
|
807
809
|
for m in models:
|
808
810
|
init_active_model(m, generate_config)
|
@@ -816,17 +818,27 @@ def eval_init(
|
|
816
818
|
return models, approval, resolved_tasks
|
817
819
|
|
818
820
|
|
819
|
-
def
|
821
|
+
def init_eval_display(
|
822
|
+
display: DisplayType | None,
|
820
823
|
trace: bool | None,
|
821
824
|
max_tasks: int | None,
|
822
825
|
max_samples: int | None,
|
823
826
|
model: Any = None,
|
824
827
|
) -> tuple[int | None, int | None]:
|
825
|
-
#
|
826
|
-
init_trace(trace)
|
827
|
-
|
828
|
-
# adapt task/samples as required
|
828
|
+
# propagate any trace value to display_type
|
829
829
|
if trace:
|
830
|
+
warn_once(
|
831
|
+
log,
|
832
|
+
"WARNING: The --trace flag is deprecated (use --display=conversation instead)",
|
833
|
+
)
|
834
|
+
display = "conversation"
|
835
|
+
|
836
|
+
# apply default and init
|
837
|
+
display = display or display_type()
|
838
|
+
init_display_type(display)
|
839
|
+
|
840
|
+
# adapt task/samples as required if we are in conversation mode
|
841
|
+
if display_type() == "conversation":
|
830
842
|
# single task at a time
|
831
843
|
if max_tasks is not None:
|
832
844
|
max_tasks = 1
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -33,7 +33,7 @@ from inspect_ai.model import (
|
|
33
33
|
)
|
34
34
|
from inspect_ai.model._generate_config import GenerateConfig
|
35
35
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
36
|
-
from inspect_ai.util import SandboxEnvironmentType
|
36
|
+
from inspect_ai.util import DisplayType, SandboxEnvironmentType
|
37
37
|
|
38
38
|
from .eval import eval, eval_init
|
39
39
|
from .loader import ResolvedTask, resolve_task_args
|
@@ -59,6 +59,7 @@ def eval_set(
|
|
59
59
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
60
60
|
tags: list[str] | None = None,
|
61
61
|
trace: bool | None = None,
|
62
|
+
display: DisplayType | None = None,
|
62
63
|
approval: str | list[ApprovalPolicy] | None = None,
|
63
64
|
score: bool = True,
|
64
65
|
log_level: str | None = None,
|
@@ -116,6 +117,7 @@ def eval_set(
|
|
116
117
|
evaluating task(s). ptional (uses task solver by default).
|
117
118
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
118
119
|
trace: (bool | None): Trace message interactions with evaluated model to terminal.
|
120
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
119
121
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
120
122
|
Either a path to an approval policy config file or a list of approval policies.
|
121
123
|
Defaults to no approval policy.
|
@@ -180,6 +182,7 @@ def eval_set(
|
|
180
182
|
solver=solver,
|
181
183
|
tags=tags,
|
182
184
|
trace=trace,
|
185
|
+
display=display,
|
183
186
|
approval=approval,
|
184
187
|
log_level=log_level,
|
185
188
|
log_level_transcript=log_level_transcript,
|
@@ -501,9 +504,6 @@ def latest_completed_task_eval_logs(
|
|
501
504
|
# take the most recent completed log for each id
|
502
505
|
latest_completed_logs: list[Log] = []
|
503
506
|
for id, id_logs in logs_by_id.items():
|
504
|
-
# filter on completed
|
505
|
-
id_logs = [id_log for id_log in id_logs if id_log[1].status != "started"]
|
506
|
-
|
507
507
|
# continue if there are no target logs
|
508
508
|
if len(id_logs) == 0:
|
509
509
|
continue
|
@@ -517,11 +517,13 @@ def latest_completed_task_eval_logs(
|
|
517
517
|
latest_completed_logs.append(id_logs[0])
|
518
518
|
|
519
519
|
# remove the rest if requested
|
520
|
+
# (don't remove 'started' in case its needed for post-mortum debugging)
|
520
521
|
if cleanup_older:
|
521
522
|
fs = filesystem(id_logs[0][0].name)
|
522
523
|
for id_log in id_logs[1:]:
|
523
524
|
try:
|
524
|
-
|
525
|
+
if id_log.header.status != "started":
|
526
|
+
fs.rm(id_log.info.name)
|
525
527
|
except Exception as ex:
|
526
528
|
logger.warning(f"Error attempt to remove '{id_log[0].name}': {ex}")
|
527
529
|
|
inspect_ai/_eval/score.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .task import Task, TaskInfo, PreviousTask, Tasks # noqa: I001, F401
|
1
|
+
from .task import Task, TaskInfo, PreviousTask, Tasks, task_with # noqa: I001, F401
|
2
2
|
from .epochs import Epochs
|
3
3
|
|
4
|
-
__all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks"]
|
4
|
+
__all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks", "task_with"]
|