inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +4 -2
- inspect_ai/_cli/eval.py +2 -0
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +0 -2
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/rich/display.py +4 -4
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/samples.py +41 -5
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/run.py +141 -119
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/logger.py +2 -1
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_util/trace.py +39 -3
- inspect_ai/_util/transcript.py +36 -7
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/dist/assets/index.js +322 -226
- inspect_ai/_view/www/log-schema.json +221 -138
- inspect_ai/_view/www/src/App.mjs +18 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +18 -3
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/types/log.d.ts +53 -35
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +27 -5
- inspect_ai/log/_recorders/eval.py +21 -8
- inspect_ai/log/_samples.py +10 -5
- inspect_ai/log/_transcript.py +28 -1
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +82 -17
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/{_trace.py → _conversation.py} +9 -8
- inspect_ai/model/_model.py +2 -2
- inspect_ai/model/_providers/anthropic.py +9 -7
- inspect_ai/model/_providers/azureai.py +6 -4
- inspect_ai/model/_providers/bedrock.py +6 -4
- inspect_ai/model/_providers/google.py +103 -14
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +6 -9
- inspect_ai/model/_providers/openai.py +34 -8
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_providers/vertex.py +17 -4
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/tool/__init__.py +9 -1
- inspect_ai/tool/_tool.py +9 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
- inspect_ai/util/__init__.py +4 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -13
- inspect_ai/util/_sandbox/docker/docker.py +20 -13
- inspect_ai/util/_sandbox/docker/util.py +2 -1
- inspect_ai/util/_sandbox/environment.py +13 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_store.py +2 -2
- inspect_ai/util/_subprocess.py +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py
CHANGED
@@ -7,7 +7,7 @@ from inspect_ai._eval.evalset import eval_set
|
|
7
7
|
from inspect_ai._eval.list import list_tasks
|
8
8
|
from inspect_ai._eval.registry import task
|
9
9
|
from inspect_ai._eval.score import score, score_async
|
10
|
-
from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks
|
10
|
+
from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks, task_with
|
11
11
|
from inspect_ai._util.constants import PKG_NAME
|
12
12
|
from inspect_ai.solver._human_agent.agent import human_agent
|
13
13
|
|
@@ -29,4 +29,5 @@ __all__ = [
|
|
29
29
|
"TaskInfo",
|
30
30
|
"Tasks",
|
31
31
|
"task",
|
32
|
+
"task_with",
|
32
33
|
]
|
inspect_ai/_cli/common.py
CHANGED
@@ -17,7 +17,7 @@ class CommonOptions(TypedDict):
|
|
17
17
|
log_level: str
|
18
18
|
log_level_transcript: str
|
19
19
|
log_dir: str
|
20
|
-
display: Literal["full", "rich", "plain", "none"]
|
20
|
+
display: Literal["full", "conversation", "rich", "plain", "none"]
|
21
21
|
no_ansi: bool | None
|
22
22
|
debug: bool
|
23
23
|
debug_port: int
|
@@ -64,7 +64,9 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
64
64
|
)
|
65
65
|
@click.option(
|
66
66
|
"--display",
|
67
|
-
type=click.Choice(
|
67
|
+
type=click.Choice(
|
68
|
+
["full", "conversation", "rich", "plain", "none"], case_sensitive=False
|
69
|
+
),
|
68
70
|
default=DEFAULT_DISPLAY,
|
69
71
|
envvar="INSPECT_DISPLAY",
|
70
72
|
help="Set the display type (defaults to 'full')",
|
inspect_ai/_cli/eval.py
CHANGED
@@ -118,6 +118,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
118
118
|
"--trace",
|
119
119
|
type=bool,
|
120
120
|
is_flag=True,
|
121
|
+
hidden=True,
|
121
122
|
envvar="INSPECT_EVAL_TRACE",
|
122
123
|
help="Trace message interactions with evaluated model to terminal.",
|
123
124
|
)
|
@@ -886,6 +887,7 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
886
887
|
"--trace",
|
887
888
|
type=bool,
|
888
889
|
is_flag=True,
|
890
|
+
hidden=True,
|
889
891
|
help="Trace message interactions with evaluated model to terminal.",
|
890
892
|
envvar="INSPECT_EVAL_TRACE",
|
891
893
|
)
|
inspect_ai/_cli/trace.py
CHANGED
@@ -62,11 +62,21 @@ def list_command(json: bool) -> None:
|
|
62
62
|
|
63
63
|
@trace_command.command("dump")
|
64
64
|
@click.argument("trace-file", type=str, required=False)
|
65
|
-
|
65
|
+
@click.option(
|
66
|
+
"--filter",
|
67
|
+
type=str,
|
68
|
+
help="Filter (applied to trace message field).",
|
69
|
+
)
|
70
|
+
def dump_command(trace_file: str | None, filter: str | None) -> None:
|
66
71
|
"""Dump a trace file to stdout (as a JSON array of log records)."""
|
67
72
|
trace_file_path = _resolve_trace_file_path(trace_file)
|
68
73
|
|
69
74
|
traces = read_trace_file(trace_file_path)
|
75
|
+
|
76
|
+
if filter:
|
77
|
+
filter = filter.lower()
|
78
|
+
traces = [trace for trace in traces if filter in trace.message.lower()]
|
79
|
+
|
70
80
|
print(
|
71
81
|
to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
|
72
82
|
)
|
@@ -74,17 +84,26 @@ def dump_command(trace_file: str | None) -> None:
|
|
74
84
|
|
75
85
|
@trace_command.command("anomalies")
|
76
86
|
@click.argument("trace-file", type=str, required=False)
|
87
|
+
@click.option(
|
88
|
+
"--filter",
|
89
|
+
type=str,
|
90
|
+
help="Filter (applied to trace message field).",
|
91
|
+
)
|
77
92
|
@click.option(
|
78
93
|
"--all",
|
79
94
|
is_flag=True,
|
80
95
|
default=False,
|
81
96
|
help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
|
82
97
|
)
|
83
|
-
def anomolies_command(trace_file: str | None, all: bool) -> None:
|
98
|
+
def anomolies_command(trace_file: str | None, filter: str | None, all: bool) -> None:
|
84
99
|
"""Look for anomalies in a trace file (never completed or cancelled actions)."""
|
85
100
|
trace_file_path = _resolve_trace_file_path(trace_file)
|
86
101
|
traces = read_trace_file(trace_file_path)
|
87
102
|
|
103
|
+
if filter:
|
104
|
+
filter = filter.lower()
|
105
|
+
traces = [trace for trace in traces if filter in trace.message.lower()]
|
106
|
+
|
88
107
|
# Track started actions
|
89
108
|
running_actions: dict[str, ActionTraceRecord] = {}
|
90
109
|
canceled_actions: dict[str, ActionTraceRecord] = {}
|
@@ -4,7 +4,6 @@ from contextvars import ContextVar
|
|
4
4
|
import rich
|
5
5
|
|
6
6
|
from inspect_ai.util._display import display_type
|
7
|
-
from inspect_ai.util._trace import trace_enabled
|
8
7
|
|
9
8
|
from ..rich.display import RichDisplay
|
10
9
|
from ..textual.display import TextualDisplay
|
@@ -17,7 +16,6 @@ def display() -> Display:
|
|
17
16
|
if (
|
18
17
|
display_type() == "full"
|
19
18
|
and sys.stdout.isatty()
|
20
|
-
and not trace_enabled()
|
21
19
|
and not rich.get_console().is_jupyter
|
22
20
|
):
|
23
21
|
_active_display = TextualDisplay()
|
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
|
|
112
112
|
def task_title(profile: TaskProfile, show_model: bool) -> str:
|
113
113
|
eval_epochs = profile.eval_config.epochs or 1
|
114
114
|
epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
|
115
|
-
samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
|
115
|
+
samples = f"{profile.samples // eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
|
116
116
|
title = f"{registry_unqualified_name(profile.name)} ({samples})"
|
117
117
|
if show_model:
|
118
118
|
title = f"{title}: {profile.model}"
|
@@ -15,7 +15,6 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
|
15
15
|
from inspect_ai.log._transcript import InputEvent, transcript
|
16
16
|
from inspect_ai.util._display import display_type
|
17
17
|
from inspect_ai.util._throttle import throttle
|
18
|
-
from inspect_ai.util._trace import trace_enabled
|
19
18
|
|
20
19
|
from ..core.config import task_config
|
21
20
|
from ..core.display import (
|
@@ -151,7 +150,8 @@ class RichDisplay(Display):
|
|
151
150
|
@throttle(1)
|
152
151
|
def _update_display(self) -> None:
|
153
152
|
if (
|
154
|
-
|
153
|
+
display_type() != "conversation"
|
154
|
+
and self.tasks is not None
|
155
155
|
and self.tasks
|
156
156
|
and self.progress_ui is not None
|
157
157
|
and self.live is not None
|
@@ -170,7 +170,7 @@ class RichTaskScreen(TaskScreen):
|
|
170
170
|
def __init__(self, live: Live) -> None:
|
171
171
|
self.theme = rich_theme()
|
172
172
|
self.live = live
|
173
|
-
status_text = "Working" if
|
173
|
+
status_text = "Working" if display_type() == "conversation" else "Task running"
|
174
174
|
self.status = self.live.console.status(
|
175
175
|
f"[{self.theme.meta} bold]{status_text}...[/{self.theme.meta} bold]",
|
176
176
|
spinner="clock",
|
@@ -189,7 +189,7 @@ class RichTaskScreen(TaskScreen):
|
|
189
189
|
) -> Iterator[Console]:
|
190
190
|
# determine transient based on trace mode
|
191
191
|
if transient is None:
|
192
|
-
transient =
|
192
|
+
transient = display_type() != "conversation"
|
193
193
|
|
194
194
|
# clear live task status and transient status
|
195
195
|
self.live.update("", refresh=True)
|
@@ -284,7 +284,10 @@ class TaskScreenApp(App[TR]):
|
|
284
284
|
|
285
285
|
def update_samples(self) -> None:
|
286
286
|
samples_view = self.query_one(SamplesView)
|
287
|
-
|
287
|
+
active_and_started_samples = [
|
288
|
+
sample for sample in active_samples() if sample.started is not None
|
289
|
+
]
|
290
|
+
samples_view.set_samples(active_and_started_samples)
|
288
291
|
|
289
292
|
def update_footer(self) -> None:
|
290
293
|
left, right = task_footer()
|
@@ -25,6 +25,7 @@ from textual.widgets.option_list import Option, Separator
|
|
25
25
|
from inspect_ai._util.format import format_progress_time
|
26
26
|
from inspect_ai._util.registry import registry_unqualified_name
|
27
27
|
from inspect_ai.log._samples import ActiveSample
|
28
|
+
from inspect_ai.log._transcript import ToolEvent
|
28
29
|
|
29
30
|
from .clock import Clock
|
30
31
|
from .transcript import TranscriptView
|
@@ -332,16 +333,29 @@ class SandboxesView(Vertical):
|
|
332
333
|
|
333
334
|
|
334
335
|
class SampleToolbar(Horizontal):
|
336
|
+
STATUS_GROUP = "status_group"
|
337
|
+
TIMEOUT_TOOL_CALL = "timeout_tool_call"
|
335
338
|
CANCEL_SCORE_OUTPUT = "cancel_score_output"
|
336
339
|
CANCEL_RAISE_ERROR = "cancel_raise_error"
|
337
340
|
PENDING_STATUS = "pending_status"
|
338
341
|
PENDING_CAPTION = "pending_caption"
|
339
342
|
|
340
343
|
DEFAULT_CSS = f"""
|
344
|
+
SampleToolbar {{
|
345
|
+
grid-size: 5 1;
|
346
|
+
grid-columns: auto auto 1fr auto auto;
|
347
|
+
}}
|
348
|
+
SampleToolbar #{STATUS_GROUP} {{
|
349
|
+
min-width: 20;
|
350
|
+
}}
|
341
351
|
SampleToolbar Button {{
|
342
352
|
margin-bottom: 1;
|
343
353
|
margin-right: 2;
|
344
|
-
min-width:
|
354
|
+
min-width: 18;
|
355
|
+
}}
|
356
|
+
SampleToolbar #{TIMEOUT_TOOL_CALL} {{
|
357
|
+
color: $secondary-darken-3;
|
358
|
+
min-width: 16;
|
345
359
|
}}
|
346
360
|
SampleToolbar #{CANCEL_SCORE_OUTPUT} {{
|
347
361
|
color: $primary-darken-3;
|
@@ -356,9 +370,16 @@ class SampleToolbar(Horizontal):
|
|
356
370
|
self.sample: ActiveSample | None = None
|
357
371
|
|
358
372
|
def compose(self) -> ComposeResult:
|
359
|
-
with
|
360
|
-
|
361
|
-
|
373
|
+
with HorizontalGroup(id=self.STATUS_GROUP):
|
374
|
+
with VerticalGroup(id=self.PENDING_STATUS):
|
375
|
+
yield Static("Executing...", id=self.PENDING_CAPTION)
|
376
|
+
yield HorizontalGroup(EventLoadingIndicator(), Clock())
|
377
|
+
yield Button(
|
378
|
+
Text("Timeout Tool"),
|
379
|
+
id=self.TIMEOUT_TOOL_CALL,
|
380
|
+
tooltip="Cancel the tool call and report a timeout to the model.",
|
381
|
+
)
|
382
|
+
yield Horizontal()
|
362
383
|
yield Button(
|
363
384
|
Text("Cancel (Score)"),
|
364
385
|
id=self.CANCEL_SCORE_OUTPUT,
|
@@ -372,12 +393,21 @@ class SampleToolbar(Horizontal):
|
|
372
393
|
|
373
394
|
def on_mount(self) -> None:
|
374
395
|
self.query_one("#" + self.PENDING_STATUS).visible = False
|
396
|
+
self.query_one("#" + self.TIMEOUT_TOOL_CALL).display = False
|
375
397
|
self.query_one("#" + self.CANCEL_SCORE_OUTPUT).display = False
|
376
398
|
self.query_one("#" + self.CANCEL_RAISE_ERROR).display = False
|
377
399
|
|
378
400
|
def on_button_pressed(self, event: Button.Pressed) -> None:
|
379
401
|
if self.sample:
|
380
|
-
if event.button.id == self.
|
402
|
+
if event.button.id == self.TIMEOUT_TOOL_CALL:
|
403
|
+
last_event = (
|
404
|
+
self.sample.transcript.events[-1]
|
405
|
+
if self.sample.transcript.events
|
406
|
+
else None
|
407
|
+
)
|
408
|
+
if isinstance(last_event, ToolEvent):
|
409
|
+
last_event.cancel()
|
410
|
+
elif event.button.id == self.CANCEL_SCORE_OUTPUT:
|
381
411
|
self.sample.interrupt("score")
|
382
412
|
elif event.button.id == self.CANCEL_RAISE_ERROR:
|
383
413
|
self.sample.interrupt("error")
|
@@ -389,6 +419,7 @@ class SampleToolbar(Horizontal):
|
|
389
419
|
self.sample = sample
|
390
420
|
|
391
421
|
pending_status = self.query_one("#" + self.PENDING_STATUS)
|
422
|
+
timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
|
392
423
|
clock = self.query_one(Clock)
|
393
424
|
cancel_score_output = cast(
|
394
425
|
Button, self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
|
@@ -419,14 +450,19 @@ class SampleToolbar(Horizontal):
|
|
419
450
|
pending_caption.update(
|
420
451
|
Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
|
421
452
|
)
|
453
|
+
|
454
|
+
timeout_tool.display = isinstance(last_event, ToolEvent)
|
455
|
+
|
422
456
|
clock.start(last_event.timestamp.timestamp())
|
423
457
|
else:
|
424
458
|
pending_status.visible = False
|
459
|
+
timeout_tool.display = False
|
425
460
|
clock.stop()
|
426
461
|
|
427
462
|
else:
|
428
463
|
self.display = False
|
429
464
|
pending_status.visible = False
|
465
|
+
timeout_tool.display = False
|
430
466
|
clock.stop()
|
431
467
|
|
432
468
|
|
inspect_ai/_eval/eval.py
CHANGED
@@ -7,11 +7,12 @@ from shortuuid import uuid
|
|
7
7
|
from typing_extensions import Unpack
|
8
8
|
|
9
9
|
from inspect_ai._cli.util import parse_cli_args
|
10
|
-
from inspect_ai._display.core.active import display
|
10
|
+
from inspect_ai._display.core.active import display as task_display
|
11
11
|
from inspect_ai._util.config import resolve_args
|
12
12
|
from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
|
13
13
|
from inspect_ai._util.error import PrerequisiteError
|
14
14
|
from inspect_ai._util.file import absolute_file_path
|
15
|
+
from inspect_ai._util.logger import warn_once
|
15
16
|
from inspect_ai._util.platform import platform_init
|
16
17
|
from inspect_ai._util.registry import registry_lookup
|
17
18
|
from inspect_ai.approval._apply import init_tool_approval
|
@@ -34,7 +35,7 @@ from inspect_ai.scorer._reducer import reducer_log_names
|
|
34
35
|
from inspect_ai.solver._chain import chain
|
35
36
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
36
37
|
from inspect_ai.util import SandboxEnvironmentType
|
37
|
-
from inspect_ai.util.
|
38
|
+
from inspect_ai.util._display import DisplayType, display_type, init_display_type
|
38
39
|
|
39
40
|
from .context import init_eval_context
|
40
41
|
from .loader import ResolvedTask, resolve_tasks
|
@@ -55,6 +56,7 @@ def eval(
|
|
55
56
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
56
57
|
tags: list[str] | None = None,
|
57
58
|
trace: bool | None = None,
|
59
|
+
display: DisplayType | None = None,
|
58
60
|
approval: str | list[ApprovalPolicy] | None = None,
|
59
61
|
log_level: str | None = None,
|
60
62
|
log_level_transcript: str | None = None,
|
@@ -100,7 +102,8 @@ def eval(
|
|
100
102
|
solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
|
101
103
|
Optional (uses task solver by default).
|
102
104
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
103
|
-
trace
|
105
|
+
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
106
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
104
107
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
105
108
|
Either a path to an approval policy config file or a list of approval policies.
|
106
109
|
Defaults to no approval policy.
|
@@ -150,9 +153,11 @@ def eval(
|
|
150
153
|
platform_init()
|
151
154
|
|
152
155
|
# resolve eval trace
|
153
|
-
max_tasks, max_samples =
|
156
|
+
max_tasks, max_samples = init_eval_display(
|
157
|
+
display, trace, max_tasks, max_samples, model
|
158
|
+
)
|
154
159
|
|
155
|
-
return
|
160
|
+
return task_display().run_task_app(
|
156
161
|
main=eval_async(
|
157
162
|
tasks=tasks,
|
158
163
|
model=model,
|
@@ -163,7 +168,6 @@ def eval(
|
|
163
168
|
sandbox_cleanup=sandbox_cleanup,
|
164
169
|
solver=solver,
|
165
170
|
tags=tags,
|
166
|
-
trace=trace,
|
167
171
|
approval=approval,
|
168
172
|
log_level=log_level,
|
169
173
|
log_level_transcript=log_level_transcript,
|
@@ -201,7 +205,6 @@ async def eval_async(
|
|
201
205
|
sandbox_cleanup: bool | None = None,
|
202
206
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
203
207
|
tags: list[str] | None = None,
|
204
|
-
trace: bool | None = None,
|
205
208
|
approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
|
206
209
|
log_level: str | None = None,
|
207
210
|
log_level_transcript: str | None = None,
|
@@ -247,7 +250,6 @@ async def eval_async(
|
|
247
250
|
solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
|
248
251
|
Optional (uses task solver by default).
|
249
252
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
250
|
-
trace: (bool | None): Trace message interactions with evaluated model to terminal.
|
251
253
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
252
254
|
Either a path to an approval policy config file or a list of approval policies.
|
253
255
|
Defaults to no approval policy.
|
@@ -329,8 +331,8 @@ async def eval_async(
|
|
329
331
|
log.warning("No inspect tasks were found at the specified paths.")
|
330
332
|
return []
|
331
333
|
|
332
|
-
# apply
|
333
|
-
if
|
334
|
+
# apply conversation display constraints
|
335
|
+
if display_type() == "conversation":
|
334
336
|
# single task at a time
|
335
337
|
if max_tasks is not None:
|
336
338
|
max_tasks = 1
|
@@ -371,7 +373,6 @@ async def eval_async(
|
|
371
373
|
epochs_reducer=reducer_log_names(epochs_reducer)
|
372
374
|
if epochs_reducer
|
373
375
|
else None,
|
374
|
-
trace=trace,
|
375
376
|
approval=config_from_approval_policies(approval) if approval else None,
|
376
377
|
fail_on_error=fail_on_error,
|
377
378
|
message_limit=message_limit,
|
@@ -467,6 +468,7 @@ def eval_retry(
|
|
467
468
|
max_sandboxes: int | None = None,
|
468
469
|
sandbox_cleanup: bool | None = None,
|
469
470
|
trace: bool | None = None,
|
471
|
+
display: DisplayType | None = None,
|
470
472
|
fail_on_error: bool | float | None = None,
|
471
473
|
debug_errors: bool | None = None,
|
472
474
|
log_samples: bool | None = None,
|
@@ -501,6 +503,7 @@ def eval_retry(
|
|
501
503
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
502
504
|
(defaults to True)
|
503
505
|
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
506
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
504
507
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
505
508
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
506
509
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
@@ -529,9 +532,9 @@ def eval_retry(
|
|
529
532
|
platform_init()
|
530
533
|
|
531
534
|
# resolve eval trace
|
532
|
-
max_tasks, max_samples =
|
535
|
+
max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
|
533
536
|
|
534
|
-
return
|
537
|
+
return task_display().run_task_app(
|
535
538
|
main=eval_retry_async(
|
536
539
|
tasks=tasks,
|
537
540
|
log_level=log_level,
|
@@ -800,9 +803,8 @@ def eval_init(
|
|
800
803
|
|
801
804
|
# resolve tasks (set active model to resolve uses of the
|
802
805
|
# 'default' model in tools, solvers, and scorers)
|
803
|
-
from inspect_ai._display.core.active import display
|
804
806
|
|
805
|
-
with
|
807
|
+
with task_display().suspend_task_app():
|
806
808
|
resolved_tasks: list[ResolvedTask] = []
|
807
809
|
for m in models:
|
808
810
|
init_active_model(m, generate_config)
|
@@ -816,17 +818,27 @@ def eval_init(
|
|
816
818
|
return models, approval, resolved_tasks
|
817
819
|
|
818
820
|
|
819
|
-
def
|
821
|
+
def init_eval_display(
|
822
|
+
display: DisplayType | None,
|
820
823
|
trace: bool | None,
|
821
824
|
max_tasks: int | None,
|
822
825
|
max_samples: int | None,
|
823
826
|
model: Any = None,
|
824
827
|
) -> tuple[int | None, int | None]:
|
825
|
-
#
|
826
|
-
init_trace(trace)
|
827
|
-
|
828
|
-
# adapt task/samples as required
|
828
|
+
# propagate any trace value to display_type
|
829
829
|
if trace:
|
830
|
+
warn_once(
|
831
|
+
log,
|
832
|
+
"WARNING: The --trace flag is deprecated (use --display=conversation instead)",
|
833
|
+
)
|
834
|
+
display = "conversation"
|
835
|
+
|
836
|
+
# apply default and init
|
837
|
+
display = display or display_type()
|
838
|
+
init_display_type(display)
|
839
|
+
|
840
|
+
# adapt task/samples as required if we are in conversation mode
|
841
|
+
if display_type() == "conversation":
|
830
842
|
# single task at a time
|
831
843
|
if max_tasks is not None:
|
832
844
|
max_tasks = 1
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -33,7 +33,7 @@ from inspect_ai.model import (
|
|
33
33
|
)
|
34
34
|
from inspect_ai.model._generate_config import GenerateConfig
|
35
35
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
36
|
-
from inspect_ai.util import SandboxEnvironmentType
|
36
|
+
from inspect_ai.util import DisplayType, SandboxEnvironmentType
|
37
37
|
|
38
38
|
from .eval import eval, eval_init
|
39
39
|
from .loader import ResolvedTask, resolve_task_args
|
@@ -59,6 +59,7 @@ def eval_set(
|
|
59
59
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
60
60
|
tags: list[str] | None = None,
|
61
61
|
trace: bool | None = None,
|
62
|
+
display: DisplayType | None = None,
|
62
63
|
approval: str | list[ApprovalPolicy] | None = None,
|
63
64
|
score: bool = True,
|
64
65
|
log_level: str | None = None,
|
@@ -116,6 +117,7 @@ def eval_set(
|
|
116
117
|
evaluating task(s). ptional (uses task solver by default).
|
117
118
|
tags (list[str] | None): Tags to associate with this evaluation run.
|
118
119
|
trace: (bool | None): Trace message interactions with evaluated model to terminal.
|
120
|
+
display (DisplayType | None): Task display type (defaults to 'full').
|
119
121
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
120
122
|
Either a path to an approval policy config file or a list of approval policies.
|
121
123
|
Defaults to no approval policy.
|
@@ -180,6 +182,7 @@ def eval_set(
|
|
180
182
|
solver=solver,
|
181
183
|
tags=tags,
|
182
184
|
trace=trace,
|
185
|
+
display=display,
|
183
186
|
approval=approval,
|
184
187
|
log_level=log_level,
|
185
188
|
log_level_transcript=log_level_transcript,
|
@@ -501,9 +504,6 @@ def latest_completed_task_eval_logs(
|
|
501
504
|
# take the most recent completed log for each id
|
502
505
|
latest_completed_logs: list[Log] = []
|
503
506
|
for id, id_logs in logs_by_id.items():
|
504
|
-
# filter on completed
|
505
|
-
id_logs = [id_log for id_log in id_logs if id_log[1].status != "started"]
|
506
|
-
|
507
507
|
# continue if there are no target logs
|
508
508
|
if len(id_logs) == 0:
|
509
509
|
continue
|
@@ -517,11 +517,13 @@ def latest_completed_task_eval_logs(
|
|
517
517
|
latest_completed_logs.append(id_logs[0])
|
518
518
|
|
519
519
|
# remove the rest if requested
|
520
|
+
# (don't remove 'started' in case its needed for post-mortum debugging)
|
520
521
|
if cleanup_older:
|
521
522
|
fs = filesystem(id_logs[0][0].name)
|
522
523
|
for id_log in id_logs[1:]:
|
523
524
|
try:
|
524
|
-
|
525
|
+
if id_log.header.status != "started":
|
526
|
+
fs.rm(id_log.info.name)
|
525
527
|
except Exception as ex:
|
526
528
|
logger.warning(f"Error attempt to remove '{id_log[0].name}': {ex}")
|
527
529
|
|
inspect_ai/_eval/run.py
CHANGED
@@ -42,7 +42,7 @@ from .task.log import TaskLogger
|
|
42
42
|
from .task.run import TaskRunOptions, task_run
|
43
43
|
from .task.rundir import task_run_dir_switching
|
44
44
|
from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
|
45
|
-
from .task.util import task_run_dir
|
45
|
+
from .task.util import slice_dataset, task_run_dir
|
46
46
|
|
47
47
|
log = logging.getLogger(__name__)
|
48
48
|
|
@@ -70,12 +70,23 @@ async def eval_run(
|
|
70
70
|
# get cwd before switching to task dir
|
71
71
|
eval_wd = os.getcwd()
|
72
72
|
|
73
|
+
# ensure sample ids
|
74
|
+
for resolved_task in tasks:
|
75
|
+
# add sample ids to dataset if they aren't there (start at 1 not 0)
|
76
|
+
task = resolved_task.task
|
77
|
+
for id, sample in enumerate(task.dataset):
|
78
|
+
if sample.id is None:
|
79
|
+
sample.id = id + 1
|
80
|
+
|
81
|
+
# Ensure sample ids are unique
|
82
|
+
ensure_unique_ids(task.dataset)
|
83
|
+
|
73
84
|
# run startup pass for the sandbox environments
|
74
85
|
shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
|
75
86
|
if has_sandbox:
|
76
87
|
cleanup = eval_config.sandbox_cleanup is not False
|
77
88
|
shutdown_sandbox_environments = await startup_sandbox_environments(
|
78
|
-
resolve_sandbox_environment(eval_sandbox), tasks, cleanup
|
89
|
+
resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
|
79
90
|
)
|
80
91
|
|
81
92
|
# resolve solver and solver spec
|
@@ -146,14 +157,6 @@ async def eval_run(
|
|
146
157
|
else:
|
147
158
|
task.fail_on_error = task_eval_config.fail_on_error
|
148
159
|
|
149
|
-
# add sample ids to dataset if they aren't there (start at 1 not 0)
|
150
|
-
for id, sample in enumerate(task.dataset):
|
151
|
-
if sample.id is None:
|
152
|
-
sample.id = id + 1
|
153
|
-
|
154
|
-
# Ensure sample ids are unique
|
155
|
-
ensure_unique_ids(task.dataset)
|
156
|
-
|
157
160
|
# create and track the logger
|
158
161
|
logger = TaskLogger(
|
159
162
|
task_name=task.name,
|
@@ -340,13 +343,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
340
343
|
async def startup_sandbox_environments(
|
341
344
|
eval_sandbox: SandboxEnvironmentSpec | None,
|
342
345
|
tasks: list[ResolvedTask],
|
346
|
+
config: EvalConfig,
|
343
347
|
cleanup: bool,
|
344
348
|
) -> Callable[[], Awaitable[None]]:
|
345
349
|
# find unique sandboxenvs
|
346
350
|
sandboxenvs: Set[TaskSandboxEnvironment] = set()
|
347
351
|
for task in tasks:
|
348
352
|
# resolve each sample and add to sandboxenvs
|
349
|
-
|
353
|
+
dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
|
354
|
+
for sample in dataset:
|
350
355
|
sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
|
351
356
|
if sandbox is not None and sandbox not in sandboxenvs:
|
352
357
|
sandboxenvs.add(sandbox)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from .task import Task, TaskInfo, PreviousTask, Tasks # noqa: I001, F401
|
1
|
+
from .task import Task, TaskInfo, PreviousTask, Tasks, task_with # noqa: I001, F401
|
2
2
|
from .epochs import Epochs
|
3
3
|
|
4
|
-
__all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks"]
|
4
|
+
__all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks", "task_with"]
|