inspect-ai 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +26 -1
- inspect_ai/_cli/main.py +2 -0
- inspect_ai/_cli/trace.py +244 -0
- inspect_ai/_display/textual/app.py +5 -1
- inspect_ai/_display/textual/widgets/tasks.py +13 -3
- inspect_ai/_eval/eval.py +17 -0
- inspect_ai/_eval/task/images.py +4 -14
- inspect_ai/_eval/task/log.py +2 -1
- inspect_ai/_eval/task/run.py +26 -10
- inspect_ai/_util/constants.py +3 -3
- inspect_ai/_util/display.py +1 -0
- inspect_ai/_util/logger.py +34 -8
- inspect_ai/_util/trace.py +275 -0
- inspect_ai/log/_log.py +3 -0
- inspect_ai/log/_message.py +2 -2
- inspect_ai/log/_recorders/eval.py +6 -17
- inspect_ai/log/_recorders/json.py +19 -17
- inspect_ai/model/_cache.py +22 -16
- inspect_ai/model/_call_tools.py +9 -1
- inspect_ai/model/_generate_config.py +2 -2
- inspect_ai/model/_model.py +11 -12
- inspect_ai/model/_providers/bedrock.py +1 -1
- inspect_ai/model/_providers/openai.py +11 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
- inspect_ai/util/_sandbox/context.py +6 -1
- inspect_ai/util/_sandbox/docker/compose.py +58 -19
- inspect_ai/util/_sandbox/docker/docker.py +11 -11
- inspect_ai/util/_sandbox/docker/util.py +0 -6
- inspect_ai/util/_sandbox/service.py +17 -7
- inspect_ai/util/_subprocess.py +6 -1
- inspect_ai/util/_subtask.py +8 -2
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/METADATA +7 -7
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/RECORD +37 -35
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -42,6 +42,7 @@ LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not s
|
|
42
42
|
NO_SCORE_HELP = (
|
43
43
|
"Do not score model output (use the inspect score command to score output later)"
|
44
44
|
)
|
45
|
+
NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
|
45
46
|
MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
|
46
47
|
MAX_RETRIES_HELP = (
|
47
48
|
f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
|
@@ -257,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
257
258
|
help=NO_SCORE_HELP,
|
258
259
|
envvar="INSPECT_EVAL_NO_SCORE",
|
259
260
|
)
|
261
|
+
@click.option(
|
262
|
+
"--no-score-display",
|
263
|
+
type=bool,
|
264
|
+
is_flag=True,
|
265
|
+
help=NO_SCORE_HELP,
|
266
|
+
envvar="INSPECT_EVAL_SCORE_DISPLAY",
|
267
|
+
)
|
260
268
|
@click.option(
|
261
269
|
"--max-tokens",
|
262
270
|
type=int,
|
@@ -339,7 +347,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
339
347
|
"--logprobs",
|
340
348
|
type=bool,
|
341
349
|
is_flag=True,
|
342
|
-
help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, vLLM only.",
|
350
|
+
help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only.",
|
343
351
|
envvar="INSPECT_EVAL_LOGPROBS",
|
344
352
|
)
|
345
353
|
@click.option(
|
@@ -446,6 +454,7 @@ def eval_command(
|
|
446
454
|
log_images: bool | None,
|
447
455
|
log_buffer: int | None,
|
448
456
|
no_score: bool | None,
|
457
|
+
no_score_display: bool | None,
|
449
458
|
log_format: Literal["eval", "json"] | None,
|
450
459
|
**common: Unpack[CommonOptions],
|
451
460
|
) -> None:
|
@@ -495,6 +504,7 @@ def eval_command(
|
|
495
504
|
log_images=log_images,
|
496
505
|
log_buffer=log_buffer,
|
497
506
|
no_score=no_score,
|
507
|
+
no_score_display=no_score_display,
|
498
508
|
is_eval_set=False,
|
499
509
|
**config,
|
500
510
|
)
|
@@ -603,6 +613,7 @@ def eval_set_command(
|
|
603
613
|
log_images: bool | None,
|
604
614
|
log_buffer: int | None,
|
605
615
|
no_score: bool | None,
|
616
|
+
no_score_display: bool | None,
|
606
617
|
bundle_dir: str | None,
|
607
618
|
bundle_overwrite: bool | None,
|
608
619
|
log_format: Literal["eval", "json"] | None,
|
@@ -654,6 +665,7 @@ def eval_set_command(
|
|
654
665
|
log_images=log_images,
|
655
666
|
log_buffer=log_buffer,
|
656
667
|
no_score=no_score,
|
668
|
+
no_score_display=no_score_display,
|
657
669
|
is_eval_set=True,
|
658
670
|
retry_attempts=retry_attempts,
|
659
671
|
retry_wait=retry_wait,
|
@@ -706,6 +718,7 @@ def eval_exec(
|
|
706
718
|
log_images: bool | None,
|
707
719
|
log_buffer: int | None,
|
708
720
|
no_score: bool | None,
|
721
|
+
no_score_display: bool | None,
|
709
722
|
is_eval_set: bool = False,
|
710
723
|
retry_attempts: int | None = None,
|
711
724
|
retry_wait: int | None = None,
|
@@ -746,6 +759,7 @@ def eval_exec(
|
|
746
759
|
log_images = False if log_images is False else None
|
747
760
|
trace = True if trace else None
|
748
761
|
score = False if no_score else True
|
762
|
+
score_display = False if no_score_display else None
|
749
763
|
|
750
764
|
# build params
|
751
765
|
params: dict[str, Any] = (
|
@@ -781,6 +795,7 @@ def eval_exec(
|
|
781
795
|
log_images=log_images,
|
782
796
|
log_buffer=log_buffer,
|
783
797
|
score=score,
|
798
|
+
score_display=score_display,
|
784
799
|
)
|
785
800
|
| kwargs
|
786
801
|
)
|
@@ -915,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
915
930
|
help=NO_SCORE_HELP,
|
916
931
|
envvar="INSPECT_EVAL_SCORE",
|
917
932
|
)
|
933
|
+
@click.option(
|
934
|
+
"--no-score-display",
|
935
|
+
type=bool,
|
936
|
+
is_flag=True,
|
937
|
+
help=NO_SCORE_HELP,
|
938
|
+
envvar="INSPECT_EVAL_SCORE_DISPLAY",
|
939
|
+
)
|
918
940
|
@click.option(
|
919
941
|
"--max-connections",
|
920
942
|
type=int,
|
@@ -940,6 +962,7 @@ def eval_retry_command(
|
|
940
962
|
log_images: bool | None,
|
941
963
|
log_buffer: int | None,
|
942
964
|
no_score: bool | None,
|
965
|
+
no_score_display: bool | None,
|
943
966
|
max_connections: int | None,
|
944
967
|
max_retries: int | None,
|
945
968
|
timeout: int | None,
|
@@ -954,6 +977,7 @@ def eval_retry_command(
|
|
954
977
|
log_samples = False if no_log_samples else None
|
955
978
|
log_images = False if log_images is False else None
|
956
979
|
score = False if no_score else True
|
980
|
+
score_display = False if no_score_display else None
|
957
981
|
|
958
982
|
# resolve fail_on_error
|
959
983
|
if no_fail_on_error is True:
|
@@ -984,6 +1008,7 @@ def eval_retry_command(
|
|
984
1008
|
log_images=log_images,
|
985
1009
|
log_buffer=log_buffer,
|
986
1010
|
score=score,
|
1011
|
+
score_display=score_display,
|
987
1012
|
max_retries=max_retries,
|
988
1013
|
timeout=timeout,
|
989
1014
|
max_connections=max_connections,
|
inspect_ai/_cli/main.py
CHANGED
@@ -11,6 +11,7 @@ from .list import list_command
|
|
11
11
|
from .log import log_command
|
12
12
|
from .sandbox import sandbox_command
|
13
13
|
from .score import score_command
|
14
|
+
from .trace import trace_command
|
14
15
|
from .view import view_command
|
15
16
|
|
16
17
|
|
@@ -46,6 +47,7 @@ inspect.add_command(log_command)
|
|
46
47
|
inspect.add_command(score_command)
|
47
48
|
inspect.add_command(view_command)
|
48
49
|
inspect.add_command(sandbox_command)
|
50
|
+
inspect.add_command(trace_command)
|
49
51
|
|
50
52
|
|
51
53
|
def main() -> None:
|
inspect_ai/_cli/trace.py
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
import os
|
2
|
+
import shlex
|
3
|
+
import time
|
4
|
+
from datetime import datetime
|
5
|
+
from json import dumps
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Callable, cast
|
8
|
+
|
9
|
+
import click
|
10
|
+
from pydantic_core import to_json
|
11
|
+
from rich import print as r_print
|
12
|
+
from rich.console import Console, RenderableType
|
13
|
+
from rich.table import Column, Table
|
14
|
+
|
15
|
+
from inspect_ai._util.error import PrerequisiteError
|
16
|
+
from inspect_ai._util.logger import TRACE_FILE_NAME
|
17
|
+
from inspect_ai._util.trace import ActionTraceRecord, inspect_trace_dir, read_trace_file
|
18
|
+
|
19
|
+
|
20
|
+
@click.group("trace")
|
21
|
+
def trace_command() -> None:
|
22
|
+
"""List and read execution traces.
|
23
|
+
|
24
|
+
Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
|
25
|
+
"""
|
26
|
+
return None
|
27
|
+
|
28
|
+
|
29
|
+
@trace_command.command("list")
|
30
|
+
@click.option(
|
31
|
+
"--json",
|
32
|
+
type=bool,
|
33
|
+
is_flag=True,
|
34
|
+
default=False,
|
35
|
+
help="Output listing as JSON",
|
36
|
+
)
|
37
|
+
def list_command(json: bool) -> None:
|
38
|
+
"""List all trace files."""
|
39
|
+
trace_dir = inspect_trace_dir()
|
40
|
+
trace_files: list[dict[str, float | str]] = [
|
41
|
+
{"mtime": f.lstat().st_mtime, "file": f.absolute().as_posix()}
|
42
|
+
for f in trace_dir.iterdir()
|
43
|
+
if f.is_file()
|
44
|
+
]
|
45
|
+
trace_files.sort(key=lambda f: cast(float, f["mtime"]), reverse=True)
|
46
|
+
if json:
|
47
|
+
print(dumps(trace_files, indent=2))
|
48
|
+
else:
|
49
|
+
table = Table(box=None, show_header=True, pad_edge=False)
|
50
|
+
table.add_column("Time")
|
51
|
+
table.add_column("Trace File")
|
52
|
+
for file in trace_files:
|
53
|
+
mtime = datetime.fromtimestamp(cast(float, file["mtime"])).astimezone()
|
54
|
+
table.add_row(
|
55
|
+
mtime.strftime("%d-%b %H:%M:%S %Z"), shlex.quote(str(file["file"]))
|
56
|
+
)
|
57
|
+
r_print(table)
|
58
|
+
|
59
|
+
|
60
|
+
@trace_command.command("dump")
|
61
|
+
@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
|
62
|
+
def read_command(trace_file: str) -> None:
|
63
|
+
"""Dump a trace file to stdout (as a JSON array of log records)."""
|
64
|
+
trace_file_path = resolve_trace_file_path(trace_file)
|
65
|
+
|
66
|
+
traces = read_trace_file(trace_file_path)
|
67
|
+
print(
|
68
|
+
to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
|
69
|
+
)
|
70
|
+
|
71
|
+
|
72
|
+
@trace_command.command("anomalies")
|
73
|
+
@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
|
74
|
+
@click.option(
|
75
|
+
"--all",
|
76
|
+
is_flag=True,
|
77
|
+
default=False,
|
78
|
+
help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
|
79
|
+
)
|
80
|
+
def anomolies_command(trace_file: str, all: bool) -> None:
|
81
|
+
"""Look for anomalies in a trace file (never completed or cancelled actions)."""
|
82
|
+
trace_file_path = resolve_trace_file_path(trace_file)
|
83
|
+
traces = read_trace_file(trace_file_path)
|
84
|
+
|
85
|
+
# Track started actions
|
86
|
+
running_actions: dict[str, ActionTraceRecord] = {}
|
87
|
+
canceled_actions: dict[str, ActionTraceRecord] = {}
|
88
|
+
error_actions: dict[str, ActionTraceRecord] = {}
|
89
|
+
timeout_actions: dict[str, ActionTraceRecord] = {}
|
90
|
+
|
91
|
+
def action_started(trace: ActionTraceRecord) -> None:
|
92
|
+
running_actions[trace.trace_id] = trace
|
93
|
+
|
94
|
+
def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
|
95
|
+
start_trace = running_actions.get(trace.trace_id)
|
96
|
+
if start_trace:
|
97
|
+
del running_actions[trace.trace_id]
|
98
|
+
return start_trace
|
99
|
+
else:
|
100
|
+
raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
|
101
|
+
|
102
|
+
def action_failed(trace: ActionTraceRecord) -> None:
|
103
|
+
if all:
|
104
|
+
error_actions[start_trace.trace_id] = trace
|
105
|
+
|
106
|
+
def action_canceled(trace: ActionTraceRecord) -> None:
|
107
|
+
canceled_actions[start_trace.trace_id] = trace
|
108
|
+
|
109
|
+
def action_timeout(trace: ActionTraceRecord) -> None:
|
110
|
+
if all:
|
111
|
+
timeout_actions[start_trace.trace_id] = trace
|
112
|
+
|
113
|
+
for trace in traces:
|
114
|
+
if isinstance(trace, ActionTraceRecord):
|
115
|
+
match trace.event:
|
116
|
+
case "enter":
|
117
|
+
action_started(trace)
|
118
|
+
case "exit":
|
119
|
+
action_completed(trace)
|
120
|
+
case "cancel":
|
121
|
+
start_trace = action_completed(trace)
|
122
|
+
trace.start_time = start_trace.start_time
|
123
|
+
action_canceled(trace)
|
124
|
+
case "error":
|
125
|
+
start_trace = action_completed(trace)
|
126
|
+
trace.start_time = start_trace.start_time
|
127
|
+
action_failed(trace)
|
128
|
+
case "timeout":
|
129
|
+
start_trace = action_completed(trace)
|
130
|
+
trace.start_time = start_trace.start_time
|
131
|
+
action_timeout(trace)
|
132
|
+
case _:
|
133
|
+
print(f"Unknown event type: {trace.event}")
|
134
|
+
|
135
|
+
# do we have any traces?
|
136
|
+
if (
|
137
|
+
len(running_actions)
|
138
|
+
+ len(canceled_actions)
|
139
|
+
+ len(error_actions)
|
140
|
+
+ len(timeout_actions)
|
141
|
+
== 0
|
142
|
+
):
|
143
|
+
print(f"TRACE: {shlex.quote(trace_file_path.as_posix())}\n")
|
144
|
+
if all:
|
145
|
+
print("No anomalies found in trace log.")
|
146
|
+
else:
|
147
|
+
print(
|
148
|
+
"No running or cancelled actions found in trace log (pass --all to see errors and timeouts)."
|
149
|
+
)
|
150
|
+
return
|
151
|
+
|
152
|
+
with open(os.devnull, "w") as f:
|
153
|
+
# generate output
|
154
|
+
console = Console(record=True, file=f)
|
155
|
+
|
156
|
+
def print_fn(o: RenderableType) -> None:
|
157
|
+
console.print(o, highlight=False)
|
158
|
+
|
159
|
+
print_fn(f"[bold]TRACE: {shlex.quote(trace_file_path.as_posix())}[bold]")
|
160
|
+
|
161
|
+
_print_bucket(print_fn, "Running Actions", running_actions)
|
162
|
+
_print_bucket(print_fn, "Cancelled Actions", canceled_actions)
|
163
|
+
_print_bucket(print_fn, "Error Actions", error_actions)
|
164
|
+
_print_bucket(print_fn, "Timeout Actions", timeout_actions)
|
165
|
+
|
166
|
+
# print
|
167
|
+
print(console.export_text(styles=True).strip())
|
168
|
+
|
169
|
+
|
170
|
+
def _print_bucket(
|
171
|
+
print_fn: Callable[[RenderableType], None],
|
172
|
+
label: str,
|
173
|
+
bucket: dict[str, ActionTraceRecord],
|
174
|
+
) -> None:
|
175
|
+
if len(bucket) > 0:
|
176
|
+
# Sort the items in chronological order of when
|
177
|
+
# they finished so the first finished item is at the top
|
178
|
+
sorted_actions = sorted(
|
179
|
+
bucket.values(),
|
180
|
+
key=lambda record: (record.start_time or 0) + (record.duration or 0),
|
181
|
+
reverse=True,
|
182
|
+
)
|
183
|
+
|
184
|
+
# create table
|
185
|
+
table = Table(
|
186
|
+
Column(""),
|
187
|
+
Column("", justify="right"),
|
188
|
+
Column(""),
|
189
|
+
Column("", width=22),
|
190
|
+
box=None,
|
191
|
+
title=label,
|
192
|
+
title_justify="left",
|
193
|
+
title_style="bold",
|
194
|
+
pad_edge=False,
|
195
|
+
padding=(0, 1),
|
196
|
+
)
|
197
|
+
|
198
|
+
for action in sorted_actions:
|
199
|
+
# Compute duration (use the event duration or time since started)
|
200
|
+
duration = (
|
201
|
+
action.duration
|
202
|
+
if action.duration is not None
|
203
|
+
else time.time() - action.start_time
|
204
|
+
if action.start_time is not None
|
205
|
+
else 0.0
|
206
|
+
)
|
207
|
+
|
208
|
+
# The event start time
|
209
|
+
start_time = formatTime(action.start_time) if action.start_time else "None"
|
210
|
+
|
211
|
+
# Event detail
|
212
|
+
detail = (
|
213
|
+
f"{action.detail or action.message} {action.error}"
|
214
|
+
if action.event == "error"
|
215
|
+
else (action.detail or action.message)
|
216
|
+
)
|
217
|
+
|
218
|
+
table.add_row(
|
219
|
+
action.action,
|
220
|
+
f"{round(duration, 2):.2f}s".rjust(8),
|
221
|
+
f" {detail}",
|
222
|
+
start_time,
|
223
|
+
)
|
224
|
+
|
225
|
+
print_fn("")
|
226
|
+
print_fn(table)
|
227
|
+
|
228
|
+
|
229
|
+
def resolve_trace_file_path(trace_file: str) -> Path:
|
230
|
+
trace_file_path = Path(trace_file)
|
231
|
+
if not trace_file_path.is_absolute():
|
232
|
+
trace_file_path = inspect_trace_dir() / trace_file_path
|
233
|
+
|
234
|
+
if not trace_file_path.exists():
|
235
|
+
raise PrerequisiteError(
|
236
|
+
f"The specified trace file '{trace_file_path}' does not exist."
|
237
|
+
)
|
238
|
+
|
239
|
+
return trace_file_path
|
240
|
+
|
241
|
+
|
242
|
+
def formatTime(timestamp: float) -> str:
|
243
|
+
dt = datetime.fromtimestamp(timestamp).astimezone()
|
244
|
+
return dt.strftime("%H:%M:%S %Z")
|
@@ -197,7 +197,11 @@ class TaskScreenApp(App[TR]):
|
|
197
197
|
|
198
198
|
# add task
|
199
199
|
try:
|
200
|
-
|
200
|
+
task_view = self.query_one(TasksView)
|
201
|
+
task_view.set_display_metrics(
|
202
|
+
profile.eval_config.score_display is not False
|
203
|
+
)
|
204
|
+
yield task_view.add_task(task)
|
201
205
|
finally:
|
202
206
|
pass
|
203
207
|
|
@@ -72,6 +72,7 @@ class TasksView(Container):
|
|
72
72
|
self.description_width = MAX_DESCRIPTION_WIDTH
|
73
73
|
self.model_name_width = MAX_MODEL_NAME_WIDTH
|
74
74
|
self.sample_count_width = 0
|
75
|
+
self.display_metrics = True
|
75
76
|
|
76
77
|
def init_tasks(self, tasks: list[TaskSpec]) -> None:
|
77
78
|
# clear existing tasks
|
@@ -89,7 +90,11 @@ class TasksView(Container):
|
|
89
90
|
def add_task(self, task: TaskWithResult) -> TaskDisplay:
|
90
91
|
self.update_count_width(task.profile.samples)
|
91
92
|
task_display = TaskProgressView(
|
92
|
-
task,
|
93
|
+
task,
|
94
|
+
self.description_width,
|
95
|
+
self.model_name_width,
|
96
|
+
self.sample_count_width,
|
97
|
+
self.display_metrics,
|
93
98
|
)
|
94
99
|
self.tasks.mount(task_display)
|
95
100
|
self.tasks.scroll_to_widget(task_display)
|
@@ -97,6 +102,9 @@ class TasksView(Container):
|
|
97
102
|
|
98
103
|
return task_display
|
99
104
|
|
105
|
+
def set_display_metrics(self, display_metrics: bool) -> None:
|
106
|
+
self.display_metrics = display_metrics
|
107
|
+
|
100
108
|
def update_count_width(self, samples: int) -> None:
|
101
109
|
sample_count_str = progress_count(samples, samples, self.sample_count_width)
|
102
110
|
self.sample_count_width = min(
|
@@ -174,6 +182,7 @@ class TaskProgressView(Widget):
|
|
174
182
|
description_width: int,
|
175
183
|
model_name_width: int,
|
176
184
|
sample_count_width: int,
|
185
|
+
display_metrics: bool,
|
177
186
|
) -> None:
|
178
187
|
super().__init__()
|
179
188
|
self.t = task
|
@@ -190,6 +199,7 @@ class TaskProgressView(Widget):
|
|
190
199
|
self.task_detail = TaskDetail(id="task-detail", classes="hidden")
|
191
200
|
|
192
201
|
self.sample_count_width: int = sample_count_width
|
202
|
+
self.display_metrics = display_metrics
|
193
203
|
|
194
204
|
metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
|
195
205
|
metrics_width: reactive[int | None] = reactive(None)
|
@@ -198,7 +208,7 @@ class TaskProgressView(Widget):
|
|
198
208
|
samples_total: reactive[int] = reactive(0)
|
199
209
|
|
200
210
|
def compose(self) -> ComposeResult:
|
201
|
-
yield self.toggle
|
211
|
+
yield (self.toggle if self.display_metrics else Static())
|
202
212
|
yield TaskStatusIcon()
|
203
213
|
yield Static(
|
204
214
|
progress_description(self.t.profile, self.description_width, pad=True)
|
@@ -274,7 +284,7 @@ class TaskProgressView(Widget):
|
|
274
284
|
|
275
285
|
def update_metrics_label(self) -> None:
|
276
286
|
# compute the label (with a min size)
|
277
|
-
if self.metrics is not None:
|
287
|
+
if self.metrics is not None and self.metrics_display is not None:
|
278
288
|
metric_label = task_metric(self.metrics, self.metrics_width)
|
279
289
|
self.metrics_width = len(metric_label)
|
280
290
|
self.metrics_display.update(metric_label)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -76,6 +76,7 @@ def eval(
|
|
76
76
|
log_images: bool | None = None,
|
77
77
|
log_buffer: int | None = None,
|
78
78
|
score: bool = True,
|
79
|
+
score_display: bool | None = None,
|
79
80
|
**kwargs: Unpack[GenerateConfigArgs],
|
80
81
|
) -> list[EvalLog]:
|
81
82
|
r"""Evaluate tasks using a Model.
|
@@ -139,6 +140,7 @@ def eval(
|
|
139
140
|
If not specified, an appropriate default for the format and filesystem is
|
140
141
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
141
142
|
score (bool): Score output (defaults to True)
|
143
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
142
144
|
**kwargs (GenerateConfigArgs): Model generation options.
|
143
145
|
|
144
146
|
Returns:
|
@@ -183,6 +185,7 @@ def eval(
|
|
183
185
|
log_images=log_images,
|
184
186
|
log_buffer=log_buffer,
|
185
187
|
score=score,
|
188
|
+
score_display=score_display,
|
186
189
|
**kwargs,
|
187
190
|
)
|
188
191
|
)
|
@@ -220,6 +223,7 @@ async def eval_async(
|
|
220
223
|
log_images: bool | None = None,
|
221
224
|
log_buffer: int | None = None,
|
222
225
|
score: bool = True,
|
226
|
+
score_display: bool | None = None,
|
223
227
|
**kwargs: Unpack[GenerateConfigArgs],
|
224
228
|
) -> list[EvalLog]:
|
225
229
|
r"""Evaluate tasks using a Model (async).
|
@@ -282,6 +286,7 @@ async def eval_async(
|
|
282
286
|
If not specified, an appropriate default for the format and filesystem is
|
283
287
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
284
288
|
score (bool): Score output (defaults to True)
|
289
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
285
290
|
**kwargs (GenerateConfigArgs): Model generation options.
|
286
291
|
|
287
292
|
Returns:
|
@@ -380,6 +385,7 @@ async def eval_async(
|
|
380
385
|
log_samples=log_samples,
|
381
386
|
log_images=log_images,
|
382
387
|
log_buffer=log_buffer,
|
388
|
+
score_display=score_display,
|
383
389
|
)
|
384
390
|
|
385
391
|
# run tasks - 2 codepaths, one for the traditional task at a time
|
@@ -467,6 +473,7 @@ def eval_retry(
|
|
467
473
|
log_images: bool | None = None,
|
468
474
|
log_buffer: int | None = None,
|
469
475
|
score: bool = True,
|
476
|
+
score_display: bool | None = None,
|
470
477
|
max_retries: int | None = None,
|
471
478
|
timeout: int | None = None,
|
472
479
|
max_connections: int | None = None,
|
@@ -507,6 +514,7 @@ def eval_retry(
|
|
507
514
|
If not specified, an appropriate default for the format and filesystem is
|
508
515
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
509
516
|
score (bool): Score output (defaults to True)
|
517
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
510
518
|
max_retries (int | None):
|
511
519
|
Maximum number of times to retry request.
|
512
520
|
timeout: (int | None):
|
@@ -541,6 +549,7 @@ def eval_retry(
|
|
541
549
|
log_images=log_images,
|
542
550
|
log_buffer=log_buffer,
|
543
551
|
score=score,
|
552
|
+
score_display=score_display,
|
544
553
|
max_retries=max_retries,
|
545
554
|
timeout=timeout,
|
546
555
|
max_connections=max_connections,
|
@@ -565,6 +574,7 @@ async def eval_retry_async(
|
|
565
574
|
log_images: bool | None = None,
|
566
575
|
log_buffer: int | None = None,
|
567
576
|
score: bool = True,
|
577
|
+
score_display: bool | None = None,
|
568
578
|
max_retries: int | None = None,
|
569
579
|
timeout: int | None = None,
|
570
580
|
max_connections: int | None = None,
|
@@ -603,6 +613,7 @@ async def eval_retry_async(
|
|
603
613
|
If not specified, an appropriate default for the format and filesystem is
|
604
614
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
605
615
|
score (bool): Score output (defaults to True)
|
616
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
606
617
|
max_retries (int | None):
|
607
618
|
Maximum number of times to retry request.
|
608
619
|
timeout: (int | None):
|
@@ -699,6 +710,11 @@ async def eval_retry_async(
|
|
699
710
|
log_buffer = (
|
700
711
|
log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
|
701
712
|
)
|
713
|
+
score_display = (
|
714
|
+
score_display
|
715
|
+
if score_display is not None
|
716
|
+
else eval_log.eval.config.score_display
|
717
|
+
)
|
702
718
|
|
703
719
|
config = eval_log.plan.config
|
704
720
|
config.max_retries = max_retries or config.max_retries
|
@@ -740,6 +756,7 @@ async def eval_retry_async(
|
|
740
756
|
log_images=log_images,
|
741
757
|
log_buffer=log_buffer,
|
742
758
|
score=score,
|
759
|
+
score_display=score_display,
|
743
760
|
**dict(config),
|
744
761
|
)
|
745
762
|
)[0]
|
inspect_ai/_eval/task/images.py
CHANGED
@@ -30,13 +30,8 @@ async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
|
|
30
30
|
|
31
31
|
async def sample_with_base64_images(sample: Sample) -> Sample:
|
32
32
|
if isinstance(sample.input, list):
|
33
|
-
return
|
34
|
-
input
|
35
|
-
target=sample.target,
|
36
|
-
id=sample.id,
|
37
|
-
metadata=sample.metadata,
|
38
|
-
files=sample.files,
|
39
|
-
choices=sample.choices,
|
33
|
+
return sample.model_copy(
|
34
|
+
update={"input": await messages_with_base64_images(sample.input)}
|
40
35
|
)
|
41
36
|
else:
|
42
37
|
return sample
|
@@ -44,13 +39,8 @@ async def sample_with_base64_images(sample: Sample) -> Sample:
|
|
44
39
|
|
45
40
|
def sample_without_base64_images(sample: Sample) -> Sample:
|
46
41
|
if isinstance(sample.input, list):
|
47
|
-
return
|
48
|
-
input
|
49
|
-
target=sample.target,
|
50
|
-
id=sample.id,
|
51
|
-
metadata=sample.metadata,
|
52
|
-
files=sample.files,
|
53
|
-
choices=sample.choices,
|
42
|
+
return sample.model_copy(
|
43
|
+
update={"input": messages_without_base64_images(sample.input)}
|
54
44
|
)
|
55
45
|
else:
|
56
46
|
return sample
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -69,10 +69,11 @@ class TaskLogger:
|
|
69
69
|
)
|
70
70
|
packages = {PKG_NAME: importlib_metadata.version(PKG_NAME)}
|
71
71
|
|
72
|
-
#
|
72
|
+
# redact authentication oriented model_args
|
73
73
|
model_args = model_args.copy()
|
74
74
|
if "api_key" in model_args:
|
75
75
|
del model_args["api_key"]
|
76
|
+
model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
|
76
77
|
|
77
78
|
# cwd_relative_path for sandbox config
|
78
79
|
if sandbox and isinstance(sandbox.config, str):
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -217,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
217
217
|
log_location=log_location,
|
218
218
|
)
|
219
219
|
|
220
|
-
with display().task(
|
220
|
+
with display().task(
|
221
|
+
profile,
|
222
|
+
) as td:
|
221
223
|
try:
|
222
224
|
# start the log
|
223
225
|
await log_start(logger, plan, generate_config)
|
@@ -252,7 +254,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
252
254
|
|
253
255
|
# track when samples complete and update progress as we go
|
254
256
|
progress_results: list[dict[str, SampleScore]] = []
|
255
|
-
update_metrics_display = update_metrics_display_fn(
|
257
|
+
update_metrics_display = update_metrics_display_fn(
|
258
|
+
td,
|
259
|
+
display_metrics=profile.eval_config.score_display is not False,
|
260
|
+
)
|
256
261
|
|
257
262
|
def sample_complete(sample_score: dict[str, SampleScore]) -> None:
|
258
263
|
# Capture the result
|
@@ -400,7 +405,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
400
405
|
|
401
406
|
|
402
407
|
def update_metrics_display_fn(
|
403
|
-
td: TaskDisplay,
|
408
|
+
td: TaskDisplay,
|
409
|
+
initial_interval: float = 0,
|
410
|
+
min_interval: float = 0.9,
|
411
|
+
display_metrics: bool = True,
|
404
412
|
) -> Callable[
|
405
413
|
[
|
406
414
|
int,
|
@@ -420,6 +428,10 @@ def update_metrics_display_fn(
|
|
420
428
|
reducers: ScoreReducer | list[ScoreReducer] | None,
|
421
429
|
metrics: list[Metric] | dict[str, list[Metric]] | None,
|
422
430
|
) -> None:
|
431
|
+
# Don't compute metrics if they are not being displayed
|
432
|
+
if not display_metrics:
|
433
|
+
return None
|
434
|
+
|
423
435
|
nonlocal next_compute_time
|
424
436
|
time_start = time.perf_counter()
|
425
437
|
if time_start >= next_compute_time:
|
@@ -568,14 +580,18 @@ async def task_run_sample(
|
|
568
580
|
state = await plan(state, generate)
|
569
581
|
|
570
582
|
except TimeoutError:
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
583
|
+
if time_limit is not None:
|
584
|
+
transcript()._event(
|
585
|
+
SampleLimitEvent(
|
586
|
+
type="time",
|
587
|
+
message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
|
588
|
+
limit=time_limit,
|
589
|
+
)
|
590
|
+
)
|
591
|
+
else:
|
592
|
+
py_logger.warning(
|
593
|
+
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
577
594
|
)
|
578
|
-
)
|
579
595
|
|
580
596
|
# capture most recent state for scoring
|
581
597
|
state = sample_state() or state
|