inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +55 -1
- inspect_ai/_cli/main.py +2 -0
- inspect_ai/_cli/trace.py +244 -0
- inspect_ai/_display/core/progress.py +9 -3
- inspect_ai/_display/core/results.py +8 -4
- inspect_ai/_display/textual/app.py +5 -1
- inspect_ai/_display/textual/widgets/task_detail.py +3 -0
- inspect_ai/_display/textual/widgets/tasks.py +97 -6
- inspect_ai/_eval/eval.py +33 -0
- inspect_ai/_eval/evalset.py +4 -0
- inspect_ai/_eval/registry.py +2 -2
- inspect_ai/_eval/task/images.py +4 -14
- inspect_ai/_eval/task/results.py +22 -4
- inspect_ai/_eval/task/run.py +40 -20
- inspect_ai/_eval/task/sandbox.py +72 -43
- inspect_ai/_eval/task/task.py +4 -0
- inspect_ai/_eval/task/util.py +2 -0
- inspect_ai/_util/constants.py +3 -3
- inspect_ai/_util/display.py +1 -0
- inspect_ai/_util/logger.py +34 -8
- inspect_ai/_util/trace.py +275 -0
- inspect_ai/_view/www/App.css +13 -0
- inspect_ai/_view/www/dist/assets/index.css +13 -0
- inspect_ai/_view/www/dist/assets/index.js +80 -43
- inspect_ai/_view/www/src/App.mjs +31 -6
- inspect_ai/_view/www/src/Types.mjs +6 -0
- inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
- inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
- inspect_ai/_view/www/src/components/Tools.mjs +46 -18
- inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
- inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
- inspect_ai/log/_log.py +6 -0
- inspect_ai/log/_message.py +2 -2
- inspect_ai/log/_recorders/eval.py +8 -18
- inspect_ai/log/_recorders/json.py +19 -17
- inspect_ai/model/_cache.py +22 -16
- inspect_ai/model/_call_tools.py +9 -1
- inspect_ai/model/_generate_config.py +8 -2
- inspect_ai/model/_model.py +11 -12
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/bedrock.py +18 -2
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/openai.py +32 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
- inspect_ai/util/_sandbox/context.py +7 -3
- inspect_ai/util/_sandbox/docker/compose.py +58 -19
- inspect_ai/util/_sandbox/docker/config.py +8 -10
- inspect_ai/util/_sandbox/docker/docker.py +20 -16
- inspect_ai/util/_sandbox/docker/util.py +3 -9
- inspect_ai/util/_sandbox/environment.py +7 -2
- inspect_ai/util/_sandbox/limits.py +1 -1
- inspect_ai/util/_sandbox/local.py +8 -9
- inspect_ai/util/_sandbox/service.py +17 -7
- inspect_ai/util/_subprocess.py +6 -1
- inspect_ai/util/_subtask.py +8 -2
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0
@@ -36,6 +36,9 @@ from ...core.progress import (
|
|
36
36
|
progress_model_name,
|
37
37
|
)
|
38
38
|
|
39
|
+
MAX_METRIC_WIDTH = 25
|
40
|
+
MAX_COUNT_WIDTH = 15
|
41
|
+
|
39
42
|
|
40
43
|
class TasksView(Container):
|
41
44
|
DEFAULT_CSS = """
|
@@ -68,6 +71,8 @@ class TasksView(Container):
|
|
68
71
|
super().__init__()
|
69
72
|
self.description_width = MAX_DESCRIPTION_WIDTH
|
70
73
|
self.model_name_width = MAX_MODEL_NAME_WIDTH
|
74
|
+
self.sample_count_width = 0
|
75
|
+
self.display_metrics = True
|
71
76
|
|
72
77
|
def init_tasks(self, tasks: list[TaskSpec]) -> None:
|
73
78
|
# clear existing tasks
|
@@ -80,15 +85,48 @@ class TasksView(Container):
|
|
80
85
|
self.model_name_width = min(
|
81
86
|
max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
|
82
87
|
)
|
88
|
+
self.update_progress_widths()
|
83
89
|
|
84
90
|
def add_task(self, task: TaskWithResult) -> TaskDisplay:
|
91
|
+
self.update_count_width(task.profile.samples)
|
85
92
|
task_display = TaskProgressView(
|
86
|
-
task,
|
93
|
+
task,
|
94
|
+
self.description_width,
|
95
|
+
self.model_name_width,
|
96
|
+
self.sample_count_width,
|
97
|
+
self.display_metrics,
|
87
98
|
)
|
88
99
|
self.tasks.mount(task_display)
|
89
100
|
self.tasks.scroll_to_widget(task_display)
|
101
|
+
self.update_progress_widths()
|
102
|
+
|
90
103
|
return task_display
|
91
104
|
|
105
|
+
def set_display_metrics(self, display_metrics: bool) -> None:
|
106
|
+
self.display_metrics = display_metrics
|
107
|
+
|
108
|
+
def update_count_width(self, samples: int) -> None:
|
109
|
+
sample_count_str = progress_count(samples, samples, self.sample_count_width)
|
110
|
+
self.sample_count_width = min(
|
111
|
+
max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
|
112
|
+
)
|
113
|
+
|
114
|
+
def update_progress_widths(self) -> None:
|
115
|
+
progress_views = self.tasks.query_children(TaskProgressView)
|
116
|
+
metrics_size = 0
|
117
|
+
for progress_view in progress_views:
|
118
|
+
metrics_size = max(
|
119
|
+
metrics_size,
|
120
|
+
progress_view.metrics_width
|
121
|
+
if progress_view.metrics_width is not None
|
122
|
+
else 0,
|
123
|
+
)
|
124
|
+
metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
|
125
|
+
|
126
|
+
for progress_view in progress_views:
|
127
|
+
progress_view.update_metrics_width(metrics_size)
|
128
|
+
progress_view.update_count_width(self.sample_count_width)
|
129
|
+
|
92
130
|
def compose(self) -> ComposeResult:
|
93
131
|
yield Static(id="tasks-config")
|
94
132
|
yield Static(id="tasks-targets")
|
@@ -139,13 +177,19 @@ class TaskProgressView(Widget):
|
|
139
177
|
"""
|
140
178
|
|
141
179
|
def __init__(
|
142
|
-
self,
|
180
|
+
self,
|
181
|
+
task: TaskWithResult,
|
182
|
+
description_width: int,
|
183
|
+
model_name_width: int,
|
184
|
+
sample_count_width: int,
|
185
|
+
display_metrics: bool,
|
143
186
|
) -> None:
|
144
187
|
super().__init__()
|
145
188
|
self.t = task
|
146
189
|
|
147
190
|
self.description_width = description_width
|
148
191
|
self.model_name_width = model_name_width
|
192
|
+
|
149
193
|
self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
|
150
194
|
self.count_display = Static()
|
151
195
|
self.metrics_display = Static(id="task-metrics")
|
@@ -154,8 +198,17 @@ class TaskProgressView(Widget):
|
|
154
198
|
self.toggle = Toggle()
|
155
199
|
self.task_detail = TaskDetail(id="task-detail", classes="hidden")
|
156
200
|
|
201
|
+
self.sample_count_width: int = sample_count_width
|
202
|
+
self.display_metrics = display_metrics
|
203
|
+
|
204
|
+
metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
|
205
|
+
metrics_width: reactive[int | None] = reactive(None)
|
206
|
+
sample_count_width: reactive[int] = reactive(0)
|
207
|
+
samples_complete: reactive[int] = reactive(0)
|
208
|
+
samples_total: reactive[int] = reactive(0)
|
209
|
+
|
157
210
|
def compose(self) -> ComposeResult:
|
158
|
-
yield self.toggle
|
211
|
+
yield (self.toggle if self.display_metrics else Static())
|
159
212
|
yield TaskStatusIcon()
|
160
213
|
yield Static(
|
161
214
|
progress_description(self.t.profile, self.description_width, pad=True)
|
@@ -191,13 +244,51 @@ class TaskProgressView(Widget):
|
|
191
244
|
self.task_progress.complete()
|
192
245
|
|
193
246
|
def sample_complete(self, complete: int, total: int) -> None:
|
194
|
-
self.
|
247
|
+
self.samples_complete = complete
|
248
|
+
self.samples_total = total
|
195
249
|
|
196
250
|
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
197
|
-
|
198
|
-
|
251
|
+
self.metrics = metrics
|
252
|
+
|
253
|
+
def update_metrics_width(self, width: int) -> None:
|
254
|
+
self.metrics_width = width
|
255
|
+
|
256
|
+
def update_count_width(self, width: int) -> None:
|
257
|
+
self.sample_count_width = width
|
258
|
+
|
259
|
+
def _watch_sample_count_width(self, width: int) -> None:
|
260
|
+
self.refresh_count()
|
261
|
+
|
262
|
+
def _watch_samples_complete(self, complete: int) -> None:
|
263
|
+
self.refresh_count()
|
264
|
+
|
265
|
+
def _watch_samples_total(self, total: int) -> None:
|
266
|
+
self.refresh_count()
|
267
|
+
|
268
|
+
def _watch_metrics_width(self, width: int) -> None:
|
269
|
+
self.update_metrics_label()
|
270
|
+
|
271
|
+
def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
|
272
|
+
if metrics is not None and len(metrics) > 0:
|
273
|
+
# update label
|
274
|
+
self.update_metrics_label()
|
275
|
+
|
276
|
+
# update details
|
199
277
|
self.task_detail.update_metrics(metrics)
|
200
278
|
|
279
|
+
def refresh_count(self) -> None:
|
280
|
+
progress_label = progress_count(
|
281
|
+
self.samples_complete, self.samples_total, self.sample_count_width
|
282
|
+
)
|
283
|
+
self.count_display.update(progress_label)
|
284
|
+
|
285
|
+
def update_metrics_label(self) -> None:
|
286
|
+
# compute the label (with a min size)
|
287
|
+
if self.metrics is not None and self.metrics_display is not None:
|
288
|
+
metric_label = task_metric(self.metrics, self.metrics_width)
|
289
|
+
self.metrics_width = len(metric_label)
|
290
|
+
self.metrics_display.update(metric_label)
|
291
|
+
|
201
292
|
|
202
293
|
class TaskStatusIcon(Static):
|
203
294
|
result: reactive[TaskResult | None] = reactive(None)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -71,10 +71,12 @@ def eval(
|
|
71
71
|
max_samples: int | None = None,
|
72
72
|
max_tasks: int | None = None,
|
73
73
|
max_subprocesses: int | None = None,
|
74
|
+
max_sandboxes: int | None = None,
|
74
75
|
log_samples: bool | None = None,
|
75
76
|
log_images: bool | None = None,
|
76
77
|
log_buffer: int | None = None,
|
77
78
|
score: bool = True,
|
79
|
+
score_display: bool | None = None,
|
78
80
|
**kwargs: Unpack[GenerateConfigArgs],
|
79
81
|
) -> list[EvalLog]:
|
80
82
|
r"""Evaluate tasks using a Model.
|
@@ -129,6 +131,8 @@ def eval(
|
|
129
131
|
(default is 1)
|
130
132
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
131
133
|
run in parallel (default is os.cpu_count())
|
134
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
135
|
+
to run in parallel.
|
132
136
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
133
137
|
log_images: (bool | None): Log base64 encoded version of images,
|
134
138
|
even if specified as a filename or URL (defaults to False)
|
@@ -136,6 +140,7 @@ def eval(
|
|
136
140
|
If not specified, an appropriate default for the format and filesystem is
|
137
141
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
138
142
|
score (bool): Score output (defaults to True)
|
143
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
139
144
|
**kwargs (GenerateConfigArgs): Model generation options.
|
140
145
|
|
141
146
|
Returns:
|
@@ -175,10 +180,12 @@ def eval(
|
|
175
180
|
max_samples=max_samples,
|
176
181
|
max_tasks=max_tasks,
|
177
182
|
max_subprocesses=max_subprocesses,
|
183
|
+
max_sandboxes=max_sandboxes,
|
178
184
|
log_samples=log_samples,
|
179
185
|
log_images=log_images,
|
180
186
|
log_buffer=log_buffer,
|
181
187
|
score=score,
|
188
|
+
score_display=score_display,
|
182
189
|
**kwargs,
|
183
190
|
)
|
184
191
|
)
|
@@ -211,10 +218,12 @@ async def eval_async(
|
|
211
218
|
max_samples: int | None = None,
|
212
219
|
max_tasks: int | None = None,
|
213
220
|
max_subprocesses: int | None = None,
|
221
|
+
max_sandboxes: int | None = None,
|
214
222
|
log_samples: bool | None = None,
|
215
223
|
log_images: bool | None = None,
|
216
224
|
log_buffer: int | None = None,
|
217
225
|
score: bool = True,
|
226
|
+
score_display: bool | None = None,
|
218
227
|
**kwargs: Unpack[GenerateConfigArgs],
|
219
228
|
) -> list[EvalLog]:
|
220
229
|
r"""Evaluate tasks using a Model (async).
|
@@ -268,6 +277,8 @@ async def eval_async(
|
|
268
277
|
(default is 1)
|
269
278
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
270
279
|
run in parallel (default is os.cpu_count())
|
280
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
281
|
+
to run in parallel.
|
271
282
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
272
283
|
log_images: (bool | None): Log base64 encoded version of images,
|
273
284
|
even if specified as a filename or URL (defaults to False)
|
@@ -275,6 +286,7 @@ async def eval_async(
|
|
275
286
|
If not specified, an appropriate default for the format and filesystem is
|
276
287
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
277
288
|
score (bool): Score output (defaults to True)
|
289
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
278
290
|
**kwargs (GenerateConfigArgs): Model generation options.
|
279
291
|
|
280
292
|
Returns:
|
@@ -368,10 +380,12 @@ async def eval_async(
|
|
368
380
|
max_samples=max_samples,
|
369
381
|
max_tasks=max_tasks,
|
370
382
|
max_subprocesses=max_subprocesses,
|
383
|
+
max_sandboxes=max_sandboxes,
|
371
384
|
sandbox_cleanup=sandbox_cleanup,
|
372
385
|
log_samples=log_samples,
|
373
386
|
log_images=log_images,
|
374
387
|
log_buffer=log_buffer,
|
388
|
+
score_display=score_display,
|
375
389
|
)
|
376
390
|
|
377
391
|
# run tasks - 2 codepaths, one for the traditional task at a time
|
@@ -450,6 +464,7 @@ def eval_retry(
|
|
450
464
|
max_samples: int | None = None,
|
451
465
|
max_tasks: int | None = None,
|
452
466
|
max_subprocesses: int | None = None,
|
467
|
+
max_sandboxes: int | None = None,
|
453
468
|
sandbox_cleanup: bool | None = None,
|
454
469
|
trace: bool | None = None,
|
455
470
|
fail_on_error: bool | float | None = None,
|
@@ -458,6 +473,7 @@ def eval_retry(
|
|
458
473
|
log_images: bool | None = None,
|
459
474
|
log_buffer: int | None = None,
|
460
475
|
score: bool = True,
|
476
|
+
score_display: bool | None = None,
|
461
477
|
max_retries: int | None = None,
|
462
478
|
timeout: int | None = None,
|
463
479
|
max_connections: int | None = None,
|
@@ -480,6 +496,8 @@ def eval_retry(
|
|
480
496
|
(default is 1)
|
481
497
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
482
498
|
run in parallel (default is os.cpu_count())
|
499
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
500
|
+
to run in parallel.
|
483
501
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
484
502
|
(defaults to True)
|
485
503
|
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
@@ -496,6 +514,7 @@ def eval_retry(
|
|
496
514
|
If not specified, an appropriate default for the format and filesystem is
|
497
515
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
498
516
|
score (bool): Score output (defaults to True)
|
517
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
499
518
|
max_retries (int | None):
|
500
519
|
Maximum number of times to retry request.
|
501
520
|
timeout: (int | None):
|
@@ -522,6 +541,7 @@ def eval_retry(
|
|
522
541
|
max_samples=max_samples,
|
523
542
|
max_tasks=max_tasks,
|
524
543
|
max_subprocesses=max_subprocesses,
|
544
|
+
max_sandboxes=max_sandboxes,
|
525
545
|
sandbox_cleanup=sandbox_cleanup,
|
526
546
|
fail_on_error=fail_on_error,
|
527
547
|
debug_errors=debug_errors,
|
@@ -529,6 +549,7 @@ def eval_retry(
|
|
529
549
|
log_images=log_images,
|
530
550
|
log_buffer=log_buffer,
|
531
551
|
score=score,
|
552
|
+
score_display=score_display,
|
532
553
|
max_retries=max_retries,
|
533
554
|
timeout=timeout,
|
534
555
|
max_connections=max_connections,
|
@@ -545,6 +566,7 @@ async def eval_retry_async(
|
|
545
566
|
max_samples: int | None = None,
|
546
567
|
max_tasks: int | None = None,
|
547
568
|
max_subprocesses: int | None = None,
|
569
|
+
max_sandboxes: int | None = None,
|
548
570
|
sandbox_cleanup: bool | None = None,
|
549
571
|
fail_on_error: bool | float | None = None,
|
550
572
|
debug_errors: bool | None = None,
|
@@ -552,6 +574,7 @@ async def eval_retry_async(
|
|
552
574
|
log_images: bool | None = None,
|
553
575
|
log_buffer: int | None = None,
|
554
576
|
score: bool = True,
|
577
|
+
score_display: bool | None = None,
|
555
578
|
max_retries: int | None = None,
|
556
579
|
timeout: int | None = None,
|
557
580
|
max_connections: int | None = None,
|
@@ -574,6 +597,7 @@ async def eval_retry_async(
|
|
574
597
|
(default is 1)
|
575
598
|
max_subprocesses (int): Maximum number of subprocesses to
|
576
599
|
run in parallel (default is os.cpu_count())
|
600
|
+
max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
|
577
601
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
578
602
|
(defaults to True)
|
579
603
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
@@ -589,6 +613,7 @@ async def eval_retry_async(
|
|
589
613
|
If not specified, an appropriate default for the format and filesystem is
|
590
614
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
591
615
|
score (bool): Score output (defaults to True)
|
616
|
+
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
592
617
|
max_retries (int | None):
|
593
618
|
Maximum number of times to retry request.
|
594
619
|
timeout: (int | None):
|
@@ -665,6 +690,7 @@ async def eval_retry_async(
|
|
665
690
|
max_samples = max_samples or eval_log.eval.config.max_samples
|
666
691
|
max_tasks = max_tasks or eval_log.eval.config.max_tasks
|
667
692
|
max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
|
693
|
+
max_sandboxes = max_sandboxes or eval_log.eval.config.max_sandboxes
|
668
694
|
sandbox_cleanup = (
|
669
695
|
sandbox_cleanup
|
670
696
|
if sandbox_cleanup is not None
|
@@ -684,6 +710,11 @@ async def eval_retry_async(
|
|
684
710
|
log_buffer = (
|
685
711
|
log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
|
686
712
|
)
|
713
|
+
score_display = (
|
714
|
+
score_display
|
715
|
+
if score_display is not None
|
716
|
+
else eval_log.eval.config.score_display
|
717
|
+
)
|
687
718
|
|
688
719
|
config = eval_log.plan.config
|
689
720
|
config.max_retries = max_retries or config.max_retries
|
@@ -720,10 +751,12 @@ async def eval_retry_async(
|
|
720
751
|
max_samples=max_samples,
|
721
752
|
max_tasks=max_tasks,
|
722
753
|
max_subprocesses=max_subprocesses,
|
754
|
+
max_sandboxes=max_sandboxes,
|
723
755
|
log_samples=log_samples,
|
724
756
|
log_images=log_images,
|
725
757
|
log_buffer=log_buffer,
|
726
758
|
score=score,
|
759
|
+
score_display=score_display,
|
727
760
|
**dict(config),
|
728
761
|
)
|
729
762
|
)[0]
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -75,6 +75,7 @@ def eval_set(
|
|
75
75
|
max_samples: int | None = None,
|
76
76
|
max_tasks: int | None = None,
|
77
77
|
max_subprocesses: int | None = None,
|
78
|
+
max_sandboxes: int | None = None,
|
78
79
|
log_samples: bool | None = None,
|
79
80
|
log_images: bool | None = None,
|
80
81
|
log_buffer: int | None = None,
|
@@ -144,6 +145,8 @@ def eval_set(
|
|
144
145
|
(default is 1)
|
145
146
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
146
147
|
run in parallel (default is os.cpu_count())
|
148
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
149
|
+
to run in parallel.
|
147
150
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
148
151
|
log_images: (bool | None): Log base64 encoded version of images,
|
149
152
|
even if specified as a filename or URL (defaults to False)
|
@@ -193,6 +196,7 @@ def eval_set(
|
|
193
196
|
max_samples=max_samples,
|
194
197
|
max_tasks=max_tasks,
|
195
198
|
max_subprocesses=max_subprocesses,
|
199
|
+
max_sandboxes=max_sandboxes,
|
196
200
|
log_samples=log_samples,
|
197
201
|
log_images=log_images,
|
198
202
|
log_buffer=log_buffer,
|
inspect_ai/_eval/registry.py
CHANGED
@@ -146,8 +146,8 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
146
146
|
# module import, so set its task file and run dir
|
147
147
|
if get_installed_package_name(task_type) is None:
|
148
148
|
module = inspect.getmodule(task_type)
|
149
|
-
if module and module
|
150
|
-
file = Path(module
|
149
|
+
if module and hasattr(module, "__file__"):
|
150
|
+
file = Path(getattr(module, "__file__"))
|
151
151
|
setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
|
152
152
|
setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
|
153
153
|
|
inspect_ai/_eval/task/images.py
CHANGED
@@ -30,13 +30,8 @@ async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
|
|
30
30
|
|
31
31
|
async def sample_with_base64_images(sample: Sample) -> Sample:
|
32
32
|
if isinstance(sample.input, list):
|
33
|
-
return
|
34
|
-
input
|
35
|
-
target=sample.target,
|
36
|
-
id=sample.id,
|
37
|
-
metadata=sample.metadata,
|
38
|
-
files=sample.files,
|
39
|
-
choices=sample.choices,
|
33
|
+
return sample.model_copy(
|
34
|
+
update={"input": await messages_with_base64_images(sample.input)}
|
40
35
|
)
|
41
36
|
else:
|
42
37
|
return sample
|
@@ -44,13 +39,8 @@ async def sample_with_base64_images(sample: Sample) -> Sample:
|
|
44
39
|
|
45
40
|
def sample_without_base64_images(sample: Sample) -> Sample:
|
46
41
|
if isinstance(sample.input, list):
|
47
|
-
return
|
48
|
-
input
|
49
|
-
target=sample.target,
|
50
|
-
id=sample.id,
|
51
|
-
metadata=sample.metadata,
|
52
|
-
files=sample.files,
|
53
|
-
choices=sample.choices,
|
42
|
+
return sample.model_copy(
|
43
|
+
update={"input": messages_without_base64_images(sample.input)}
|
54
44
|
)
|
55
45
|
else:
|
56
46
|
return sample
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -267,10 +267,28 @@ def scorers_from_metric_dict(
|
|
267
267
|
value = target_metric(metric_scores)
|
268
268
|
else:
|
269
269
|
value = float("Nan")
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
)
|
270
|
+
|
271
|
+
# convert the value to a float (either by expanding the dict or array)
|
272
|
+
# or by casting to a float
|
273
|
+
if isinstance(value, dict):
|
274
|
+
for key, val in value.items():
|
275
|
+
name = f"{metric_name}_{key}"
|
276
|
+
result_metrics[name] = EvalMetric(
|
277
|
+
name=name,
|
278
|
+
value=cast(float, val),
|
279
|
+
)
|
280
|
+
elif isinstance(value, list):
|
281
|
+
for idx, item in enumerate(value):
|
282
|
+
name = f"{metric_name}_{idx}"
|
283
|
+
result_metrics[name] = EvalMetric(
|
284
|
+
name=name,
|
285
|
+
value=cast(float, item),
|
286
|
+
)
|
287
|
+
else:
|
288
|
+
result_metrics[metric_name] = EvalMetric(
|
289
|
+
name=metric_name,
|
290
|
+
value=cast(float, value),
|
291
|
+
)
|
274
292
|
|
275
293
|
# create a scorer result for this metric
|
276
294
|
# TODO: What if there is separate simple scorer which has a name collision with
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -178,6 +178,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
178
178
|
else:
|
179
179
|
plan = Plan(unroll(solver), internal=True)
|
180
180
|
|
181
|
+
# add setup solver(s) if specified
|
182
|
+
if task.setup:
|
183
|
+
plan.steps = unroll(task.setup) + plan.steps
|
184
|
+
|
181
185
|
# reaolve the scorer
|
182
186
|
score = score and task.scorer is not None
|
183
187
|
scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
|
@@ -213,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
213
217
|
log_location=log_location,
|
214
218
|
)
|
215
219
|
|
216
|
-
with display().task(
|
220
|
+
with display().task(
|
221
|
+
profile,
|
222
|
+
) as td:
|
217
223
|
try:
|
218
224
|
# start the log
|
219
225
|
await log_start(logger, plan, generate_config)
|
@@ -248,7 +254,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
248
254
|
|
249
255
|
# track when samples complete and update progress as we go
|
250
256
|
progress_results: list[dict[str, SampleScore]] = []
|
251
|
-
update_metrics_display = update_metrics_display_fn(
|
257
|
+
update_metrics_display = update_metrics_display_fn(
|
258
|
+
td,
|
259
|
+
display_metrics=profile.eval_config.score_display is not False,
|
260
|
+
)
|
252
261
|
|
253
262
|
def sample_complete(sample_score: dict[str, SampleScore]) -> None:
|
254
263
|
# Capture the result
|
@@ -275,6 +284,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
275
284
|
sample=sample,
|
276
285
|
state=state,
|
277
286
|
sandbox=sandbox,
|
287
|
+
max_sandboxes=config.max_sandboxes,
|
278
288
|
sandbox_cleanup=sandbox_cleanup,
|
279
289
|
plan=plan,
|
280
290
|
scorers=scorers,
|
@@ -395,7 +405,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
395
405
|
|
396
406
|
|
397
407
|
def update_metrics_display_fn(
|
398
|
-
td: TaskDisplay,
|
408
|
+
td: TaskDisplay,
|
409
|
+
initial_interval: float = 0,
|
410
|
+
min_interval: float = 0.9,
|
411
|
+
display_metrics: bool = True,
|
399
412
|
) -> Callable[
|
400
413
|
[
|
401
414
|
int,
|
@@ -415,6 +428,10 @@ def update_metrics_display_fn(
|
|
415
428
|
reducers: ScoreReducer | list[ScoreReducer] | None,
|
416
429
|
metrics: list[Metric] | dict[str, list[Metric]] | None,
|
417
430
|
) -> None:
|
431
|
+
# Don't compute metrics if they are not being displayed
|
432
|
+
if not display_metrics:
|
433
|
+
return None
|
434
|
+
|
418
435
|
nonlocal next_compute_time
|
419
436
|
time_start = time.perf_counter()
|
420
437
|
if time_start >= next_compute_time:
|
@@ -456,6 +473,7 @@ async def task_run_sample(
|
|
456
473
|
sample: Sample,
|
457
474
|
state: TaskState,
|
458
475
|
sandbox: SandboxEnvironmentSpec | None,
|
476
|
+
max_sandboxes: int | None,
|
459
477
|
sandbox_cleanup: bool,
|
460
478
|
plan: Plan,
|
461
479
|
scorers: list[Scorer] | None,
|
@@ -482,8 +500,8 @@ async def task_run_sample(
|
|
482
500
|
await logger.log_sample(previous_sample, flush=False)
|
483
501
|
|
484
502
|
# return score
|
485
|
-
|
486
|
-
|
503
|
+
sample_scores = (
|
504
|
+
{
|
487
505
|
key: SampleScore(
|
488
506
|
sample_id=previous_sample.id,
|
489
507
|
value=score.value,
|
@@ -493,8 +511,11 @@ async def task_run_sample(
|
|
493
511
|
)
|
494
512
|
for key, score in previous_sample.scores.items()
|
495
513
|
}
|
496
|
-
|
497
|
-
|
514
|
+
if previous_sample.scores
|
515
|
+
else {}
|
516
|
+
)
|
517
|
+
sample_complete(sample_scores)
|
518
|
+
return sample_scores
|
498
519
|
|
499
520
|
# use semaphore if provided
|
500
521
|
semaphore_cm: asyncio.Semaphore | contextlib.AbstractAsyncContextManager[None] = (
|
@@ -510,7 +531,7 @@ async def task_run_sample(
|
|
510
531
|
|
511
532
|
# use sandbox if provided
|
512
533
|
sandboxenv_cm = (
|
513
|
-
sandboxenv_context(task_name, sandbox, sandbox_cleanup, sample)
|
534
|
+
sandboxenv_context(task_name, sandbox, max_sandboxes, sandbox_cleanup, sample)
|
514
535
|
if sandbox or sample.sandbox is not None
|
515
536
|
else contextlib.nullcontext()
|
516
537
|
)
|
@@ -559,14 +580,18 @@ async def task_run_sample(
|
|
559
580
|
state = await plan(state, generate)
|
560
581
|
|
561
582
|
except TimeoutError:
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
583
|
+
if time_limit is not None:
|
584
|
+
transcript()._event(
|
585
|
+
SampleLimitEvent(
|
586
|
+
type="time",
|
587
|
+
message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
|
588
|
+
limit=time_limit,
|
589
|
+
)
|
590
|
+
)
|
591
|
+
else:
|
592
|
+
py_logger.warning(
|
593
|
+
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
568
594
|
)
|
569
|
-
)
|
570
595
|
|
571
596
|
# capture most recent state for scoring
|
572
597
|
state = sample_state() or state
|
@@ -866,10 +891,5 @@ def create_sample_semaphore(
|
|
866
891
|
else DEFAULT_MAX_CONNECTIONS
|
867
892
|
)
|
868
893
|
|
869
|
-
# if max_tasks is specified and max_samples is less
|
870
|
-
# than max_tasks then bump it up
|
871
|
-
if config.max_tasks is not None:
|
872
|
-
max_samples = max(max_samples, config.max_tasks)
|
873
|
-
|
874
894
|
# return the semaphore
|
875
895
|
return asyncio.Semaphore(max_samples)
|