inspect-ai 0.3.52__py3-none-any.whl → 0.3.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +29 -0
- inspect_ai/_display/core/progress.py +9 -3
- inspect_ai/_display/core/results.py +8 -4
- inspect_ai/_display/textual/widgets/task_detail.py +3 -0
- inspect_ai/_display/textual/widgets/tasks.py +86 -5
- inspect_ai/_eval/eval.py +16 -0
- inspect_ai/_eval/evalset.py +4 -0
- inspect_ai/_eval/registry.py +2 -2
- inspect_ai/_eval/task/results.py +22 -4
- inspect_ai/_eval/task/run.py +14 -10
- inspect_ai/_eval/task/sandbox.py +72 -43
- inspect_ai/_eval/task/task.py +4 -0
- inspect_ai/_eval/task/util.py +2 -0
- inspect_ai/_view/www/App.css +13 -0
- inspect_ai/_view/www/dist/assets/index.css +13 -0
- inspect_ai/_view/www/dist/assets/index.js +80 -43
- inspect_ai/_view/www/src/App.mjs +31 -6
- inspect_ai/_view/www/src/Types.mjs +6 -0
- inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
- inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
- inspect_ai/_view/www/src/components/Tools.mjs +46 -18
- inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
- inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
- inspect_ai/log/_log.py +3 -0
- inspect_ai/log/_recorders/eval.py +8 -7
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/bedrock.py +17 -1
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/openai.py +32 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/util/_sandbox/context.py +1 -2
- inspect_ai/util/_sandbox/docker/config.py +8 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -5
- inspect_ai/util/_sandbox/docker/util.py +3 -3
- inspect_ai/util/_sandbox/environment.py +7 -2
- inspect_ai/util/_sandbox/limits.py +1 -1
- inspect_ai/util/_sandbox/local.py +8 -9
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/METADATA +1 -3
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/RECORD +46 -46
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
|
30
30
|
MAX_SUBPROCESSES_HELP = (
|
31
31
|
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
|
32
32
|
)
|
33
|
+
MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
|
33
34
|
NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
|
34
35
|
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
|
35
36
|
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
|
@@ -192,6 +193,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
192
193
|
help=MAX_SUBPROCESSES_HELP,
|
193
194
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
194
195
|
)
|
196
|
+
@click.option(
|
197
|
+
"--max-sandboxes",
|
198
|
+
type=int,
|
199
|
+
help=MAX_SANDBOXES_HELP,
|
200
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
201
|
+
)
|
195
202
|
@click.option(
|
196
203
|
"--message-limit",
|
197
204
|
type=int,
|
@@ -361,6 +368,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
361
368
|
help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
|
362
369
|
envvar="INSPECT_EVAL_CACHE_PROMPT",
|
363
370
|
)
|
371
|
+
@click.option(
|
372
|
+
"--reasoning-effort",
|
373
|
+
type=click.Choice(["low", "medium", "high"]),
|
374
|
+
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
375
|
+
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
376
|
+
)
|
364
377
|
@click.option(
|
365
378
|
"--log-format",
|
366
379
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -419,12 +432,14 @@ def eval_command(
|
|
419
432
|
parallel_tool_calls: bool | None,
|
420
433
|
max_tool_output: int | None,
|
421
434
|
cache_prompt: str | None,
|
435
|
+
reasoning_effort: str | None,
|
422
436
|
message_limit: int | None,
|
423
437
|
token_limit: int | None,
|
424
438
|
time_limit: int | None,
|
425
439
|
max_samples: int | None,
|
426
440
|
max_tasks: int | None,
|
427
441
|
max_subprocesses: int | None,
|
442
|
+
max_sandboxes: int | None,
|
428
443
|
fail_on_error: bool | float | None,
|
429
444
|
no_fail_on_error: bool | None,
|
430
445
|
no_log_samples: bool | None,
|
@@ -472,6 +487,7 @@ def eval_command(
|
|
472
487
|
max_samples=max_samples,
|
473
488
|
max_tasks=max_tasks,
|
474
489
|
max_subprocesses=max_subprocesses,
|
490
|
+
max_sandboxes=max_sandboxes,
|
475
491
|
fail_on_error=fail_on_error,
|
476
492
|
no_fail_on_error=no_fail_on_error,
|
477
493
|
debug_errors=common["debug_errors"],
|
@@ -573,12 +589,14 @@ def eval_set_command(
|
|
573
589
|
parallel_tool_calls: bool | None,
|
574
590
|
max_tool_output: int | None,
|
575
591
|
cache_prompt: str | None,
|
592
|
+
reasoning_effort: str | None,
|
576
593
|
message_limit: int | None,
|
577
594
|
token_limit: int | None,
|
578
595
|
time_limit: int | None,
|
579
596
|
max_samples: int | None,
|
580
597
|
max_tasks: int | None,
|
581
598
|
max_subprocesses: int | None,
|
599
|
+
max_sandboxes: int | None,
|
582
600
|
fail_on_error: bool | float | None,
|
583
601
|
no_fail_on_error: bool | None,
|
584
602
|
no_log_samples: bool | None,
|
@@ -628,6 +646,7 @@ def eval_set_command(
|
|
628
646
|
max_samples=max_samples,
|
629
647
|
max_tasks=max_tasks,
|
630
648
|
max_subprocesses=max_subprocesses,
|
649
|
+
max_sandboxes=max_sandboxes,
|
631
650
|
fail_on_error=fail_on_error,
|
632
651
|
no_fail_on_error=no_fail_on_error,
|
633
652
|
debug_errors=common["debug_errors"],
|
@@ -679,6 +698,7 @@ def eval_exec(
|
|
679
698
|
max_samples: int | None,
|
680
699
|
max_tasks: int | None,
|
681
700
|
max_subprocesses: int | None,
|
701
|
+
max_sandboxes: int | None,
|
682
702
|
fail_on_error: bool | float | None,
|
683
703
|
no_fail_on_error: bool | None,
|
684
704
|
debug_errors: bool | None,
|
@@ -756,6 +776,7 @@ def eval_exec(
|
|
756
776
|
max_samples=max_samples,
|
757
777
|
max_tasks=max_tasks,
|
758
778
|
max_subprocesses=max_subprocesses,
|
779
|
+
max_sandboxes=max_sandboxes,
|
759
780
|
log_samples=log_samples,
|
760
781
|
log_images=log_images,
|
761
782
|
log_buffer=log_buffer,
|
@@ -834,6 +855,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
834
855
|
help=MAX_SUBPROCESSES_HELP,
|
835
856
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
836
857
|
)
|
858
|
+
@click.option(
|
859
|
+
"--max-sandboxes",
|
860
|
+
type=int,
|
861
|
+
help=MAX_SANDBOXES_HELP,
|
862
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
863
|
+
)
|
837
864
|
@click.option(
|
838
865
|
"--no-sandbox-cleanup",
|
839
866
|
type=bool,
|
@@ -904,6 +931,7 @@ def eval_retry_command(
|
|
904
931
|
max_samples: int | None,
|
905
932
|
max_tasks: int | None,
|
906
933
|
max_subprocesses: int | None,
|
934
|
+
max_sandboxes: int | None,
|
907
935
|
no_sandbox_cleanup: bool | None,
|
908
936
|
trace: bool | None,
|
909
937
|
fail_on_error: bool | float | None,
|
@@ -947,6 +975,7 @@ def eval_retry_command(
|
|
947
975
|
max_samples=max_samples,
|
948
976
|
max_tasks=max_tasks,
|
949
977
|
max_subprocesses=max_subprocesses,
|
978
|
+
max_sandboxes=max_sandboxes,
|
950
979
|
sandbox_cleanup=sandbox_cleanup,
|
951
980
|
trace=trace,
|
952
981
|
fail_on_error=fail_on_error,
|
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
|
|
130
130
|
return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
|
131
131
|
|
132
132
|
|
133
|
-
def progress_count(complete: int, total: int) -> str:
|
134
|
-
# Pad the display to keep it stable
|
133
|
+
def progress_count(complete: int, total: int, width: int | None = None) -> str:
|
134
|
+
# Pad the display to keep it stable as the
|
135
|
+
# complete metrics
|
135
136
|
total_str = f"{total:,}"
|
136
137
|
complete_str = f"{complete:,}"
|
137
138
|
padding = max(0, len(total_str) - len(complete_str))
|
138
|
-
|
139
|
+
padded = " " * padding + f"[{complete_str}/{total_str}]"
|
140
|
+
|
141
|
+
# If a width has ben specified, pad up to this width as well
|
142
|
+
if width is not None:
|
143
|
+
padded = padded.rjust(width)
|
144
|
+
return padded
|
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
|
|
166
166
|
return message
|
167
167
|
|
168
168
|
|
169
|
-
def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
169
|
+
def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
|
170
170
|
reducer_names: Set[str] = {
|
171
171
|
metric.reducer for metric in metrics if metric.reducer is not None
|
172
172
|
}
|
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
|
180
180
|
else:
|
181
181
|
value = f"{metric.value:.2f}"
|
182
182
|
|
183
|
-
if show_reducer:
|
184
|
-
|
183
|
+
if show_reducer and metric.reducer is not None:
|
184
|
+
metric_str = f"{metric.name}/{metric.reducer}: {value}"
|
185
185
|
else:
|
186
|
-
|
186
|
+
metric_str = f"{metric.name}: {value}"
|
187
|
+
|
188
|
+
if width is not None:
|
189
|
+
metric_str = metric_str.rjust(width)
|
190
|
+
return metric_str
|
187
191
|
|
188
192
|
|
189
193
|
def task_metrics(scores: list[EvalScore]) -> str:
|
@@ -36,6 +36,9 @@ from ...core.progress import (
|
|
36
36
|
progress_model_name,
|
37
37
|
)
|
38
38
|
|
39
|
+
MAX_METRIC_WIDTH = 25
|
40
|
+
MAX_COUNT_WIDTH = 15
|
41
|
+
|
39
42
|
|
40
43
|
class TasksView(Container):
|
41
44
|
DEFAULT_CSS = """
|
@@ -68,6 +71,7 @@ class TasksView(Container):
|
|
68
71
|
super().__init__()
|
69
72
|
self.description_width = MAX_DESCRIPTION_WIDTH
|
70
73
|
self.model_name_width = MAX_MODEL_NAME_WIDTH
|
74
|
+
self.sample_count_width = 0
|
71
75
|
|
72
76
|
def init_tasks(self, tasks: list[TaskSpec]) -> None:
|
73
77
|
# clear existing tasks
|
@@ -80,15 +84,41 @@ class TasksView(Container):
|
|
80
84
|
self.model_name_width = min(
|
81
85
|
max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
|
82
86
|
)
|
87
|
+
self.update_progress_widths()
|
83
88
|
|
84
89
|
def add_task(self, task: TaskWithResult) -> TaskDisplay:
|
90
|
+
self.update_count_width(task.profile.samples)
|
85
91
|
task_display = TaskProgressView(
|
86
|
-
task, self.description_width, self.model_name_width
|
92
|
+
task, self.description_width, self.model_name_width, self.sample_count_width
|
87
93
|
)
|
88
94
|
self.tasks.mount(task_display)
|
89
95
|
self.tasks.scroll_to_widget(task_display)
|
96
|
+
self.update_progress_widths()
|
97
|
+
|
90
98
|
return task_display
|
91
99
|
|
100
|
+
def update_count_width(self, samples: int) -> None:
|
101
|
+
sample_count_str = progress_count(samples, samples, self.sample_count_width)
|
102
|
+
self.sample_count_width = min(
|
103
|
+
max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
|
104
|
+
)
|
105
|
+
|
106
|
+
def update_progress_widths(self) -> None:
|
107
|
+
progress_views = self.tasks.query_children(TaskProgressView)
|
108
|
+
metrics_size = 0
|
109
|
+
for progress_view in progress_views:
|
110
|
+
metrics_size = max(
|
111
|
+
metrics_size,
|
112
|
+
progress_view.metrics_width
|
113
|
+
if progress_view.metrics_width is not None
|
114
|
+
else 0,
|
115
|
+
)
|
116
|
+
metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
|
117
|
+
|
118
|
+
for progress_view in progress_views:
|
119
|
+
progress_view.update_metrics_width(metrics_size)
|
120
|
+
progress_view.update_count_width(self.sample_count_width)
|
121
|
+
|
92
122
|
def compose(self) -> ComposeResult:
|
93
123
|
yield Static(id="tasks-config")
|
94
124
|
yield Static(id="tasks-targets")
|
@@ -139,13 +169,18 @@ class TaskProgressView(Widget):
|
|
139
169
|
"""
|
140
170
|
|
141
171
|
def __init__(
|
142
|
-
self,
|
172
|
+
self,
|
173
|
+
task: TaskWithResult,
|
174
|
+
description_width: int,
|
175
|
+
model_name_width: int,
|
176
|
+
sample_count_width: int,
|
143
177
|
) -> None:
|
144
178
|
super().__init__()
|
145
179
|
self.t = task
|
146
180
|
|
147
181
|
self.description_width = description_width
|
148
182
|
self.model_name_width = model_name_width
|
183
|
+
|
149
184
|
self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
|
150
185
|
self.count_display = Static()
|
151
186
|
self.metrics_display = Static(id="task-metrics")
|
@@ -154,6 +189,14 @@ class TaskProgressView(Widget):
|
|
154
189
|
self.toggle = Toggle()
|
155
190
|
self.task_detail = TaskDetail(id="task-detail", classes="hidden")
|
156
191
|
|
192
|
+
self.sample_count_width: int = sample_count_width
|
193
|
+
|
194
|
+
metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
|
195
|
+
metrics_width: reactive[int | None] = reactive(None)
|
196
|
+
sample_count_width: reactive[int] = reactive(0)
|
197
|
+
samples_complete: reactive[int] = reactive(0)
|
198
|
+
samples_total: reactive[int] = reactive(0)
|
199
|
+
|
157
200
|
def compose(self) -> ComposeResult:
|
158
201
|
yield self.toggle
|
159
202
|
yield TaskStatusIcon()
|
@@ -191,13 +234,51 @@ class TaskProgressView(Widget):
|
|
191
234
|
self.task_progress.complete()
|
192
235
|
|
193
236
|
def sample_complete(self, complete: int, total: int) -> None:
|
194
|
-
self.
|
237
|
+
self.samples_complete = complete
|
238
|
+
self.samples_total = total
|
195
239
|
|
196
240
|
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
197
|
-
|
198
|
-
|
241
|
+
self.metrics = metrics
|
242
|
+
|
243
|
+
def update_metrics_width(self, width: int) -> None:
|
244
|
+
self.metrics_width = width
|
245
|
+
|
246
|
+
def update_count_width(self, width: int) -> None:
|
247
|
+
self.sample_count_width = width
|
248
|
+
|
249
|
+
def _watch_sample_count_width(self, width: int) -> None:
|
250
|
+
self.refresh_count()
|
251
|
+
|
252
|
+
def _watch_samples_complete(self, complete: int) -> None:
|
253
|
+
self.refresh_count()
|
254
|
+
|
255
|
+
def _watch_samples_total(self, total: int) -> None:
|
256
|
+
self.refresh_count()
|
257
|
+
|
258
|
+
def _watch_metrics_width(self, width: int) -> None:
|
259
|
+
self.update_metrics_label()
|
260
|
+
|
261
|
+
def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
|
262
|
+
if metrics is not None and len(metrics) > 0:
|
263
|
+
# update label
|
264
|
+
self.update_metrics_label()
|
265
|
+
|
266
|
+
# update details
|
199
267
|
self.task_detail.update_metrics(metrics)
|
200
268
|
|
269
|
+
def refresh_count(self) -> None:
|
270
|
+
progress_label = progress_count(
|
271
|
+
self.samples_complete, self.samples_total, self.sample_count_width
|
272
|
+
)
|
273
|
+
self.count_display.update(progress_label)
|
274
|
+
|
275
|
+
def update_metrics_label(self) -> None:
|
276
|
+
# compute the label (with a min size)
|
277
|
+
if self.metrics is not None:
|
278
|
+
metric_label = task_metric(self.metrics, self.metrics_width)
|
279
|
+
self.metrics_width = len(metric_label)
|
280
|
+
self.metrics_display.update(metric_label)
|
281
|
+
|
201
282
|
|
202
283
|
class TaskStatusIcon(Static):
|
203
284
|
result: reactive[TaskResult | None] = reactive(None)
|
inspect_ai/_eval/eval.py
CHANGED
@@ -71,6 +71,7 @@ def eval(
|
|
71
71
|
max_samples: int | None = None,
|
72
72
|
max_tasks: int | None = None,
|
73
73
|
max_subprocesses: int | None = None,
|
74
|
+
max_sandboxes: int | None = None,
|
74
75
|
log_samples: bool | None = None,
|
75
76
|
log_images: bool | None = None,
|
76
77
|
log_buffer: int | None = None,
|
@@ -129,6 +130,8 @@ def eval(
|
|
129
130
|
(default is 1)
|
130
131
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
131
132
|
run in parallel (default is os.cpu_count())
|
133
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
134
|
+
to run in parallel.
|
132
135
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
133
136
|
log_images: (bool | None): Log base64 encoded version of images,
|
134
137
|
even if specified as a filename or URL (defaults to False)
|
@@ -175,6 +178,7 @@ def eval(
|
|
175
178
|
max_samples=max_samples,
|
176
179
|
max_tasks=max_tasks,
|
177
180
|
max_subprocesses=max_subprocesses,
|
181
|
+
max_sandboxes=max_sandboxes,
|
178
182
|
log_samples=log_samples,
|
179
183
|
log_images=log_images,
|
180
184
|
log_buffer=log_buffer,
|
@@ -211,6 +215,7 @@ async def eval_async(
|
|
211
215
|
max_samples: int | None = None,
|
212
216
|
max_tasks: int | None = None,
|
213
217
|
max_subprocesses: int | None = None,
|
218
|
+
max_sandboxes: int | None = None,
|
214
219
|
log_samples: bool | None = None,
|
215
220
|
log_images: bool | None = None,
|
216
221
|
log_buffer: int | None = None,
|
@@ -268,6 +273,8 @@ async def eval_async(
|
|
268
273
|
(default is 1)
|
269
274
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
270
275
|
run in parallel (default is os.cpu_count())
|
276
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
277
|
+
to run in parallel.
|
271
278
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
272
279
|
log_images: (bool | None): Log base64 encoded version of images,
|
273
280
|
even if specified as a filename or URL (defaults to False)
|
@@ -368,6 +375,7 @@ async def eval_async(
|
|
368
375
|
max_samples=max_samples,
|
369
376
|
max_tasks=max_tasks,
|
370
377
|
max_subprocesses=max_subprocesses,
|
378
|
+
max_sandboxes=max_sandboxes,
|
371
379
|
sandbox_cleanup=sandbox_cleanup,
|
372
380
|
log_samples=log_samples,
|
373
381
|
log_images=log_images,
|
@@ -450,6 +458,7 @@ def eval_retry(
|
|
450
458
|
max_samples: int | None = None,
|
451
459
|
max_tasks: int | None = None,
|
452
460
|
max_subprocesses: int | None = None,
|
461
|
+
max_sandboxes: int | None = None,
|
453
462
|
sandbox_cleanup: bool | None = None,
|
454
463
|
trace: bool | None = None,
|
455
464
|
fail_on_error: bool | float | None = None,
|
@@ -480,6 +489,8 @@ def eval_retry(
|
|
480
489
|
(default is 1)
|
481
490
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
482
491
|
run in parallel (default is os.cpu_count())
|
492
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
493
|
+
to run in parallel.
|
483
494
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
484
495
|
(defaults to True)
|
485
496
|
trace (bool | None): Trace message interactions with evaluated model to terminal.
|
@@ -522,6 +533,7 @@ def eval_retry(
|
|
522
533
|
max_samples=max_samples,
|
523
534
|
max_tasks=max_tasks,
|
524
535
|
max_subprocesses=max_subprocesses,
|
536
|
+
max_sandboxes=max_sandboxes,
|
525
537
|
sandbox_cleanup=sandbox_cleanup,
|
526
538
|
fail_on_error=fail_on_error,
|
527
539
|
debug_errors=debug_errors,
|
@@ -545,6 +557,7 @@ async def eval_retry_async(
|
|
545
557
|
max_samples: int | None = None,
|
546
558
|
max_tasks: int | None = None,
|
547
559
|
max_subprocesses: int | None = None,
|
560
|
+
max_sandboxes: int | None = None,
|
548
561
|
sandbox_cleanup: bool | None = None,
|
549
562
|
fail_on_error: bool | float | None = None,
|
550
563
|
debug_errors: bool | None = None,
|
@@ -574,6 +587,7 @@ async def eval_retry_async(
|
|
574
587
|
(default is 1)
|
575
588
|
max_subprocesses (int): Maximum number of subprocesses to
|
576
589
|
run in parallel (default is os.cpu_count())
|
590
|
+
max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
|
577
591
|
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
578
592
|
(defaults to True)
|
579
593
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
@@ -665,6 +679,7 @@ async def eval_retry_async(
|
|
665
679
|
max_samples = max_samples or eval_log.eval.config.max_samples
|
666
680
|
max_tasks = max_tasks or eval_log.eval.config.max_tasks
|
667
681
|
max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
|
682
|
+
max_sandboxes = max_sandboxes or eval_log.eval.config.max_sandboxes
|
668
683
|
sandbox_cleanup = (
|
669
684
|
sandbox_cleanup
|
670
685
|
if sandbox_cleanup is not None
|
@@ -720,6 +735,7 @@ async def eval_retry_async(
|
|
720
735
|
max_samples=max_samples,
|
721
736
|
max_tasks=max_tasks,
|
722
737
|
max_subprocesses=max_subprocesses,
|
738
|
+
max_sandboxes=max_sandboxes,
|
723
739
|
log_samples=log_samples,
|
724
740
|
log_images=log_images,
|
725
741
|
log_buffer=log_buffer,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -75,6 +75,7 @@ def eval_set(
|
|
75
75
|
max_samples: int | None = None,
|
76
76
|
max_tasks: int | None = None,
|
77
77
|
max_subprocesses: int | None = None,
|
78
|
+
max_sandboxes: int | None = None,
|
78
79
|
log_samples: bool | None = None,
|
79
80
|
log_images: bool | None = None,
|
80
81
|
log_buffer: int | None = None,
|
@@ -144,6 +145,8 @@ def eval_set(
|
|
144
145
|
(default is 1)
|
145
146
|
max_subprocesses (int | None): Maximum number of subprocesses to
|
146
147
|
run in parallel (default is os.cpu_count())
|
148
|
+
max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
|
149
|
+
to run in parallel.
|
147
150
|
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
|
148
151
|
log_images: (bool | None): Log base64 encoded version of images,
|
149
152
|
even if specified as a filename or URL (defaults to False)
|
@@ -193,6 +196,7 @@ def eval_set(
|
|
193
196
|
max_samples=max_samples,
|
194
197
|
max_tasks=max_tasks,
|
195
198
|
max_subprocesses=max_subprocesses,
|
199
|
+
max_sandboxes=max_sandboxes,
|
196
200
|
log_samples=log_samples,
|
197
201
|
log_images=log_images,
|
198
202
|
log_buffer=log_buffer,
|
inspect_ai/_eval/registry.py
CHANGED
@@ -146,8 +146,8 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
146
146
|
# module import, so set its task file and run dir
|
147
147
|
if get_installed_package_name(task_type) is None:
|
148
148
|
module = inspect.getmodule(task_type)
|
149
|
-
if module and module
|
150
|
-
file = Path(module
|
149
|
+
if module and hasattr(module, "__file__"):
|
150
|
+
file = Path(getattr(module, "__file__"))
|
151
151
|
setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
|
152
152
|
setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
|
153
153
|
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -267,10 +267,28 @@ def scorers_from_metric_dict(
|
|
267
267
|
value = target_metric(metric_scores)
|
268
268
|
else:
|
269
269
|
value = float("Nan")
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
)
|
270
|
+
|
271
|
+
# convert the value to a float (either by expanding the dict or array)
|
272
|
+
# or by casting to a float
|
273
|
+
if isinstance(value, dict):
|
274
|
+
for key, val in value.items():
|
275
|
+
name = f"{metric_name}_{key}"
|
276
|
+
result_metrics[name] = EvalMetric(
|
277
|
+
name=name,
|
278
|
+
value=cast(float, val),
|
279
|
+
)
|
280
|
+
elif isinstance(value, list):
|
281
|
+
for idx, item in enumerate(value):
|
282
|
+
name = f"{metric_name}_{idx}"
|
283
|
+
result_metrics[name] = EvalMetric(
|
284
|
+
name=name,
|
285
|
+
value=cast(float, item),
|
286
|
+
)
|
287
|
+
else:
|
288
|
+
result_metrics[metric_name] = EvalMetric(
|
289
|
+
name=metric_name,
|
290
|
+
value=cast(float, value),
|
291
|
+
)
|
274
292
|
|
275
293
|
# create a scorer result for this metric
|
276
294
|
# TODO: What if there is separate simple scorer which has a name collision with
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -178,6 +178,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
178
178
|
else:
|
179
179
|
plan = Plan(unroll(solver), internal=True)
|
180
180
|
|
181
|
+
# add setup solver(s) if specified
|
182
|
+
if task.setup:
|
183
|
+
plan.steps = unroll(task.setup) + plan.steps
|
184
|
+
|
181
185
|
# reaolve the scorer
|
182
186
|
score = score and task.scorer is not None
|
183
187
|
scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
|
@@ -275,6 +279,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
275
279
|
sample=sample,
|
276
280
|
state=state,
|
277
281
|
sandbox=sandbox,
|
282
|
+
max_sandboxes=config.max_sandboxes,
|
278
283
|
sandbox_cleanup=sandbox_cleanup,
|
279
284
|
plan=plan,
|
280
285
|
scorers=scorers,
|
@@ -456,6 +461,7 @@ async def task_run_sample(
|
|
456
461
|
sample: Sample,
|
457
462
|
state: TaskState,
|
458
463
|
sandbox: SandboxEnvironmentSpec | None,
|
464
|
+
max_sandboxes: int | None,
|
459
465
|
sandbox_cleanup: bool,
|
460
466
|
plan: Plan,
|
461
467
|
scorers: list[Scorer] | None,
|
@@ -482,8 +488,8 @@ async def task_run_sample(
|
|
482
488
|
await logger.log_sample(previous_sample, flush=False)
|
483
489
|
|
484
490
|
# return score
|
485
|
-
|
486
|
-
|
491
|
+
sample_scores = (
|
492
|
+
{
|
487
493
|
key: SampleScore(
|
488
494
|
sample_id=previous_sample.id,
|
489
495
|
value=score.value,
|
@@ -493,8 +499,11 @@ async def task_run_sample(
|
|
493
499
|
)
|
494
500
|
for key, score in previous_sample.scores.items()
|
495
501
|
}
|
496
|
-
|
497
|
-
|
502
|
+
if previous_sample.scores
|
503
|
+
else {}
|
504
|
+
)
|
505
|
+
sample_complete(sample_scores)
|
506
|
+
return sample_scores
|
498
507
|
|
499
508
|
# use semaphore if provided
|
500
509
|
semaphore_cm: asyncio.Semaphore | contextlib.AbstractAsyncContextManager[None] = (
|
@@ -510,7 +519,7 @@ async def task_run_sample(
|
|
510
519
|
|
511
520
|
# use sandbox if provided
|
512
521
|
sandboxenv_cm = (
|
513
|
-
sandboxenv_context(task_name, sandbox, sandbox_cleanup, sample)
|
522
|
+
sandboxenv_context(task_name, sandbox, max_sandboxes, sandbox_cleanup, sample)
|
514
523
|
if sandbox or sample.sandbox is not None
|
515
524
|
else contextlib.nullcontext()
|
516
525
|
)
|
@@ -866,10 +875,5 @@ def create_sample_semaphore(
|
|
866
875
|
else DEFAULT_MAX_CONNECTIONS
|
867
876
|
)
|
868
877
|
|
869
|
-
# if max_tasks is specified and max_samples is less
|
870
|
-
# than max_tasks then bump it up
|
871
|
-
if config.max_tasks is not None:
|
872
|
-
max_samples = max(max_samples, config.max_tasks)
|
873
|
-
|
874
878
|
# return the semaphore
|
875
879
|
return asyncio.Semaphore(max_samples)
|