inspect-ai 0.3.52__py3-none-any.whl → 0.3.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. inspect_ai/_cli/eval.py +29 -0
  2. inspect_ai/_display/core/progress.py +9 -3
  3. inspect_ai/_display/core/results.py +8 -4
  4. inspect_ai/_display/textual/widgets/task_detail.py +3 -0
  5. inspect_ai/_display/textual/widgets/tasks.py +86 -5
  6. inspect_ai/_eval/eval.py +16 -0
  7. inspect_ai/_eval/evalset.py +4 -0
  8. inspect_ai/_eval/registry.py +2 -2
  9. inspect_ai/_eval/task/results.py +22 -4
  10. inspect_ai/_eval/task/run.py +14 -10
  11. inspect_ai/_eval/task/sandbox.py +72 -43
  12. inspect_ai/_eval/task/task.py +4 -0
  13. inspect_ai/_eval/task/util.py +2 -0
  14. inspect_ai/_view/www/App.css +13 -0
  15. inspect_ai/_view/www/dist/assets/index.css +13 -0
  16. inspect_ai/_view/www/dist/assets/index.js +80 -43
  17. inspect_ai/_view/www/src/App.mjs +31 -6
  18. inspect_ai/_view/www/src/Types.mjs +6 -0
  19. inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
  20. inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
  21. inspect_ai/_view/www/src/components/Tools.mjs +46 -18
  22. inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
  23. inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
  24. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
  25. inspect_ai/log/_log.py +3 -0
  26. inspect_ai/log/_recorders/eval.py +8 -7
  27. inspect_ai/model/_generate_config.py +6 -0
  28. inspect_ai/model/_providers/azureai.py +1 -1
  29. inspect_ai/model/_providers/bedrock.py +17 -1
  30. inspect_ai/model/_providers/hf.py +1 -1
  31. inspect_ai/model/_providers/openai.py +32 -8
  32. inspect_ai/model/_providers/providers.py +1 -1
  33. inspect_ai/model/_providers/vllm.py +1 -1
  34. inspect_ai/util/_sandbox/context.py +1 -2
  35. inspect_ai/util/_sandbox/docker/config.py +8 -10
  36. inspect_ai/util/_sandbox/docker/docker.py +9 -5
  37. inspect_ai/util/_sandbox/docker/util.py +3 -3
  38. inspect_ai/util/_sandbox/environment.py +7 -2
  39. inspect_ai/util/_sandbox/limits.py +1 -1
  40. inspect_ai/util/_sandbox/local.py +8 -9
  41. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/METADATA +1 -3
  42. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/RECORD +46 -46
  43. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/LICENSE +0 -0
  44. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/WHEEL +0 -0
  45. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/entry_points.txt +0 -0
  46. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.53.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
30
30
  MAX_SUBPROCESSES_HELP = (
31
31
  "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
32
32
  )
33
+ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
33
34
  NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
34
35
  FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
35
36
  NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
@@ -192,6 +193,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
192
193
  help=MAX_SUBPROCESSES_HELP,
193
194
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
194
195
  )
196
+ @click.option(
197
+ "--max-sandboxes",
198
+ type=int,
199
+ help=MAX_SANDBOXES_HELP,
200
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
201
+ )
195
202
  @click.option(
196
203
  "--message-limit",
197
204
  type=int,
@@ -361,6 +368,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
361
368
  help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
362
369
  envvar="INSPECT_EVAL_CACHE_PROMPT",
363
370
  )
371
+ @click.option(
372
+ "--reasoning-effort",
373
+ type=click.Choice(["low", "medium", "high"]),
374
+ help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
375
+ envvar="INSPECT_EVAL_REASONING_EFFORT",
376
+ )
364
377
  @click.option(
365
378
  "--log-format",
366
379
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -419,12 +432,14 @@ def eval_command(
419
432
  parallel_tool_calls: bool | None,
420
433
  max_tool_output: int | None,
421
434
  cache_prompt: str | None,
435
+ reasoning_effort: str | None,
422
436
  message_limit: int | None,
423
437
  token_limit: int | None,
424
438
  time_limit: int | None,
425
439
  max_samples: int | None,
426
440
  max_tasks: int | None,
427
441
  max_subprocesses: int | None,
442
+ max_sandboxes: int | None,
428
443
  fail_on_error: bool | float | None,
429
444
  no_fail_on_error: bool | None,
430
445
  no_log_samples: bool | None,
@@ -472,6 +487,7 @@ def eval_command(
472
487
  max_samples=max_samples,
473
488
  max_tasks=max_tasks,
474
489
  max_subprocesses=max_subprocesses,
490
+ max_sandboxes=max_sandboxes,
475
491
  fail_on_error=fail_on_error,
476
492
  no_fail_on_error=no_fail_on_error,
477
493
  debug_errors=common["debug_errors"],
@@ -573,12 +589,14 @@ def eval_set_command(
573
589
  parallel_tool_calls: bool | None,
574
590
  max_tool_output: int | None,
575
591
  cache_prompt: str | None,
592
+ reasoning_effort: str | None,
576
593
  message_limit: int | None,
577
594
  token_limit: int | None,
578
595
  time_limit: int | None,
579
596
  max_samples: int | None,
580
597
  max_tasks: int | None,
581
598
  max_subprocesses: int | None,
599
+ max_sandboxes: int | None,
582
600
  fail_on_error: bool | float | None,
583
601
  no_fail_on_error: bool | None,
584
602
  no_log_samples: bool | None,
@@ -628,6 +646,7 @@ def eval_set_command(
628
646
  max_samples=max_samples,
629
647
  max_tasks=max_tasks,
630
648
  max_subprocesses=max_subprocesses,
649
+ max_sandboxes=max_sandboxes,
631
650
  fail_on_error=fail_on_error,
632
651
  no_fail_on_error=no_fail_on_error,
633
652
  debug_errors=common["debug_errors"],
@@ -679,6 +698,7 @@ def eval_exec(
679
698
  max_samples: int | None,
680
699
  max_tasks: int | None,
681
700
  max_subprocesses: int | None,
701
+ max_sandboxes: int | None,
682
702
  fail_on_error: bool | float | None,
683
703
  no_fail_on_error: bool | None,
684
704
  debug_errors: bool | None,
@@ -756,6 +776,7 @@ def eval_exec(
756
776
  max_samples=max_samples,
757
777
  max_tasks=max_tasks,
758
778
  max_subprocesses=max_subprocesses,
779
+ max_sandboxes=max_sandboxes,
759
780
  log_samples=log_samples,
760
781
  log_images=log_images,
761
782
  log_buffer=log_buffer,
@@ -834,6 +855,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
834
855
  help=MAX_SUBPROCESSES_HELP,
835
856
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
836
857
  )
858
+ @click.option(
859
+ "--max-sandboxes",
860
+ type=int,
861
+ help=MAX_SANDBOXES_HELP,
862
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
863
+ )
837
864
  @click.option(
838
865
  "--no-sandbox-cleanup",
839
866
  type=bool,
@@ -904,6 +931,7 @@ def eval_retry_command(
904
931
  max_samples: int | None,
905
932
  max_tasks: int | None,
906
933
  max_subprocesses: int | None,
934
+ max_sandboxes: int | None,
907
935
  no_sandbox_cleanup: bool | None,
908
936
  trace: bool | None,
909
937
  fail_on_error: bool | float | None,
@@ -947,6 +975,7 @@ def eval_retry_command(
947
975
  max_samples=max_samples,
948
976
  max_tasks=max_tasks,
949
977
  max_subprocesses=max_subprocesses,
978
+ max_sandboxes=max_sandboxes,
950
979
  sandbox_cleanup=sandbox_cleanup,
951
980
  trace=trace,
952
981
  fail_on_error=fail_on_error,
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
130
130
  return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
131
131
 
132
132
 
133
- def progress_count(complete: int, total: int) -> str:
134
- # Pad the display to keep it stable
133
+ def progress_count(complete: int, total: int, width: int | None = None) -> str:
134
+ # Pad the display to keep it stable as the
135
+ # complete metrics
135
136
  total_str = f"{total:,}"
136
137
  complete_str = f"{complete:,}"
137
138
  padding = max(0, len(total_str) - len(complete_str))
138
- return " " * padding + f"[{complete_str}/{total_str}]"
139
+ padded = " " * padding + f"[{complete_str}/{total_str}]"
140
+
141
+ # If a width has ben specified, pad up to this width as well
142
+ if width is not None:
143
+ padded = padded.rjust(width)
144
+ return padded
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
166
166
  return message
167
167
 
168
168
 
169
- def task_metric(metrics: list[TaskDisplayMetric]) -> str:
169
+ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
170
170
  reducer_names: Set[str] = {
171
171
  metric.reducer for metric in metrics if metric.reducer is not None
172
172
  }
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
180
180
  else:
181
181
  value = f"{metric.value:.2f}"
182
182
 
183
- if show_reducer:
184
- return f"{metric.name}/{metric.reducer}: {value}"
183
+ if show_reducer and metric.reducer is not None:
184
+ metric_str = f"{metric.name}/{metric.reducer}: {value}"
185
185
  else:
186
- return f"{metric.name}: {value}"
186
+ metric_str = f"{metric.name}: {value}"
187
+
188
+ if width is not None:
189
+ metric_str = metric_str.rjust(width)
190
+ return metric_str
187
191
 
188
192
 
189
193
  def task_metrics(scores: list[EvalScore]) -> str:
@@ -224,6 +224,9 @@ class TaskMetrics(Widget):
224
224
  self.recompute_grid()
225
225
 
226
226
  def recompute_grid(self) -> None:
227
+ if not self.is_mounted:
228
+ return
229
+
227
230
  grid = self.query_one(f"#{self.grid_id()}")
228
231
 
229
232
  grid.remove_children()
@@ -36,6 +36,9 @@ from ...core.progress import (
36
36
  progress_model_name,
37
37
  )
38
38
 
39
+ MAX_METRIC_WIDTH = 25
40
+ MAX_COUNT_WIDTH = 15
41
+
39
42
 
40
43
  class TasksView(Container):
41
44
  DEFAULT_CSS = """
@@ -68,6 +71,7 @@ class TasksView(Container):
68
71
  super().__init__()
69
72
  self.description_width = MAX_DESCRIPTION_WIDTH
70
73
  self.model_name_width = MAX_MODEL_NAME_WIDTH
74
+ self.sample_count_width = 0
71
75
 
72
76
  def init_tasks(self, tasks: list[TaskSpec]) -> None:
73
77
  # clear existing tasks
@@ -80,15 +84,41 @@ class TasksView(Container):
80
84
  self.model_name_width = min(
81
85
  max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
82
86
  )
87
+ self.update_progress_widths()
83
88
 
84
89
  def add_task(self, task: TaskWithResult) -> TaskDisplay:
90
+ self.update_count_width(task.profile.samples)
85
91
  task_display = TaskProgressView(
86
- task, self.description_width, self.model_name_width
92
+ task, self.description_width, self.model_name_width, self.sample_count_width
87
93
  )
88
94
  self.tasks.mount(task_display)
89
95
  self.tasks.scroll_to_widget(task_display)
96
+ self.update_progress_widths()
97
+
90
98
  return task_display
91
99
 
100
+ def update_count_width(self, samples: int) -> None:
101
+ sample_count_str = progress_count(samples, samples, self.sample_count_width)
102
+ self.sample_count_width = min(
103
+ max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
104
+ )
105
+
106
+ def update_progress_widths(self) -> None:
107
+ progress_views = self.tasks.query_children(TaskProgressView)
108
+ metrics_size = 0
109
+ for progress_view in progress_views:
110
+ metrics_size = max(
111
+ metrics_size,
112
+ progress_view.metrics_width
113
+ if progress_view.metrics_width is not None
114
+ else 0,
115
+ )
116
+ metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
117
+
118
+ for progress_view in progress_views:
119
+ progress_view.update_metrics_width(metrics_size)
120
+ progress_view.update_count_width(self.sample_count_width)
121
+
92
122
  def compose(self) -> ComposeResult:
93
123
  yield Static(id="tasks-config")
94
124
  yield Static(id="tasks-targets")
@@ -139,13 +169,18 @@ class TaskProgressView(Widget):
139
169
  """
140
170
 
141
171
  def __init__(
142
- self, task: TaskWithResult, description_width: int, model_name_width: int
172
+ self,
173
+ task: TaskWithResult,
174
+ description_width: int,
175
+ model_name_width: int,
176
+ sample_count_width: int,
143
177
  ) -> None:
144
178
  super().__init__()
145
179
  self.t = task
146
180
 
147
181
  self.description_width = description_width
148
182
  self.model_name_width = model_name_width
183
+
149
184
  self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
150
185
  self.count_display = Static()
151
186
  self.metrics_display = Static(id="task-metrics")
@@ -154,6 +189,14 @@ class TaskProgressView(Widget):
154
189
  self.toggle = Toggle()
155
190
  self.task_detail = TaskDetail(id="task-detail", classes="hidden")
156
191
 
192
+ self.sample_count_width: int = sample_count_width
193
+
194
+ metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
195
+ metrics_width: reactive[int | None] = reactive(None)
196
+ sample_count_width: reactive[int] = reactive(0)
197
+ samples_complete: reactive[int] = reactive(0)
198
+ samples_total: reactive[int] = reactive(0)
199
+
157
200
  def compose(self) -> ComposeResult:
158
201
  yield self.toggle
159
202
  yield TaskStatusIcon()
@@ -191,13 +234,51 @@ class TaskProgressView(Widget):
191
234
  self.task_progress.complete()
192
235
 
193
236
  def sample_complete(self, complete: int, total: int) -> None:
194
- self.count_display.update(progress_count(complete, total))
237
+ self.samples_complete = complete
238
+ self.samples_total = total
195
239
 
196
240
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
197
- if len(metrics) > 0:
198
- self.metrics_display.update(task_metric(metrics))
241
+ self.metrics = metrics
242
+
243
+ def update_metrics_width(self, width: int) -> None:
244
+ self.metrics_width = width
245
+
246
+ def update_count_width(self, width: int) -> None:
247
+ self.sample_count_width = width
248
+
249
+ def _watch_sample_count_width(self, width: int) -> None:
250
+ self.refresh_count()
251
+
252
+ def _watch_samples_complete(self, complete: int) -> None:
253
+ self.refresh_count()
254
+
255
+ def _watch_samples_total(self, total: int) -> None:
256
+ self.refresh_count()
257
+
258
+ def _watch_metrics_width(self, width: int) -> None:
259
+ self.update_metrics_label()
260
+
261
+ def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
262
+ if metrics is not None and len(metrics) > 0:
263
+ # update label
264
+ self.update_metrics_label()
265
+
266
+ # update details
199
267
  self.task_detail.update_metrics(metrics)
200
268
 
269
+ def refresh_count(self) -> None:
270
+ progress_label = progress_count(
271
+ self.samples_complete, self.samples_total, self.sample_count_width
272
+ )
273
+ self.count_display.update(progress_label)
274
+
275
+ def update_metrics_label(self) -> None:
276
+ # compute the label (with a min size)
277
+ if self.metrics is not None:
278
+ metric_label = task_metric(self.metrics, self.metrics_width)
279
+ self.metrics_width = len(metric_label)
280
+ self.metrics_display.update(metric_label)
281
+
201
282
 
202
283
  class TaskStatusIcon(Static):
203
284
  result: reactive[TaskResult | None] = reactive(None)
inspect_ai/_eval/eval.py CHANGED
@@ -71,6 +71,7 @@ def eval(
71
71
  max_samples: int | None = None,
72
72
  max_tasks: int | None = None,
73
73
  max_subprocesses: int | None = None,
74
+ max_sandboxes: int | None = None,
74
75
  log_samples: bool | None = None,
75
76
  log_images: bool | None = None,
76
77
  log_buffer: int | None = None,
@@ -129,6 +130,8 @@ def eval(
129
130
  (default is 1)
130
131
  max_subprocesses (int | None): Maximum number of subprocesses to
131
132
  run in parallel (default is os.cpu_count())
133
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
134
+ to run in parallel.
132
135
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
133
136
  log_images: (bool | None): Log base64 encoded version of images,
134
137
  even if specified as a filename or URL (defaults to False)
@@ -175,6 +178,7 @@ def eval(
175
178
  max_samples=max_samples,
176
179
  max_tasks=max_tasks,
177
180
  max_subprocesses=max_subprocesses,
181
+ max_sandboxes=max_sandboxes,
178
182
  log_samples=log_samples,
179
183
  log_images=log_images,
180
184
  log_buffer=log_buffer,
@@ -211,6 +215,7 @@ async def eval_async(
211
215
  max_samples: int | None = None,
212
216
  max_tasks: int | None = None,
213
217
  max_subprocesses: int | None = None,
218
+ max_sandboxes: int | None = None,
214
219
  log_samples: bool | None = None,
215
220
  log_images: bool | None = None,
216
221
  log_buffer: int | None = None,
@@ -268,6 +273,8 @@ async def eval_async(
268
273
  (default is 1)
269
274
  max_subprocesses (int | None): Maximum number of subprocesses to
270
275
  run in parallel (default is os.cpu_count())
276
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
277
+ to run in parallel.
271
278
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
272
279
  log_images: (bool | None): Log base64 encoded version of images,
273
280
  even if specified as a filename or URL (defaults to False)
@@ -368,6 +375,7 @@ async def eval_async(
368
375
  max_samples=max_samples,
369
376
  max_tasks=max_tasks,
370
377
  max_subprocesses=max_subprocesses,
378
+ max_sandboxes=max_sandboxes,
371
379
  sandbox_cleanup=sandbox_cleanup,
372
380
  log_samples=log_samples,
373
381
  log_images=log_images,
@@ -450,6 +458,7 @@ def eval_retry(
450
458
  max_samples: int | None = None,
451
459
  max_tasks: int | None = None,
452
460
  max_subprocesses: int | None = None,
461
+ max_sandboxes: int | None = None,
453
462
  sandbox_cleanup: bool | None = None,
454
463
  trace: bool | None = None,
455
464
  fail_on_error: bool | float | None = None,
@@ -480,6 +489,8 @@ def eval_retry(
480
489
  (default is 1)
481
490
  max_subprocesses (int | None): Maximum number of subprocesses to
482
491
  run in parallel (default is os.cpu_count())
492
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
493
+ to run in parallel.
483
494
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
484
495
  (defaults to True)
485
496
  trace (bool | None): Trace message interactions with evaluated model to terminal.
@@ -522,6 +533,7 @@ def eval_retry(
522
533
  max_samples=max_samples,
523
534
  max_tasks=max_tasks,
524
535
  max_subprocesses=max_subprocesses,
536
+ max_sandboxes=max_sandboxes,
525
537
  sandbox_cleanup=sandbox_cleanup,
526
538
  fail_on_error=fail_on_error,
527
539
  debug_errors=debug_errors,
@@ -545,6 +557,7 @@ async def eval_retry_async(
545
557
  max_samples: int | None = None,
546
558
  max_tasks: int | None = None,
547
559
  max_subprocesses: int | None = None,
560
+ max_sandboxes: int | None = None,
548
561
  sandbox_cleanup: bool | None = None,
549
562
  fail_on_error: bool | float | None = None,
550
563
  debug_errors: bool | None = None,
@@ -574,6 +587,7 @@ async def eval_retry_async(
574
587
  (default is 1)
575
588
  max_subprocesses (int): Maximum number of subprocesses to
576
589
  run in parallel (default is os.cpu_count())
590
+ max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
577
591
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
578
592
  (defaults to True)
579
593
  fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -665,6 +679,7 @@ async def eval_retry_async(
665
679
  max_samples = max_samples or eval_log.eval.config.max_samples
666
680
  max_tasks = max_tasks or eval_log.eval.config.max_tasks
667
681
  max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
682
+ max_sandboxes = max_sandboxes or eval_log.eval.config.max_sandboxes
668
683
  sandbox_cleanup = (
669
684
  sandbox_cleanup
670
685
  if sandbox_cleanup is not None
@@ -720,6 +735,7 @@ async def eval_retry_async(
720
735
  max_samples=max_samples,
721
736
  max_tasks=max_tasks,
722
737
  max_subprocesses=max_subprocesses,
738
+ max_sandboxes=max_sandboxes,
723
739
  log_samples=log_samples,
724
740
  log_images=log_images,
725
741
  log_buffer=log_buffer,
@@ -75,6 +75,7 @@ def eval_set(
75
75
  max_samples: int | None = None,
76
76
  max_tasks: int | None = None,
77
77
  max_subprocesses: int | None = None,
78
+ max_sandboxes: int | None = None,
78
79
  log_samples: bool | None = None,
79
80
  log_images: bool | None = None,
80
81
  log_buffer: int | None = None,
@@ -144,6 +145,8 @@ def eval_set(
144
145
  (default is 1)
145
146
  max_subprocesses (int | None): Maximum number of subprocesses to
146
147
  run in parallel (default is os.cpu_count())
148
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
149
+ to run in parallel.
147
150
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
148
151
  log_images: (bool | None): Log base64 encoded version of images,
149
152
  even if specified as a filename or URL (defaults to False)
@@ -193,6 +196,7 @@ def eval_set(
193
196
  max_samples=max_samples,
194
197
  max_tasks=max_tasks,
195
198
  max_subprocesses=max_subprocesses,
199
+ max_sandboxes=max_sandboxes,
196
200
  log_samples=log_samples,
197
201
  log_images=log_images,
198
202
  log_buffer=log_buffer,
@@ -146,8 +146,8 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
146
146
  # module import, so set its task file and run dir
147
147
  if get_installed_package_name(task_type) is None:
148
148
  module = inspect.getmodule(task_type)
149
- if module and module.__file__:
150
- file = Path(module.__file__)
149
+ if module and hasattr(module, "__file__"):
150
+ file = Path(getattr(module, "__file__"))
151
151
  setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
152
152
  setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
153
153
 
@@ -267,10 +267,28 @@ def scorers_from_metric_dict(
267
267
  value = target_metric(metric_scores)
268
268
  else:
269
269
  value = float("Nan")
270
- result_metrics[metric_name] = EvalMetric(
271
- name=metric_name,
272
- value=cast(float, value),
273
- )
270
+
271
+ # convert the value to a float (either by expanding the dict or array)
272
+ # or by casting to a float
273
+ if isinstance(value, dict):
274
+ for key, val in value.items():
275
+ name = f"{metric_name}_{key}"
276
+ result_metrics[name] = EvalMetric(
277
+ name=name,
278
+ value=cast(float, val),
279
+ )
280
+ elif isinstance(value, list):
281
+ for idx, item in enumerate(value):
282
+ name = f"{metric_name}_{idx}"
283
+ result_metrics[name] = EvalMetric(
284
+ name=name,
285
+ value=cast(float, item),
286
+ )
287
+ else:
288
+ result_metrics[metric_name] = EvalMetric(
289
+ name=metric_name,
290
+ value=cast(float, value),
291
+ )
274
292
 
275
293
  # create a scorer result for this metric
276
294
  # TODO: What if there is separate simple scorer which has a name collision with
@@ -178,6 +178,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
178
178
  else:
179
179
  plan = Plan(unroll(solver), internal=True)
180
180
 
181
+ # add setup solver(s) if specified
182
+ if task.setup:
183
+ plan.steps = unroll(task.setup) + plan.steps
184
+
181
185
  # reaolve the scorer
182
186
  score = score and task.scorer is not None
183
187
  scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
@@ -275,6 +279,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
275
279
  sample=sample,
276
280
  state=state,
277
281
  sandbox=sandbox,
282
+ max_sandboxes=config.max_sandboxes,
278
283
  sandbox_cleanup=sandbox_cleanup,
279
284
  plan=plan,
280
285
  scorers=scorers,
@@ -456,6 +461,7 @@ async def task_run_sample(
456
461
  sample: Sample,
457
462
  state: TaskState,
458
463
  sandbox: SandboxEnvironmentSpec | None,
464
+ max_sandboxes: int | None,
459
465
  sandbox_cleanup: bool,
460
466
  plan: Plan,
461
467
  scorers: list[Scorer] | None,
@@ -482,8 +488,8 @@ async def task_run_sample(
482
488
  await logger.log_sample(previous_sample, flush=False)
483
489
 
484
490
  # return score
485
- if previous_sample.scores:
486
- return {
491
+ sample_scores = (
492
+ {
487
493
  key: SampleScore(
488
494
  sample_id=previous_sample.id,
489
495
  value=score.value,
@@ -493,8 +499,11 @@ async def task_run_sample(
493
499
  )
494
500
  for key, score in previous_sample.scores.items()
495
501
  }
496
- else:
497
- return {}
502
+ if previous_sample.scores
503
+ else {}
504
+ )
505
+ sample_complete(sample_scores)
506
+ return sample_scores
498
507
 
499
508
  # use semaphore if provided
500
509
  semaphore_cm: asyncio.Semaphore | contextlib.AbstractAsyncContextManager[None] = (
@@ -510,7 +519,7 @@ async def task_run_sample(
510
519
 
511
520
  # use sandbox if provided
512
521
  sandboxenv_cm = (
513
- sandboxenv_context(task_name, sandbox, sandbox_cleanup, sample)
522
+ sandboxenv_context(task_name, sandbox, max_sandboxes, sandbox_cleanup, sample)
514
523
  if sandbox or sample.sandbox is not None
515
524
  else contextlib.nullcontext()
516
525
  )
@@ -866,10 +875,5 @@ def create_sample_semaphore(
866
875
  else DEFAULT_MAX_CONNECTIONS
867
876
  )
868
877
 
869
- # if max_tasks is specified and max_samples is less
870
- # than max_tasks then bump it up
871
- if config.max_tasks is not None:
872
- max_samples = max(max_samples, config.max_tasks)
873
-
874
878
  # return the semaphore
875
879
  return asyncio.Semaphore(max_samples)