inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. inspect_ai/_cli/eval.py +55 -1
  2. inspect_ai/_cli/main.py +2 -0
  3. inspect_ai/_cli/trace.py +244 -0
  4. inspect_ai/_display/core/progress.py +9 -3
  5. inspect_ai/_display/core/results.py +8 -4
  6. inspect_ai/_display/textual/app.py +5 -1
  7. inspect_ai/_display/textual/widgets/task_detail.py +3 -0
  8. inspect_ai/_display/textual/widgets/tasks.py +97 -6
  9. inspect_ai/_eval/eval.py +33 -0
  10. inspect_ai/_eval/evalset.py +4 -0
  11. inspect_ai/_eval/registry.py +2 -2
  12. inspect_ai/_eval/task/images.py +4 -14
  13. inspect_ai/_eval/task/results.py +22 -4
  14. inspect_ai/_eval/task/run.py +40 -20
  15. inspect_ai/_eval/task/sandbox.py +72 -43
  16. inspect_ai/_eval/task/task.py +4 -0
  17. inspect_ai/_eval/task/util.py +2 -0
  18. inspect_ai/_util/constants.py +3 -3
  19. inspect_ai/_util/display.py +1 -0
  20. inspect_ai/_util/logger.py +34 -8
  21. inspect_ai/_util/trace.py +275 -0
  22. inspect_ai/_view/www/App.css +13 -0
  23. inspect_ai/_view/www/dist/assets/index.css +13 -0
  24. inspect_ai/_view/www/dist/assets/index.js +80 -43
  25. inspect_ai/_view/www/src/App.mjs +31 -6
  26. inspect_ai/_view/www/src/Types.mjs +6 -0
  27. inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
  28. inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
  29. inspect_ai/_view/www/src/components/Tools.mjs +46 -18
  30. inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
  31. inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
  32. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
  33. inspect_ai/log/_log.py +6 -0
  34. inspect_ai/log/_message.py +2 -2
  35. inspect_ai/log/_recorders/eval.py +8 -18
  36. inspect_ai/log/_recorders/json.py +19 -17
  37. inspect_ai/model/_cache.py +22 -16
  38. inspect_ai/model/_call_tools.py +9 -1
  39. inspect_ai/model/_generate_config.py +8 -2
  40. inspect_ai/model/_model.py +11 -12
  41. inspect_ai/model/_providers/azureai.py +1 -1
  42. inspect_ai/model/_providers/bedrock.py +18 -2
  43. inspect_ai/model/_providers/hf.py +1 -1
  44. inspect_ai/model/_providers/openai.py +32 -8
  45. inspect_ai/model/_providers/providers.py +1 -1
  46. inspect_ai/model/_providers/vllm.py +1 -1
  47. inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
  48. inspect_ai/util/_sandbox/context.py +7 -3
  49. inspect_ai/util/_sandbox/docker/compose.py +58 -19
  50. inspect_ai/util/_sandbox/docker/config.py +8 -10
  51. inspect_ai/util/_sandbox/docker/docker.py +20 -16
  52. inspect_ai/util/_sandbox/docker/util.py +3 -9
  53. inspect_ai/util/_sandbox/environment.py +7 -2
  54. inspect_ai/util/_sandbox/limits.py +1 -1
  55. inspect_ai/util/_sandbox/local.py +8 -9
  56. inspect_ai/util/_sandbox/service.py +17 -7
  57. inspect_ai/util/_subprocess.py +6 -1
  58. inspect_ai/util/_subtask.py +8 -2
  59. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
  60. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
  61. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
  62. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
  63. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
  64. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0
@@ -36,6 +36,9 @@ from ...core.progress import (
36
36
  progress_model_name,
37
37
  )
38
38
 
39
+ MAX_METRIC_WIDTH = 25
40
+ MAX_COUNT_WIDTH = 15
41
+
39
42
 
40
43
  class TasksView(Container):
41
44
  DEFAULT_CSS = """
@@ -68,6 +71,8 @@ class TasksView(Container):
68
71
  super().__init__()
69
72
  self.description_width = MAX_DESCRIPTION_WIDTH
70
73
  self.model_name_width = MAX_MODEL_NAME_WIDTH
74
+ self.sample_count_width = 0
75
+ self.display_metrics = True
71
76
 
72
77
  def init_tasks(self, tasks: list[TaskSpec]) -> None:
73
78
  # clear existing tasks
@@ -80,15 +85,48 @@ class TasksView(Container):
80
85
  self.model_name_width = min(
81
86
  max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
82
87
  )
88
+ self.update_progress_widths()
83
89
 
84
90
  def add_task(self, task: TaskWithResult) -> TaskDisplay:
91
+ self.update_count_width(task.profile.samples)
85
92
  task_display = TaskProgressView(
86
- task, self.description_width, self.model_name_width
93
+ task,
94
+ self.description_width,
95
+ self.model_name_width,
96
+ self.sample_count_width,
97
+ self.display_metrics,
87
98
  )
88
99
  self.tasks.mount(task_display)
89
100
  self.tasks.scroll_to_widget(task_display)
101
+ self.update_progress_widths()
102
+
90
103
  return task_display
91
104
 
105
+ def set_display_metrics(self, display_metrics: bool) -> None:
106
+ self.display_metrics = display_metrics
107
+
108
+ def update_count_width(self, samples: int) -> None:
109
+ sample_count_str = progress_count(samples, samples, self.sample_count_width)
110
+ self.sample_count_width = min(
111
+ max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
112
+ )
113
+
114
+ def update_progress_widths(self) -> None:
115
+ progress_views = self.tasks.query_children(TaskProgressView)
116
+ metrics_size = 0
117
+ for progress_view in progress_views:
118
+ metrics_size = max(
119
+ metrics_size,
120
+ progress_view.metrics_width
121
+ if progress_view.metrics_width is not None
122
+ else 0,
123
+ )
124
+ metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
125
+
126
+ for progress_view in progress_views:
127
+ progress_view.update_metrics_width(metrics_size)
128
+ progress_view.update_count_width(self.sample_count_width)
129
+
92
130
  def compose(self) -> ComposeResult:
93
131
  yield Static(id="tasks-config")
94
132
  yield Static(id="tasks-targets")
@@ -139,13 +177,19 @@ class TaskProgressView(Widget):
139
177
  """
140
178
 
141
179
  def __init__(
142
- self, task: TaskWithResult, description_width: int, model_name_width: int
180
+ self,
181
+ task: TaskWithResult,
182
+ description_width: int,
183
+ model_name_width: int,
184
+ sample_count_width: int,
185
+ display_metrics: bool,
143
186
  ) -> None:
144
187
  super().__init__()
145
188
  self.t = task
146
189
 
147
190
  self.description_width = description_width
148
191
  self.model_name_width = model_name_width
192
+
149
193
  self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
150
194
  self.count_display = Static()
151
195
  self.metrics_display = Static(id="task-metrics")
@@ -154,8 +198,17 @@ class TaskProgressView(Widget):
154
198
  self.toggle = Toggle()
155
199
  self.task_detail = TaskDetail(id="task-detail", classes="hidden")
156
200
 
201
+ self.sample_count_width: int = sample_count_width
202
+ self.display_metrics = display_metrics
203
+
204
+ metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
205
+ metrics_width: reactive[int | None] = reactive(None)
206
+ sample_count_width: reactive[int] = reactive(0)
207
+ samples_complete: reactive[int] = reactive(0)
208
+ samples_total: reactive[int] = reactive(0)
209
+
157
210
  def compose(self) -> ComposeResult:
158
- yield self.toggle
211
+ yield (self.toggle if self.display_metrics else Static())
159
212
  yield TaskStatusIcon()
160
213
  yield Static(
161
214
  progress_description(self.t.profile, self.description_width, pad=True)
@@ -191,13 +244,51 @@ class TaskProgressView(Widget):
191
244
  self.task_progress.complete()
192
245
 
193
246
  def sample_complete(self, complete: int, total: int) -> None:
194
- self.count_display.update(progress_count(complete, total))
247
+ self.samples_complete = complete
248
+ self.samples_total = total
195
249
 
196
250
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
197
- if len(metrics) > 0:
198
- self.metrics_display.update(task_metric(metrics))
251
+ self.metrics = metrics
252
+
253
+ def update_metrics_width(self, width: int) -> None:
254
+ self.metrics_width = width
255
+
256
+ def update_count_width(self, width: int) -> None:
257
+ self.sample_count_width = width
258
+
259
+ def _watch_sample_count_width(self, width: int) -> None:
260
+ self.refresh_count()
261
+
262
+ def _watch_samples_complete(self, complete: int) -> None:
263
+ self.refresh_count()
264
+
265
+ def _watch_samples_total(self, total: int) -> None:
266
+ self.refresh_count()
267
+
268
+ def _watch_metrics_width(self, width: int) -> None:
269
+ self.update_metrics_label()
270
+
271
+ def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
272
+ if metrics is not None and len(metrics) > 0:
273
+ # update label
274
+ self.update_metrics_label()
275
+
276
+ # update details
199
277
  self.task_detail.update_metrics(metrics)
200
278
 
279
+ def refresh_count(self) -> None:
280
+ progress_label = progress_count(
281
+ self.samples_complete, self.samples_total, self.sample_count_width
282
+ )
283
+ self.count_display.update(progress_label)
284
+
285
+ def update_metrics_label(self) -> None:
286
+ # compute the label (with a min size)
287
+ if self.metrics is not None and self.metrics_display is not None:
288
+ metric_label = task_metric(self.metrics, self.metrics_width)
289
+ self.metrics_width = len(metric_label)
290
+ self.metrics_display.update(metric_label)
291
+
201
292
 
202
293
  class TaskStatusIcon(Static):
203
294
  result: reactive[TaskResult | None] = reactive(None)
inspect_ai/_eval/eval.py CHANGED
@@ -71,10 +71,12 @@ def eval(
71
71
  max_samples: int | None = None,
72
72
  max_tasks: int | None = None,
73
73
  max_subprocesses: int | None = None,
74
+ max_sandboxes: int | None = None,
74
75
  log_samples: bool | None = None,
75
76
  log_images: bool | None = None,
76
77
  log_buffer: int | None = None,
77
78
  score: bool = True,
79
+ score_display: bool | None = None,
78
80
  **kwargs: Unpack[GenerateConfigArgs],
79
81
  ) -> list[EvalLog]:
80
82
  r"""Evaluate tasks using a Model.
@@ -129,6 +131,8 @@ def eval(
129
131
  (default is 1)
130
132
  max_subprocesses (int | None): Maximum number of subprocesses to
131
133
  run in parallel (default is os.cpu_count())
134
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
135
+ to run in parallel.
132
136
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
133
137
  log_images: (bool | None): Log base64 encoded version of images,
134
138
  even if specified as a filename or URL (defaults to False)
@@ -136,6 +140,7 @@ def eval(
136
140
  If not specified, an appropriate default for the format and filesystem is
137
141
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
138
142
  score (bool): Score output (defaults to True)
143
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
139
144
  **kwargs (GenerateConfigArgs): Model generation options.
140
145
 
141
146
  Returns:
@@ -175,10 +180,12 @@ def eval(
175
180
  max_samples=max_samples,
176
181
  max_tasks=max_tasks,
177
182
  max_subprocesses=max_subprocesses,
183
+ max_sandboxes=max_sandboxes,
178
184
  log_samples=log_samples,
179
185
  log_images=log_images,
180
186
  log_buffer=log_buffer,
181
187
  score=score,
188
+ score_display=score_display,
182
189
  **kwargs,
183
190
  )
184
191
  )
@@ -211,10 +218,12 @@ async def eval_async(
211
218
  max_samples: int | None = None,
212
219
  max_tasks: int | None = None,
213
220
  max_subprocesses: int | None = None,
221
+ max_sandboxes: int | None = None,
214
222
  log_samples: bool | None = None,
215
223
  log_images: bool | None = None,
216
224
  log_buffer: int | None = None,
217
225
  score: bool = True,
226
+ score_display: bool | None = None,
218
227
  **kwargs: Unpack[GenerateConfigArgs],
219
228
  ) -> list[EvalLog]:
220
229
  r"""Evaluate tasks using a Model (async).
@@ -268,6 +277,8 @@ async def eval_async(
268
277
  (default is 1)
269
278
  max_subprocesses (int | None): Maximum number of subprocesses to
270
279
  run in parallel (default is os.cpu_count())
280
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
281
+ to run in parallel.
271
282
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
272
283
  log_images: (bool | None): Log base64 encoded version of images,
273
284
  even if specified as a filename or URL (defaults to False)
@@ -275,6 +286,7 @@ async def eval_async(
275
286
  If not specified, an appropriate default for the format and filesystem is
276
287
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
277
288
  score (bool): Score output (defaults to True)
289
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
278
290
  **kwargs (GenerateConfigArgs): Model generation options.
279
291
 
280
292
  Returns:
@@ -368,10 +380,12 @@ async def eval_async(
368
380
  max_samples=max_samples,
369
381
  max_tasks=max_tasks,
370
382
  max_subprocesses=max_subprocesses,
383
+ max_sandboxes=max_sandboxes,
371
384
  sandbox_cleanup=sandbox_cleanup,
372
385
  log_samples=log_samples,
373
386
  log_images=log_images,
374
387
  log_buffer=log_buffer,
388
+ score_display=score_display,
375
389
  )
376
390
 
377
391
  # run tasks - 2 codepaths, one for the traditional task at a time
@@ -450,6 +464,7 @@ def eval_retry(
450
464
  max_samples: int | None = None,
451
465
  max_tasks: int | None = None,
452
466
  max_subprocesses: int | None = None,
467
+ max_sandboxes: int | None = None,
453
468
  sandbox_cleanup: bool | None = None,
454
469
  trace: bool | None = None,
455
470
  fail_on_error: bool | float | None = None,
@@ -458,6 +473,7 @@ def eval_retry(
458
473
  log_images: bool | None = None,
459
474
  log_buffer: int | None = None,
460
475
  score: bool = True,
476
+ score_display: bool | None = None,
461
477
  max_retries: int | None = None,
462
478
  timeout: int | None = None,
463
479
  max_connections: int | None = None,
@@ -480,6 +496,8 @@ def eval_retry(
480
496
  (default is 1)
481
497
  max_subprocesses (int | None): Maximum number of subprocesses to
482
498
  run in parallel (default is os.cpu_count())
499
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
500
+ to run in parallel.
483
501
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
484
502
  (defaults to True)
485
503
  trace (bool | None): Trace message interactions with evaluated model to terminal.
@@ -496,6 +514,7 @@ def eval_retry(
496
514
  If not specified, an appropriate default for the format and filesystem is
497
515
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
498
516
  score (bool): Score output (defaults to True)
517
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
499
518
  max_retries (int | None):
500
519
  Maximum number of times to retry request.
501
520
  timeout: (int | None):
@@ -522,6 +541,7 @@ def eval_retry(
522
541
  max_samples=max_samples,
523
542
  max_tasks=max_tasks,
524
543
  max_subprocesses=max_subprocesses,
544
+ max_sandboxes=max_sandboxes,
525
545
  sandbox_cleanup=sandbox_cleanup,
526
546
  fail_on_error=fail_on_error,
527
547
  debug_errors=debug_errors,
@@ -529,6 +549,7 @@ def eval_retry(
529
549
  log_images=log_images,
530
550
  log_buffer=log_buffer,
531
551
  score=score,
552
+ score_display=score_display,
532
553
  max_retries=max_retries,
533
554
  timeout=timeout,
534
555
  max_connections=max_connections,
@@ -545,6 +566,7 @@ async def eval_retry_async(
545
566
  max_samples: int | None = None,
546
567
  max_tasks: int | None = None,
547
568
  max_subprocesses: int | None = None,
569
+ max_sandboxes: int | None = None,
548
570
  sandbox_cleanup: bool | None = None,
549
571
  fail_on_error: bool | float | None = None,
550
572
  debug_errors: bool | None = None,
@@ -552,6 +574,7 @@ async def eval_retry_async(
552
574
  log_images: bool | None = None,
553
575
  log_buffer: int | None = None,
554
576
  score: bool = True,
577
+ score_display: bool | None = None,
555
578
  max_retries: int | None = None,
556
579
  timeout: int | None = None,
557
580
  max_connections: int | None = None,
@@ -574,6 +597,7 @@ async def eval_retry_async(
574
597
  (default is 1)
575
598
  max_subprocesses (int): Maximum number of subprocesses to
576
599
  run in parallel (default is os.cpu_count())
600
+ max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
577
601
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
578
602
  (defaults to True)
579
603
  fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -589,6 +613,7 @@ async def eval_retry_async(
589
613
  If not specified, an appropriate default for the format and filesystem is
590
614
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
591
615
  score (bool): Score output (defaults to True)
616
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
592
617
  max_retries (int | None):
593
618
  Maximum number of times to retry request.
594
619
  timeout: (int | None):
@@ -665,6 +690,7 @@ async def eval_retry_async(
665
690
  max_samples = max_samples or eval_log.eval.config.max_samples
666
691
  max_tasks = max_tasks or eval_log.eval.config.max_tasks
667
692
  max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
693
+ max_sandboxes = max_sandboxes or eval_log.eval.config.max_sandboxes
668
694
  sandbox_cleanup = (
669
695
  sandbox_cleanup
670
696
  if sandbox_cleanup is not None
@@ -684,6 +710,11 @@ async def eval_retry_async(
684
710
  log_buffer = (
685
711
  log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
686
712
  )
713
+ score_display = (
714
+ score_display
715
+ if score_display is not None
716
+ else eval_log.eval.config.score_display
717
+ )
687
718
 
688
719
  config = eval_log.plan.config
689
720
  config.max_retries = max_retries or config.max_retries
@@ -720,10 +751,12 @@ async def eval_retry_async(
720
751
  max_samples=max_samples,
721
752
  max_tasks=max_tasks,
722
753
  max_subprocesses=max_subprocesses,
754
+ max_sandboxes=max_sandboxes,
723
755
  log_samples=log_samples,
724
756
  log_images=log_images,
725
757
  log_buffer=log_buffer,
726
758
  score=score,
759
+ score_display=score_display,
727
760
  **dict(config),
728
761
  )
729
762
  )[0]
@@ -75,6 +75,7 @@ def eval_set(
75
75
  max_samples: int | None = None,
76
76
  max_tasks: int | None = None,
77
77
  max_subprocesses: int | None = None,
78
+ max_sandboxes: int | None = None,
78
79
  log_samples: bool | None = None,
79
80
  log_images: bool | None = None,
80
81
  log_buffer: int | None = None,
@@ -144,6 +145,8 @@ def eval_set(
144
145
  (default is 1)
145
146
  max_subprocesses (int | None): Maximum number of subprocesses to
146
147
  run in parallel (default is os.cpu_count())
148
+ max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
149
+ to run in parallel.
147
150
  log_samples: (bool | None): Log detailed samples and scores (defaults to True)
148
151
  log_images: (bool | None): Log base64 encoded version of images,
149
152
  even if specified as a filename or URL (defaults to False)
@@ -193,6 +196,7 @@ def eval_set(
193
196
  max_samples=max_samples,
194
197
  max_tasks=max_tasks,
195
198
  max_subprocesses=max_subprocesses,
199
+ max_sandboxes=max_sandboxes,
196
200
  log_samples=log_samples,
197
201
  log_images=log_images,
198
202
  log_buffer=log_buffer,
@@ -146,8 +146,8 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
146
146
  # module import, so set its task file and run dir
147
147
  if get_installed_package_name(task_type) is None:
148
148
  module = inspect.getmodule(task_type)
149
- if module and module.__file__:
150
- file = Path(module.__file__)
149
+ if module and hasattr(module, "__file__"):
150
+ file = Path(getattr(module, "__file__"))
151
151
  setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
152
152
  setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
153
153
 
@@ -30,13 +30,8 @@ async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
30
30
 
31
31
  async def sample_with_base64_images(sample: Sample) -> Sample:
32
32
  if isinstance(sample.input, list):
33
- return Sample(
34
- input=await messages_with_base64_images(sample.input),
35
- target=sample.target,
36
- id=sample.id,
37
- metadata=sample.metadata,
38
- files=sample.files,
39
- choices=sample.choices,
33
+ return sample.model_copy(
34
+ update={"input": await messages_with_base64_images(sample.input)}
40
35
  )
41
36
  else:
42
37
  return sample
@@ -44,13 +39,8 @@ async def sample_with_base64_images(sample: Sample) -> Sample:
44
39
 
45
40
  def sample_without_base64_images(sample: Sample) -> Sample:
46
41
  if isinstance(sample.input, list):
47
- return Sample(
48
- input=messages_without_base64_images(sample.input),
49
- target=sample.target,
50
- id=sample.id,
51
- metadata=sample.metadata,
52
- files=sample.files,
53
- choices=sample.choices,
42
+ return sample.model_copy(
43
+ update={"input": messages_without_base64_images(sample.input)}
54
44
  )
55
45
  else:
56
46
  return sample
@@ -267,10 +267,28 @@ def scorers_from_metric_dict(
267
267
  value = target_metric(metric_scores)
268
268
  else:
269
269
  value = float("Nan")
270
- result_metrics[metric_name] = EvalMetric(
271
- name=metric_name,
272
- value=cast(float, value),
273
- )
270
+
271
+ # convert the value to a float (either by expanding the dict or array)
272
+ # or by casting to a float
273
+ if isinstance(value, dict):
274
+ for key, val in value.items():
275
+ name = f"{metric_name}_{key}"
276
+ result_metrics[name] = EvalMetric(
277
+ name=name,
278
+ value=cast(float, val),
279
+ )
280
+ elif isinstance(value, list):
281
+ for idx, item in enumerate(value):
282
+ name = f"{metric_name}_{idx}"
283
+ result_metrics[name] = EvalMetric(
284
+ name=name,
285
+ value=cast(float, item),
286
+ )
287
+ else:
288
+ result_metrics[metric_name] = EvalMetric(
289
+ name=metric_name,
290
+ value=cast(float, value),
291
+ )
274
292
 
275
293
  # create a scorer result for this metric
276
294
  # TODO: What if there is separate simple scorer which has a name collision with
@@ -178,6 +178,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
178
178
  else:
179
179
  plan = Plan(unroll(solver), internal=True)
180
180
 
181
+ # add setup solver(s) if specified
182
+ if task.setup:
183
+ plan.steps = unroll(task.setup) + plan.steps
184
+
181
185
  # reaolve the scorer
182
186
  score = score and task.scorer is not None
183
187
  scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
@@ -213,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
213
217
  log_location=log_location,
214
218
  )
215
219
 
216
- with display().task(profile) as td:
220
+ with display().task(
221
+ profile,
222
+ ) as td:
217
223
  try:
218
224
  # start the log
219
225
  await log_start(logger, plan, generate_config)
@@ -248,7 +254,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
248
254
 
249
255
  # track when samples complete and update progress as we go
250
256
  progress_results: list[dict[str, SampleScore]] = []
251
- update_metrics_display = update_metrics_display_fn(td)
257
+ update_metrics_display = update_metrics_display_fn(
258
+ td,
259
+ display_metrics=profile.eval_config.score_display is not False,
260
+ )
252
261
 
253
262
  def sample_complete(sample_score: dict[str, SampleScore]) -> None:
254
263
  # Capture the result
@@ -275,6 +284,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
275
284
  sample=sample,
276
285
  state=state,
277
286
  sandbox=sandbox,
287
+ max_sandboxes=config.max_sandboxes,
278
288
  sandbox_cleanup=sandbox_cleanup,
279
289
  plan=plan,
280
290
  scorers=scorers,
@@ -395,7 +405,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
395
405
 
396
406
 
397
407
  def update_metrics_display_fn(
398
- td: TaskDisplay, initial_interval: float = 0, min_interval: float = 0.9
408
+ td: TaskDisplay,
409
+ initial_interval: float = 0,
410
+ min_interval: float = 0.9,
411
+ display_metrics: bool = True,
399
412
  ) -> Callable[
400
413
  [
401
414
  int,
@@ -415,6 +428,10 @@ def update_metrics_display_fn(
415
428
  reducers: ScoreReducer | list[ScoreReducer] | None,
416
429
  metrics: list[Metric] | dict[str, list[Metric]] | None,
417
430
  ) -> None:
431
+ # Don't compute metrics if they are not being displayed
432
+ if not display_metrics:
433
+ return None
434
+
418
435
  nonlocal next_compute_time
419
436
  time_start = time.perf_counter()
420
437
  if time_start >= next_compute_time:
@@ -456,6 +473,7 @@ async def task_run_sample(
456
473
  sample: Sample,
457
474
  state: TaskState,
458
475
  sandbox: SandboxEnvironmentSpec | None,
476
+ max_sandboxes: int | None,
459
477
  sandbox_cleanup: bool,
460
478
  plan: Plan,
461
479
  scorers: list[Scorer] | None,
@@ -482,8 +500,8 @@ async def task_run_sample(
482
500
  await logger.log_sample(previous_sample, flush=False)
483
501
 
484
502
  # return score
485
- if previous_sample.scores:
486
- return {
503
+ sample_scores = (
504
+ {
487
505
  key: SampleScore(
488
506
  sample_id=previous_sample.id,
489
507
  value=score.value,
@@ -493,8 +511,11 @@ async def task_run_sample(
493
511
  )
494
512
  for key, score in previous_sample.scores.items()
495
513
  }
496
- else:
497
- return {}
514
+ if previous_sample.scores
515
+ else {}
516
+ )
517
+ sample_complete(sample_scores)
518
+ return sample_scores
498
519
 
499
520
  # use semaphore if provided
500
521
  semaphore_cm: asyncio.Semaphore | contextlib.AbstractAsyncContextManager[None] = (
@@ -510,7 +531,7 @@ async def task_run_sample(
510
531
 
511
532
  # use sandbox if provided
512
533
  sandboxenv_cm = (
513
- sandboxenv_context(task_name, sandbox, sandbox_cleanup, sample)
534
+ sandboxenv_context(task_name, sandbox, max_sandboxes, sandbox_cleanup, sample)
514
535
  if sandbox or sample.sandbox is not None
515
536
  else contextlib.nullcontext()
516
537
  )
@@ -559,14 +580,18 @@ async def task_run_sample(
559
580
  state = await plan(state, generate)
560
581
 
561
582
  except TimeoutError:
562
- # notify the user
563
- transcript()._event(
564
- SampleLimitEvent(
565
- type="time",
566
- message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
567
- limit=time_limit,
583
+ if time_limit is not None:
584
+ transcript()._event(
585
+ SampleLimitEvent(
586
+ type="time",
587
+ message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
588
+ limit=time_limit,
589
+ )
590
+ )
591
+ else:
592
+ py_logger.warning(
593
+ "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
568
594
  )
569
- )
570
595
 
571
596
  # capture most recent state for scoring
572
597
  state = sample_state() or state
@@ -866,10 +891,5 @@ def create_sample_semaphore(
866
891
  else DEFAULT_MAX_CONNECTIONS
867
892
  )
868
893
 
869
- # if max_tasks is specified and max_samples is less
870
- # than max_tasks then bump it up
871
- if config.max_tasks is not None:
872
- max_samples = max(max_samples, config.max_tasks)
873
-
874
894
  # return the semaphore
875
895
  return asyncio.Semaphore(max_samples)