inspect-ai 0.3.51__py3-none-any.whl → 0.3.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. inspect_ai/_cli/eval.py +44 -2
  2. inspect_ai/_display/core/config.py +4 -0
  3. inspect_ai/_display/core/panel.py +1 -1
  4. inspect_ai/_display/core/progress.py +9 -3
  5. inspect_ai/_display/core/results.py +8 -4
  6. inspect_ai/_display/textual/widgets/task_detail.py +45 -13
  7. inspect_ai/_display/textual/widgets/tasks.py +86 -5
  8. inspect_ai/_display/textual/widgets/transcript.py +4 -17
  9. inspect_ai/_eval/eval.py +29 -1
  10. inspect_ai/_eval/evalset.py +7 -0
  11. inspect_ai/_eval/registry.py +2 -2
  12. inspect_ai/_eval/task/log.py +6 -1
  13. inspect_ai/_eval/task/results.py +22 -4
  14. inspect_ai/_eval/task/run.py +18 -12
  15. inspect_ai/_eval/task/sandbox.py +72 -43
  16. inspect_ai/_eval/task/task.py +4 -0
  17. inspect_ai/_eval/task/util.py +17 -6
  18. inspect_ai/_util/logger.py +10 -2
  19. inspect_ai/_util/samples.py +7 -0
  20. inspect_ai/_util/transcript.py +8 -0
  21. inspect_ai/_view/www/App.css +13 -0
  22. inspect_ai/_view/www/dist/assets/index.css +13 -0
  23. inspect_ai/_view/www/dist/assets/index.js +105 -55
  24. inspect_ai/_view/www/src/App.mjs +31 -6
  25. inspect_ai/_view/www/src/Types.mjs +6 -0
  26. inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
  27. inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
  28. inspect_ai/_view/www/src/components/Tools.mjs +46 -18
  29. inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
  30. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +18 -5
  31. inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
  32. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
  33. inspect_ai/log/_log.py +6 -0
  34. inspect_ai/log/_recorders/eval.py +8 -7
  35. inspect_ai/model/_call_tools.py +2 -6
  36. inspect_ai/model/_generate_config.py +6 -0
  37. inspect_ai/model/_model.py +18 -4
  38. inspect_ai/model/_providers/azureai.py +22 -2
  39. inspect_ai/model/_providers/bedrock.py +17 -1
  40. inspect_ai/model/_providers/hf.py +1 -1
  41. inspect_ai/model/_providers/openai.py +32 -8
  42. inspect_ai/model/_providers/providers.py +1 -1
  43. inspect_ai/model/_providers/vllm.py +1 -1
  44. inspect_ai/model/_render.py +7 -6
  45. inspect_ai/model/_trace.py +1 -1
  46. inspect_ai/solver/_basic_agent.py +8 -1
  47. inspect_ai/tool/_tool_transcript.py +28 -0
  48. inspect_ai/util/_sandbox/context.py +1 -2
  49. inspect_ai/util/_sandbox/docker/config.py +8 -10
  50. inspect_ai/util/_sandbox/docker/docker.py +9 -5
  51. inspect_ai/util/_sandbox/docker/util.py +3 -3
  52. inspect_ai/util/_sandbox/environment.py +7 -2
  53. inspect_ai/util/_sandbox/limits.py +1 -1
  54. inspect_ai/util/_sandbox/local.py +8 -9
  55. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/METADATA +2 -4
  56. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/RECORD +60 -59
  57. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/LICENSE +0 -0
  58. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/WHEEL +0 -0
  59. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/entry_points.txt +0 -0
  60. {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -12,7 +12,7 @@ from inspect_ai._util.constants import (
12
12
  DEFAULT_MAX_RETRIES,
13
13
  )
14
14
  from inspect_ai._util.file import filesystem
15
- from inspect_ai._util.samples import parse_samples_limit
15
+ from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
16
16
  from inspect_ai.log._file import log_file_info
17
17
  from inspect_ai.model import GenerateConfigArgs
18
18
  from inspect_ai.scorer._reducer import create_reducers
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
30
30
  MAX_SUBPROCESSES_HELP = (
31
31
  "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
32
32
  )
33
+ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
33
34
  NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
34
35
  FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
35
36
  NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
@@ -144,6 +145,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
144
145
  help="Limit samples to evaluate e.g. 10 or 10-20",
145
146
  envvar="INSPECT_EVAL_LIMIT",
146
147
  )
148
+ @click.option(
149
+ "--sample-id",
150
+ type=str,
151
+ help="Evaluate specific sample(s) (comma separated list of ids)",
152
+ envvar="INSPECT_EVAL_SAMPLE_ID",
153
+ )
147
154
  @click.option(
148
155
  "--epochs",
149
156
  type=int,
@@ -186,6 +193,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
186
193
  help=MAX_SUBPROCESSES_HELP,
187
194
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
188
195
  )
196
+ @click.option(
197
+ "--max-sandboxes",
198
+ type=int,
199
+ help=MAX_SANDBOXES_HELP,
200
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
201
+ )
189
202
  @click.option(
190
203
  "--message-limit",
191
204
  type=int,
@@ -355,6 +368,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
355
368
  help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
356
369
  envvar="INSPECT_EVAL_CACHE_PROMPT",
357
370
  )
371
+ @click.option(
372
+ "--reasoning-effort",
373
+ type=click.Choice(["low", "medium", "high"]),
374
+ help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
375
+ envvar="INSPECT_EVAL_REASONING_EFFORT",
376
+ )
358
377
  @click.option(
359
378
  "--log-format",
360
379
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -391,6 +410,7 @@ def eval_command(
391
410
  epochs: int | None,
392
411
  epochs_reducer: str | None,
393
412
  limit: str | None,
413
+ sample_id: str | None,
394
414
  max_retries: int | None,
395
415
  timeout: int | None,
396
416
  max_connections: int | None,
@@ -412,12 +432,14 @@ def eval_command(
412
432
  parallel_tool_calls: bool | None,
413
433
  max_tool_output: int | None,
414
434
  cache_prompt: str | None,
435
+ reasoning_effort: str | None,
415
436
  message_limit: int | None,
416
437
  token_limit: int | None,
417
438
  time_limit: int | None,
418
439
  max_samples: int | None,
419
440
  max_tasks: int | None,
420
441
  max_subprocesses: int | None,
442
+ max_sandboxes: int | None,
421
443
  fail_on_error: bool | float | None,
422
444
  no_fail_on_error: bool | None,
423
445
  no_log_samples: bool | None,
@@ -458,12 +480,14 @@ def eval_command(
458
480
  epochs=epochs,
459
481
  epochs_reducer=epochs_reducer,
460
482
  limit=limit,
483
+ sample_id=sample_id,
461
484
  message_limit=message_limit,
462
485
  token_limit=token_limit,
463
486
  time_limit=time_limit,
464
487
  max_samples=max_samples,
465
488
  max_tasks=max_tasks,
466
489
  max_subprocesses=max_subprocesses,
490
+ max_sandboxes=max_sandboxes,
467
491
  fail_on_error=fail_on_error,
468
492
  no_fail_on_error=no_fail_on_error,
469
493
  debug_errors=common["debug_errors"],
@@ -543,6 +567,7 @@ def eval_set_command(
543
567
  epochs: int | None,
544
568
  epochs_reducer: str | None,
545
569
  limit: str | None,
570
+ sample_id: str | None,
546
571
  max_retries: int | None,
547
572
  timeout: int | None,
548
573
  max_connections: int | None,
@@ -564,12 +589,14 @@ def eval_set_command(
564
589
  parallel_tool_calls: bool | None,
565
590
  max_tool_output: int | None,
566
591
  cache_prompt: str | None,
592
+ reasoning_effort: str | None,
567
593
  message_limit: int | None,
568
594
  token_limit: int | None,
569
595
  time_limit: int | None,
570
596
  max_samples: int | None,
571
597
  max_tasks: int | None,
572
598
  max_subprocesses: int | None,
599
+ max_sandboxes: int | None,
573
600
  fail_on_error: bool | float | None,
574
601
  no_fail_on_error: bool | None,
575
602
  no_log_samples: bool | None,
@@ -612,12 +639,14 @@ def eval_set_command(
612
639
  epochs=epochs,
613
640
  epochs_reducer=epochs_reducer,
614
641
  limit=limit,
642
+ sample_id=sample_id,
615
643
  message_limit=message_limit,
616
644
  token_limit=token_limit,
617
645
  time_limit=time_limit,
618
646
  max_samples=max_samples,
619
647
  max_tasks=max_tasks,
620
648
  max_subprocesses=max_subprocesses,
649
+ max_sandboxes=max_sandboxes,
621
650
  fail_on_error=fail_on_error,
622
651
  no_fail_on_error=no_fail_on_error,
623
652
  debug_errors=common["debug_errors"],
@@ -662,12 +691,14 @@ def eval_exec(
662
691
  epochs: int | None,
663
692
  epochs_reducer: str | None,
664
693
  limit: str | None,
694
+ sample_id: str | None,
665
695
  message_limit: int | None,
666
696
  token_limit: int | None,
667
697
  time_limit: int | None,
668
698
  max_samples: int | None,
669
699
  max_tasks: int | None,
670
700
  max_subprocesses: int | None,
701
+ max_sandboxes: int | None,
671
702
  fail_on_error: bool | float | None,
672
703
  no_fail_on_error: bool | None,
673
704
  debug_errors: bool | None,
@@ -699,8 +730,9 @@ def eval_exec(
699
730
  else None
700
731
  )
701
732
 
702
- # resolve range
733
+ # resolve range and sample id
703
734
  eval_limit = parse_samples_limit(limit)
735
+ eval_sample_id = parse_sample_id(sample_id)
704
736
 
705
737
  # resolve fail_on_error
706
738
  if no_fail_on_error is True:
@@ -734,6 +766,7 @@ def eval_exec(
734
766
  log_dir=log_dir,
735
767
  log_format=log_format,
736
768
  limit=eval_limit,
769
+ sample_id=eval_sample_id,
737
770
  epochs=eval_epochs,
738
771
  fail_on_error=fail_on_error,
739
772
  debug_errors=debug_errors,
@@ -743,6 +776,7 @@ def eval_exec(
743
776
  max_samples=max_samples,
744
777
  max_tasks=max_tasks,
745
778
  max_subprocesses=max_subprocesses,
779
+ max_sandboxes=max_sandboxes,
746
780
  log_samples=log_samples,
747
781
  log_images=log_images,
748
782
  log_buffer=log_buffer,
@@ -821,6 +855,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
821
855
  help=MAX_SUBPROCESSES_HELP,
822
856
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
823
857
  )
858
+ @click.option(
859
+ "--max-sandboxes",
860
+ type=int,
861
+ help=MAX_SANDBOXES_HELP,
862
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
863
+ )
824
864
  @click.option(
825
865
  "--no-sandbox-cleanup",
826
866
  type=bool,
@@ -891,6 +931,7 @@ def eval_retry_command(
891
931
  max_samples: int | None,
892
932
  max_tasks: int | None,
893
933
  max_subprocesses: int | None,
934
+ max_sandboxes: int | None,
894
935
  no_sandbox_cleanup: bool | None,
895
936
  trace: bool | None,
896
937
  fail_on_error: bool | float | None,
@@ -934,6 +975,7 @@ def eval_retry_command(
934
975
  max_samples=max_samples,
935
976
  max_tasks=max_tasks,
936
977
  max_subprocesses=max_subprocesses,
978
+ max_sandboxes=max_sandboxes,
937
979
  sandbox_cleanup=sandbox_cleanup,
938
980
  trace=trace,
939
981
  fail_on_error=fail_on_error,
@@ -24,6 +24,10 @@ def task_config(
24
24
  config_print.append(
25
25
  f"{name}: {','.join([approver['name'] for approver in value['approvers']])}"
26
26
  )
27
+ elif name == "sample_id":
28
+ value = value if isinstance(value, list) else [value]
29
+ value = [str(v) for v in value]
30
+ config_print.append(f"{name}: {','.join(value)}")
27
31
  elif name not in ["limit", "model"]:
28
32
  config_print.append(f"{name}: {value}")
29
33
  values = ", ".join(config_print)
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
112
112
  def task_title(profile: TaskProfile, show_model: bool) -> str:
113
113
  eval_epochs = profile.eval_config.epochs or 1
114
114
  epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
115
- samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples > 1 else ''}"
115
+ samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
116
116
  title = f"{registry_unqualified_name(profile.name)} ({samples})"
117
117
  if show_model:
118
118
  title = f"{title}: {profile.model}"
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
130
130
  return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
131
131
 
132
132
 
133
- def progress_count(complete: int, total: int) -> str:
134
- # Pad the display to keep it stable
133
+ def progress_count(complete: int, total: int, width: int | None = None) -> str:
134
+ # Pad the display to keep it stable as the
135
+ # complete metrics
135
136
  total_str = f"{total:,}"
136
137
  complete_str = f"{complete:,}"
137
138
  padding = max(0, len(total_str) - len(complete_str))
138
- return " " * padding + f"[{complete_str}/{total_str}]"
139
+ padded = " " * padding + f"[{complete_str}/{total_str}]"
140
+
141
+ # If a width has ben specified, pad up to this width as well
142
+ if width is not None:
143
+ padded = padded.rjust(width)
144
+ return padded
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
166
166
  return message
167
167
 
168
168
 
169
- def task_metric(metrics: list[TaskDisplayMetric]) -> str:
169
+ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
170
170
  reducer_names: Set[str] = {
171
171
  metric.reducer for metric in metrics if metric.reducer is not None
172
172
  }
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
180
180
  else:
181
181
  value = f"{metric.value:.2f}"
182
182
 
183
- if show_reducer:
184
- return f"{metric.name}/{metric.reducer}: {value}"
183
+ if show_reducer and metric.reducer is not None:
184
+ metric_str = f"{metric.name}/{metric.reducer}: {value}"
185
185
  else:
186
- return f"{metric.name}: {value}"
186
+ metric_str = f"{metric.name}: {value}"
187
+
188
+ if width is not None:
189
+ metric_str = metric_str.rjust(width)
190
+ return metric_str
187
191
 
188
192
 
189
193
  def task_metrics(scores: list[EvalScore]) -> str:
@@ -63,6 +63,9 @@ class TaskDetail(Widget):
63
63
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
64
64
  # Group by reducer then scorer within reducers
65
65
  self.metrics = metrics
66
+
67
+ # clear the existing computed reducers
68
+ self.by_reducer = {}
66
69
  for metric in metrics:
67
70
  reducer_group = (
68
71
  self.by_reducer[metric.reducer]
@@ -117,6 +120,7 @@ class TaskDetail(Widget):
117
120
  for remove in to_remove:
118
121
  task_metric = self.existing_metrics[remove]
119
122
  task_metric.remove()
123
+ del self.existing_metrics[remove]
120
124
 
121
125
  # add or update widgets with metrics
122
126
  for reducer, scorers in self.by_reducer.items():
@@ -187,24 +191,52 @@ class TaskMetrics(Widget):
187
191
  self.grid: Grid = Grid()
188
192
  self.value_widgets: dict[str, Static] = {}
189
193
 
194
+ def grid_id(self) -> str:
195
+ return f"{self.id}-grid"
196
+
190
197
  def compose(self) -> ComposeResult:
191
- # Just yield a single DataTable widget
198
+ # Yield the title and base grid
192
199
  yield Center(self._title())
193
- with Grid():
194
- for metric in self.metrics:
195
- # Add the value static but keep it around
196
- # for future updates
197
- self.value_widgets[metric.name] = Static(
198
- self._metric_value(metric.value)
199
- )
200
-
201
- yield Static(metric.name)
202
- yield self.value_widgets[metric.name]
200
+ yield Grid(id=self.grid_id())
203
201
 
204
202
  def update(self, metrics: list[TaskMetric]) -> None:
203
+ self.metrics = metrics
204
+
205
+ # We assume that generally the initial metric names will
206
+ # always match future updates (so we can just update values in line)
207
+ # but if an unrecognized metric appears on the scene, just
208
+ # recompute the whole grid
209
+ need_recompute = False
205
210
  for metric in metrics:
206
- widget = self.value_widgets[metric.name]
207
- widget.update(content=f"{metric.value:,.3f}")
211
+ widget = self.value_widgets.get(metric.name)
212
+ if widget:
213
+ # Just update the values themselves
214
+ widget.update(content=f"{metric.value:,.3f}")
215
+ else:
216
+ # Don't have a widget for this, recompute the whole grid
217
+ need_recompute = True
218
+ break
219
+
220
+ if need_recompute:
221
+ self.recompute_grid()
222
+
223
+ def on_mount(self) -> None:
224
+ self.recompute_grid()
225
+
226
+ def recompute_grid(self) -> None:
227
+ if not self.is_mounted:
228
+ return
229
+
230
+ grid = self.query_one(f"#{self.grid_id()}")
231
+
232
+ grid.remove_children()
233
+ for metric in self.metrics:
234
+ # Add the value static but keep it around
235
+ # for future updates
236
+ self.value_widgets[metric.name] = Static(self._metric_value(metric.value))
237
+
238
+ grid.mount(Static(metric.name))
239
+ grid.mount(self.value_widgets[metric.name])
208
240
 
209
241
  def _title(self) -> Widget:
210
242
  if self.scorer is None:
@@ -36,6 +36,9 @@ from ...core.progress import (
36
36
  progress_model_name,
37
37
  )
38
38
 
39
+ MAX_METRIC_WIDTH = 25
40
+ MAX_COUNT_WIDTH = 15
41
+
39
42
 
40
43
  class TasksView(Container):
41
44
  DEFAULT_CSS = """
@@ -68,6 +71,7 @@ class TasksView(Container):
68
71
  super().__init__()
69
72
  self.description_width = MAX_DESCRIPTION_WIDTH
70
73
  self.model_name_width = MAX_MODEL_NAME_WIDTH
74
+ self.sample_count_width = 0
71
75
 
72
76
  def init_tasks(self, tasks: list[TaskSpec]) -> None:
73
77
  # clear existing tasks
@@ -80,15 +84,41 @@ class TasksView(Container):
80
84
  self.model_name_width = min(
81
85
  max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
82
86
  )
87
+ self.update_progress_widths()
83
88
 
84
89
  def add_task(self, task: TaskWithResult) -> TaskDisplay:
90
+ self.update_count_width(task.profile.samples)
85
91
  task_display = TaskProgressView(
86
- task, self.description_width, self.model_name_width
92
+ task, self.description_width, self.model_name_width, self.sample_count_width
87
93
  )
88
94
  self.tasks.mount(task_display)
89
95
  self.tasks.scroll_to_widget(task_display)
96
+ self.update_progress_widths()
97
+
90
98
  return task_display
91
99
 
100
+ def update_count_width(self, samples: int) -> None:
101
+ sample_count_str = progress_count(samples, samples, self.sample_count_width)
102
+ self.sample_count_width = min(
103
+ max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
104
+ )
105
+
106
+ def update_progress_widths(self) -> None:
107
+ progress_views = self.tasks.query_children(TaskProgressView)
108
+ metrics_size = 0
109
+ for progress_view in progress_views:
110
+ metrics_size = max(
111
+ metrics_size,
112
+ progress_view.metrics_width
113
+ if progress_view.metrics_width is not None
114
+ else 0,
115
+ )
116
+ metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
117
+
118
+ for progress_view in progress_views:
119
+ progress_view.update_metrics_width(metrics_size)
120
+ progress_view.update_count_width(self.sample_count_width)
121
+
92
122
  def compose(self) -> ComposeResult:
93
123
  yield Static(id="tasks-config")
94
124
  yield Static(id="tasks-targets")
@@ -139,13 +169,18 @@ class TaskProgressView(Widget):
139
169
  """
140
170
 
141
171
  def __init__(
142
- self, task: TaskWithResult, description_width: int, model_name_width: int
172
+ self,
173
+ task: TaskWithResult,
174
+ description_width: int,
175
+ model_name_width: int,
176
+ sample_count_width: int,
143
177
  ) -> None:
144
178
  super().__init__()
145
179
  self.t = task
146
180
 
147
181
  self.description_width = description_width
148
182
  self.model_name_width = model_name_width
183
+
149
184
  self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
150
185
  self.count_display = Static()
151
186
  self.metrics_display = Static(id="task-metrics")
@@ -154,6 +189,14 @@ class TaskProgressView(Widget):
154
189
  self.toggle = Toggle()
155
190
  self.task_detail = TaskDetail(id="task-detail", classes="hidden")
156
191
 
192
+ self.sample_count_width: int = sample_count_width
193
+
194
+ metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
195
+ metrics_width: reactive[int | None] = reactive(None)
196
+ sample_count_width: reactive[int] = reactive(0)
197
+ samples_complete: reactive[int] = reactive(0)
198
+ samples_total: reactive[int] = reactive(0)
199
+
157
200
  def compose(self) -> ComposeResult:
158
201
  yield self.toggle
159
202
  yield TaskStatusIcon()
@@ -191,13 +234,51 @@ class TaskProgressView(Widget):
191
234
  self.task_progress.complete()
192
235
 
193
236
  def sample_complete(self, complete: int, total: int) -> None:
194
- self.count_display.update(progress_count(complete, total))
237
+ self.samples_complete = complete
238
+ self.samples_total = total
195
239
 
196
240
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
197
- if len(metrics) > 0:
198
- self.metrics_display.update(task_metric(metrics))
241
+ self.metrics = metrics
242
+
243
+ def update_metrics_width(self, width: int) -> None:
244
+ self.metrics_width = width
245
+
246
+ def update_count_width(self, width: int) -> None:
247
+ self.sample_count_width = width
248
+
249
+ def _watch_sample_count_width(self, width: int) -> None:
250
+ self.refresh_count()
251
+
252
+ def _watch_samples_complete(self, complete: int) -> None:
253
+ self.refresh_count()
254
+
255
+ def _watch_samples_total(self, total: int) -> None:
256
+ self.refresh_count()
257
+
258
+ def _watch_metrics_width(self, width: int) -> None:
259
+ self.update_metrics_label()
260
+
261
+ def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
262
+ if metrics is not None and len(metrics) > 0:
263
+ # update label
264
+ self.update_metrics_label()
265
+
266
+ # update details
199
267
  self.task_detail.update_metrics(metrics)
200
268
 
269
+ def refresh_count(self) -> None:
270
+ progress_label = progress_count(
271
+ self.samples_complete, self.samples_total, self.sample_count_width
272
+ )
273
+ self.count_display.update(progress_label)
274
+
275
+ def update_metrics_label(self) -> None:
276
+ # compute the label (with a min size)
277
+ if self.metrics is not None:
278
+ metric_label = task_metric(self.metrics, self.metrics_width)
279
+ self.metrics_width = len(metric_label)
280
+ self.metrics_display.update(metric_label)
281
+
201
282
 
202
283
  class TaskStatusIcon(Static):
203
284
  result: reactive[TaskResult | None] = reactive(None)
@@ -10,10 +10,10 @@ from textual.widget import Widget
10
10
  from textual.widgets import Static
11
11
 
12
12
  from inspect_ai._util.content import ContentText
13
- from inspect_ai._util.format import format_function_call
14
13
  from inspect_ai._util.rich import lines_display
15
14
  from inspect_ai._util.transcript import (
16
15
  set_transcript_markdown_options,
16
+ transcript_function,
17
17
  transcript_markdown,
18
18
  transcript_separator,
19
19
  )
@@ -36,6 +36,7 @@ from inspect_ai.log._transcript import (
36
36
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
37
37
  from inspect_ai.model._render import messages_preceding_assistant
38
38
  from inspect_ai.tool._tool import ToolResult
39
+ from inspect_ai.tool._tool_transcript import transcript_tool_call
39
40
 
40
41
 
41
42
  class TranscriptView(ScrollableContainer):
@@ -195,16 +196,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
195
196
  display.extend(render_event(e) or [])
196
197
 
197
198
  # render the call
198
- content: list[RenderableType] = []
199
- if event.view:
200
- if event.view.title:
201
- content.append(Text.from_markup(f"[bold]{event.view.title}[/bold]\n"))
202
- if event.view.format == "markdown":
203
- content.append(transcript_markdown(event.view.content))
204
- else:
205
- content.append(event.view.content)
206
- else:
207
- content.append(render_function_call(event.function, event.arguments))
199
+ content = transcript_tool_call(event)
208
200
 
209
201
  # render the output
210
202
  if isinstance(event.result, list):
@@ -266,7 +258,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
266
258
  for e in event.events:
267
259
  display.extend(render_event(e) or [])
268
260
 
269
- content: list[RenderableType] = [render_function_call(event.name, event.input)]
261
+ content: list[RenderableType] = [transcript_function(event.name, event.input)]
270
262
  if event.result:
271
263
  content.append(Text())
272
264
  if isinstance(event.result, str | int | float | bool | None):
@@ -309,11 +301,6 @@ def render_error_event(event: ErrorEvent) -> EventDisplay:
309
301
  return EventDisplay("error", event.error.traceback.strip())
310
302
 
311
303
 
312
- def render_function_call(function: str, arguments: dict[str, Any]) -> RenderableType:
313
- call = format_function_call(function, arguments)
314
- return transcript_markdown("```python\n" + call + "\n```\n")
315
-
316
-
317
304
  def render_as_json(json: Any) -> RenderableType:
318
305
  return transcript_markdown(
319
306
  "```json\n"