inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. inspect_ai/_cli/eval.py +55 -1
  2. inspect_ai/_cli/main.py +2 -0
  3. inspect_ai/_cli/trace.py +244 -0
  4. inspect_ai/_display/core/progress.py +9 -3
  5. inspect_ai/_display/core/results.py +8 -4
  6. inspect_ai/_display/textual/app.py +5 -1
  7. inspect_ai/_display/textual/widgets/task_detail.py +3 -0
  8. inspect_ai/_display/textual/widgets/tasks.py +97 -6
  9. inspect_ai/_eval/eval.py +33 -0
  10. inspect_ai/_eval/evalset.py +4 -0
  11. inspect_ai/_eval/registry.py +2 -2
  12. inspect_ai/_eval/task/images.py +4 -14
  13. inspect_ai/_eval/task/results.py +22 -4
  14. inspect_ai/_eval/task/run.py +40 -20
  15. inspect_ai/_eval/task/sandbox.py +72 -43
  16. inspect_ai/_eval/task/task.py +4 -0
  17. inspect_ai/_eval/task/util.py +2 -0
  18. inspect_ai/_util/constants.py +3 -3
  19. inspect_ai/_util/display.py +1 -0
  20. inspect_ai/_util/logger.py +34 -8
  21. inspect_ai/_util/trace.py +275 -0
  22. inspect_ai/_view/www/App.css +13 -0
  23. inspect_ai/_view/www/dist/assets/index.css +13 -0
  24. inspect_ai/_view/www/dist/assets/index.js +80 -43
  25. inspect_ai/_view/www/src/App.mjs +31 -6
  26. inspect_ai/_view/www/src/Types.mjs +6 -0
  27. inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
  28. inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
  29. inspect_ai/_view/www/src/components/Tools.mjs +46 -18
  30. inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
  31. inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
  32. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
  33. inspect_ai/log/_log.py +6 -0
  34. inspect_ai/log/_message.py +2 -2
  35. inspect_ai/log/_recorders/eval.py +8 -18
  36. inspect_ai/log/_recorders/json.py +19 -17
  37. inspect_ai/model/_cache.py +22 -16
  38. inspect_ai/model/_call_tools.py +9 -1
  39. inspect_ai/model/_generate_config.py +8 -2
  40. inspect_ai/model/_model.py +11 -12
  41. inspect_ai/model/_providers/azureai.py +1 -1
  42. inspect_ai/model/_providers/bedrock.py +18 -2
  43. inspect_ai/model/_providers/hf.py +1 -1
  44. inspect_ai/model/_providers/openai.py +32 -8
  45. inspect_ai/model/_providers/providers.py +1 -1
  46. inspect_ai/model/_providers/vllm.py +1 -1
  47. inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
  48. inspect_ai/util/_sandbox/context.py +7 -3
  49. inspect_ai/util/_sandbox/docker/compose.py +58 -19
  50. inspect_ai/util/_sandbox/docker/config.py +8 -10
  51. inspect_ai/util/_sandbox/docker/docker.py +20 -16
  52. inspect_ai/util/_sandbox/docker/util.py +3 -9
  53. inspect_ai/util/_sandbox/environment.py +7 -2
  54. inspect_ai/util/_sandbox/limits.py +1 -1
  55. inspect_ai/util/_sandbox/local.py +8 -9
  56. inspect_ai/util/_sandbox/service.py +17 -7
  57. inspect_ai/util/_subprocess.py +6 -1
  58. inspect_ai/util/_subtask.py +8 -2
  59. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
  60. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
  61. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
  62. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
  63. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
  64. {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
30
30
  MAX_SUBPROCESSES_HELP = (
31
31
  "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
32
32
  )
33
+ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
33
34
  NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
34
35
  FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
35
36
  NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
@@ -41,6 +42,7 @@ LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not s
41
42
  NO_SCORE_HELP = (
42
43
  "Do not score model output (use the inspect score command to score output later)"
43
44
  )
45
+ NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
44
46
  MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
45
47
  MAX_RETRIES_HELP = (
46
48
  f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
@@ -192,6 +194,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
192
194
  help=MAX_SUBPROCESSES_HELP,
193
195
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
194
196
  )
197
+ @click.option(
198
+ "--max-sandboxes",
199
+ type=int,
200
+ help=MAX_SANDBOXES_HELP,
201
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
202
+ )
195
203
  @click.option(
196
204
  "--message-limit",
197
205
  type=int,
@@ -250,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
250
258
  help=NO_SCORE_HELP,
251
259
  envvar="INSPECT_EVAL_NO_SCORE",
252
260
  )
261
+ @click.option(
262
+ "--no-score-display",
263
+ type=bool,
264
+ is_flag=True,
265
+ help=NO_SCORE_HELP,
266
+ envvar="INSPECT_EVAL_SCORE_DISPLAY",
267
+ )
253
268
  @click.option(
254
269
  "--max-tokens",
255
270
  type=int,
@@ -332,7 +347,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
332
347
  "--logprobs",
333
348
  type=bool,
334
349
  is_flag=True,
335
- help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, vLLM only.",
350
+ help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only.",
336
351
  envvar="INSPECT_EVAL_LOGPROBS",
337
352
  )
338
353
  @click.option(
@@ -361,6 +376,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
361
376
  help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
362
377
  envvar="INSPECT_EVAL_CACHE_PROMPT",
363
378
  )
379
+ @click.option(
380
+ "--reasoning-effort",
381
+ type=click.Choice(["low", "medium", "high"]),
382
+ help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
383
+ envvar="INSPECT_EVAL_REASONING_EFFORT",
384
+ )
364
385
  @click.option(
365
386
  "--log-format",
366
387
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -419,18 +440,21 @@ def eval_command(
419
440
  parallel_tool_calls: bool | None,
420
441
  max_tool_output: int | None,
421
442
  cache_prompt: str | None,
443
+ reasoning_effort: str | None,
422
444
  message_limit: int | None,
423
445
  token_limit: int | None,
424
446
  time_limit: int | None,
425
447
  max_samples: int | None,
426
448
  max_tasks: int | None,
427
449
  max_subprocesses: int | None,
450
+ max_sandboxes: int | None,
428
451
  fail_on_error: bool | float | None,
429
452
  no_fail_on_error: bool | None,
430
453
  no_log_samples: bool | None,
431
454
  log_images: bool | None,
432
455
  log_buffer: int | None,
433
456
  no_score: bool | None,
457
+ no_score_display: bool | None,
434
458
  log_format: Literal["eval", "json"] | None,
435
459
  **common: Unpack[CommonOptions],
436
460
  ) -> None:
@@ -472,6 +496,7 @@ def eval_command(
472
496
  max_samples=max_samples,
473
497
  max_tasks=max_tasks,
474
498
  max_subprocesses=max_subprocesses,
499
+ max_sandboxes=max_sandboxes,
475
500
  fail_on_error=fail_on_error,
476
501
  no_fail_on_error=no_fail_on_error,
477
502
  debug_errors=common["debug_errors"],
@@ -479,6 +504,7 @@ def eval_command(
479
504
  log_images=log_images,
480
505
  log_buffer=log_buffer,
481
506
  no_score=no_score,
507
+ no_score_display=no_score_display,
482
508
  is_eval_set=False,
483
509
  **config,
484
510
  )
@@ -573,18 +599,21 @@ def eval_set_command(
573
599
  parallel_tool_calls: bool | None,
574
600
  max_tool_output: int | None,
575
601
  cache_prompt: str | None,
602
+ reasoning_effort: str | None,
576
603
  message_limit: int | None,
577
604
  token_limit: int | None,
578
605
  time_limit: int | None,
579
606
  max_samples: int | None,
580
607
  max_tasks: int | None,
581
608
  max_subprocesses: int | None,
609
+ max_sandboxes: int | None,
582
610
  fail_on_error: bool | float | None,
583
611
  no_fail_on_error: bool | None,
584
612
  no_log_samples: bool | None,
585
613
  log_images: bool | None,
586
614
  log_buffer: int | None,
587
615
  no_score: bool | None,
616
+ no_score_display: bool | None,
588
617
  bundle_dir: str | None,
589
618
  bundle_overwrite: bool | None,
590
619
  log_format: Literal["eval", "json"] | None,
@@ -628,6 +657,7 @@ def eval_set_command(
628
657
  max_samples=max_samples,
629
658
  max_tasks=max_tasks,
630
659
  max_subprocesses=max_subprocesses,
660
+ max_sandboxes=max_sandboxes,
631
661
  fail_on_error=fail_on_error,
632
662
  no_fail_on_error=no_fail_on_error,
633
663
  debug_errors=common["debug_errors"],
@@ -635,6 +665,7 @@ def eval_set_command(
635
665
  log_images=log_images,
636
666
  log_buffer=log_buffer,
637
667
  no_score=no_score,
668
+ no_score_display=no_score_display,
638
669
  is_eval_set=True,
639
670
  retry_attempts=retry_attempts,
640
671
  retry_wait=retry_wait,
@@ -679,6 +710,7 @@ def eval_exec(
679
710
  max_samples: int | None,
680
711
  max_tasks: int | None,
681
712
  max_subprocesses: int | None,
713
+ max_sandboxes: int | None,
682
714
  fail_on_error: bool | float | None,
683
715
  no_fail_on_error: bool | None,
684
716
  debug_errors: bool | None,
@@ -686,6 +718,7 @@ def eval_exec(
686
718
  log_images: bool | None,
687
719
  log_buffer: int | None,
688
720
  no_score: bool | None,
721
+ no_score_display: bool | None,
689
722
  is_eval_set: bool = False,
690
723
  retry_attempts: int | None = None,
691
724
  retry_wait: int | None = None,
@@ -726,6 +759,7 @@ def eval_exec(
726
759
  log_images = False if log_images is False else None
727
760
  trace = True if trace else None
728
761
  score = False if no_score else True
762
+ score_display = False if no_score_display else None
729
763
 
730
764
  # build params
731
765
  params: dict[str, Any] = (
@@ -756,10 +790,12 @@ def eval_exec(
756
790
  max_samples=max_samples,
757
791
  max_tasks=max_tasks,
758
792
  max_subprocesses=max_subprocesses,
793
+ max_sandboxes=max_sandboxes,
759
794
  log_samples=log_samples,
760
795
  log_images=log_images,
761
796
  log_buffer=log_buffer,
762
797
  score=score,
798
+ score_display=score_display,
763
799
  )
764
800
  | kwargs
765
801
  )
@@ -834,6 +870,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
834
870
  help=MAX_SUBPROCESSES_HELP,
835
871
  envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
836
872
  )
873
+ @click.option(
874
+ "--max-sandboxes",
875
+ type=int,
876
+ help=MAX_SANDBOXES_HELP,
877
+ envvar="INSPECT_EVAL_MAX_SANDBOXES",
878
+ )
837
879
  @click.option(
838
880
  "--no-sandbox-cleanup",
839
881
  type=bool,
@@ -888,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
888
930
  help=NO_SCORE_HELP,
889
931
  envvar="INSPECT_EVAL_SCORE",
890
932
  )
933
+ @click.option(
934
+ "--no-score-display",
935
+ type=bool,
936
+ is_flag=True,
937
+ help=NO_SCORE_HELP,
938
+ envvar="INSPECT_EVAL_SCORE_DISPLAY",
939
+ )
891
940
  @click.option(
892
941
  "--max-connections",
893
942
  type=int,
@@ -904,6 +953,7 @@ def eval_retry_command(
904
953
  max_samples: int | None,
905
954
  max_tasks: int | None,
906
955
  max_subprocesses: int | None,
956
+ max_sandboxes: int | None,
907
957
  no_sandbox_cleanup: bool | None,
908
958
  trace: bool | None,
909
959
  fail_on_error: bool | float | None,
@@ -912,6 +962,7 @@ def eval_retry_command(
912
962
  log_images: bool | None,
913
963
  log_buffer: int | None,
914
964
  no_score: bool | None,
965
+ no_score_display: bool | None,
915
966
  max_connections: int | None,
916
967
  max_retries: int | None,
917
968
  timeout: int | None,
@@ -926,6 +977,7 @@ def eval_retry_command(
926
977
  log_samples = False if no_log_samples else None
927
978
  log_images = False if log_images is False else None
928
979
  score = False if no_score else True
980
+ score_display = False if no_score_display else None
929
981
 
930
982
  # resolve fail_on_error
931
983
  if no_fail_on_error is True:
@@ -947,6 +999,7 @@ def eval_retry_command(
947
999
  max_samples=max_samples,
948
1000
  max_tasks=max_tasks,
949
1001
  max_subprocesses=max_subprocesses,
1002
+ max_sandboxes=max_sandboxes,
950
1003
  sandbox_cleanup=sandbox_cleanup,
951
1004
  trace=trace,
952
1005
  fail_on_error=fail_on_error,
@@ -955,6 +1008,7 @@ def eval_retry_command(
955
1008
  log_images=log_images,
956
1009
  log_buffer=log_buffer,
957
1010
  score=score,
1011
+ score_display=score_display,
958
1012
  max_retries=max_retries,
959
1013
  timeout=timeout,
960
1014
  max_connections=max_connections,
inspect_ai/_cli/main.py CHANGED
@@ -11,6 +11,7 @@ from .list import list_command
11
11
  from .log import log_command
12
12
  from .sandbox import sandbox_command
13
13
  from .score import score_command
14
+ from .trace import trace_command
14
15
  from .view import view_command
15
16
 
16
17
 
@@ -46,6 +47,7 @@ inspect.add_command(log_command)
46
47
  inspect.add_command(score_command)
47
48
  inspect.add_command(view_command)
48
49
  inspect.add_command(sandbox_command)
50
+ inspect.add_command(trace_command)
49
51
 
50
52
 
51
53
  def main() -> None:
@@ -0,0 +1,244 @@
1
+ import os
2
+ import shlex
3
+ import time
4
+ from datetime import datetime
5
+ from json import dumps
6
+ from pathlib import Path
7
+ from typing import Callable, cast
8
+
9
+ import click
10
+ from pydantic_core import to_json
11
+ from rich import print as r_print
12
+ from rich.console import Console, RenderableType
13
+ from rich.table import Column, Table
14
+
15
+ from inspect_ai._util.error import PrerequisiteError
16
+ from inspect_ai._util.logger import TRACE_FILE_NAME
17
+ from inspect_ai._util.trace import ActionTraceRecord, inspect_trace_dir, read_trace_file
18
+
19
+
20
+ @click.group("trace")
21
+ def trace_command() -> None:
22
+ """List and read execution traces.
23
+
24
+ Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
25
+ """
26
+ return None
27
+
28
+
29
+ @trace_command.command("list")
30
+ @click.option(
31
+ "--json",
32
+ type=bool,
33
+ is_flag=True,
34
+ default=False,
35
+ help="Output listing as JSON",
36
+ )
37
+ def list_command(json: bool) -> None:
38
+ """List all trace files."""
39
+ trace_dir = inspect_trace_dir()
40
+ trace_files: list[dict[str, float | str]] = [
41
+ {"mtime": f.lstat().st_mtime, "file": f.absolute().as_posix()}
42
+ for f in trace_dir.iterdir()
43
+ if f.is_file()
44
+ ]
45
+ trace_files.sort(key=lambda f: cast(float, f["mtime"]), reverse=True)
46
+ if json:
47
+ print(dumps(trace_files, indent=2))
48
+ else:
49
+ table = Table(box=None, show_header=True, pad_edge=False)
50
+ table.add_column("Time")
51
+ table.add_column("Trace File")
52
+ for file in trace_files:
53
+ mtime = datetime.fromtimestamp(cast(float, file["mtime"])).astimezone()
54
+ table.add_row(
55
+ mtime.strftime("%d-%b %H:%M:%S %Z"), shlex.quote(str(file["file"]))
56
+ )
57
+ r_print(table)
58
+
59
+
60
+ @trace_command.command("dump")
61
+ @click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
62
+ def read_command(trace_file: str) -> None:
63
+ """Dump a trace file to stdout (as a JSON array of log records)."""
64
+ trace_file_path = resolve_trace_file_path(trace_file)
65
+
66
+ traces = read_trace_file(trace_file_path)
67
+ print(
68
+ to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
69
+ )
70
+
71
+
72
+ @trace_command.command("anomalies")
73
+ @click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
74
+ @click.option(
75
+ "--all",
76
+ is_flag=True,
77
+ default=False,
78
+ help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
79
+ )
80
+ def anomolies_command(trace_file: str, all: bool) -> None:
81
+ """Look for anomalies in a trace file (never completed or cancelled actions)."""
82
+ trace_file_path = resolve_trace_file_path(trace_file)
83
+ traces = read_trace_file(trace_file_path)
84
+
85
+ # Track started actions
86
+ running_actions: dict[str, ActionTraceRecord] = {}
87
+ canceled_actions: dict[str, ActionTraceRecord] = {}
88
+ error_actions: dict[str, ActionTraceRecord] = {}
89
+ timeout_actions: dict[str, ActionTraceRecord] = {}
90
+
91
+ def action_started(trace: ActionTraceRecord) -> None:
92
+ running_actions[trace.trace_id] = trace
93
+
94
+ def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
95
+ start_trace = running_actions.get(trace.trace_id)
96
+ if start_trace:
97
+ del running_actions[trace.trace_id]
98
+ return start_trace
99
+ else:
100
+ raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
101
+
102
+ def action_failed(trace: ActionTraceRecord) -> None:
103
+ if all:
104
+ error_actions[start_trace.trace_id] = trace
105
+
106
+ def action_canceled(trace: ActionTraceRecord) -> None:
107
+ canceled_actions[start_trace.trace_id] = trace
108
+
109
+ def action_timeout(trace: ActionTraceRecord) -> None:
110
+ if all:
111
+ timeout_actions[start_trace.trace_id] = trace
112
+
113
+ for trace in traces:
114
+ if isinstance(trace, ActionTraceRecord):
115
+ match trace.event:
116
+ case "enter":
117
+ action_started(trace)
118
+ case "exit":
119
+ action_completed(trace)
120
+ case "cancel":
121
+ start_trace = action_completed(trace)
122
+ trace.start_time = start_trace.start_time
123
+ action_canceled(trace)
124
+ case "error":
125
+ start_trace = action_completed(trace)
126
+ trace.start_time = start_trace.start_time
127
+ action_failed(trace)
128
+ case "timeout":
129
+ start_trace = action_completed(trace)
130
+ trace.start_time = start_trace.start_time
131
+ action_timeout(trace)
132
+ case _:
133
+ print(f"Unknown event type: {trace.event}")
134
+
135
+ # do we have any traces?
136
+ if (
137
+ len(running_actions)
138
+ + len(canceled_actions)
139
+ + len(error_actions)
140
+ + len(timeout_actions)
141
+ == 0
142
+ ):
143
+ print(f"TRACE: {shlex.quote(trace_file_path.as_posix())}\n")
144
+ if all:
145
+ print("No anomalies found in trace log.")
146
+ else:
147
+ print(
148
+ "No running or cancelled actions found in trace log (pass --all to see errors and timeouts)."
149
+ )
150
+ return
151
+
152
+ with open(os.devnull, "w") as f:
153
+ # generate output
154
+ console = Console(record=True, file=f)
155
+
156
+ def print_fn(o: RenderableType) -> None:
157
+ console.print(o, highlight=False)
158
+
159
+ print_fn(f"[bold]TRACE: {shlex.quote(trace_file_path.as_posix())}[bold]")
160
+
161
+ _print_bucket(print_fn, "Running Actions", running_actions)
162
+ _print_bucket(print_fn, "Cancelled Actions", canceled_actions)
163
+ _print_bucket(print_fn, "Error Actions", error_actions)
164
+ _print_bucket(print_fn, "Timeout Actions", timeout_actions)
165
+
166
+ # print
167
+ print(console.export_text(styles=True).strip())
168
+
169
+
170
+ def _print_bucket(
171
+ print_fn: Callable[[RenderableType], None],
172
+ label: str,
173
+ bucket: dict[str, ActionTraceRecord],
174
+ ) -> None:
175
+ if len(bucket) > 0:
176
+ # Sort the items in chronological order of when
177
+ # they finished so the first finished item is at the top
178
+ sorted_actions = sorted(
179
+ bucket.values(),
180
+ key=lambda record: (record.start_time or 0) + (record.duration or 0),
181
+ reverse=True,
182
+ )
183
+
184
+ # create table
185
+ table = Table(
186
+ Column(""),
187
+ Column("", justify="right"),
188
+ Column(""),
189
+ Column("", width=22),
190
+ box=None,
191
+ title=label,
192
+ title_justify="left",
193
+ title_style="bold",
194
+ pad_edge=False,
195
+ padding=(0, 1),
196
+ )
197
+
198
+ for action in sorted_actions:
199
+ # Compute duration (use the event duration or time since started)
200
+ duration = (
201
+ action.duration
202
+ if action.duration is not None
203
+ else time.time() - action.start_time
204
+ if action.start_time is not None
205
+ else 0.0
206
+ )
207
+
208
+ # The event start time
209
+ start_time = formatTime(action.start_time) if action.start_time else "None"
210
+
211
+ # Event detail
212
+ detail = (
213
+ f"{action.detail or action.message} {action.error}"
214
+ if action.event == "error"
215
+ else (action.detail or action.message)
216
+ )
217
+
218
+ table.add_row(
219
+ action.action,
220
+ f"{round(duration, 2):.2f}s".rjust(8),
221
+ f" {detail}",
222
+ start_time,
223
+ )
224
+
225
+ print_fn("")
226
+ print_fn(table)
227
+
228
+
229
+ def resolve_trace_file_path(trace_file: str) -> Path:
230
+ trace_file_path = Path(trace_file)
231
+ if not trace_file_path.is_absolute():
232
+ trace_file_path = inspect_trace_dir() / trace_file_path
233
+
234
+ if not trace_file_path.exists():
235
+ raise PrerequisiteError(
236
+ f"The specified trace file '{trace_file_path}' does not exist."
237
+ )
238
+
239
+ return trace_file_path
240
+
241
+
242
+ def formatTime(timestamp: float) -> str:
243
+ dt = datetime.fromtimestamp(timestamp).astimezone()
244
+ return dt.strftime("%H:%M:%S %Z")
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
130
130
  return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
131
131
 
132
132
 
133
- def progress_count(complete: int, total: int) -> str:
134
- # Pad the display to keep it stable
133
+ def progress_count(complete: int, total: int, width: int | None = None) -> str:
134
+ # Pad the display to keep it stable as the
135
+ # complete metrics
135
136
  total_str = f"{total:,}"
136
137
  complete_str = f"{complete:,}"
137
138
  padding = max(0, len(total_str) - len(complete_str))
138
- return " " * padding + f"[{complete_str}/{total_str}]"
139
+ padded = " " * padding + f"[{complete_str}/{total_str}]"
140
+
141
+ # If a width has ben specified, pad up to this width as well
142
+ if width is not None:
143
+ padded = padded.rjust(width)
144
+ return padded
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
166
166
  return message
167
167
 
168
168
 
169
- def task_metric(metrics: list[TaskDisplayMetric]) -> str:
169
+ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
170
170
  reducer_names: Set[str] = {
171
171
  metric.reducer for metric in metrics if metric.reducer is not None
172
172
  }
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
180
180
  else:
181
181
  value = f"{metric.value:.2f}"
182
182
 
183
- if show_reducer:
184
- return f"{metric.name}/{metric.reducer}: {value}"
183
+ if show_reducer and metric.reducer is not None:
184
+ metric_str = f"{metric.name}/{metric.reducer}: {value}"
185
185
  else:
186
- return f"{metric.name}: {value}"
186
+ metric_str = f"{metric.name}: {value}"
187
+
188
+ if width is not None:
189
+ metric_str = metric_str.rjust(width)
190
+ return metric_str
187
191
 
188
192
 
189
193
  def task_metrics(scores: list[EvalScore]) -> str:
@@ -197,7 +197,11 @@ class TaskScreenApp(App[TR]):
197
197
 
198
198
  # add task
199
199
  try:
200
- yield self.query_one(TasksView).add_task(task)
200
+ task_view = self.query_one(TasksView)
201
+ task_view.set_display_metrics(
202
+ profile.eval_config.score_display is not False
203
+ )
204
+ yield task_view.add_task(task)
201
205
  finally:
202
206
  pass
203
207
 
@@ -224,6 +224,9 @@ class TaskMetrics(Widget):
224
224
  self.recompute_grid()
225
225
 
226
226
  def recompute_grid(self) -> None:
227
+ if not self.is_mounted:
228
+ return
229
+
227
230
  grid = self.query_one(f"#{self.grid_id()}")
228
231
 
229
232
  grid.remove_children()