inspect-ai 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. inspect_ai/_cli/eval.py +26 -1
  2. inspect_ai/_cli/main.py +2 -0
  3. inspect_ai/_cli/trace.py +244 -0
  4. inspect_ai/_display/textual/app.py +5 -1
  5. inspect_ai/_display/textual/widgets/tasks.py +13 -3
  6. inspect_ai/_eval/eval.py +17 -0
  7. inspect_ai/_eval/task/images.py +4 -14
  8. inspect_ai/_eval/task/log.py +2 -1
  9. inspect_ai/_eval/task/run.py +26 -10
  10. inspect_ai/_util/constants.py +3 -3
  11. inspect_ai/_util/display.py +1 -0
  12. inspect_ai/_util/logger.py +34 -8
  13. inspect_ai/_util/trace.py +275 -0
  14. inspect_ai/log/_log.py +3 -0
  15. inspect_ai/log/_message.py +2 -2
  16. inspect_ai/log/_recorders/eval.py +6 -17
  17. inspect_ai/log/_recorders/json.py +19 -17
  18. inspect_ai/model/_cache.py +22 -16
  19. inspect_ai/model/_call_tools.py +9 -1
  20. inspect_ai/model/_generate_config.py +2 -2
  21. inspect_ai/model/_model.py +11 -12
  22. inspect_ai/model/_providers/bedrock.py +1 -1
  23. inspect_ai/model/_providers/openai.py +11 -1
  24. inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
  25. inspect_ai/util/_sandbox/context.py +6 -1
  26. inspect_ai/util/_sandbox/docker/compose.py +58 -19
  27. inspect_ai/util/_sandbox/docker/docker.py +11 -11
  28. inspect_ai/util/_sandbox/docker/util.py +0 -6
  29. inspect_ai/util/_sandbox/service.py +17 -7
  30. inspect_ai/util/_subprocess.py +6 -1
  31. inspect_ai/util/_subtask.py +8 -2
  32. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/METADATA +7 -7
  33. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/RECORD +37 -35
  34. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/LICENSE +0 -0
  35. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/WHEEL +0 -0
  36. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/entry_points.txt +0 -0
  37. {inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -42,6 +42,7 @@ LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not s
42
42
  NO_SCORE_HELP = (
43
43
  "Do not score model output (use the inspect score command to score output later)"
44
44
  )
45
+ NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
45
46
  MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
46
47
  MAX_RETRIES_HELP = (
47
48
  f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
@@ -257,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
257
258
  help=NO_SCORE_HELP,
258
259
  envvar="INSPECT_EVAL_NO_SCORE",
259
260
  )
261
+ @click.option(
262
+ "--no-score-display",
263
+ type=bool,
264
+ is_flag=True,
265
+ help=NO_SCORE_HELP,
266
+ envvar="INSPECT_EVAL_SCORE_DISPLAY",
267
+ )
260
268
  @click.option(
261
269
  "--max-tokens",
262
270
  type=int,
@@ -339,7 +347,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
339
347
  "--logprobs",
340
348
  type=bool,
341
349
  is_flag=True,
342
- help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, vLLM only.",
350
+ help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only.",
343
351
  envvar="INSPECT_EVAL_LOGPROBS",
344
352
  )
345
353
  @click.option(
@@ -446,6 +454,7 @@ def eval_command(
446
454
  log_images: bool | None,
447
455
  log_buffer: int | None,
448
456
  no_score: bool | None,
457
+ no_score_display: bool | None,
449
458
  log_format: Literal["eval", "json"] | None,
450
459
  **common: Unpack[CommonOptions],
451
460
  ) -> None:
@@ -495,6 +504,7 @@ def eval_command(
495
504
  log_images=log_images,
496
505
  log_buffer=log_buffer,
497
506
  no_score=no_score,
507
+ no_score_display=no_score_display,
498
508
  is_eval_set=False,
499
509
  **config,
500
510
  )
@@ -603,6 +613,7 @@ def eval_set_command(
603
613
  log_images: bool | None,
604
614
  log_buffer: int | None,
605
615
  no_score: bool | None,
616
+ no_score_display: bool | None,
606
617
  bundle_dir: str | None,
607
618
  bundle_overwrite: bool | None,
608
619
  log_format: Literal["eval", "json"] | None,
@@ -654,6 +665,7 @@ def eval_set_command(
654
665
  log_images=log_images,
655
666
  log_buffer=log_buffer,
656
667
  no_score=no_score,
668
+ no_score_display=no_score_display,
657
669
  is_eval_set=True,
658
670
  retry_attempts=retry_attempts,
659
671
  retry_wait=retry_wait,
@@ -706,6 +718,7 @@ def eval_exec(
706
718
  log_images: bool | None,
707
719
  log_buffer: int | None,
708
720
  no_score: bool | None,
721
+ no_score_display: bool | None,
709
722
  is_eval_set: bool = False,
710
723
  retry_attempts: int | None = None,
711
724
  retry_wait: int | None = None,
@@ -746,6 +759,7 @@ def eval_exec(
746
759
  log_images = False if log_images is False else None
747
760
  trace = True if trace else None
748
761
  score = False if no_score else True
762
+ score_display = False if no_score_display else None
749
763
 
750
764
  # build params
751
765
  params: dict[str, Any] = (
@@ -781,6 +795,7 @@ def eval_exec(
781
795
  log_images=log_images,
782
796
  log_buffer=log_buffer,
783
797
  score=score,
798
+ score_display=score_display,
784
799
  )
785
800
  | kwargs
786
801
  )
@@ -915,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
915
930
  help=NO_SCORE_HELP,
916
931
  envvar="INSPECT_EVAL_SCORE",
917
932
  )
933
+ @click.option(
934
+ "--no-score-display",
935
+ type=bool,
936
+ is_flag=True,
937
+ help=NO_SCORE_HELP,
938
+ envvar="INSPECT_EVAL_SCORE_DISPLAY",
939
+ )
918
940
  @click.option(
919
941
  "--max-connections",
920
942
  type=int,
@@ -940,6 +962,7 @@ def eval_retry_command(
940
962
  log_images: bool | None,
941
963
  log_buffer: int | None,
942
964
  no_score: bool | None,
965
+ no_score_display: bool | None,
943
966
  max_connections: int | None,
944
967
  max_retries: int | None,
945
968
  timeout: int | None,
@@ -954,6 +977,7 @@ def eval_retry_command(
954
977
  log_samples = False if no_log_samples else None
955
978
  log_images = False if log_images is False else None
956
979
  score = False if no_score else True
980
+ score_display = False if no_score_display else None
957
981
 
958
982
  # resolve fail_on_error
959
983
  if no_fail_on_error is True:
@@ -984,6 +1008,7 @@ def eval_retry_command(
984
1008
  log_images=log_images,
985
1009
  log_buffer=log_buffer,
986
1010
  score=score,
1011
+ score_display=score_display,
987
1012
  max_retries=max_retries,
988
1013
  timeout=timeout,
989
1014
  max_connections=max_connections,
inspect_ai/_cli/main.py CHANGED
@@ -11,6 +11,7 @@ from .list import list_command
11
11
  from .log import log_command
12
12
  from .sandbox import sandbox_command
13
13
  from .score import score_command
14
+ from .trace import trace_command
14
15
  from .view import view_command
15
16
 
16
17
 
@@ -46,6 +47,7 @@ inspect.add_command(log_command)
46
47
  inspect.add_command(score_command)
47
48
  inspect.add_command(view_command)
48
49
  inspect.add_command(sandbox_command)
50
+ inspect.add_command(trace_command)
49
51
 
50
52
 
51
53
  def main() -> None:
@@ -0,0 +1,244 @@
1
+ import os
2
+ import shlex
3
+ import time
4
+ from datetime import datetime
5
+ from json import dumps
6
+ from pathlib import Path
7
+ from typing import Callable, cast
8
+
9
+ import click
10
+ from pydantic_core import to_json
11
+ from rich import print as r_print
12
+ from rich.console import Console, RenderableType
13
+ from rich.table import Column, Table
14
+
15
+ from inspect_ai._util.error import PrerequisiteError
16
+ from inspect_ai._util.logger import TRACE_FILE_NAME
17
+ from inspect_ai._util.trace import ActionTraceRecord, inspect_trace_dir, read_trace_file
18
+
19
+
20
+ @click.group("trace")
21
+ def trace_command() -> None:
22
+ """List and read execution traces.
23
+
24
+ Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
25
+ """
26
+ return None
27
+
28
+
29
+ @trace_command.command("list")
30
+ @click.option(
31
+ "--json",
32
+ type=bool,
33
+ is_flag=True,
34
+ default=False,
35
+ help="Output listing as JSON",
36
+ )
37
+ def list_command(json: bool) -> None:
38
+ """List all trace files."""
39
+ trace_dir = inspect_trace_dir()
40
+ trace_files: list[dict[str, float | str]] = [
41
+ {"mtime": f.lstat().st_mtime, "file": f.absolute().as_posix()}
42
+ for f in trace_dir.iterdir()
43
+ if f.is_file()
44
+ ]
45
+ trace_files.sort(key=lambda f: cast(float, f["mtime"]), reverse=True)
46
+ if json:
47
+ print(dumps(trace_files, indent=2))
48
+ else:
49
+ table = Table(box=None, show_header=True, pad_edge=False)
50
+ table.add_column("Time")
51
+ table.add_column("Trace File")
52
+ for file in trace_files:
53
+ mtime = datetime.fromtimestamp(cast(float, file["mtime"])).astimezone()
54
+ table.add_row(
55
+ mtime.strftime("%d-%b %H:%M:%S %Z"), shlex.quote(str(file["file"]))
56
+ )
57
+ r_print(table)
58
+
59
+
60
+ @trace_command.command("dump")
61
+ @click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
62
+ def read_command(trace_file: str) -> None:
63
+ """Dump a trace file to stdout (as a JSON array of log records)."""
64
+ trace_file_path = resolve_trace_file_path(trace_file)
65
+
66
+ traces = read_trace_file(trace_file_path)
67
+ print(
68
+ to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
69
+ )
70
+
71
+
72
+ @trace_command.command("anomalies")
73
+ @click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
74
+ @click.option(
75
+ "--all",
76
+ is_flag=True,
77
+ default=False,
78
+ help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
79
+ )
80
+ def anomolies_command(trace_file: str, all: bool) -> None:
81
+ """Look for anomalies in a trace file (never completed or cancelled actions)."""
82
+ trace_file_path = resolve_trace_file_path(trace_file)
83
+ traces = read_trace_file(trace_file_path)
84
+
85
+ # Track started actions
86
+ running_actions: dict[str, ActionTraceRecord] = {}
87
+ canceled_actions: dict[str, ActionTraceRecord] = {}
88
+ error_actions: dict[str, ActionTraceRecord] = {}
89
+ timeout_actions: dict[str, ActionTraceRecord] = {}
90
+
91
+ def action_started(trace: ActionTraceRecord) -> None:
92
+ running_actions[trace.trace_id] = trace
93
+
94
+ def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
95
+ start_trace = running_actions.get(trace.trace_id)
96
+ if start_trace:
97
+ del running_actions[trace.trace_id]
98
+ return start_trace
99
+ else:
100
+ raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
101
+
102
+ def action_failed(trace: ActionTraceRecord) -> None:
103
+ if all:
104
+ error_actions[start_trace.trace_id] = trace
105
+
106
+ def action_canceled(trace: ActionTraceRecord) -> None:
107
+ canceled_actions[start_trace.trace_id] = trace
108
+
109
+ def action_timeout(trace: ActionTraceRecord) -> None:
110
+ if all:
111
+ timeout_actions[start_trace.trace_id] = trace
112
+
113
+ for trace in traces:
114
+ if isinstance(trace, ActionTraceRecord):
115
+ match trace.event:
116
+ case "enter":
117
+ action_started(trace)
118
+ case "exit":
119
+ action_completed(trace)
120
+ case "cancel":
121
+ start_trace = action_completed(trace)
122
+ trace.start_time = start_trace.start_time
123
+ action_canceled(trace)
124
+ case "error":
125
+ start_trace = action_completed(trace)
126
+ trace.start_time = start_trace.start_time
127
+ action_failed(trace)
128
+ case "timeout":
129
+ start_trace = action_completed(trace)
130
+ trace.start_time = start_trace.start_time
131
+ action_timeout(trace)
132
+ case _:
133
+ print(f"Unknown event type: {trace.event}")
134
+
135
+ # do we have any traces?
136
+ if (
137
+ len(running_actions)
138
+ + len(canceled_actions)
139
+ + len(error_actions)
140
+ + len(timeout_actions)
141
+ == 0
142
+ ):
143
+ print(f"TRACE: {shlex.quote(trace_file_path.as_posix())}\n")
144
+ if all:
145
+ print("No anomalies found in trace log.")
146
+ else:
147
+ print(
148
+ "No running or cancelled actions found in trace log (pass --all to see errors and timeouts)."
149
+ )
150
+ return
151
+
152
+ with open(os.devnull, "w") as f:
153
+ # generate output
154
+ console = Console(record=True, file=f)
155
+
156
+ def print_fn(o: RenderableType) -> None:
157
+ console.print(o, highlight=False)
158
+
159
+ print_fn(f"[bold]TRACE: {shlex.quote(trace_file_path.as_posix())}[bold]")
160
+
161
+ _print_bucket(print_fn, "Running Actions", running_actions)
162
+ _print_bucket(print_fn, "Cancelled Actions", canceled_actions)
163
+ _print_bucket(print_fn, "Error Actions", error_actions)
164
+ _print_bucket(print_fn, "Timeout Actions", timeout_actions)
165
+
166
+ # print
167
+ print(console.export_text(styles=True).strip())
168
+
169
+
170
+ def _print_bucket(
171
+ print_fn: Callable[[RenderableType], None],
172
+ label: str,
173
+ bucket: dict[str, ActionTraceRecord],
174
+ ) -> None:
175
+ if len(bucket) > 0:
176
+ # Sort the items in chronological order of when
177
+ # they finished so the first finished item is at the top
178
+ sorted_actions = sorted(
179
+ bucket.values(),
180
+ key=lambda record: (record.start_time or 0) + (record.duration or 0),
181
+ reverse=True,
182
+ )
183
+
184
+ # create table
185
+ table = Table(
186
+ Column(""),
187
+ Column("", justify="right"),
188
+ Column(""),
189
+ Column("", width=22),
190
+ box=None,
191
+ title=label,
192
+ title_justify="left",
193
+ title_style="bold",
194
+ pad_edge=False,
195
+ padding=(0, 1),
196
+ )
197
+
198
+ for action in sorted_actions:
199
+ # Compute duration (use the event duration or time since started)
200
+ duration = (
201
+ action.duration
202
+ if action.duration is not None
203
+ else time.time() - action.start_time
204
+ if action.start_time is not None
205
+ else 0.0
206
+ )
207
+
208
+ # The event start time
209
+ start_time = formatTime(action.start_time) if action.start_time else "None"
210
+
211
+ # Event detail
212
+ detail = (
213
+ f"{action.detail or action.message} {action.error}"
214
+ if action.event == "error"
215
+ else (action.detail or action.message)
216
+ )
217
+
218
+ table.add_row(
219
+ action.action,
220
+ f"{round(duration, 2):.2f}s".rjust(8),
221
+ f" {detail}",
222
+ start_time,
223
+ )
224
+
225
+ print_fn("")
226
+ print_fn(table)
227
+
228
+
229
+ def resolve_trace_file_path(trace_file: str) -> Path:
230
+ trace_file_path = Path(trace_file)
231
+ if not trace_file_path.is_absolute():
232
+ trace_file_path = inspect_trace_dir() / trace_file_path
233
+
234
+ if not trace_file_path.exists():
235
+ raise PrerequisiteError(
236
+ f"The specified trace file '{trace_file_path}' does not exist."
237
+ )
238
+
239
+ return trace_file_path
240
+
241
+
242
+ def formatTime(timestamp: float) -> str:
243
+ dt = datetime.fromtimestamp(timestamp).astimezone()
244
+ return dt.strftime("%H:%M:%S %Z")
@@ -197,7 +197,11 @@ class TaskScreenApp(App[TR]):
197
197
 
198
198
  # add task
199
199
  try:
200
- yield self.query_one(TasksView).add_task(task)
200
+ task_view = self.query_one(TasksView)
201
+ task_view.set_display_metrics(
202
+ profile.eval_config.score_display is not False
203
+ )
204
+ yield task_view.add_task(task)
201
205
  finally:
202
206
  pass
203
207
 
@@ -72,6 +72,7 @@ class TasksView(Container):
72
72
  self.description_width = MAX_DESCRIPTION_WIDTH
73
73
  self.model_name_width = MAX_MODEL_NAME_WIDTH
74
74
  self.sample_count_width = 0
75
+ self.display_metrics = True
75
76
 
76
77
  def init_tasks(self, tasks: list[TaskSpec]) -> None:
77
78
  # clear existing tasks
@@ -89,7 +90,11 @@ class TasksView(Container):
89
90
  def add_task(self, task: TaskWithResult) -> TaskDisplay:
90
91
  self.update_count_width(task.profile.samples)
91
92
  task_display = TaskProgressView(
92
- task, self.description_width, self.model_name_width, self.sample_count_width
93
+ task,
94
+ self.description_width,
95
+ self.model_name_width,
96
+ self.sample_count_width,
97
+ self.display_metrics,
93
98
  )
94
99
  self.tasks.mount(task_display)
95
100
  self.tasks.scroll_to_widget(task_display)
@@ -97,6 +102,9 @@ class TasksView(Container):
97
102
 
98
103
  return task_display
99
104
 
105
+ def set_display_metrics(self, display_metrics: bool) -> None:
106
+ self.display_metrics = display_metrics
107
+
100
108
  def update_count_width(self, samples: int) -> None:
101
109
  sample_count_str = progress_count(samples, samples, self.sample_count_width)
102
110
  self.sample_count_width = min(
@@ -174,6 +182,7 @@ class TaskProgressView(Widget):
174
182
  description_width: int,
175
183
  model_name_width: int,
176
184
  sample_count_width: int,
185
+ display_metrics: bool,
177
186
  ) -> None:
178
187
  super().__init__()
179
188
  self.t = task
@@ -190,6 +199,7 @@ class TaskProgressView(Widget):
190
199
  self.task_detail = TaskDetail(id="task-detail", classes="hidden")
191
200
 
192
201
  self.sample_count_width: int = sample_count_width
202
+ self.display_metrics = display_metrics
193
203
 
194
204
  metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
195
205
  metrics_width: reactive[int | None] = reactive(None)
@@ -198,7 +208,7 @@ class TaskProgressView(Widget):
198
208
  samples_total: reactive[int] = reactive(0)
199
209
 
200
210
  def compose(self) -> ComposeResult:
201
- yield self.toggle
211
+ yield (self.toggle if self.display_metrics else Static())
202
212
  yield TaskStatusIcon()
203
213
  yield Static(
204
214
  progress_description(self.t.profile, self.description_width, pad=True)
@@ -274,7 +284,7 @@ class TaskProgressView(Widget):
274
284
 
275
285
  def update_metrics_label(self) -> None:
276
286
  # compute the label (with a min size)
277
- if self.metrics is not None:
287
+ if self.metrics is not None and self.metrics_display is not None:
278
288
  metric_label = task_metric(self.metrics, self.metrics_width)
279
289
  self.metrics_width = len(metric_label)
280
290
  self.metrics_display.update(metric_label)
inspect_ai/_eval/eval.py CHANGED
@@ -76,6 +76,7 @@ def eval(
76
76
  log_images: bool | None = None,
77
77
  log_buffer: int | None = None,
78
78
  score: bool = True,
79
+ score_display: bool | None = None,
79
80
  **kwargs: Unpack[GenerateConfigArgs],
80
81
  ) -> list[EvalLog]:
81
82
  r"""Evaluate tasks using a Model.
@@ -139,6 +140,7 @@ def eval(
139
140
  If not specified, an appropriate default for the format and filesystem is
140
141
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
141
142
  score (bool): Score output (defaults to True)
143
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
142
144
  **kwargs (GenerateConfigArgs): Model generation options.
143
145
 
144
146
  Returns:
@@ -183,6 +185,7 @@ def eval(
183
185
  log_images=log_images,
184
186
  log_buffer=log_buffer,
185
187
  score=score,
188
+ score_display=score_display,
186
189
  **kwargs,
187
190
  )
188
191
  )
@@ -220,6 +223,7 @@ async def eval_async(
220
223
  log_images: bool | None = None,
221
224
  log_buffer: int | None = None,
222
225
  score: bool = True,
226
+ score_display: bool | None = None,
223
227
  **kwargs: Unpack[GenerateConfigArgs],
224
228
  ) -> list[EvalLog]:
225
229
  r"""Evaluate tasks using a Model (async).
@@ -282,6 +286,7 @@ async def eval_async(
282
286
  If not specified, an appropriate default for the format and filesystem is
283
287
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
284
288
  score (bool): Score output (defaults to True)
289
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
285
290
  **kwargs (GenerateConfigArgs): Model generation options.
286
291
 
287
292
  Returns:
@@ -380,6 +385,7 @@ async def eval_async(
380
385
  log_samples=log_samples,
381
386
  log_images=log_images,
382
387
  log_buffer=log_buffer,
388
+ score_display=score_display,
383
389
  )
384
390
 
385
391
  # run tasks - 2 codepaths, one for the traditional task at a time
@@ -467,6 +473,7 @@ def eval_retry(
467
473
  log_images: bool | None = None,
468
474
  log_buffer: int | None = None,
469
475
  score: bool = True,
476
+ score_display: bool | None = None,
470
477
  max_retries: int | None = None,
471
478
  timeout: int | None = None,
472
479
  max_connections: int | None = None,
@@ -507,6 +514,7 @@ def eval_retry(
507
514
  If not specified, an appropriate default for the format and filesystem is
508
515
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
509
516
  score (bool): Score output (defaults to True)
517
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
510
518
  max_retries (int | None):
511
519
  Maximum number of times to retry request.
512
520
  timeout: (int | None):
@@ -541,6 +549,7 @@ def eval_retry(
541
549
  log_images=log_images,
542
550
  log_buffer=log_buffer,
543
551
  score=score,
552
+ score_display=score_display,
544
553
  max_retries=max_retries,
545
554
  timeout=timeout,
546
555
  max_connections=max_connections,
@@ -565,6 +574,7 @@ async def eval_retry_async(
565
574
  log_images: bool | None = None,
566
575
  log_buffer: int | None = None,
567
576
  score: bool = True,
577
+ score_display: bool | None = None,
568
578
  max_retries: int | None = None,
569
579
  timeout: int | None = None,
570
580
  max_connections: int | None = None,
@@ -603,6 +613,7 @@ async def eval_retry_async(
603
613
  If not specified, an appropriate default for the format and filesystem is
604
614
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
605
615
  score (bool): Score output (defaults to True)
616
+ score_display (bool | None): Show scoring metrics in realtime (defaults to True)
606
617
  max_retries (int | None):
607
618
  Maximum number of times to retry request.
608
619
  timeout: (int | None):
@@ -699,6 +710,11 @@ async def eval_retry_async(
699
710
  log_buffer = (
700
711
  log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
701
712
  )
713
+ score_display = (
714
+ score_display
715
+ if score_display is not None
716
+ else eval_log.eval.config.score_display
717
+ )
702
718
 
703
719
  config = eval_log.plan.config
704
720
  config.max_retries = max_retries or config.max_retries
@@ -740,6 +756,7 @@ async def eval_retry_async(
740
756
  log_images=log_images,
741
757
  log_buffer=log_buffer,
742
758
  score=score,
759
+ score_display=score_display,
743
760
  **dict(config),
744
761
  )
745
762
  )[0]
@@ -30,13 +30,8 @@ async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
30
30
 
31
31
  async def sample_with_base64_images(sample: Sample) -> Sample:
32
32
  if isinstance(sample.input, list):
33
- return Sample(
34
- input=await messages_with_base64_images(sample.input),
35
- target=sample.target,
36
- id=sample.id,
37
- metadata=sample.metadata,
38
- files=sample.files,
39
- choices=sample.choices,
33
+ return sample.model_copy(
34
+ update={"input": await messages_with_base64_images(sample.input)}
40
35
  )
41
36
  else:
42
37
  return sample
@@ -44,13 +39,8 @@ async def sample_with_base64_images(sample: Sample) -> Sample:
44
39
 
45
40
  def sample_without_base64_images(sample: Sample) -> Sample:
46
41
  if isinstance(sample.input, list):
47
- return Sample(
48
- input=messages_without_base64_images(sample.input),
49
- target=sample.target,
50
- id=sample.id,
51
- metadata=sample.metadata,
52
- files=sample.files,
53
- choices=sample.choices,
42
+ return sample.model_copy(
43
+ update={"input": messages_without_base64_images(sample.input)}
54
44
  )
55
45
  else:
56
46
  return sample
@@ -69,10 +69,11 @@ class TaskLogger:
69
69
  )
70
70
  packages = {PKG_NAME: importlib_metadata.version(PKG_NAME)}
71
71
 
72
- # remove api_key from model_args
72
+ # redact authentication oriented model_args
73
73
  model_args = model_args.copy()
74
74
  if "api_key" in model_args:
75
75
  del model_args["api_key"]
76
+ model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
76
77
 
77
78
  # cwd_relative_path for sandbox config
78
79
  if sandbox and isinstance(sandbox.config, str):
@@ -217,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
217
217
  log_location=log_location,
218
218
  )
219
219
 
220
- with display().task(profile) as td:
220
+ with display().task(
221
+ profile,
222
+ ) as td:
221
223
  try:
222
224
  # start the log
223
225
  await log_start(logger, plan, generate_config)
@@ -252,7 +254,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
252
254
 
253
255
  # track when samples complete and update progress as we go
254
256
  progress_results: list[dict[str, SampleScore]] = []
255
- update_metrics_display = update_metrics_display_fn(td)
257
+ update_metrics_display = update_metrics_display_fn(
258
+ td,
259
+ display_metrics=profile.eval_config.score_display is not False,
260
+ )
256
261
 
257
262
  def sample_complete(sample_score: dict[str, SampleScore]) -> None:
258
263
  # Capture the result
@@ -400,7 +405,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
400
405
 
401
406
 
402
407
  def update_metrics_display_fn(
403
- td: TaskDisplay, initial_interval: float = 0, min_interval: float = 0.9
408
+ td: TaskDisplay,
409
+ initial_interval: float = 0,
410
+ min_interval: float = 0.9,
411
+ display_metrics: bool = True,
404
412
  ) -> Callable[
405
413
  [
406
414
  int,
@@ -420,6 +428,10 @@ def update_metrics_display_fn(
420
428
  reducers: ScoreReducer | list[ScoreReducer] | None,
421
429
  metrics: list[Metric] | dict[str, list[Metric]] | None,
422
430
  ) -> None:
431
+ # Don't compute metrics if they are not being displayed
432
+ if not display_metrics:
433
+ return None
434
+
423
435
  nonlocal next_compute_time
424
436
  time_start = time.perf_counter()
425
437
  if time_start >= next_compute_time:
@@ -568,14 +580,18 @@ async def task_run_sample(
568
580
  state = await plan(state, generate)
569
581
 
570
582
  except TimeoutError:
571
- # notify the user
572
- transcript()._event(
573
- SampleLimitEvent(
574
- type="time",
575
- message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
576
- limit=time_limit,
583
+ if time_limit is not None:
584
+ transcript()._event(
585
+ SampleLimitEvent(
586
+ type="time",
587
+ message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
588
+ limit=time_limit,
589
+ )
590
+ )
591
+ else:
592
+ py_logger.warning(
593
+ "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
577
594
  )
578
- )
579
595
 
580
596
  # capture most recent state for scoring
581
597
  state = sample_state() or state