inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +4 -2
  3. inspect_ai/_cli/eval.py +2 -0
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +0 -2
  6. inspect_ai/_display/core/panel.py +1 -1
  7. inspect_ai/_display/rich/display.py +4 -4
  8. inspect_ai/_display/textual/app.py +4 -1
  9. inspect_ai/_display/textual/widgets/samples.py +41 -5
  10. inspect_ai/_eval/eval.py +32 -20
  11. inspect_ai/_eval/evalset.py +7 -5
  12. inspect_ai/_eval/run.py +16 -11
  13. inspect_ai/_eval/task/__init__.py +2 -2
  14. inspect_ai/_eval/task/images.py +40 -25
  15. inspect_ai/_eval/task/run.py +141 -119
  16. inspect_ai/_eval/task/task.py +140 -25
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/content.py +23 -1
  19. inspect_ai/_util/datetime.py +1 -1
  20. inspect_ai/_util/deprecation.py +1 -1
  21. inspect_ai/_util/images.py +20 -17
  22. inspect_ai/_util/json.py +11 -1
  23. inspect_ai/_util/kvstore.py +73 -0
  24. inspect_ai/_util/logger.py +2 -1
  25. inspect_ai/_util/notgiven.py +18 -0
  26. inspect_ai/_util/thread.py +5 -0
  27. inspect_ai/_util/trace.py +39 -3
  28. inspect_ai/_util/transcript.py +36 -7
  29. inspect_ai/_view/www/.prettierrc.js +12 -0
  30. inspect_ai/_view/www/dist/assets/index.js +322 -226
  31. inspect_ai/_view/www/log-schema.json +221 -138
  32. inspect_ai/_view/www/src/App.mjs +18 -9
  33. inspect_ai/_view/www/src/Types.mjs +0 -1
  34. inspect_ai/_view/www/src/api/Types.mjs +15 -4
  35. inspect_ai/_view/www/src/api/api-http.mjs +2 -0
  36. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
  37. inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
  38. inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
  39. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  40. inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
  41. inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
  42. inspect_ai/_view/www/src/components/Tools.mjs +18 -3
  43. inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
  44. inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
  45. inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
  46. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
  47. inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
  48. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
  49. inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
  50. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
  51. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
  52. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
  53. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
  54. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
  55. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
  56. inspect_ai/_view/www/src/types/log.d.ts +53 -35
  57. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  58. inspect_ai/approval/_human/util.py +2 -2
  59. inspect_ai/dataset/_sources/csv.py +2 -1
  60. inspect_ai/dataset/_sources/json.py +2 -1
  61. inspect_ai/dataset/_sources/util.py +15 -7
  62. inspect_ai/log/_condense.py +11 -1
  63. inspect_ai/log/_log.py +27 -5
  64. inspect_ai/log/_recorders/eval.py +21 -8
  65. inspect_ai/log/_samples.py +10 -5
  66. inspect_ai/log/_transcript.py +28 -1
  67. inspect_ai/model/__init__.py +10 -2
  68. inspect_ai/model/_call_tools.py +82 -17
  69. inspect_ai/model/_chat_message.py +2 -4
  70. inspect_ai/model/{_trace.py → _conversation.py} +9 -8
  71. inspect_ai/model/_model.py +2 -2
  72. inspect_ai/model/_providers/anthropic.py +9 -7
  73. inspect_ai/model/_providers/azureai.py +6 -4
  74. inspect_ai/model/_providers/bedrock.py +6 -4
  75. inspect_ai/model/_providers/google.py +103 -14
  76. inspect_ai/model/_providers/groq.py +7 -5
  77. inspect_ai/model/_providers/hf.py +11 -6
  78. inspect_ai/model/_providers/mistral.py +6 -9
  79. inspect_ai/model/_providers/openai.py +34 -8
  80. inspect_ai/model/_providers/openai_o1.py +10 -12
  81. inspect_ai/model/_providers/vertex.py +17 -4
  82. inspect_ai/scorer/__init__.py +13 -2
  83. inspect_ai/scorer/_metrics/__init__.py +2 -2
  84. inspect_ai/scorer/_metrics/std.py +3 -3
  85. inspect_ai/tool/__init__.py +9 -1
  86. inspect_ai/tool/_tool.py +9 -2
  87. inspect_ai/tool/_tool_info.py +2 -1
  88. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
  89. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
  90. inspect_ai/util/__init__.py +4 -3
  91. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  92. inspect_ai/util/_display.py +14 -4
  93. inspect_ai/util/_sandbox/context.py +12 -13
  94. inspect_ai/util/_sandbox/docker/compose.py +24 -13
  95. inspect_ai/util/_sandbox/docker/docker.py +20 -13
  96. inspect_ai/util/_sandbox/docker/util.py +2 -1
  97. inspect_ai/util/_sandbox/environment.py +13 -1
  98. inspect_ai/util/_sandbox/local.py +1 -0
  99. inspect_ai/util/_sandbox/self_check.py +18 -18
  100. inspect_ai/util/_store.py +2 -2
  101. inspect_ai/util/_subprocess.py +3 -3
  102. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
  103. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
  104. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
  105. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
  106. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
  107. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py CHANGED
@@ -7,7 +7,7 @@ from inspect_ai._eval.evalset import eval_set
7
7
  from inspect_ai._eval.list import list_tasks
8
8
  from inspect_ai._eval.registry import task
9
9
  from inspect_ai._eval.score import score, score_async
10
- from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks
10
+ from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks, task_with
11
11
  from inspect_ai._util.constants import PKG_NAME
12
12
  from inspect_ai.solver._human_agent.agent import human_agent
13
13
 
@@ -29,4 +29,5 @@ __all__ = [
29
29
  "TaskInfo",
30
30
  "Tasks",
31
31
  "task",
32
+ "task_with",
32
33
  ]
inspect_ai/_cli/common.py CHANGED
@@ -17,7 +17,7 @@ class CommonOptions(TypedDict):
17
17
  log_level: str
18
18
  log_level_transcript: str
19
19
  log_dir: str
20
- display: Literal["full", "rich", "plain", "none"]
20
+ display: Literal["full", "conversation", "rich", "plain", "none"]
21
21
  no_ansi: bool | None
22
22
  debug: bool
23
23
  debug_port: int
@@ -64,7 +64,9 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
64
64
  )
65
65
  @click.option(
66
66
  "--display",
67
- type=click.Choice(["full", "rich", "plain", "none"], case_sensitive=False),
67
+ type=click.Choice(
68
+ ["full", "conversation", "rich", "plain", "none"], case_sensitive=False
69
+ ),
68
70
  default=DEFAULT_DISPLAY,
69
71
  envvar="INSPECT_DISPLAY",
70
72
  help="Set the display type (defaults to 'full')",
inspect_ai/_cli/eval.py CHANGED
@@ -118,6 +118,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
118
118
  "--trace",
119
119
  type=bool,
120
120
  is_flag=True,
121
+ hidden=True,
121
122
  envvar="INSPECT_EVAL_TRACE",
122
123
  help="Trace message interactions with evaluated model to terminal.",
123
124
  )
@@ -886,6 +887,7 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
886
887
  "--trace",
887
888
  type=bool,
888
889
  is_flag=True,
890
+ hidden=True,
889
891
  help="Trace message interactions with evaluated model to terminal.",
890
892
  envvar="INSPECT_EVAL_TRACE",
891
893
  )
inspect_ai/_cli/trace.py CHANGED
@@ -62,11 +62,21 @@ def list_command(json: bool) -> None:
62
62
 
63
63
  @trace_command.command("dump")
64
64
  @click.argument("trace-file", type=str, required=False)
65
- def dump_command(trace_file: str | None) -> None:
65
+ @click.option(
66
+ "--filter",
67
+ type=str,
68
+ help="Filter (applied to trace message field).",
69
+ )
70
+ def dump_command(trace_file: str | None, filter: str | None) -> None:
66
71
  """Dump a trace file to stdout (as a JSON array of log records)."""
67
72
  trace_file_path = _resolve_trace_file_path(trace_file)
68
73
 
69
74
  traces = read_trace_file(trace_file_path)
75
+
76
+ if filter:
77
+ filter = filter.lower()
78
+ traces = [trace for trace in traces if filter in trace.message.lower()]
79
+
70
80
  print(
71
81
  to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
72
82
  )
@@ -74,17 +84,26 @@ def dump_command(trace_file: str | None) -> None:
74
84
 
75
85
  @trace_command.command("anomalies")
76
86
  @click.argument("trace-file", type=str, required=False)
87
+ @click.option(
88
+ "--filter",
89
+ type=str,
90
+ help="Filter (applied to trace message field).",
91
+ )
77
92
  @click.option(
78
93
  "--all",
79
94
  is_flag=True,
80
95
  default=False,
81
96
  help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
82
97
  )
83
- def anomolies_command(trace_file: str | None, all: bool) -> None:
98
+ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) -> None:
84
99
  """Look for anomalies in a trace file (never completed or cancelled actions)."""
85
100
  trace_file_path = _resolve_trace_file_path(trace_file)
86
101
  traces = read_trace_file(trace_file_path)
87
102
 
103
+ if filter:
104
+ filter = filter.lower()
105
+ traces = [trace for trace in traces if filter in trace.message.lower()]
106
+
88
107
  # Track started actions
89
108
  running_actions: dict[str, ActionTraceRecord] = {}
90
109
  canceled_actions: dict[str, ActionTraceRecord] = {}
@@ -4,7 +4,6 @@ from contextvars import ContextVar
4
4
  import rich
5
5
 
6
6
  from inspect_ai.util._display import display_type
7
- from inspect_ai.util._trace import trace_enabled
8
7
 
9
8
  from ..rich.display import RichDisplay
10
9
  from ..textual.display import TextualDisplay
@@ -17,7 +16,6 @@ def display() -> Display:
17
16
  if (
18
17
  display_type() == "full"
19
18
  and sys.stdout.isatty()
20
- and not trace_enabled()
21
19
  and not rich.get_console().is_jupyter
22
20
  ):
23
21
  _active_display = TextualDisplay()
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
112
112
  def task_title(profile: TaskProfile, show_model: bool) -> str:
113
113
  eval_epochs = profile.eval_config.epochs or 1
114
114
  epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
115
- samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
115
+ samples = f"{profile.samples // eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
116
116
  title = f"{registry_unqualified_name(profile.name)} ({samples})"
117
117
  if show_model:
118
118
  title = f"{title}: {profile.model}"
@@ -15,7 +15,6 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
15
15
  from inspect_ai.log._transcript import InputEvent, transcript
16
16
  from inspect_ai.util._display import display_type
17
17
  from inspect_ai.util._throttle import throttle
18
- from inspect_ai.util._trace import trace_enabled
19
18
 
20
19
  from ..core.config import task_config
21
20
  from ..core.display import (
@@ -151,7 +150,8 @@ class RichDisplay(Display):
151
150
  @throttle(1)
152
151
  def _update_display(self) -> None:
153
152
  if (
154
- self.tasks is not None
153
+ display_type() != "conversation"
154
+ and self.tasks is not None
155
155
  and self.tasks
156
156
  and self.progress_ui is not None
157
157
  and self.live is not None
@@ -170,7 +170,7 @@ class RichTaskScreen(TaskScreen):
170
170
  def __init__(self, live: Live) -> None:
171
171
  self.theme = rich_theme()
172
172
  self.live = live
173
- status_text = "Working" if trace_enabled() else "Task running"
173
+ status_text = "Working" if display_type() == "conversation" else "Task running"
174
174
  self.status = self.live.console.status(
175
175
  f"[{self.theme.meta} bold]{status_text}...[/{self.theme.meta} bold]",
176
176
  spinner="clock",
@@ -189,7 +189,7 @@ class RichTaskScreen(TaskScreen):
189
189
  ) -> Iterator[Console]:
190
190
  # determine transient based on trace mode
191
191
  if transient is None:
192
- transient = not trace_enabled()
192
+ transient = display_type() != "conversation"
193
193
 
194
194
  # clear live task status and transient status
195
195
  self.live.update("", refresh=True)
@@ -284,7 +284,10 @@ class TaskScreenApp(App[TR]):
284
284
 
285
285
  def update_samples(self) -> None:
286
286
  samples_view = self.query_one(SamplesView)
287
- samples_view.set_samples(active_samples())
287
+ active_and_started_samples = [
288
+ sample for sample in active_samples() if sample.started is not None
289
+ ]
290
+ samples_view.set_samples(active_and_started_samples)
288
291
 
289
292
  def update_footer(self) -> None:
290
293
  left, right = task_footer()
@@ -25,6 +25,7 @@ from textual.widgets.option_list import Option, Separator
25
25
  from inspect_ai._util.format import format_progress_time
26
26
  from inspect_ai._util.registry import registry_unqualified_name
27
27
  from inspect_ai.log._samples import ActiveSample
28
+ from inspect_ai.log._transcript import ToolEvent
28
29
 
29
30
  from .clock import Clock
30
31
  from .transcript import TranscriptView
@@ -332,16 +333,29 @@ class SandboxesView(Vertical):
332
333
 
333
334
 
334
335
  class SampleToolbar(Horizontal):
336
+ STATUS_GROUP = "status_group"
337
+ TIMEOUT_TOOL_CALL = "timeout_tool_call"
335
338
  CANCEL_SCORE_OUTPUT = "cancel_score_output"
336
339
  CANCEL_RAISE_ERROR = "cancel_raise_error"
337
340
  PENDING_STATUS = "pending_status"
338
341
  PENDING_CAPTION = "pending_caption"
339
342
 
340
343
  DEFAULT_CSS = f"""
344
+ SampleToolbar {{
345
+ grid-size: 5 1;
346
+ grid-columns: auto auto 1fr auto auto;
347
+ }}
348
+ SampleToolbar #{STATUS_GROUP} {{
349
+ min-width: 20;
350
+ }}
341
351
  SampleToolbar Button {{
342
352
  margin-bottom: 1;
343
353
  margin-right: 2;
344
- min-width: 20;
354
+ min-width: 18;
355
+ }}
356
+ SampleToolbar #{TIMEOUT_TOOL_CALL} {{
357
+ color: $secondary-darken-3;
358
+ min-width: 16;
345
359
  }}
346
360
  SampleToolbar #{CANCEL_SCORE_OUTPUT} {{
347
361
  color: $primary-darken-3;
@@ -356,9 +370,16 @@ class SampleToolbar(Horizontal):
356
370
  self.sample: ActiveSample | None = None
357
371
 
358
372
  def compose(self) -> ComposeResult:
359
- with VerticalGroup(id=self.PENDING_STATUS):
360
- yield Static("Executing...", id=self.PENDING_CAPTION)
361
- yield HorizontalGroup(EventLoadingIndicator(), Clock())
373
+ with HorizontalGroup(id=self.STATUS_GROUP):
374
+ with VerticalGroup(id=self.PENDING_STATUS):
375
+ yield Static("Executing...", id=self.PENDING_CAPTION)
376
+ yield HorizontalGroup(EventLoadingIndicator(), Clock())
377
+ yield Button(
378
+ Text("Timeout Tool"),
379
+ id=self.TIMEOUT_TOOL_CALL,
380
+ tooltip="Cancel the tool call and report a timeout to the model.",
381
+ )
382
+ yield Horizontal()
362
383
  yield Button(
363
384
  Text("Cancel (Score)"),
364
385
  id=self.CANCEL_SCORE_OUTPUT,
@@ -372,12 +393,21 @@ class SampleToolbar(Horizontal):
372
393
 
373
394
  def on_mount(self) -> None:
374
395
  self.query_one("#" + self.PENDING_STATUS).visible = False
396
+ self.query_one("#" + self.TIMEOUT_TOOL_CALL).display = False
375
397
  self.query_one("#" + self.CANCEL_SCORE_OUTPUT).display = False
376
398
  self.query_one("#" + self.CANCEL_RAISE_ERROR).display = False
377
399
 
378
400
  def on_button_pressed(self, event: Button.Pressed) -> None:
379
401
  if self.sample:
380
- if event.button.id == self.CANCEL_SCORE_OUTPUT:
402
+ if event.button.id == self.TIMEOUT_TOOL_CALL:
403
+ last_event = (
404
+ self.sample.transcript.events[-1]
405
+ if self.sample.transcript.events
406
+ else None
407
+ )
408
+ if isinstance(last_event, ToolEvent):
409
+ last_event.cancel()
410
+ elif event.button.id == self.CANCEL_SCORE_OUTPUT:
381
411
  self.sample.interrupt("score")
382
412
  elif event.button.id == self.CANCEL_RAISE_ERROR:
383
413
  self.sample.interrupt("error")
@@ -389,6 +419,7 @@ class SampleToolbar(Horizontal):
389
419
  self.sample = sample
390
420
 
391
421
  pending_status = self.query_one("#" + self.PENDING_STATUS)
422
+ timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
392
423
  clock = self.query_one(Clock)
393
424
  cancel_score_output = cast(
394
425
  Button, self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
@@ -419,14 +450,19 @@ class SampleToolbar(Horizontal):
419
450
  pending_caption.update(
420
451
  Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
421
452
  )
453
+
454
+ timeout_tool.display = isinstance(last_event, ToolEvent)
455
+
422
456
  clock.start(last_event.timestamp.timestamp())
423
457
  else:
424
458
  pending_status.visible = False
459
+ timeout_tool.display = False
425
460
  clock.stop()
426
461
 
427
462
  else:
428
463
  self.display = False
429
464
  pending_status.visible = False
465
+ timeout_tool.display = False
430
466
  clock.stop()
431
467
 
432
468
 
inspect_ai/_eval/eval.py CHANGED
@@ -7,11 +7,12 @@ from shortuuid import uuid
7
7
  from typing_extensions import Unpack
8
8
 
9
9
  from inspect_ai._cli.util import parse_cli_args
10
- from inspect_ai._display.core.active import display
10
+ from inspect_ai._display.core.active import display as task_display
11
11
  from inspect_ai._util.config import resolve_args
12
12
  from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
13
13
  from inspect_ai._util.error import PrerequisiteError
14
14
  from inspect_ai._util.file import absolute_file_path
15
+ from inspect_ai._util.logger import warn_once
15
16
  from inspect_ai._util.platform import platform_init
16
17
  from inspect_ai._util.registry import registry_lookup
17
18
  from inspect_ai.approval._apply import init_tool_approval
@@ -34,7 +35,7 @@ from inspect_ai.scorer._reducer import reducer_log_names
34
35
  from inspect_ai.solver._chain import chain
35
36
  from inspect_ai.solver._solver import Solver, SolverSpec
36
37
  from inspect_ai.util import SandboxEnvironmentType
37
- from inspect_ai.util._trace import init_trace
38
+ from inspect_ai.util._display import DisplayType, display_type, init_display_type
38
39
 
39
40
  from .context import init_eval_context
40
41
  from .loader import ResolvedTask, resolve_tasks
@@ -55,6 +56,7 @@ def eval(
55
56
  solver: Solver | list[Solver] | SolverSpec | None = None,
56
57
  tags: list[str] | None = None,
57
58
  trace: bool | None = None,
59
+ display: DisplayType | None = None,
58
60
  approval: str | list[ApprovalPolicy] | None = None,
59
61
  log_level: str | None = None,
60
62
  log_level_transcript: str | None = None,
@@ -100,7 +102,8 @@ def eval(
100
102
  solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
101
103
  Optional (uses task solver by default).
102
104
  tags (list[str] | None): Tags to associate with this evaluation run.
103
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
105
+ trace (bool | None): Trace message interactions with evaluated model to terminal.
106
+ display (DisplayType | None): Task display type (defaults to 'full').
104
107
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
105
108
  Either a path to an approval policy config file or a list of approval policies.
106
109
  Defaults to no approval policy.
@@ -150,9 +153,11 @@ def eval(
150
153
  platform_init()
151
154
 
152
155
  # resolve eval trace
153
- max_tasks, max_samples = init_eval_trace(trace, max_tasks, max_samples, model)
156
+ max_tasks, max_samples = init_eval_display(
157
+ display, trace, max_tasks, max_samples, model
158
+ )
154
159
 
155
- return display().run_task_app(
160
+ return task_display().run_task_app(
156
161
  main=eval_async(
157
162
  tasks=tasks,
158
163
  model=model,
@@ -163,7 +168,6 @@ def eval(
163
168
  sandbox_cleanup=sandbox_cleanup,
164
169
  solver=solver,
165
170
  tags=tags,
166
- trace=trace,
167
171
  approval=approval,
168
172
  log_level=log_level,
169
173
  log_level_transcript=log_level_transcript,
@@ -201,7 +205,6 @@ async def eval_async(
201
205
  sandbox_cleanup: bool | None = None,
202
206
  solver: Solver | list[Solver] | SolverSpec | None = None,
203
207
  tags: list[str] | None = None,
204
- trace: bool | None = None,
205
208
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
206
209
  log_level: str | None = None,
207
210
  log_level_transcript: str | None = None,
@@ -247,7 +250,6 @@ async def eval_async(
247
250
  solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
248
251
  Optional (uses task solver by default).
249
252
  tags (list[str] | None): Tags to associate with this evaluation run.
250
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
251
253
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
252
254
  Either a path to an approval policy config file or a list of approval policies.
253
255
  Defaults to no approval policy.
@@ -329,8 +331,8 @@ async def eval_async(
329
331
  log.warning("No inspect tasks were found at the specified paths.")
330
332
  return []
331
333
 
332
- # apply trace mode constraints
333
- if trace:
334
+ # apply conversation display constraints
335
+ if display_type() == "conversation":
334
336
  # single task at a time
335
337
  if max_tasks is not None:
336
338
  max_tasks = 1
@@ -371,7 +373,6 @@ async def eval_async(
371
373
  epochs_reducer=reducer_log_names(epochs_reducer)
372
374
  if epochs_reducer
373
375
  else None,
374
- trace=trace,
375
376
  approval=config_from_approval_policies(approval) if approval else None,
376
377
  fail_on_error=fail_on_error,
377
378
  message_limit=message_limit,
@@ -467,6 +468,7 @@ def eval_retry(
467
468
  max_sandboxes: int | None = None,
468
469
  sandbox_cleanup: bool | None = None,
469
470
  trace: bool | None = None,
471
+ display: DisplayType | None = None,
470
472
  fail_on_error: bool | float | None = None,
471
473
  debug_errors: bool | None = None,
472
474
  log_samples: bool | None = None,
@@ -501,6 +503,7 @@ def eval_retry(
501
503
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
502
504
  (defaults to True)
503
505
  trace (bool | None): Trace message interactions with evaluated model to terminal.
506
+ display (DisplayType | None): Task display type (defaults to 'full').
504
507
  fail_on_error (bool | float | None): `True` to fail on first sample error
505
508
  (default); `False` to never fail on sample errors; Value between 0 and 1
506
509
  to fail if a proportion of total samples fails. Value greater than 1 to fail
@@ -529,9 +532,9 @@ def eval_retry(
529
532
  platform_init()
530
533
 
531
534
  # resolve eval trace
532
- max_tasks, max_samples = init_eval_trace(trace, max_tasks, max_samples)
535
+ max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
533
536
 
534
- return display().run_task_app(
537
+ return task_display().run_task_app(
535
538
  main=eval_retry_async(
536
539
  tasks=tasks,
537
540
  log_level=log_level,
@@ -800,9 +803,8 @@ def eval_init(
800
803
 
801
804
  # resolve tasks (set active model to resolve uses of the
802
805
  # 'default' model in tools, solvers, and scorers)
803
- from inspect_ai._display.core.active import display
804
806
 
805
- with display().suspend_task_app():
807
+ with task_display().suspend_task_app():
806
808
  resolved_tasks: list[ResolvedTask] = []
807
809
  for m in models:
808
810
  init_active_model(m, generate_config)
@@ -816,17 +818,27 @@ def eval_init(
816
818
  return models, approval, resolved_tasks
817
819
 
818
820
 
819
- def init_eval_trace(
821
+ def init_eval_display(
822
+ display: DisplayType | None,
820
823
  trace: bool | None,
821
824
  max_tasks: int | None,
822
825
  max_samples: int | None,
823
826
  model: Any = None,
824
827
  ) -> tuple[int | None, int | None]:
825
- # init trace setting
826
- init_trace(trace)
827
-
828
- # adapt task/samples as required
828
+ # propagate any trace value to display_type
829
829
  if trace:
830
+ warn_once(
831
+ log,
832
+ "WARNING: The --trace flag is deprecated (use --display=conversation instead)",
833
+ )
834
+ display = "conversation"
835
+
836
+ # apply default and init
837
+ display = display or display_type()
838
+ init_display_type(display)
839
+
840
+ # adapt task/samples as required if we are in conversation mode
841
+ if display_type() == "conversation":
830
842
  # single task at a time
831
843
  if max_tasks is not None:
832
844
  max_tasks = 1
@@ -33,7 +33,7 @@ from inspect_ai.model import (
33
33
  )
34
34
  from inspect_ai.model._generate_config import GenerateConfig
35
35
  from inspect_ai.solver._solver import Solver, SolverSpec
36
- from inspect_ai.util import SandboxEnvironmentType
36
+ from inspect_ai.util import DisplayType, SandboxEnvironmentType
37
37
 
38
38
  from .eval import eval, eval_init
39
39
  from .loader import ResolvedTask, resolve_task_args
@@ -59,6 +59,7 @@ def eval_set(
59
59
  solver: Solver | list[Solver] | SolverSpec | None = None,
60
60
  tags: list[str] | None = None,
61
61
  trace: bool | None = None,
62
+ display: DisplayType | None = None,
62
63
  approval: str | list[ApprovalPolicy] | None = None,
63
64
  score: bool = True,
64
65
  log_level: str | None = None,
@@ -116,6 +117,7 @@ def eval_set(
116
117
  evaluating task(s). ptional (uses task solver by default).
117
118
  tags (list[str] | None): Tags to associate with this evaluation run.
118
119
  trace: (bool | None): Trace message interactions with evaluated model to terminal.
120
+ display (DisplayType | None): Task display type (defaults to 'full').
119
121
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
120
122
  Either a path to an approval policy config file or a list of approval policies.
121
123
  Defaults to no approval policy.
@@ -180,6 +182,7 @@ def eval_set(
180
182
  solver=solver,
181
183
  tags=tags,
182
184
  trace=trace,
185
+ display=display,
183
186
  approval=approval,
184
187
  log_level=log_level,
185
188
  log_level_transcript=log_level_transcript,
@@ -501,9 +504,6 @@ def latest_completed_task_eval_logs(
501
504
  # take the most recent completed log for each id
502
505
  latest_completed_logs: list[Log] = []
503
506
  for id, id_logs in logs_by_id.items():
504
- # filter on completed
505
- id_logs = [id_log for id_log in id_logs if id_log[1].status != "started"]
506
-
507
507
  # continue if there are no target logs
508
508
  if len(id_logs) == 0:
509
509
  continue
@@ -517,11 +517,13 @@ def latest_completed_task_eval_logs(
517
517
  latest_completed_logs.append(id_logs[0])
518
518
 
519
519
  # remove the rest if requested
520
+ # (don't remove 'started' in case its needed for post-mortum debugging)
520
521
  if cleanup_older:
521
522
  fs = filesystem(id_logs[0][0].name)
522
523
  for id_log in id_logs[1:]:
523
524
  try:
524
- fs.rm(id_log[0].name)
525
+ if id_log.header.status != "started":
526
+ fs.rm(id_log.info.name)
525
527
  except Exception as ex:
526
528
  logger.warning(f"Error attempt to remove '{id_log[0].name}': {ex}")
527
529
 
inspect_ai/_eval/run.py CHANGED
@@ -42,7 +42,7 @@ from .task.log import TaskLogger
42
42
  from .task.run import TaskRunOptions, task_run
43
43
  from .task.rundir import task_run_dir_switching
44
44
  from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
45
- from .task.util import task_run_dir
45
+ from .task.util import slice_dataset, task_run_dir
46
46
 
47
47
  log = logging.getLogger(__name__)
48
48
 
@@ -70,12 +70,23 @@ async def eval_run(
70
70
  # get cwd before switching to task dir
71
71
  eval_wd = os.getcwd()
72
72
 
73
+ # ensure sample ids
74
+ for resolved_task in tasks:
75
+ # add sample ids to dataset if they aren't there (start at 1 not 0)
76
+ task = resolved_task.task
77
+ for id, sample in enumerate(task.dataset):
78
+ if sample.id is None:
79
+ sample.id = id + 1
80
+
81
+ # Ensure sample ids are unique
82
+ ensure_unique_ids(task.dataset)
83
+
73
84
  # run startup pass for the sandbox environments
74
85
  shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
75
86
  if has_sandbox:
76
87
  cleanup = eval_config.sandbox_cleanup is not False
77
88
  shutdown_sandbox_environments = await startup_sandbox_environments(
78
- resolve_sandbox_environment(eval_sandbox), tasks, cleanup
89
+ resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
79
90
  )
80
91
 
81
92
  # resolve solver and solver spec
@@ -146,14 +157,6 @@ async def eval_run(
146
157
  else:
147
158
  task.fail_on_error = task_eval_config.fail_on_error
148
159
 
149
- # add sample ids to dataset if they aren't there (start at 1 not 0)
150
- for id, sample in enumerate(task.dataset):
151
- if sample.id is None:
152
- sample.id = id + 1
153
-
154
- # Ensure sample ids are unique
155
- ensure_unique_ids(task.dataset)
156
-
157
160
  # create and track the logger
158
161
  logger = TaskLogger(
159
162
  task_name=task.name,
@@ -340,13 +343,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
340
343
  async def startup_sandbox_environments(
341
344
  eval_sandbox: SandboxEnvironmentSpec | None,
342
345
  tasks: list[ResolvedTask],
346
+ config: EvalConfig,
343
347
  cleanup: bool,
344
348
  ) -> Callable[[], Awaitable[None]]:
345
349
  # find unique sandboxenvs
346
350
  sandboxenvs: Set[TaskSandboxEnvironment] = set()
347
351
  for task in tasks:
348
352
  # resolve each sample and add to sandboxenvs
349
- for sample in task.task.dataset:
353
+ dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
354
+ for sample in dataset:
350
355
  sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
351
356
  if sandbox is not None and sandbox not in sandboxenvs:
352
357
  sandboxenvs.add(sandbox)
@@ -1,4 +1,4 @@
1
- from .task import Task, TaskInfo, PreviousTask, Tasks # noqa: I001, F401
1
+ from .task import Task, TaskInfo, PreviousTask, Tasks, task_with # noqa: I001, F401
2
2
  from .epochs import Epochs
3
3
 
4
- __all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks"]
4
+ __all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks", "task_with"]