inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. inspect_ai/_cli/eval.py +13 -1
  2. inspect_ai/_display/plain/display.py +9 -11
  3. inspect_ai/_display/textual/app.py +5 -5
  4. inspect_ai/_display/textual/widgets/samples.py +47 -18
  5. inspect_ai/_display/textual/widgets/transcript.py +25 -12
  6. inspect_ai/_eval/eval.py +14 -2
  7. inspect_ai/_eval/evalset.py +6 -1
  8. inspect_ai/_eval/run.py +6 -0
  9. inspect_ai/_eval/task/run.py +44 -15
  10. inspect_ai/_eval/task/task.py +26 -3
  11. inspect_ai/_util/interrupt.py +15 -0
  12. inspect_ai/_util/logger.py +23 -0
  13. inspect_ai/_util/rich.py +7 -8
  14. inspect_ai/_util/text.py +301 -1
  15. inspect_ai/_util/transcript.py +10 -2
  16. inspect_ai/_util/working.py +46 -0
  17. inspect_ai/_view/www/dist/assets/index.css +56 -12
  18. inspect_ai/_view/www/dist/assets/index.js +905 -751
  19. inspect_ai/_view/www/log-schema.json +337 -2
  20. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  21. inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
  22. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  23. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
  24. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  25. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
  26. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  27. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
  28. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
  29. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
  30. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  31. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
  32. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
  33. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
  34. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  35. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  36. inspect_ai/_view/www/src/types/log.d.ts +188 -108
  37. inspect_ai/_view/www/src/utils/format.ts +7 -4
  38. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
  39. inspect_ai/log/__init__.py +2 -0
  40. inspect_ai/log/_condense.py +1 -0
  41. inspect_ai/log/_log.py +72 -12
  42. inspect_ai/log/_samples.py +5 -5
  43. inspect_ai/log/_transcript.py +31 -1
  44. inspect_ai/model/_call_tools.py +1 -1
  45. inspect_ai/model/_conversation.py +1 -1
  46. inspect_ai/model/_model.py +35 -16
  47. inspect_ai/model/_model_call.py +10 -3
  48. inspect_ai/model/_providers/anthropic.py +13 -2
  49. inspect_ai/model/_providers/bedrock.py +7 -0
  50. inspect_ai/model/_providers/cloudflare.py +20 -7
  51. inspect_ai/model/_providers/google.py +358 -302
  52. inspect_ai/model/_providers/groq.py +57 -23
  53. inspect_ai/model/_providers/hf.py +6 -0
  54. inspect_ai/model/_providers/mistral.py +81 -52
  55. inspect_ai/model/_providers/openai.py +9 -0
  56. inspect_ai/model/_providers/providers.py +6 -6
  57. inspect_ai/model/_providers/util/tracker.py +92 -0
  58. inspect_ai/model/_providers/vllm.py +13 -5
  59. inspect_ai/solver/_basic_agent.py +1 -3
  60. inspect_ai/solver/_bridge/patch.py +0 -2
  61. inspect_ai/solver/_limit.py +4 -4
  62. inspect_ai/solver/_plan.py +3 -3
  63. inspect_ai/solver/_solver.py +3 -0
  64. inspect_ai/solver/_task_state.py +10 -1
  65. inspect_ai/tool/_tools/_web_search.py +3 -3
  66. inspect_ai/util/_concurrency.py +14 -8
  67. inspect_ai/util/_sandbox/context.py +15 -0
  68. inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
  69. inspect_ai/util/_sandbox/docker/compose.py +5 -9
  70. inspect_ai/util/_sandbox/docker/docker.py +20 -6
  71. inspect_ai/util/_sandbox/docker/util.py +10 -1
  72. inspect_ai/util/_sandbox/environment.py +32 -1
  73. inspect_ai/util/_sandbox/events.py +149 -0
  74. inspect_ai/util/_sandbox/local.py +3 -3
  75. inspect_ai/util/_sandbox/self_check.py +2 -1
  76. inspect_ai/util/_subprocess.py +4 -1
  77. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
  78. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
  79. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
  80. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
  81. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
  82. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
218
218
  @click.option(
219
219
  "--time-limit",
220
220
  type=int,
221
- help="Limit on total execution time for each sample.",
221
+ help="Limit on total running time for each sample.",
222
222
  envvar="INSPECT_EVAL_TIME_LIMIT",
223
223
  )
224
+ @click.option(
225
+ "--working-limit",
226
+ type=int,
227
+ help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
228
+ envvar="INSPECT_EVAL_WORKING_LIMIT",
229
+ )
224
230
  @click.option(
225
231
  "--fail-on-error",
226
232
  type=float,
@@ -468,6 +474,7 @@ def eval_command(
468
474
  message_limit: int | None,
469
475
  token_limit: int | None,
470
476
  time_limit: int | None,
477
+ working_limit: int | None,
471
478
  max_samples: int | None,
472
479
  max_tasks: int | None,
473
480
  max_subprocesses: int | None,
@@ -518,6 +525,7 @@ def eval_command(
518
525
  message_limit=message_limit,
519
526
  token_limit=token_limit,
520
527
  time_limit=time_limit,
528
+ working_limit=working_limit,
521
529
  max_samples=max_samples,
522
530
  max_tasks=max_tasks,
523
531
  max_subprocesses=max_subprocesses,
@@ -629,6 +637,7 @@ def eval_set_command(
629
637
  message_limit: int | None,
630
638
  token_limit: int | None,
631
639
  time_limit: int | None,
640
+ working_limit: int | None,
632
641
  max_samples: int | None,
633
642
  max_tasks: int | None,
634
643
  max_subprocesses: int | None,
@@ -684,6 +693,7 @@ def eval_set_command(
684
693
  message_limit=message_limit,
685
694
  token_limit=token_limit,
686
695
  time_limit=time_limit,
696
+ working_limit=working_limit,
687
697
  max_samples=max_samples,
688
698
  max_tasks=max_tasks,
689
699
  max_subprocesses=max_subprocesses,
@@ -737,6 +747,7 @@ def eval_exec(
737
747
  message_limit: int | None,
738
748
  token_limit: int | None,
739
749
  time_limit: int | None,
750
+ working_limit: int | None,
740
751
  max_samples: int | None,
741
752
  max_tasks: int | None,
742
753
  max_subprocesses: int | None,
@@ -817,6 +828,7 @@ def eval_exec(
817
828
  message_limit=message_limit,
818
829
  token_limit=token_limit,
819
830
  time_limit=time_limit,
831
+ working_limit=working_limit,
820
832
  max_samples=max_samples,
821
833
  max_tasks=max_tasks,
822
834
  max_subprocesses=max_subprocesses,
@@ -119,14 +119,14 @@ class PlainTaskDisplay(TaskDisplay):
119
119
  self.samples_complete = 0
120
120
  self.samples_total = 0
121
121
  self.current_metrics: list[TaskDisplayMetric] | None = None
122
- self.last_progress = 0 # Track last progress percentage
122
+ self.last_progress = 0
123
123
 
124
124
  @contextlib.contextmanager
125
125
  def progress(self) -> Iterator[Progress]:
126
126
  self.progress_display = PlainProgress(self.task.profile.steps)
127
127
  yield self.progress_display
128
128
 
129
- @throttle(1)
129
+ @throttle(5)
130
130
  def _print_status_throttled(self) -> None:
131
131
  self._print_status()
132
132
 
@@ -135,13 +135,8 @@ class PlainTaskDisplay(TaskDisplay):
135
135
  if not self.progress_display:
136
136
  return
137
137
 
138
- # Calculate current progress percentage
139
- current_progress = int(
140
- self.progress_display.current / self.progress_display.total * 100
141
- )
142
-
143
- # Only print on percentage changes to avoid too much output
144
- if current_progress != self.last_progress:
138
+ # Only print when step count changes to avoid too much output
139
+ if self.progress_display.current != self.last_progress:
145
140
  status_parts: list[str] = []
146
141
 
147
142
  # if this is parallel print task and model to distinguish (limit both to 12 chars)
@@ -154,8 +149,11 @@ class PlainTaskDisplay(TaskDisplay):
154
149
  )
155
150
 
156
151
  # Add step progress
152
+ progress_percent = int(
153
+ self.progress_display.current / self.progress_display.total * 100
154
+ )
157
155
  status_parts.append(
158
- f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {current_progress:3d}%"
156
+ f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {progress_percent:3d}%"
159
157
  )
160
158
 
161
159
  # Add sample progress
@@ -187,7 +185,7 @@ class PlainTaskDisplay(TaskDisplay):
187
185
  # Print on new line
188
186
  print(" | ".join(status_parts))
189
187
 
190
- self.last_progress = current_progress
188
+ self.last_progress = self.progress_display.current
191
189
 
192
190
  def sample_complete(self, complete: int, total: int) -> None:
193
191
  self.samples_complete = complete
@@ -13,7 +13,6 @@ from typing import (
13
13
 
14
14
  import rich
15
15
  from rich.console import Console
16
- from rich.text import Text
17
16
  from textual.app import App, ComposeResult
18
17
  from textual.binding import Binding, BindingType
19
18
  from textual.css.query import NoMatches
@@ -186,7 +185,8 @@ class TaskScreenApp(App[TR]):
186
185
  # force repaint
187
186
  self.refresh(repaint=True)
188
187
 
189
- # enable mouse support (this broke in textual 2.0 when running in VS Code)
188
+ # enable mouse support (this broke in textual 2.0 when running in VS Code
189
+ # however is fixed in textual 2.1)
190
190
  assert self.app._driver
191
191
  textual_enable_mouse_support(self.app._driver)
192
192
 
@@ -316,9 +316,9 @@ class TaskScreenApp(App[TR]):
316
316
 
317
317
  def set_unread(unread: int | None) -> None:
318
318
  if unread is not None:
319
- console_tab.label = Text(f"Console ({unread}")
319
+ console_tab.label = f"Console ({unread})" # type: ignore[assignment]
320
320
  else:
321
- console_tab.label = Text("Console")
321
+ console_tab.label = "Console" # type: ignore[assignment]
322
322
 
323
323
  self.watch(console_view, "unread", set_unread)
324
324
 
@@ -385,7 +385,7 @@ class TaskScreenApp(App[TR]):
385
385
  def set_title(self, title: str) -> None:
386
386
  tabs = self.app.query_one(TabbedContent)
387
387
  tab = tabs.get_tab(self.tab_id)
388
- tab.label = Text(title)
388
+ tab.label = title # type: ignore[assignment]
389
389
 
390
390
  def activate(self) -> None:
391
391
  # show the tab
@@ -6,6 +6,7 @@ from rich.table import Table
6
6
  from rich.text import Text
7
7
  from textual.app import ComposeResult
8
8
  from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
9
+ from textual.css.query import NoMatches
9
10
  from textual.reactive import reactive
10
11
  from textual.widget import Widget
11
12
  from textual.widgets import (
@@ -38,7 +39,7 @@ class SamplesView(Widget):
38
39
  padding: 0 1 0 1;
39
40
  layout: grid;
40
41
  grid-size: 2 3;
41
- grid-rows: auto 1fr auto;
42
+ grid-rows: auto 1fr 3;
42
43
  grid-columns: 32 1fr;
43
44
  grid-gutter: 1;
44
45
  }
@@ -61,7 +62,10 @@ class SamplesView(Widget):
61
62
  )
62
63
 
63
64
  async def notify_active(self, active: bool) -> None:
64
- await self.query_one(TranscriptView).notify_active(active)
65
+ try:
66
+ await self.query_one(TranscriptView).notify_active(active)
67
+ except NoMatches:
68
+ pass
65
69
 
66
70
  def set_samples(self, samples: list[ActiveSample]) -> None:
67
71
  # throttle to no more than 1 second per 100 samples
@@ -137,8 +141,8 @@ class SamplesList(OptionList):
137
141
  if highlighted_sample and (highlighted_sample not in self.samples):
138
142
  self.samples.append(highlighted_sample)
139
143
 
140
- # sort the samples by execution time
141
- self.samples.sort(key=lambda sample: sample.execution_time, reverse=True)
144
+ # sort the samples by running time
145
+ self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
142
146
 
143
147
  # rebuild the list
144
148
  self.clear_options()
@@ -150,9 +154,7 @@ class SamplesList(OptionList):
150
154
  table.add_column(width=1)
151
155
  task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
152
156
  task_name.truncate(18, overflow="ellipsis", pad=True)
153
- task_time = Text.from_markup(
154
- f"{format_progress_time(sample.execution_time)}"
155
- )
157
+ task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
156
158
  table.add_row(task_name, task_time, " ")
157
159
  sample_id = Text.from_markup(f"id: {sample.sample.id}")
158
160
  sample_id.truncate(18, overflow="ellipsis", pad=True)
@@ -408,11 +410,17 @@ class SampleToolbar(Horizontal):
408
410
  PENDING_STATUS = "pending_status"
409
411
  PENDING_CAPTION = "pending_caption"
410
412
 
413
+ TIMEOUT_TOOL_CALL_ENABLED = (
414
+ "Cancel the tool call and report a timeout to the model."
415
+ )
416
+ TIMEOUT_TOOL_CALL_DISABLED = "Cancelling tool call..."
417
+ CANCEL_SCORE_OUTPUT_ENABLED = (
418
+ "Cancel the sample and score whatever output has been generated so far."
419
+ )
420
+ CANCEL_RAISE_ERROR_ENABLED = "Cancel the sample and raise an error"
421
+ CANCEL_DISABLED = "Cancelling sample..."
422
+
411
423
  DEFAULT_CSS = f"""
412
- SampleToolbar {{
413
- grid-size: 5 1;
414
- grid-columns: auto auto 1fr auto auto;
415
- }}
416
424
  SampleToolbar #{STATUS_GROUP} {{
417
425
  width: 22;
418
426
  }}
@@ -445,18 +453,18 @@ class SampleToolbar(Horizontal):
445
453
  yield Button(
446
454
  Text("Timeout Tool"),
447
455
  id=self.TIMEOUT_TOOL_CALL,
448
- tooltip="Cancel the tool call and report a timeout to the model.",
456
+ tooltip=self.TIMEOUT_TOOL_CALL_ENABLED,
449
457
  )
450
458
  yield Horizontal()
451
459
  yield Button(
452
460
  Text("Cancel (Score)"),
453
461
  id=self.CANCEL_SCORE_OUTPUT,
454
- tooltip="Cancel the sample and score whatever output has been generated so far.",
462
+ tooltip=self.CANCEL_SCORE_OUTPUT_ENABLED,
455
463
  )
456
464
  yield Button(
457
465
  Text("Cancel (Error)"),
458
466
  id=self.CANCEL_RAISE_ERROR,
459
- tooltip="Cancel the sample and raise an error (task will exit unless fail_on_error is set)",
467
+ tooltip=self.CANCEL_RAISE_ERROR_ENABLED,
460
468
  )
461
469
 
462
470
  def on_mount(self) -> None:
@@ -475,14 +483,26 @@ class SampleToolbar(Horizontal):
475
483
  )
476
484
  if isinstance(last_event, ToolEvent):
477
485
  last_event._cancel()
478
- elif event.button.id == self.CANCEL_SCORE_OUTPUT:
479
- self.sample.interrupt("score")
480
- elif event.button.id == self.CANCEL_RAISE_ERROR:
481
- self.sample.interrupt("error")
486
+ event.button.disabled = True
487
+ event.button.tooltip = self.TIMEOUT_TOOL_CALL_DISABLED
488
+ else:
489
+ if event.button.id == self.CANCEL_SCORE_OUTPUT:
490
+ self.sample.interrupt("score")
491
+ elif event.button.id == self.CANCEL_RAISE_ERROR:
492
+ self.sample.interrupt("error")
493
+ cancel_score_output = self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
494
+ cancel_score_output.disabled = True
495
+ cancel_score_output.tooltip = self.CANCEL_DISABLED
496
+ cancel_with_error = self.query_one("#" + self.CANCEL_RAISE_ERROR)
497
+ cancel_with_error.disabled = True
498
+ cancel_with_error.tooltip = self.CANCEL_DISABLED
482
499
 
483
500
  async def sync_sample(self, sample: ActiveSample | None) -> None:
484
501
  from inspect_ai.log._transcript import ModelEvent
485
502
 
503
+ # is it a new sample?
504
+ new_sample = sample != self.sample
505
+
486
506
  # track the sample
487
507
  self.sample = sample
488
508
 
@@ -499,6 +519,13 @@ class SampleToolbar(Horizontal):
499
519
  cancel_score_output.display = True
500
520
  cancel_with_error.display = not sample.fails_on_error
501
521
 
522
+ # if its a new sample then reset enabled states
523
+ if new_sample:
524
+ cancel_score_output.disabled = False
525
+ cancel_score_output.tooltip = self.CANCEL_SCORE_OUTPUT_ENABLED
526
+ cancel_with_error.disabled = False
527
+ cancel_with_error.tooltip = self.CANCEL_RAISE_ERROR_ENABLED
528
+
502
529
  # if we have a pending event then start the clock and show pending status
503
530
  last_event = (
504
531
  sample.transcript.events[-1]
@@ -520,6 +547,8 @@ class SampleToolbar(Horizontal):
520
547
  )
521
548
 
522
549
  timeout_tool.display = isinstance(last_event, ToolEvent)
550
+ timeout_tool.disabled = False
551
+ timeout_tool.tooltip = self.TIMEOUT_TOOL_CALL_ENABLED
523
552
 
524
553
  clock.start(last_event.timestamp.timestamp())
525
554
  else:
@@ -193,16 +193,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
193
193
  return EventDisplay(f"model: {event.model}", Group(*content))
194
194
 
195
195
 
196
- def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
197
- # render sub-events
198
- display: list[EventDisplay] = []
199
- if event.events:
200
- for e in event.events:
201
- display.extend(render_event(e) or [])
196
+ def render_sub_events(events: list[Event]) -> list[RenderableType]:
197
+ content: list[RenderableType] = []
198
+ for e in events:
199
+ event_displays = render_event(e) or []
200
+ for d in event_displays:
201
+ if d.content:
202
+ content.append(Text(" "))
203
+ content.append(transcript_separator(d.title, "black", "··"))
204
+ if isinstance(d.content, Markdown):
205
+ set_transcript_markdown_options(d.content)
206
+ content.append(d.content)
207
+
208
+ return content
209
+
202
210
 
211
+ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
203
212
  # render the call
204
213
  content = transcript_tool_call(event)
205
214
 
215
+ # render sub-events
216
+ if event.events:
217
+ content.extend(render_sub_events(event.events))
218
+
206
219
  # render the output
207
220
  if isinstance(event.result, list):
208
221
  result: ToolResult = "\n".join(
@@ -220,7 +233,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
220
233
  result = str(result).strip()
221
234
  content.extend(lines_display(result, 50))
222
235
 
223
- return display + [EventDisplay("tool call", Group(*content))]
236
+ return [EventDisplay("tool call", Group(*content))]
224
237
 
225
238
 
226
239
  def render_step_event(event: StepEvent) -> EventDisplay:
@@ -257,13 +270,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
257
270
 
258
271
 
259
272
  def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
273
+ # render header
274
+ content: list[RenderableType] = [transcript_function(event.name, event.input)]
275
+
260
276
  # render sub-events
261
- display: list[EventDisplay] = []
262
277
  if event.events:
263
- for e in event.events:
264
- display.extend(render_event(e) or [])
278
+ content.extend(render_sub_events(event.events))
265
279
 
266
- content: list[RenderableType] = [transcript_function(event.name, event.input)]
267
280
  if event.result:
268
281
  content.append(Text())
269
282
  if isinstance(event.result, str | int | float | bool | None):
@@ -271,7 +284,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
271
284
  else:
272
285
  content.append(render_as_json(event.result))
273
286
 
274
- return display + [EventDisplay(f"subtask: {event.name}", Group(*content))]
287
+ return [EventDisplay(f"subtask: {event.name}", Group(*content))]
275
288
 
276
289
 
277
290
  def render_input_event(event: InputEvent) -> EventDisplay:
inspect_ai/_eval/eval.py CHANGED
@@ -75,6 +75,7 @@ def eval(
75
75
  message_limit: int | None = None,
76
76
  token_limit: int | None = None,
77
77
  time_limit: int | None = None,
78
+ working_limit: int | None = None,
78
79
  max_samples: int | None = None,
79
80
  max_tasks: int | None = None,
80
81
  max_subprocesses: int | None = None,
@@ -132,7 +133,10 @@ def eval(
132
133
  so they can be debugged (defaults to False).
133
134
  message_limit: Limit on total messages used for each sample.
134
135
  token_limit: Limit on total tokens used for each sample.
135
- time_limit: Limit on time (in seconds) for execution of each sample.
136
+ time_limit: Limit on clock time (in seconds) for samples.
137
+ working_limit: Limit on working time (in seconds) for sample. Working
138
+ time includes model generation, tool calls, etc. but does not include
139
+ time spent waiting on retries or shared resources.
136
140
  max_samples: Maximum number of samples to run in parallel
137
141
  (default is max_connections)
138
142
  max_tasks: Maximum number of tasks to run in parallel
@@ -186,6 +190,7 @@ def eval(
186
190
  message_limit=message_limit,
187
191
  token_limit=token_limit,
188
192
  time_limit=time_limit,
193
+ working_limit=working_limit,
189
194
  max_samples=max_samples,
190
195
  max_tasks=max_tasks,
191
196
  max_subprocesses=max_subprocesses,
@@ -227,6 +232,7 @@ async def eval_async(
227
232
  message_limit: int | None = None,
228
233
  token_limit: int | None = None,
229
234
  time_limit: int | None = None,
235
+ working_limit: int | None = None,
230
236
  max_samples: int | None = None,
231
237
  max_tasks: int | None = None,
232
238
  max_subprocesses: int | None = None,
@@ -281,7 +287,10 @@ async def eval_async(
281
287
  so they can be debugged (defaults to False).
282
288
  message_limit (int | None): Limit on total messages used for each sample.
283
289
  token_limit (int | None): Limit on total tokens used for each sample.
284
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
290
+ time_limit: Limit on clock time (in seconds) for samples.
291
+ working_limit: Limit on working time (in seconds) for sample. Working
292
+ time includes model generation, tool calls, etc. but does not include
293
+ time spent waiting on retries or shared resources.
285
294
  max_samples (int | None): Maximum number of samples to run in parallel
286
295
  (default is max_connections)
287
296
  max_tasks (int | None): Maximum number of tasks to run in parallel
@@ -395,6 +404,7 @@ async def eval_async(
395
404
  message_limit=message_limit,
396
405
  token_limit=token_limit,
397
406
  time_limit=time_limit,
407
+ working_limit=working_limit,
398
408
  max_samples=max_samples,
399
409
  max_tasks=max_tasks,
400
410
  max_subprocesses=max_subprocesses,
@@ -702,6 +712,7 @@ async def eval_retry_async(
702
712
  message_limit = eval_log.eval.config.message_limit
703
713
  token_limit = eval_log.eval.config.token_limit
704
714
  time_limit = eval_log.eval.config.time_limit
715
+ working_limit = eval_log.eval.config.working_limit
705
716
  max_samples = max_samples or eval_log.eval.config.max_samples
706
717
  max_tasks = max_tasks or eval_log.eval.config.max_tasks
707
718
  max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
@@ -763,6 +774,7 @@ async def eval_retry_async(
763
774
  message_limit=message_limit,
764
775
  token_limit=token_limit,
765
776
  time_limit=time_limit,
777
+ working_limit=working_limit,
766
778
  max_samples=max_samples,
767
779
  max_tasks=max_tasks,
768
780
  max_subprocesses=max_subprocesses,
@@ -79,6 +79,7 @@ def eval_set(
79
79
  message_limit: int | None = None,
80
80
  token_limit: int | None = None,
81
81
  time_limit: int | None = None,
82
+ working_limit: int | None = None,
82
83
  max_samples: int | None = None,
83
84
  max_tasks: int | None = None,
84
85
  max_subprocesses: int | None = None,
@@ -146,7 +147,10 @@ def eval_set(
146
147
  so they can be debugged (defaults to False).
147
148
  message_limit: Limit on total messages used for each sample.
148
149
  token_limit: Limit on total tokens used for each sample.
149
- time_limit: Limit on time (in seconds) for execution of each sample.
150
+ time_limit: Limit on clock time (in seconds) for samples.
151
+ working_limit: Limit on working time (in seconds) for sample. Working
152
+ time includes model generation, tool calls, etc. but does not include
153
+ time spent waiting on retries or shared resources.
150
154
  max_samples: Maximum number of samples to run in parallel
151
155
  (default is max_connections)
152
156
  max_tasks: Maximum number of tasks to run in parallel
@@ -202,6 +206,7 @@ def eval_set(
202
206
  message_limit=message_limit,
203
207
  token_limit=token_limit,
204
208
  time_limit=time_limit,
209
+ working_limit=working_limit,
205
210
  max_samples=max_samples,
206
211
  max_tasks=max_tasks,
207
212
  max_subprocesses=max_subprocesses,
inspect_ai/_eval/run.py CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
163
163
  else:
164
164
  task.time_limit = task_eval_config.time_limit
165
165
 
166
+ # sample execution limit
167
+ if task_eval_config.working_limit is None:
168
+ task_eval_config.working_limit = task.working_limit
169
+ else:
170
+ task.working_limit = task_eval_config.working_limit
171
+
166
172
  # fail_on_error
167
173
  if task_eval_config.fail_on_error is None:
168
174
  task_eval_config.fail_on_error = task.fail_on_error
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
33
33
  registry_unqualified_name,
34
34
  )
35
35
  from inspect_ai._util.timeouts import Timeout, timeout
36
+ from inspect_ai._util.working import (
37
+ init_sample_working_limit,
38
+ sample_waiting_time,
39
+ )
36
40
  from inspect_ai._view.notify import view_notify_eval
37
41
  from inspect_ai.dataset import Dataset, Sample
38
42
  from inspect_ai.log import (
@@ -56,6 +60,7 @@ from inspect_ai.log._transcript import (
56
60
  SampleInitEvent,
57
61
  SampleLimitEvent,
58
62
  ScoreEvent,
63
+ StepEvent,
59
64
  transcript,
60
65
  )
61
66
  from inspect_ai.model import (
@@ -182,9 +187,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
182
187
  if isinstance(solver, Plan):
183
188
  plan = solver
184
189
  elif isinstance(solver, Chain):
185
- plan = Plan(list(solver), internal=True)
190
+ plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
186
191
  else:
187
- plan = Plan(unroll(solver), internal=True)
192
+ plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
188
193
 
189
194
  # add setup solver(s) if specified
190
195
  if task.setup:
@@ -308,6 +313,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
308
313
  or config.fail_on_error is True
309
314
  ),
310
315
  time_limit=config.time_limit,
316
+ working_limit=config.working_limit,
311
317
  semaphore=sample_semaphore,
312
318
  )
313
319
  for (sample, state) in zip(samples, states)
@@ -500,6 +506,7 @@ async def task_run_sample(
500
506
  sample_complete: Callable[[dict[str, SampleScore]], None],
501
507
  fails_on_error: bool,
502
508
  time_limit: int | None,
509
+ working_limit: int | None,
503
510
  semaphore: asyncio.Semaphore | None,
504
511
  ) -> dict[str, SampleScore] | None:
505
512
  # if there is an existing sample then tick off its progress, log it, and return it
@@ -570,19 +577,37 @@ async def task_run_sample(
570
577
  message_limit=state.message_limit,
571
578
  token_limit=state.token_limit,
572
579
  time_limit=time_limit,
580
+ working_limit=working_limit,
573
581
  fails_on_error=fails_on_error,
574
582
  transcript=sample_transcript,
575
583
  ) as active,
576
584
  ):
585
+ start_time: float | None = None
577
586
  error: EvalError | None = None
578
587
  raise_error: BaseException | None = None
579
588
  results: dict[str, SampleScore] = {}
580
589
  try:
590
+ # begin init
591
+ transcript()._event(StepEvent(action="begin", name="init"))
592
+
593
+ # sample init event (remove file bodies as they have content or absolute paths)
594
+ event_sample = sample.model_copy(
595
+ update=dict(files={k: "" for k in sample.files.keys()})
596
+ if sample.files
597
+ else None
598
+ )
599
+ transcript()._event(
600
+ SampleInitEvent(sample=event_sample, state=state_jsonable(state))
601
+ )
602
+
581
603
  async with sandboxenv_cm:
582
604
  try:
583
605
  # update active sample wth sandboxes now that we are initialised
584
606
  active.sandboxes = await sandbox_connections()
585
607
 
608
+ # end init
609
+ transcript()._event(StepEvent(action="end", name="init"))
610
+
586
611
  # initialise timeout context manager
587
612
  timeout_cm = (
588
613
  timeout(time_limit)
@@ -590,23 +615,15 @@ async def task_run_sample(
590
615
  else contextlib.nullcontext()
591
616
  )
592
617
 
618
+ # record start time
619
+ start_time = time.monotonic()
620
+ init_sample_working_limit(start_time, working_limit)
621
+
593
622
  # run sample w/ optional timeout
594
623
  async with timeout_cm:
595
624
  # mark started
596
625
  active.started = datetime.now().timestamp()
597
626
 
598
- # sample init event (remove file bodies as they have content or absolute paths)
599
- event_sample = sample.model_copy(
600
- update=dict(files={k: "" for k in sample.files.keys()})
601
- if sample.files
602
- else None
603
- )
604
- transcript()._event(
605
- SampleInitEvent(
606
- sample=event_sample, state=state_jsonable(state)
607
- )
608
- )
609
-
610
627
  # set progress for plan then run it
611
628
  state = await plan(state, generate)
612
629
 
@@ -661,11 +678,13 @@ async def task_run_sample(
661
678
 
662
679
  # capture most recent state for scoring
663
680
  state = ex.state or sample_state() or state
664
- state.completed = True
665
681
 
666
682
  except BaseException as ex:
667
683
  error, raise_error = handle_error(ex)
668
684
 
685
+ # mark completed
686
+ state.completed = True
687
+
669
688
  # set timeout for scoring. if the original timeout was hit we still
670
689
  # want to provide opportunity for scoring, but we don't necessarily
671
690
  # want to wait the full timeout again (especially in the case where
@@ -768,6 +787,7 @@ async def task_run_sample(
768
787
 
769
788
  # log the sample
770
789
  await log_sample(
790
+ start_time=start_time,
771
791
  logger=logger,
772
792
  sample=sample,
773
793
  state=state,
@@ -788,6 +808,7 @@ async def task_run_sample(
788
808
 
789
809
 
790
810
  async def log_sample(
811
+ start_time: float | None,
791
812
  logger: TaskLogger,
792
813
  sample: Sample,
793
814
  state: TaskState,
@@ -804,6 +825,9 @@ async def log_sample(
804
825
 
805
826
  # construct sample for logging
806
827
 
828
+ # compute total time if we can
829
+ total_time = time.monotonic() - start_time if start_time is not None else None
830
+
807
831
  # if a limit was hit, note that in the Eval Sample
808
832
  limit = None
809
833
  for e in transcript().events:
@@ -827,8 +851,13 @@ async def log_sample(
827
851
  output=state.output,
828
852
  scores={k: v.score for k, v in scores.items()},
829
853
  store=dict(state.store.items()),
854
+ uuid=state.uuid,
830
855
  events=list(transcript().events),
831
856
  model_usage=sample_model_usage(),
857
+ total_time=round(total_time, 3) if total_time is not None else None,
858
+ working_time=round(total_time - sample_waiting_time(), 3)
859
+ if total_time is not None
860
+ else None,
832
861
  error=error,
833
862
  limit=limit,
834
863
  )