inspect-ai 0.3.69__py3-none-any.whl → 0.3.70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. inspect_ai/_cli/eval.py +13 -1
  2. inspect_ai/_display/textual/app.py +3 -2
  3. inspect_ai/_display/textual/widgets/samples.py +4 -10
  4. inspect_ai/_display/textual/widgets/transcript.py +25 -12
  5. inspect_ai/_eval/eval.py +14 -2
  6. inspect_ai/_eval/evalset.py +6 -1
  7. inspect_ai/_eval/run.py +6 -0
  8. inspect_ai/_eval/task/run.py +44 -15
  9. inspect_ai/_eval/task/task.py +26 -3
  10. inspect_ai/_util/interrupt.py +6 -0
  11. inspect_ai/_util/logger.py +19 -0
  12. inspect_ai/_util/rich.py +7 -8
  13. inspect_ai/_util/text.py +13 -0
  14. inspect_ai/_util/transcript.py +10 -2
  15. inspect_ai/_util/working.py +46 -0
  16. inspect_ai/_view/www/dist/assets/index.css +56 -12
  17. inspect_ai/_view/www/dist/assets/index.js +904 -750
  18. inspect_ai/_view/www/log-schema.json +337 -2
  19. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  20. inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
  21. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  22. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
  23. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  24. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
  25. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  26. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
  27. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
  28. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  29. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
  30. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
  31. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
  32. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  33. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  34. inspect_ai/_view/www/src/types/log.d.ts +188 -108
  35. inspect_ai/_view/www/src/utils/format.ts +7 -4
  36. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
  37. inspect_ai/log/__init__.py +2 -0
  38. inspect_ai/log/_condense.py +1 -0
  39. inspect_ai/log/_log.py +72 -12
  40. inspect_ai/log/_samples.py +5 -1
  41. inspect_ai/log/_transcript.py +31 -1
  42. inspect_ai/model/_call_tools.py +1 -1
  43. inspect_ai/model/_conversation.py +1 -1
  44. inspect_ai/model/_model.py +32 -16
  45. inspect_ai/model/_model_call.py +10 -3
  46. inspect_ai/model/_providers/anthropic.py +13 -2
  47. inspect_ai/model/_providers/bedrock.py +7 -0
  48. inspect_ai/model/_providers/cloudflare.py +20 -7
  49. inspect_ai/model/_providers/google.py +2 -0
  50. inspect_ai/model/_providers/groq.py +57 -23
  51. inspect_ai/model/_providers/hf.py +6 -0
  52. inspect_ai/model/_providers/mistral.py +78 -51
  53. inspect_ai/model/_providers/openai.py +9 -0
  54. inspect_ai/model/_providers/providers.py +1 -1
  55. inspect_ai/model/_providers/util/tracker.py +92 -0
  56. inspect_ai/model/_providers/vllm.py +13 -5
  57. inspect_ai/solver/_basic_agent.py +1 -3
  58. inspect_ai/solver/_bridge/patch.py +0 -2
  59. inspect_ai/solver/_limit.py +4 -4
  60. inspect_ai/solver/_plan.py +0 -3
  61. inspect_ai/solver/_task_state.py +7 -0
  62. inspect_ai/tool/_tools/_web_search.py +3 -3
  63. inspect_ai/util/_concurrency.py +14 -8
  64. inspect_ai/util/_sandbox/context.py +15 -0
  65. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  66. inspect_ai/util/_sandbox/environment.py +32 -1
  67. inspect_ai/util/_sandbox/events.py +149 -0
  68. inspect_ai/util/_sandbox/local.py +3 -3
  69. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +3 -3
  70. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +74 -67
  71. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
  72. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
  73. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
  74. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
218
218
  @click.option(
219
219
  "--time-limit",
220
220
  type=int,
221
- help="Limit on total execution time for each sample.",
221
+ help="Limit on total running time for each sample.",
222
222
  envvar="INSPECT_EVAL_TIME_LIMIT",
223
223
  )
224
+ @click.option(
225
+ "--working-limit",
226
+ type=int,
227
+ help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
228
+ envvar="INSPECT_EVAL_WORKING_LIMIT",
229
+ )
224
230
  @click.option(
225
231
  "--fail-on-error",
226
232
  type=float,
@@ -468,6 +474,7 @@ def eval_command(
468
474
  message_limit: int | None,
469
475
  token_limit: int | None,
470
476
  time_limit: int | None,
477
+ working_limit: int | None,
471
478
  max_samples: int | None,
472
479
  max_tasks: int | None,
473
480
  max_subprocesses: int | None,
@@ -518,6 +525,7 @@ def eval_command(
518
525
  message_limit=message_limit,
519
526
  token_limit=token_limit,
520
527
  time_limit=time_limit,
528
+ working_limit=working_limit,
521
529
  max_samples=max_samples,
522
530
  max_tasks=max_tasks,
523
531
  max_subprocesses=max_subprocesses,
@@ -629,6 +637,7 @@ def eval_set_command(
629
637
  message_limit: int | None,
630
638
  token_limit: int | None,
631
639
  time_limit: int | None,
640
+ working_limit: int | None,
632
641
  max_samples: int | None,
633
642
  max_tasks: int | None,
634
643
  max_subprocesses: int | None,
@@ -684,6 +693,7 @@ def eval_set_command(
684
693
  message_limit=message_limit,
685
694
  token_limit=token_limit,
686
695
  time_limit=time_limit,
696
+ working_limit=working_limit,
687
697
  max_samples=max_samples,
688
698
  max_tasks=max_tasks,
689
699
  max_subprocesses=max_subprocesses,
@@ -737,6 +747,7 @@ def eval_exec(
737
747
  message_limit: int | None,
738
748
  token_limit: int | None,
739
749
  time_limit: int | None,
750
+ working_limit: int | None,
740
751
  max_samples: int | None,
741
752
  max_tasks: int | None,
742
753
  max_subprocesses: int | None,
@@ -817,6 +828,7 @@ def eval_exec(
817
828
  message_limit=message_limit,
818
829
  token_limit=token_limit,
819
830
  time_limit=time_limit,
831
+ working_limit=working_limit,
820
832
  max_samples=max_samples,
821
833
  max_tasks=max_tasks,
822
834
  max_subprocesses=max_subprocesses,
@@ -185,7 +185,8 @@ class TaskScreenApp(App[TR]):
185
185
  # force repaint
186
186
  self.refresh(repaint=True)
187
187
 
188
- # enable mouse support (this broke in textual 2.0 when running in VS Code)
188
+ # enable mouse support (this broke in textual 2.0 when running in VS Code
189
+ # however is fixed in textual 2.1)
189
190
  assert self.app._driver
190
191
  textual_enable_mouse_support(self.app._driver)
191
192
 
@@ -315,7 +316,7 @@ class TaskScreenApp(App[TR]):
315
316
 
316
317
  def set_unread(unread: int | None) -> None:
317
318
  if unread is not None:
318
- console_tab.label = f"Console ({unread}" # type: ignore[assignment]
319
+ console_tab.label = f"Console ({unread})" # type: ignore[assignment]
319
320
  else:
320
321
  console_tab.label = "Console" # type: ignore[assignment]
321
322
 
@@ -39,7 +39,7 @@ class SamplesView(Widget):
39
39
  padding: 0 1 0 1;
40
40
  layout: grid;
41
41
  grid-size: 2 3;
42
- grid-rows: auto 1fr auto;
42
+ grid-rows: auto 1fr 3;
43
43
  grid-columns: 32 1fr;
44
44
  grid-gutter: 1;
45
45
  }
@@ -141,8 +141,8 @@ class SamplesList(OptionList):
141
141
  if highlighted_sample and (highlighted_sample not in self.samples):
142
142
  self.samples.append(highlighted_sample)
143
143
 
144
- # sort the samples by execution time
145
- self.samples.sort(key=lambda sample: sample.execution_time, reverse=True)
144
+ # sort the samples by running time
145
+ self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
146
146
 
147
147
  # rebuild the list
148
148
  self.clear_options()
@@ -154,9 +154,7 @@ class SamplesList(OptionList):
154
154
  table.add_column(width=1)
155
155
  task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
156
156
  task_name.truncate(18, overflow="ellipsis", pad=True)
157
- task_time = Text.from_markup(
158
- f"{format_progress_time(sample.execution_time)}"
159
- )
157
+ task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
160
158
  table.add_row(task_name, task_time, " ")
161
159
  sample_id = Text.from_markup(f"id: {sample.sample.id}")
162
160
  sample_id.truncate(18, overflow="ellipsis", pad=True)
@@ -423,10 +421,6 @@ class SampleToolbar(Horizontal):
423
421
  CANCEL_DISABLED = "Cancelling sample..."
424
422
 
425
423
  DEFAULT_CSS = f"""
426
- SampleToolbar {{
427
- grid-size: 5 1;
428
- grid-columns: auto auto 1fr auto auto;
429
- }}
430
424
  SampleToolbar #{STATUS_GROUP} {{
431
425
  width: 22;
432
426
  }}
@@ -193,16 +193,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
193
193
  return EventDisplay(f"model: {event.model}", Group(*content))
194
194
 
195
195
 
196
- def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
197
- # render sub-events
198
- display: list[EventDisplay] = []
199
- if event.events:
200
- for e in event.events:
201
- display.extend(render_event(e) or [])
196
+ def render_sub_events(events: list[Event]) -> list[RenderableType]:
197
+ content: list[RenderableType] = []
198
+ for e in events:
199
+ event_displays = render_event(e) or []
200
+ for d in event_displays:
201
+ if d.content:
202
+ content.append(Text(" "))
203
+ content.append(transcript_separator(d.title, "black", "··"))
204
+ if isinstance(d.content, Markdown):
205
+ set_transcript_markdown_options(d.content)
206
+ content.append(d.content)
207
+
208
+ return content
209
+
202
210
 
211
+ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
203
212
  # render the call
204
213
  content = transcript_tool_call(event)
205
214
 
215
+ # render sub-events
216
+ if event.events:
217
+ content.extend(render_sub_events(event.events))
218
+
206
219
  # render the output
207
220
  if isinstance(event.result, list):
208
221
  result: ToolResult = "\n".join(
@@ -220,7 +233,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
220
233
  result = str(result).strip()
221
234
  content.extend(lines_display(result, 50))
222
235
 
223
- return display + [EventDisplay("tool call", Group(*content))]
236
+ return [EventDisplay("tool call", Group(*content))]
224
237
 
225
238
 
226
239
  def render_step_event(event: StepEvent) -> EventDisplay:
@@ -257,13 +270,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
257
270
 
258
271
 
259
272
  def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
273
+ # render header
274
+ content: list[RenderableType] = [transcript_function(event.name, event.input)]
275
+
260
276
  # render sub-events
261
- display: list[EventDisplay] = []
262
277
  if event.events:
263
- for e in event.events:
264
- display.extend(render_event(e) or [])
278
+ content.extend(render_sub_events(event.events))
265
279
 
266
- content: list[RenderableType] = [transcript_function(event.name, event.input)]
267
280
  if event.result:
268
281
  content.append(Text())
269
282
  if isinstance(event.result, str | int | float | bool | None):
@@ -271,7 +284,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
271
284
  else:
272
285
  content.append(render_as_json(event.result))
273
286
 
274
- return display + [EventDisplay(f"subtask: {event.name}", Group(*content))]
287
+ return [EventDisplay(f"subtask: {event.name}", Group(*content))]
275
288
 
276
289
 
277
290
  def render_input_event(event: InputEvent) -> EventDisplay:
inspect_ai/_eval/eval.py CHANGED
@@ -75,6 +75,7 @@ def eval(
75
75
  message_limit: int | None = None,
76
76
  token_limit: int | None = None,
77
77
  time_limit: int | None = None,
78
+ working_limit: int | None = None,
78
79
  max_samples: int | None = None,
79
80
  max_tasks: int | None = None,
80
81
  max_subprocesses: int | None = None,
@@ -132,7 +133,10 @@ def eval(
132
133
  so they can be debugged (defaults to False).
133
134
  message_limit: Limit on total messages used for each sample.
134
135
  token_limit: Limit on total tokens used for each sample.
135
- time_limit: Limit on time (in seconds) for execution of each sample.
136
+ time_limit: Limit on clock time (in seconds) for samples.
137
+ working_limit: Limit on working time (in seconds) for sample. Working
138
+ time includes model generation, tool calls, etc. but does not include
139
+ time spent waiting on retries or shared resources.
136
140
  max_samples: Maximum number of samples to run in parallel
137
141
  (default is max_connections)
138
142
  max_tasks: Maximum number of tasks to run in parallel
@@ -186,6 +190,7 @@ def eval(
186
190
  message_limit=message_limit,
187
191
  token_limit=token_limit,
188
192
  time_limit=time_limit,
193
+ working_limit=working_limit,
189
194
  max_samples=max_samples,
190
195
  max_tasks=max_tasks,
191
196
  max_subprocesses=max_subprocesses,
@@ -227,6 +232,7 @@ async def eval_async(
227
232
  message_limit: int | None = None,
228
233
  token_limit: int | None = None,
229
234
  time_limit: int | None = None,
235
+ working_limit: int | None = None,
230
236
  max_samples: int | None = None,
231
237
  max_tasks: int | None = None,
232
238
  max_subprocesses: int | None = None,
@@ -281,7 +287,10 @@ async def eval_async(
281
287
  so they can be debugged (defaults to False).
282
288
  message_limit (int | None): Limit on total messages used for each sample.
283
289
  token_limit (int | None): Limit on total tokens used for each sample.
284
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
290
+ time_limit: Limit on clock time (in seconds) for samples.
291
+ working_limit: Limit on working time (in seconds) for sample. Working
292
+ time includes model generation, tool calls, etc. but does not include
293
+ time spent waiting on retries or shared resources.
285
294
  max_samples (int | None): Maximum number of samples to run in parallel
286
295
  (default is max_connections)
287
296
  max_tasks (int | None): Maximum number of tasks to run in parallel
@@ -395,6 +404,7 @@ async def eval_async(
395
404
  message_limit=message_limit,
396
405
  token_limit=token_limit,
397
406
  time_limit=time_limit,
407
+ working_limit=working_limit,
398
408
  max_samples=max_samples,
399
409
  max_tasks=max_tasks,
400
410
  max_subprocesses=max_subprocesses,
@@ -702,6 +712,7 @@ async def eval_retry_async(
702
712
  message_limit = eval_log.eval.config.message_limit
703
713
  token_limit = eval_log.eval.config.token_limit
704
714
  time_limit = eval_log.eval.config.time_limit
715
+ working_limit = eval_log.eval.config.working_limit
705
716
  max_samples = max_samples or eval_log.eval.config.max_samples
706
717
  max_tasks = max_tasks or eval_log.eval.config.max_tasks
707
718
  max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
@@ -763,6 +774,7 @@ async def eval_retry_async(
763
774
  message_limit=message_limit,
764
775
  token_limit=token_limit,
765
776
  time_limit=time_limit,
777
+ working_limit=working_limit,
766
778
  max_samples=max_samples,
767
779
  max_tasks=max_tasks,
768
780
  max_subprocesses=max_subprocesses,
@@ -79,6 +79,7 @@ def eval_set(
79
79
  message_limit: int | None = None,
80
80
  token_limit: int | None = None,
81
81
  time_limit: int | None = None,
82
+ working_limit: int | None = None,
82
83
  max_samples: int | None = None,
83
84
  max_tasks: int | None = None,
84
85
  max_subprocesses: int | None = None,
@@ -146,7 +147,10 @@ def eval_set(
146
147
  so they can be debugged (defaults to False).
147
148
  message_limit: Limit on total messages used for each sample.
148
149
  token_limit: Limit on total tokens used for each sample.
149
- time_limit: Limit on time (in seconds) for execution of each sample.
150
+ time_limit: Limit on clock time (in seconds) for samples.
151
+ working_limit: Limit on working time (in seconds) for sample. Working
152
+ time includes model generation, tool calls, etc. but does not include
153
+ time spent waiting on retries or shared resources.
150
154
  max_samples: Maximum number of samples to run in parallel
151
155
  (default is max_connections)
152
156
  max_tasks: Maximum number of tasks to run in parallel
@@ -202,6 +206,7 @@ def eval_set(
202
206
  message_limit=message_limit,
203
207
  token_limit=token_limit,
204
208
  time_limit=time_limit,
209
+ working_limit=working_limit,
205
210
  max_samples=max_samples,
206
211
  max_tasks=max_tasks,
207
212
  max_subprocesses=max_subprocesses,
inspect_ai/_eval/run.py CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
163
163
  else:
164
164
  task.time_limit = task_eval_config.time_limit
165
165
 
166
+ # sample execution limit
167
+ if task_eval_config.working_limit is None:
168
+ task_eval_config.working_limit = task.working_limit
169
+ else:
170
+ task.working_limit = task_eval_config.working_limit
171
+
166
172
  # fail_on_error
167
173
  if task_eval_config.fail_on_error is None:
168
174
  task_eval_config.fail_on_error = task.fail_on_error
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
33
33
  registry_unqualified_name,
34
34
  )
35
35
  from inspect_ai._util.timeouts import Timeout, timeout
36
+ from inspect_ai._util.working import (
37
+ init_sample_working_limit,
38
+ sample_waiting_time,
39
+ )
36
40
  from inspect_ai._view.notify import view_notify_eval
37
41
  from inspect_ai.dataset import Dataset, Sample
38
42
  from inspect_ai.log import (
@@ -56,6 +60,7 @@ from inspect_ai.log._transcript import (
56
60
  SampleInitEvent,
57
61
  SampleLimitEvent,
58
62
  ScoreEvent,
63
+ StepEvent,
59
64
  transcript,
60
65
  )
61
66
  from inspect_ai.model import (
@@ -182,9 +187,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
182
187
  if isinstance(solver, Plan):
183
188
  plan = solver
184
189
  elif isinstance(solver, Chain):
185
- plan = Plan(list(solver), internal=True)
190
+ plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
186
191
  else:
187
- plan = Plan(unroll(solver), internal=True)
192
+ plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
188
193
 
189
194
  # add setup solver(s) if specified
190
195
  if task.setup:
@@ -308,6 +313,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
308
313
  or config.fail_on_error is True
309
314
  ),
310
315
  time_limit=config.time_limit,
316
+ working_limit=config.working_limit,
311
317
  semaphore=sample_semaphore,
312
318
  )
313
319
  for (sample, state) in zip(samples, states)
@@ -500,6 +506,7 @@ async def task_run_sample(
500
506
  sample_complete: Callable[[dict[str, SampleScore]], None],
501
507
  fails_on_error: bool,
502
508
  time_limit: int | None,
509
+ working_limit: int | None,
503
510
  semaphore: asyncio.Semaphore | None,
504
511
  ) -> dict[str, SampleScore] | None:
505
512
  # if there is an existing sample then tick off its progress, log it, and return it
@@ -570,19 +577,37 @@ async def task_run_sample(
570
577
  message_limit=state.message_limit,
571
578
  token_limit=state.token_limit,
572
579
  time_limit=time_limit,
580
+ working_limit=working_limit,
573
581
  fails_on_error=fails_on_error,
574
582
  transcript=sample_transcript,
575
583
  ) as active,
576
584
  ):
585
+ start_time: float | None = None
577
586
  error: EvalError | None = None
578
587
  raise_error: BaseException | None = None
579
588
  results: dict[str, SampleScore] = {}
580
589
  try:
590
+ # begin init
591
+ transcript()._event(StepEvent(action="begin", name="init"))
592
+
593
+ # sample init event (remove file bodies as they have content or absolute paths)
594
+ event_sample = sample.model_copy(
595
+ update=dict(files={k: "" for k in sample.files.keys()})
596
+ if sample.files
597
+ else None
598
+ )
599
+ transcript()._event(
600
+ SampleInitEvent(sample=event_sample, state=state_jsonable(state))
601
+ )
602
+
581
603
  async with sandboxenv_cm:
582
604
  try:
583
605
  # update active sample wth sandboxes now that we are initialised
584
606
  active.sandboxes = await sandbox_connections()
585
607
 
608
+ # end init
609
+ transcript()._event(StepEvent(action="end", name="init"))
610
+
586
611
  # initialise timeout context manager
587
612
  timeout_cm = (
588
613
  timeout(time_limit)
@@ -590,23 +615,15 @@ async def task_run_sample(
590
615
  else contextlib.nullcontext()
591
616
  )
592
617
 
618
+ # record start time
619
+ start_time = time.monotonic()
620
+ init_sample_working_limit(start_time, working_limit)
621
+
593
622
  # run sample w/ optional timeout
594
623
  async with timeout_cm:
595
624
  # mark started
596
625
  active.started = datetime.now().timestamp()
597
626
 
598
- # sample init event (remove file bodies as they have content or absolute paths)
599
- event_sample = sample.model_copy(
600
- update=dict(files={k: "" for k in sample.files.keys()})
601
- if sample.files
602
- else None
603
- )
604
- transcript()._event(
605
- SampleInitEvent(
606
- sample=event_sample, state=state_jsonable(state)
607
- )
608
- )
609
-
610
627
  # set progress for plan then run it
611
628
  state = await plan(state, generate)
612
629
 
@@ -661,11 +678,13 @@ async def task_run_sample(
661
678
 
662
679
  # capture most recent state for scoring
663
680
  state = ex.state or sample_state() or state
664
- state.completed = True
665
681
 
666
682
  except BaseException as ex:
667
683
  error, raise_error = handle_error(ex)
668
684
 
685
+ # mark completed
686
+ state.completed = True
687
+
669
688
  # set timeout for scoring. if the original timeout was hit we still
670
689
  # want to provide opportunity for scoring, but we don't necessarily
671
690
  # want to wait the full timeout again (especially in the case where
@@ -768,6 +787,7 @@ async def task_run_sample(
768
787
 
769
788
  # log the sample
770
789
  await log_sample(
790
+ start_time=start_time,
771
791
  logger=logger,
772
792
  sample=sample,
773
793
  state=state,
@@ -788,6 +808,7 @@ async def task_run_sample(
788
808
 
789
809
 
790
810
  async def log_sample(
811
+ start_time: float | None,
791
812
  logger: TaskLogger,
792
813
  sample: Sample,
793
814
  state: TaskState,
@@ -804,6 +825,9 @@ async def log_sample(
804
825
 
805
826
  # construct sample for logging
806
827
 
828
+ # compute total time if we can
829
+ total_time = time.monotonic() - start_time if start_time is not None else None
830
+
807
831
  # if a limit was hit, note that in the Eval Sample
808
832
  limit = None
809
833
  for e in transcript().events:
@@ -827,8 +851,13 @@ async def log_sample(
827
851
  output=state.output,
828
852
  scores={k: v.score for k, v in scores.items()},
829
853
  store=dict(state.store.items()),
854
+ uuid=state.uuid,
830
855
  events=list(transcript().events),
831
856
  model_usage=sample_model_usage(),
857
+ total_time=round(total_time, 3) if total_time is not None else None,
858
+ working_time=round(total_time - sample_waiting_time(), 3)
859
+ if total_time is not None
860
+ else None,
832
861
  error=error,
833
862
  limit=limit,
834
863
  )
@@ -1,7 +1,7 @@
1
1
  from copy import deepcopy
2
2
  from dataclasses import dataclass
3
3
  from logging import getLogger
4
- from typing import Any, Callable, Sequence, cast
4
+ from typing import Any, Awaitable, Callable, Sequence, cast
5
5
 
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import TypedDict, Unpack
@@ -17,6 +17,7 @@ from inspect_ai.scorer import Metric, Scorer
17
17
  from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
18
18
  from inspect_ai.solver import Plan, Solver, generate
19
19
  from inspect_ai.solver._chain import chain
20
+ from inspect_ai.solver._task_state import TaskState
20
21
  from inspect_ai.util._sandbox.environment import (
21
22
  SandboxEnvironmentSpec,
22
23
  SandboxEnvironmentType,
@@ -46,6 +47,7 @@ class Task:
46
47
  dataset: Dataset | Sequence[Sample] | None = None,
47
48
  setup: Solver | list[Solver] | None = None,
48
49
  solver: Solver | list[Solver] = generate(),
50
+ cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
49
51
  scorer: Scorer | list[Scorer] | None = None,
50
52
  metrics: list[Metric] | dict[str, list[Metric]] | None = None,
51
53
  config: GenerateConfig = GenerateConfig(),
@@ -56,6 +58,7 @@ class Task:
56
58
  message_limit: int | None = None,
57
59
  token_limit: int | None = None,
58
60
  time_limit: int | None = None,
61
+ working_limit: int | None = None,
59
62
  name: str | None = None,
60
63
  version: int = 0,
61
64
  metadata: dict[str, Any] | None = None,
@@ -69,6 +72,9 @@ class Task:
69
72
  even when the main `solver` is replaced).
70
73
  solver: (Solver | list[Solver]): Solver or list of solvers.
71
74
  Defaults to generate(), a normal call to the model.
75
+ cleanup: Optional cleanup function for task. Called after
76
+ all solvers have run for each sample (including if an
77
+ exception occurs during the run)
72
78
  scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
73
79
  metrics (list[Metric] | dict[str, list[Metric]] | None):
74
80
  Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -86,7 +92,10 @@ class Task:
86
92
  eval if a count of samples fails.
87
93
  message_limit (int | None): Limit on total messages used for each sample.
88
94
  token_limit (int | None): Limit on total tokens used for each sample.
89
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
95
+ time_limit: Limit on clock time (in seconds) for samples.
96
+ working_limit: Limit on working time (in seconds) for sample. Working
97
+ time includes model generation, tool calls, etc. but does not include
98
+ time spent waiting on retries or shared resources.
90
99
  name: (str | None): Task name. If not specified is automatically
91
100
  determined based on the name of the task directory (or "task")
92
101
  if its anonymous task (e.g. created in a notebook and passed to
@@ -123,6 +132,7 @@ class Task:
123
132
  self.dataset = resolve_dataset(dataset)
124
133
  self.setup = setup
125
134
  self.solver = resolve_solver(solver)
135
+ self.cleanup = cleanup
126
136
  self.scorer = resolve_scorer(scorer)
127
137
  self.metrics = metrics
128
138
  self.config = config
@@ -135,6 +145,7 @@ class Task:
135
145
  self.message_limit = message_limit
136
146
  self.token_limit = token_limit
137
147
  self.time_limit = time_limit
148
+ self.working_limit = working_limit
138
149
  self.version = version
139
150
  self._name = name
140
151
  self.metadata = metadata
@@ -162,6 +173,7 @@ def task_with(
162
173
  dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
163
174
  setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
164
175
  solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
176
+ cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
165
177
  scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
166
178
  metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
167
179
  config: GenerateConfig | NotGiven = NOT_GIVEN,
@@ -172,6 +184,7 @@ def task_with(
172
184
  message_limit: int | None | NotGiven = NOT_GIVEN,
173
185
  token_limit: int | None | NotGiven = NOT_GIVEN,
174
186
  time_limit: int | None | NotGiven = NOT_GIVEN,
187
+ working_limit: int | None | NotGiven = NOT_GIVEN,
175
188
  name: str | None | NotGiven = NOT_GIVEN,
176
189
  version: int | NotGiven = NOT_GIVEN,
177
190
  metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
@@ -185,6 +198,9 @@ def task_with(
185
198
  even when the main `solver` is replaced).
186
199
  solver: (Solver | list[Solver]): Solver or list of solvers.
187
200
  Defaults to generate(), a normal call to the model.
201
+ cleanup: Optional cleanup function for task. Called after
202
+ all solvers have run for each sample (including if an
203
+ exception occurs during the run)
188
204
  scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
189
205
  metrics (list[Metric] | dict[str, list[Metric]] | None):
190
206
  Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -202,7 +218,10 @@ def task_with(
202
218
  eval if a count of samples fails.
203
219
  message_limit (int | None): Limit on total messages used for each sample.
204
220
  token_limit (int | None): Limit on total tokens used for each sample.
205
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
221
+ time_limit: Limit on clock time (in seconds) for samples.
222
+ working_limit: Limit on execution time (in seconds) for sample. Execution
223
+ time includes model generation, tool calls, etc. but does not include
224
+ time spent waiting on retries or shared resources.
206
225
  name: (str | None): Task name. If not specified is automatically
207
226
  determined based on the name of the task directory (or "task")
208
227
  if its anonymous task (e.g. created in a notebook and passed to
@@ -223,6 +242,8 @@ def task_with(
223
242
  task.setup = setup
224
243
  if not isinstance(solver, NotGiven):
225
244
  task.solver = resolve_solver(solver)
245
+ if not isinstance(cleanup, NotGiven):
246
+ task.cleanup = cleanup
226
247
  if not isinstance(scorer, NotGiven):
227
248
  task.scorer = resolve_scorer(scorer)
228
249
  if not isinstance(metrics, NotGiven):
@@ -245,6 +266,8 @@ def task_with(
245
266
  task.token_limit = token_limit
246
267
  if not isinstance(time_limit, NotGiven):
247
268
  task.time_limit = time_limit
269
+ if not isinstance(working_limit, NotGiven):
270
+ task.working_limit = working_limit
248
271
  if not isinstance(version, NotGiven):
249
272
  task.version = version
250
273
  if not isinstance(name, NotGiven):
@@ -1,9 +1,15 @@
1
1
  import asyncio
2
2
 
3
+ from .working import check_sample_working_limit
4
+
3
5
 
4
6
  def check_sample_interrupt() -> None:
5
7
  from inspect_ai.log._samples import sample_active
6
8
 
9
+ # check for user interrupt
7
10
  sample = sample_active()
8
11
  if sample and sample.interrupt_action:
9
12
  raise asyncio.CancelledError()
13
+
14
+ # check for working_limit
15
+ check_sample_working_limit()
@@ -160,7 +160,9 @@ def init_logger(
160
160
 
161
161
  # init logging handler on demand
162
162
  global _logHandler
163
+ removed_root_handlers = False
163
164
  if not _logHandler:
165
+ removed_root_handlers = remove_non_pytest_root_logger_handlers()
164
166
  _logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
165
167
  getLogger().addHandler(_logHandler)
166
168
 
@@ -173,6 +175,11 @@ def init_logger(
173
175
  getLogger("httpx").setLevel(capture_level)
174
176
  getLogger("botocore").setLevel(DEBUG)
175
177
 
178
+ if removed_root_handlers:
179
+ getLogger(PKG_NAME).warning(
180
+ "Inspect removed pre-existing root logger handlers and replaced them with its own handler."
181
+ )
182
+
176
183
  # set the levelno on the global handler
177
184
  _logHandler.display_level = levelno
178
185
 
@@ -180,6 +187,18 @@ def init_logger(
180
187
  _logHandler: LogHandler | None = None
181
188
 
182
189
 
190
+ def remove_non_pytest_root_logger_handlers() -> bool:
191
+ root_logger = getLogger()
192
+ non_pytest_handlers = [
193
+ handler
194
+ for handler in root_logger.handlers
195
+ if handler.__module__ != "_pytest.logging"
196
+ ]
197
+ for handler in non_pytest_handlers:
198
+ root_logger.removeHandler(handler)
199
+ return len(non_pytest_handlers) > 0
200
+
201
+
183
202
  def notify_logger_record(record: LogRecord, write: bool) -> None:
184
203
  from inspect_ai.log._message import LoggingMessage
185
204
  from inspect_ai.log._transcript import LoggerEvent, transcript