inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
33
33
  registry_unqualified_name,
34
34
  )
35
35
  from inspect_ai._util.timeouts import Timeout, timeout
36
+ from inspect_ai._util.working import (
37
+ init_sample_working_limit,
38
+ sample_waiting_time,
39
+ )
36
40
  from inspect_ai._view.notify import view_notify_eval
37
41
  from inspect_ai.dataset import Dataset, Sample
38
42
  from inspect_ai.log import (
@@ -46,16 +50,13 @@ from inspect_ai.log import (
46
50
  from inspect_ai.log._condense import condense_sample
47
51
  from inspect_ai.log._file import eval_log_json_str
48
52
  from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
49
- from inspect_ai.log._samples import (
50
- active_sample,
51
- set_active_sample_message_limit,
52
- set_active_sample_token_limit,
53
- )
53
+ from inspect_ai.log._samples import active_sample
54
54
  from inspect_ai.log._transcript import (
55
55
  ErrorEvent,
56
56
  SampleInitEvent,
57
57
  SampleLimitEvent,
58
58
  ScoreEvent,
59
+ StepEvent,
59
60
  transcript,
60
61
  )
61
62
  from inspect_ai.model import (
@@ -182,9 +183,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
182
183
  if isinstance(solver, Plan):
183
184
  plan = solver
184
185
  elif isinstance(solver, Chain):
185
- plan = Plan(list(solver), internal=True)
186
+ plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
186
187
  else:
187
- plan = Plan(unroll(solver), internal=True)
188
+ plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
188
189
 
189
190
  # add setup solver(s) if specified
190
191
  if task.setup:
@@ -308,6 +309,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
308
309
  or config.fail_on_error is True
309
310
  ),
310
311
  time_limit=config.time_limit,
312
+ working_limit=config.working_limit,
311
313
  semaphore=sample_semaphore,
312
314
  )
313
315
  for (sample, state) in zip(samples, states)
@@ -500,6 +502,7 @@ async def task_run_sample(
500
502
  sample_complete: Callable[[dict[str, SampleScore]], None],
501
503
  fails_on_error: bool,
502
504
  time_limit: int | None,
505
+ working_limit: int | None,
503
506
  semaphore: asyncio.Semaphore | None,
504
507
  ) -> dict[str, SampleScore] | None:
505
508
  # if there is an existing sample then tick off its progress, log it, and return it
@@ -570,19 +573,37 @@ async def task_run_sample(
570
573
  message_limit=state.message_limit,
571
574
  token_limit=state.token_limit,
572
575
  time_limit=time_limit,
576
+ working_limit=working_limit,
573
577
  fails_on_error=fails_on_error,
574
578
  transcript=sample_transcript,
575
579
  ) as active,
576
580
  ):
581
+ start_time: float | None = None
577
582
  error: EvalError | None = None
578
583
  raise_error: BaseException | None = None
579
584
  results: dict[str, SampleScore] = {}
580
585
  try:
586
+ # begin init
587
+ transcript()._event(StepEvent(action="begin", name="init"))
588
+
589
+ # sample init event (remove file bodies as they have content or absolute paths)
590
+ event_sample = sample.model_copy(
591
+ update=dict(files={k: "" for k in sample.files.keys()})
592
+ if sample.files
593
+ else None
594
+ )
595
+ transcript()._event(
596
+ SampleInitEvent(sample=event_sample, state=state_jsonable(state))
597
+ )
598
+
581
599
  async with sandboxenv_cm:
582
600
  try:
583
601
  # update active sample wth sandboxes now that we are initialised
584
602
  active.sandboxes = await sandbox_connections()
585
603
 
604
+ # end init
605
+ transcript()._event(StepEvent(action="end", name="init"))
606
+
586
607
  # initialise timeout context manager
587
608
  timeout_cm = (
588
609
  timeout(time_limit)
@@ -590,23 +611,15 @@ async def task_run_sample(
590
611
  else contextlib.nullcontext()
591
612
  )
592
613
 
614
+ # record start time
615
+ start_time = time.monotonic()
616
+ init_sample_working_limit(start_time, working_limit)
617
+
593
618
  # run sample w/ optional timeout
594
619
  async with timeout_cm:
595
620
  # mark started
596
621
  active.started = datetime.now().timestamp()
597
622
 
598
- # sample init event (remove file bodies as they have content or absolute paths)
599
- event_sample = sample.model_copy(
600
- update=dict(files={k: "" for k in sample.files.keys()})
601
- if sample.files
602
- else None
603
- )
604
- transcript()._event(
605
- SampleInitEvent(
606
- sample=event_sample, state=state_jsonable(state)
607
- )
608
- )
609
-
610
623
  # set progress for plan then run it
611
624
  state = await plan(state, generate)
612
625
 
@@ -661,11 +674,13 @@ async def task_run_sample(
661
674
 
662
675
  # capture most recent state for scoring
663
676
  state = ex.state or sample_state() or state
664
- state.completed = True
665
677
 
666
678
  except BaseException as ex:
667
679
  error, raise_error = handle_error(ex)
668
680
 
681
+ # mark completed
682
+ state.completed = True
683
+
669
684
  # set timeout for scoring. if the original timeout was hit we still
670
685
  # want to provide opportunity for scoring, but we don't necessarily
671
686
  # want to wait the full timeout again (especially in the case where
@@ -676,9 +691,10 @@ async def task_run_sample(
676
691
  assert time_limit
677
692
  timeout_cm = timeout(time_limit / 2)
678
693
 
679
- # turn off sample limits
680
- set_active_sample_token_limit(None)
681
- set_active_sample_message_limit(None)
694
+ # turn off message and token limits
695
+ state.message_limit = None
696
+ state.token_limit = None
697
+ set_sample_state(state)
682
698
 
683
699
  # scoring
684
700
  try:
@@ -768,6 +784,7 @@ async def task_run_sample(
768
784
 
769
785
  # log the sample
770
786
  await log_sample(
787
+ start_time=start_time,
771
788
  logger=logger,
772
789
  sample=sample,
773
790
  state=state,
@@ -788,6 +805,7 @@ async def task_run_sample(
788
805
 
789
806
 
790
807
  async def log_sample(
808
+ start_time: float | None,
791
809
  logger: TaskLogger,
792
810
  sample: Sample,
793
811
  state: TaskState,
@@ -804,6 +822,9 @@ async def log_sample(
804
822
 
805
823
  # construct sample for logging
806
824
 
825
+ # compute total time if we can
826
+ total_time = time.monotonic() - start_time if start_time is not None else None
827
+
807
828
  # if a limit was hit, note that in the Eval Sample
808
829
  limit = None
809
830
  for e in transcript().events:
@@ -827,8 +848,13 @@ async def log_sample(
827
848
  output=state.output,
828
849
  scores={k: v.score for k, v in scores.items()},
829
850
  store=dict(state.store.items()),
851
+ uuid=state.uuid,
830
852
  events=list(transcript().events),
831
853
  model_usage=sample_model_usage(),
854
+ total_time=round(total_time, 3) if total_time is not None else None,
855
+ working_time=round(total_time - sample_waiting_time(), 3)
856
+ if total_time is not None
857
+ else None,
832
858
  error=error,
833
859
  limit=limit,
834
860
  )
@@ -1,7 +1,7 @@
1
1
  from copy import deepcopy
2
2
  from dataclasses import dataclass
3
3
  from logging import getLogger
4
- from typing import Any, Callable, Sequence, cast
4
+ from typing import Any, Awaitable, Callable, Sequence, cast
5
5
 
6
6
  from pydantic import BaseModel
7
7
  from typing_extensions import TypedDict, Unpack
@@ -17,6 +17,7 @@ from inspect_ai.scorer import Metric, Scorer
17
17
  from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
18
18
  from inspect_ai.solver import Plan, Solver, generate
19
19
  from inspect_ai.solver._chain import chain
20
+ from inspect_ai.solver._task_state import TaskState
20
21
  from inspect_ai.util._sandbox.environment import (
21
22
  SandboxEnvironmentSpec,
22
23
  SandboxEnvironmentType,
@@ -46,6 +47,7 @@ class Task:
46
47
  dataset: Dataset | Sequence[Sample] | None = None,
47
48
  setup: Solver | list[Solver] | None = None,
48
49
  solver: Solver | list[Solver] = generate(),
50
+ cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
49
51
  scorer: Scorer | list[Scorer] | None = None,
50
52
  metrics: list[Metric] | dict[str, list[Metric]] | None = None,
51
53
  config: GenerateConfig = GenerateConfig(),
@@ -56,6 +58,7 @@ class Task:
56
58
  message_limit: int | None = None,
57
59
  token_limit: int | None = None,
58
60
  time_limit: int | None = None,
61
+ working_limit: int | None = None,
59
62
  name: str | None = None,
60
63
  version: int = 0,
61
64
  metadata: dict[str, Any] | None = None,
@@ -69,6 +72,9 @@ class Task:
69
72
  even when the main `solver` is replaced).
70
73
  solver: (Solver | list[Solver]): Solver or list of solvers.
71
74
  Defaults to generate(), a normal call to the model.
75
+ cleanup: Optional cleanup function for task. Called after
76
+ all solvers have run for each sample (including if an
77
+ exception occurs during the run)
72
78
  scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
73
79
  metrics (list[Metric] | dict[str, list[Metric]] | None):
74
80
  Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -86,7 +92,10 @@ class Task:
86
92
  eval if a count of samples fails.
87
93
  message_limit (int | None): Limit on total messages used for each sample.
88
94
  token_limit (int | None): Limit on total tokens used for each sample.
89
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
95
+ time_limit: Limit on clock time (in seconds) for samples.
96
+ working_limit: Limit on working time (in seconds) for sample. Working
97
+ time includes model generation, tool calls, etc. but does not include
98
+ time spent waiting on retries or shared resources.
90
99
  name: (str | None): Task name. If not specified is automatically
91
100
  determined based on the name of the task directory (or "task")
92
101
  if its anonymous task (e.g. created in a notebook and passed to
@@ -123,6 +132,7 @@ class Task:
123
132
  self.dataset = resolve_dataset(dataset)
124
133
  self.setup = setup
125
134
  self.solver = resolve_solver(solver)
135
+ self.cleanup = cleanup
126
136
  self.scorer = resolve_scorer(scorer)
127
137
  self.metrics = metrics
128
138
  self.config = config
@@ -135,6 +145,7 @@ class Task:
135
145
  self.message_limit = message_limit
136
146
  self.token_limit = token_limit
137
147
  self.time_limit = time_limit
148
+ self.working_limit = working_limit
138
149
  self.version = version
139
150
  self._name = name
140
151
  self.metadata = metadata
@@ -162,6 +173,7 @@ def task_with(
162
173
  dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
163
174
  setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
164
175
  solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
176
+ cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
165
177
  scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
166
178
  metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
167
179
  config: GenerateConfig | NotGiven = NOT_GIVEN,
@@ -172,6 +184,7 @@ def task_with(
172
184
  message_limit: int | None | NotGiven = NOT_GIVEN,
173
185
  token_limit: int | None | NotGiven = NOT_GIVEN,
174
186
  time_limit: int | None | NotGiven = NOT_GIVEN,
187
+ working_limit: int | None | NotGiven = NOT_GIVEN,
175
188
  name: str | None | NotGiven = NOT_GIVEN,
176
189
  version: int | NotGiven = NOT_GIVEN,
177
190
  metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
@@ -185,6 +198,9 @@ def task_with(
185
198
  even when the main `solver` is replaced).
186
199
  solver: (Solver | list[Solver]): Solver or list of solvers.
187
200
  Defaults to generate(), a normal call to the model.
201
+ cleanup: Optional cleanup function for task. Called after
202
+ all solvers have run for each sample (including if an
203
+ exception occurs during the run)
188
204
  scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
189
205
  metrics (list[Metric] | dict[str, list[Metric]] | None):
190
206
  Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -202,7 +218,10 @@ def task_with(
202
218
  eval if a count of samples fails.
203
219
  message_limit (int | None): Limit on total messages used for each sample.
204
220
  token_limit (int | None): Limit on total tokens used for each sample.
205
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
221
+ time_limit: Limit on clock time (in seconds) for samples.
222
+ working_limit: Limit on execution time (in seconds) for sample. Execution
223
+ time includes model generation, tool calls, etc. but does not include
224
+ time spent waiting on retries or shared resources.
206
225
  name: (str | None): Task name. If not specified is automatically
207
226
  determined based on the name of the task directory (or "task")
208
227
  if its anonymous task (e.g. created in a notebook and passed to
@@ -223,6 +242,8 @@ def task_with(
223
242
  task.setup = setup
224
243
  if not isinstance(solver, NotGiven):
225
244
  task.solver = resolve_solver(solver)
245
+ if not isinstance(cleanup, NotGiven):
246
+ task.cleanup = cleanup
226
247
  if not isinstance(scorer, NotGiven):
227
248
  task.scorer = resolve_scorer(scorer)
228
249
  if not isinstance(metrics, NotGiven):
@@ -245,6 +266,8 @@ def task_with(
245
266
  task.token_limit = token_limit
246
267
  if not isinstance(time_limit, NotGiven):
247
268
  task.time_limit = time_limit
269
+ if not isinstance(working_limit, NotGiven):
270
+ task.working_limit = working_limit
248
271
  if not isinstance(version, NotGiven):
249
272
  task.version = version
250
273
  if not isinstance(name, NotGiven):
@@ -13,6 +13,25 @@ class ContentText(BaseModel):
13
13
  """Text content."""
14
14
 
15
15
 
16
+ class ContentReasoning(BaseModel):
17
+ """Reasoning content.
18
+
19
+ See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
20
+ """
21
+
22
+ type: Literal["reasoning"] = Field(default="reasoning")
23
+ """Type."""
24
+
25
+ reasoning: str
26
+ """Reasoning content."""
27
+
28
+ signature: str | None = Field(default=None)
29
+ """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)"""
30
+
31
+ redacted: bool = Field(default=False)
32
+ """Indicates that the explicit content of this reasoning block has been redacted."""
33
+
34
+
16
35
  class ContentImage(BaseModel):
17
36
  """Image content."""
18
37
 
@@ -55,5 +74,5 @@ class ContentVideo(BaseModel):
55
74
  """Format of video data ('mp4', 'mpeg', or 'mov')"""
56
75
 
57
76
 
58
- Content = Union[ContentText, ContentImage, ContentAudio, ContentVideo]
77
+ Content = Union[ContentText, ContentReasoning, ContentImage, ContentAudio, ContentVideo]
59
78
  """Content sent to or received from a model."""
@@ -1,9 +1,15 @@
1
1
  import asyncio
2
2
 
3
+ from .working import check_sample_working_limit
4
+
3
5
 
4
6
  def check_sample_interrupt() -> None:
5
7
  from inspect_ai.log._samples import sample_active
6
8
 
9
+ # check for user interrupt
7
10
  sample = sample_active()
8
11
  if sample and sample.interrupt_action:
9
12
  raise asyncio.CancelledError()
13
+
14
+ # check for working_limit
15
+ check_sample_working_limit()
@@ -160,7 +160,9 @@ def init_logger(
160
160
 
161
161
  # init logging handler on demand
162
162
  global _logHandler
163
+ removed_root_handlers = False
163
164
  if not _logHandler:
165
+ removed_root_handlers = remove_non_pytest_root_logger_handlers()
164
166
  _logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
165
167
  getLogger().addHandler(_logHandler)
166
168
 
@@ -173,6 +175,11 @@ def init_logger(
173
175
  getLogger("httpx").setLevel(capture_level)
174
176
  getLogger("botocore").setLevel(DEBUG)
175
177
 
178
+ if removed_root_handlers:
179
+ getLogger(PKG_NAME).warning(
180
+ "Inspect removed pre-existing root logger handlers and replaced them with its own handler."
181
+ )
182
+
176
183
  # set the levelno on the global handler
177
184
  _logHandler.display_level = levelno
178
185
 
@@ -180,6 +187,18 @@ def init_logger(
180
187
  _logHandler: LogHandler | None = None
181
188
 
182
189
 
190
+ def remove_non_pytest_root_logger_handlers() -> bool:
191
+ root_logger = getLogger()
192
+ non_pytest_handlers = [
193
+ handler
194
+ for handler in root_logger.handlers
195
+ if handler.__module__ != "_pytest.logging"
196
+ ]
197
+ for handler in non_pytest_handlers:
198
+ root_logger.removeHandler(handler)
199
+ return len(non_pytest_handlers) > 0
200
+
201
+
183
202
  def notify_logger_record(record: LogRecord, write: bool) -> None:
184
203
  from inspect_ai.log._message import LoggingMessage
185
204
  from inspect_ai.log._transcript import LoggerEvent, transcript
inspect_ai/_util/rich.py CHANGED
@@ -2,23 +2,22 @@ from rich.console import RenderableType
2
2
  from rich.style import Style
3
3
  from rich.text import Text
4
4
 
5
+ from inspect_ai._util.text import truncate_lines
6
+
5
7
 
6
8
  def lines_display(
7
9
  text: str, max_lines: int = 100, style: str | Style = ""
8
10
  ) -> list[RenderableType]:
9
- lines = text.splitlines()
10
- if len(lines) > max_lines:
11
- content: list[RenderableType] = [
12
- Text("\n".join(lines[0:max_lines]), style=style)
13
- ]
11
+ lines, truncated = truncate_lines(text, max_lines)
12
+
13
+ content: list[RenderableType] = [Text(lines, style=style)]
14
+ if truncated is not None:
14
15
  content.append(Text())
15
16
  content.append(
16
17
  Text.from_markup(
17
- f"[italic]Output truncated ({len(lines) - max_lines} additional lines)...[/italic]",
18
+ f"[italic]Output truncated ({truncated} additional lines)...[/italic]",
18
19
  style=style,
19
20
  )
20
21
  )
21
- else:
22
- content = [Text(text, style=style)]
23
22
 
24
23
  return content
inspect_ai/_util/text.py CHANGED
@@ -134,6 +134,19 @@ def truncate(text: str, length: int, overflow: str = "...", pad: bool = True) ->
134
134
  return truncated
135
135
 
136
136
 
137
+ def truncate_lines(
138
+ text: str, max_lines: int = 100, max_characters: int | None = 100 * 100
139
+ ) -> tuple[str, int | None]:
140
+ if max_characters is not None:
141
+ text = truncate(text, max_characters)
142
+ lines = text.splitlines()
143
+ if len(lines) > max_lines:
144
+ output = "\n".join(lines[0:max_lines])
145
+ return output, len(lines) - max_lines
146
+ else:
147
+ return text, None
148
+
149
+
137
150
  def generate_large_text(target_tokens: int) -> str:
138
151
  """Generate a large amount of text with approximately the target number of tokens"""
139
152
  generated_text = []
@@ -10,6 +10,8 @@ from rich.panel import Panel
10
10
  from rich.rule import Rule
11
11
  from rich.text import Text
12
12
 
13
+ from inspect_ai._util.content import ContentReasoning
14
+
13
15
  from .format import format_function_call
14
16
 
15
17
 
@@ -111,19 +113,31 @@ def transcript_panel(
111
113
  )
112
114
 
113
115
 
114
- def transcript_reasoning(reasoning: str) -> list[RenderableType]:
116
+ def transcript_reasoning(reasoning: ContentReasoning) -> list[RenderableType]:
115
117
  content: list[RenderableType] = []
118
+ text = (
119
+ reasoning.reasoning
120
+ if not reasoning.redacted
121
+ else "Reasoning encrypted by model provider."
122
+ )
123
+
116
124
  content.append(
117
- transcript_markdown(
118
- f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
119
- )
125
+ transcript_markdown(f"**<think>** \n{text} \n**</think>**\n\n", escape=True)
120
126
  )
121
127
  content.append(Text())
122
128
  return content
123
129
 
124
130
 
125
- def transcript_separator(title: str, color: str) -> RenderableType:
126
- return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
131
+ def transcript_separator(
132
+ title: str, color: str, characters: str = ""
133
+ ) -> RenderableType:
134
+ return Rule(
135
+ title=title,
136
+ characters=characters,
137
+ style=f"{color} bold",
138
+ align="center",
139
+ end="\n\n",
140
+ )
127
141
 
128
142
 
129
143
  def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableType:
@@ -0,0 +1,50 @@
1
+ import time
2
+ from contextvars import ContextVar
3
+
4
+
5
+ def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
6
+ _sample_working_limit.set(working_limit)
7
+ _sample_start_time.set(start_time)
8
+ _sample_waiting_time.set(0)
9
+
10
+
11
+ def sample_waiting_time() -> float:
12
+ return _sample_waiting_time.get()
13
+
14
+
15
+ def sample_working_time() -> float:
16
+ return time.monotonic() - _sample_start_time.get() - sample_waiting_time()
17
+
18
+
19
+ def report_sample_waiting_time(waiting_time: float) -> None:
20
+ _sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
21
+ check_sample_working_limit()
22
+
23
+
24
+ def check_sample_working_limit() -> None:
25
+ # no check if we don't have a limit
26
+ working_limit = _sample_working_limit.get()
27
+ if working_limit is None:
28
+ return
29
+
30
+ # are we over the limit?
31
+ running_time = time.monotonic() - _sample_start_time.get()
32
+ working_time = running_time - sample_waiting_time()
33
+ if working_time > working_limit:
34
+ from inspect_ai.solver._limit import SampleLimitExceededError
35
+
36
+ raise SampleLimitExceededError(
37
+ type="working",
38
+ value=int(working_time),
39
+ limit=int(working_limit),
40
+ message=f"Exceeded working time limit ({working_limit:,} seconds)",
41
+ )
42
+
43
+
44
+ _sample_working_limit: ContextVar[float | None] = ContextVar(
45
+ "sample_working_limit", default=None
46
+ )
47
+
48
+ _sample_start_time: ContextVar[float] = ContextVar("sample_start_time", default=0)
49
+
50
+ _sample_waiting_time: ContextVar[float] = ContextVar("sample_waiting_time", default=0)
@@ -805,15 +805,21 @@ table.table.table-sm td {
805
805
  overflow: unset;
806
806
  }
807
807
 
808
+ .markdown-content pre[class*="language-"],
808
809
  pre[class*="language-"].tool-output,
809
810
  .tool-output {
810
811
  background-color: #f8f8f8;
811
812
  }
813
+
814
+ .vscode-dark .model-call pre[class*="language-"],
815
+ .vscode-dark .markdown-content pre[class*="language-"],
812
816
  .vscode-dark pre[class*="language-"].tool-output,
813
817
  .vscode-dark .tool-output {
814
818
  background-color: #333333;
815
819
  }
816
820
 
821
+ .model-call pre[class*="language-"],
822
+ .markdown-content pre[class*="language-"],
817
823
  pre[class*="language-"].tool-output {
818
824
  border: none !important;
819
825
  box-shadow: none !important;