inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
218
218
  @click.option(
219
219
  "--time-limit",
220
220
  type=int,
221
- help="Limit on total execution time for each sample.",
221
+ help="Limit on total running time for each sample.",
222
222
  envvar="INSPECT_EVAL_TIME_LIMIT",
223
223
  )
224
+ @click.option(
225
+ "--working-limit",
226
+ type=int,
227
+ help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
228
+ envvar="INSPECT_EVAL_WORKING_LIMIT",
229
+ )
224
230
  @click.option(
225
231
  "--fail-on-error",
226
232
  type=float,
@@ -384,15 +390,19 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
384
390
  @click.option(
385
391
  "--reasoning-effort",
386
392
  type=click.Choice(["low", "medium", "high"]),
387
- help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
393
+ help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
388
394
  envvar="INSPECT_EVAL_REASONING_EFFORT",
389
395
  )
390
396
  @click.option(
391
- "--reasoning-history/--no-reasoning-history",
392
- type=bool,
393
- is_flag=True,
394
- default=True,
395
- help="Include reasoning in chat message history sent to generate.",
397
+ "--reasoning-tokens",
398
+ type=int,
399
+ help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
400
+ envvar="INSPECT_EVAL_REASONING_TOKENS",
401
+ )
402
+ @click.option(
403
+ "--reasoning-history",
404
+ type=click.Choice(["none", "all", "last", "auto"]),
405
+ help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
396
406
  envvar="INSPECT_EVAL_REASONING_HISTORY",
397
407
  )
398
408
  @click.option(
@@ -464,10 +474,12 @@ def eval_command(
464
474
  max_tool_output: int | None,
465
475
  cache_prompt: str | None,
466
476
  reasoning_effort: str | None,
467
- reasoning_history: bool | None,
477
+ reasoning_tokens: int | None,
478
+ reasoning_history: Literal["none", "all", "last", "auto"] | None,
468
479
  message_limit: int | None,
469
480
  token_limit: int | None,
470
481
  time_limit: int | None,
482
+ working_limit: int | None,
471
483
  max_samples: int | None,
472
484
  max_tasks: int | None,
473
485
  max_subprocesses: int | None,
@@ -518,6 +530,7 @@ def eval_command(
518
530
  message_limit=message_limit,
519
531
  token_limit=token_limit,
520
532
  time_limit=time_limit,
533
+ working_limit=working_limit,
521
534
  max_samples=max_samples,
522
535
  max_tasks=max_tasks,
523
536
  max_subprocesses=max_subprocesses,
@@ -625,10 +638,12 @@ def eval_set_command(
625
638
  max_tool_output: int | None,
626
639
  cache_prompt: str | None,
627
640
  reasoning_effort: str | None,
628
- reasoning_history: bool | None,
641
+ reasoning_tokens: int | None,
642
+ reasoning_history: Literal["none", "all", "last", "auto"] | None,
629
643
  message_limit: int | None,
630
644
  token_limit: int | None,
631
645
  time_limit: int | None,
646
+ working_limit: int | None,
632
647
  max_samples: int | None,
633
648
  max_tasks: int | None,
634
649
  max_subprocesses: int | None,
@@ -684,6 +699,7 @@ def eval_set_command(
684
699
  message_limit=message_limit,
685
700
  token_limit=token_limit,
686
701
  time_limit=time_limit,
702
+ working_limit=working_limit,
687
703
  max_samples=max_samples,
688
704
  max_tasks=max_tasks,
689
705
  max_subprocesses=max_subprocesses,
@@ -737,6 +753,7 @@ def eval_exec(
737
753
  message_limit: int | None,
738
754
  token_limit: int | None,
739
755
  time_limit: int | None,
756
+ working_limit: int | None,
740
757
  max_samples: int | None,
741
758
  max_tasks: int | None,
742
759
  max_subprocesses: int | None,
@@ -817,6 +834,7 @@ def eval_exec(
817
834
  message_limit=message_limit,
818
835
  token_limit=token_limit,
819
836
  time_limit=time_limit,
837
+ working_limit=working_limit,
820
838
  max_samples=max_samples,
821
839
  max_tasks=max_tasks,
822
840
  max_subprocesses=max_subprocesses,
@@ -143,3 +143,5 @@ class Display(Protocol):
143
143
 
144
144
  @contextlib.contextmanager
145
145
  def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]: ...
146
+
147
+ def display_counter(self, caption: str, value: str) -> None: ...
@@ -9,10 +9,12 @@ from .config import task_dict
9
9
 
10
10
 
11
11
  @throttle(1)
12
- def task_footer(style: str = "") -> tuple[RenderableType, RenderableType]:
12
+ def task_footer(
13
+ counters: dict[str, str], style: str = ""
14
+ ) -> tuple[RenderableType, RenderableType]:
13
15
  return (
14
16
  Text.from_markup(task_resources(), style=style),
15
- Text.from_markup(task_http_rate_limits(), style=style),
17
+ Text.from_markup(task_counters(counters), style=style),
16
18
  )
17
19
 
18
20
 
@@ -23,5 +25,13 @@ def task_resources() -> str:
23
25
  return task_dict(resources)
24
26
 
25
27
 
26
- def task_http_rate_limits() -> str:
28
+ def task_counters(counters: dict[str, str]) -> str:
29
+ return task_dict(counters | task_http_rate_limits())
30
+
31
+
32
+ def task_http_rate_limits() -> dict[str, str]:
33
+ return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
34
+
35
+
36
+ def task_http_rate_limits_str() -> str:
27
37
  return f"HTTP rate limits: {http_rate_limit_count():,}"
@@ -22,7 +22,7 @@ from ..core.display import (
22
22
  TaskSpec,
23
23
  TaskWithResult,
24
24
  )
25
- from ..core.footer import task_http_rate_limits
25
+ from ..core.footer import task_http_rate_limits_str
26
26
  from ..core.panel import task_panel, task_targets
27
27
  from ..core.results import task_metric, tasks_results
28
28
 
@@ -89,6 +89,10 @@ class PlainDisplay(Display):
89
89
  show_model_names=self.multiple_model_names,
90
90
  )
91
91
 
92
+ def display_counter(self, caption: str, value: str) -> None:
93
+ # Not supported for plain display as counters are only shown for tasks.
94
+ pass
95
+
92
96
  def _print_results(self) -> None:
93
97
  """Print final results using rich panels"""
94
98
  panels = tasks_results(self.tasks)
@@ -178,7 +182,7 @@ class PlainTaskDisplay(TaskDisplay):
178
182
  status_parts.append(resources)
179
183
 
180
184
  # Add rate limits
181
- rate_limits = task_http_rate_limits()
185
+ rate_limits = task_http_rate_limits_str()
182
186
  if rate_limits:
183
187
  status_parts.append(rate_limits)
184
188
 
@@ -60,6 +60,7 @@ class RichDisplay(Display):
60
60
  self.parallel = False
61
61
  self.live: Live | None = None
62
62
  self.timer_handle: asyncio.TimerHandle | None = None
63
+ self.counters: dict[str, str] = {}
63
64
  rich_initialise()
64
65
 
65
66
  @override
@@ -153,13 +154,20 @@ class RichDisplay(Display):
153
154
  and self.live.is_started
154
155
  ):
155
156
  if self.parallel:
156
- r = tasks_live_status(self.total_tasks, self.tasks, self.progress_ui)
157
+ r = tasks_live_status(
158
+ self.total_tasks, self.tasks, self.progress_ui, self.counters
159
+ )
157
160
  else:
158
- r = task_live_status(self.tasks, self.progress_ui)
161
+ r = task_live_status(self.tasks, self.progress_ui, self.counters)
159
162
  self.live.update(r, refresh=True)
160
163
 
161
164
  self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
162
165
 
166
+ @override
167
+ def display_counter(self, caption: str, value: str) -> None:
168
+ self.counters[caption] = value
169
+ self._update_display()
170
+
163
171
 
164
172
  class RichTaskScreen(TaskScreen):
165
173
  def __init__(self, live: Live) -> None:
@@ -286,7 +294,9 @@ class RichTaskDisplay(TaskDisplay):
286
294
  self.p.complete()
287
295
 
288
296
 
289
- def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> RenderableType:
297
+ def task_live_status(
298
+ tasks: list[TaskStatus], progress: RProgress, counters: dict[str, str]
299
+ ) -> RenderableType:
290
300
  theme = rich_theme()
291
301
 
292
302
  # the panel contents
@@ -300,13 +310,16 @@ def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> Renderable
300
310
  show_model=len(tasks) == 1,
301
311
  body=Group("", progress),
302
312
  subtitle=subtitle,
303
- footer=task_footer(theme.light),
313
+ footer=task_footer(counters, theme.light),
304
314
  log_location=None,
305
315
  )
306
316
 
307
317
 
308
318
  def tasks_live_status(
309
- total_tasks: int, tasks: list[TaskStatus], progress: RProgress
319
+ total_tasks: int,
320
+ tasks: list[TaskStatus],
321
+ progress: RProgress,
322
+ counters: dict[str, str],
310
323
  ) -> RenderableType:
311
324
  # rendering context
312
325
  theme = rich_theme()
@@ -325,7 +338,7 @@ def tasks_live_status(
325
338
  footer_table = Table.grid(expand=True)
326
339
  footer_table.add_column()
327
340
  footer_table.add_column(justify="right")
328
- footer = task_footer(theme.light)
341
+ footer = task_footer(counters, theme.light)
329
342
  footer_table.add_row()
330
343
  footer_table.add_row(footer[0], footer[1])
331
344
 
@@ -89,6 +89,7 @@ class TaskScreenApp(App[TR]):
89
89
  self._total_tasks = 0
90
90
  self._parallel = False
91
91
  self._tasks: list[TaskWithResult] = []
92
+ self._counters: dict[str, str] = {}
92
93
 
93
94
  # all tasks processed by app
94
95
  self._app_tasks: list[TaskWithResult] = []
@@ -185,7 +186,8 @@ class TaskScreenApp(App[TR]):
185
186
  # force repaint
186
187
  self.refresh(repaint=True)
187
188
 
188
- # enable mouse support (this broke in textual 2.0 when running in VS Code)
189
+ # enable mouse support (this broke in textual 2.0 when running in VS Code
190
+ # however is fixed in textual 2.1)
189
191
  assert self.app._driver
190
192
  textual_enable_mouse_support(self.app._driver)
191
193
 
@@ -301,7 +303,7 @@ class TaskScreenApp(App[TR]):
301
303
  samples_view.set_samples(active_and_started_samples)
302
304
 
303
305
  def update_footer(self) -> None:
304
- left, right = task_footer()
306
+ left, right = task_footer(self._counters)
305
307
  footer = self.query_one(AppFooter)
306
308
  footer.left = left
307
309
  footer.right = right
@@ -315,7 +317,7 @@ class TaskScreenApp(App[TR]):
315
317
 
316
318
  def set_unread(unread: int | None) -> None:
317
319
  if unread is not None:
318
- console_tab.label = f"Console ({unread}" # type: ignore[assignment]
320
+ console_tab.label = f"Console ({unread})" # type: ignore[assignment]
319
321
  else:
320
322
  console_tab.label = "Console" # type: ignore[assignment]
321
323
 
@@ -376,6 +378,10 @@ class TaskScreenApp(App[TR]):
376
378
  except NoMatches:
377
379
  return None
378
380
 
381
+ def display_counter(self, caption: str, value: str) -> None:
382
+ self._counters[caption] = value
383
+ self.update_footer()
384
+
379
385
  class InputPanelHost(InputPanel.Host):
380
386
  def __init__(self, app: "TaskScreenApp[TR]", tab_id: str) -> None:
381
387
  self.app = app
@@ -72,3 +72,7 @@ class TextualDisplay(Display):
72
72
  def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
73
73
  with self.app.task_display(profile) as task_display:
74
74
  yield task_display
75
+
76
+ @override
77
+ def display_counter(self, caption: str, value: str) -> None:
78
+ self.app.display_counter(caption, value)
@@ -39,7 +39,7 @@ class SamplesView(Widget):
39
39
  padding: 0 1 0 1;
40
40
  layout: grid;
41
41
  grid-size: 2 3;
42
- grid-rows: auto 1fr auto;
42
+ grid-rows: auto 1fr 3;
43
43
  grid-columns: 32 1fr;
44
44
  grid-gutter: 1;
45
45
  }
@@ -141,8 +141,8 @@ class SamplesList(OptionList):
141
141
  if highlighted_sample and (highlighted_sample not in self.samples):
142
142
  self.samples.append(highlighted_sample)
143
143
 
144
- # sort the samples by execution time
145
- self.samples.sort(key=lambda sample: sample.execution_time, reverse=True)
144
+ # sort the samples by running time
145
+ self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
146
146
 
147
147
  # rebuild the list
148
148
  self.clear_options()
@@ -154,9 +154,7 @@ class SamplesList(OptionList):
154
154
  table.add_column(width=1)
155
155
  task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
156
156
  task_name.truncate(18, overflow="ellipsis", pad=True)
157
- task_time = Text.from_markup(
158
- f"{format_progress_time(sample.execution_time)}"
159
- )
157
+ task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
160
158
  table.add_row(task_name, task_time, " ")
161
159
  sample_id = Text.from_markup(f"id: {sample.sample.id}")
162
160
  sample_id.truncate(18, overflow="ellipsis", pad=True)
@@ -423,10 +421,6 @@ class SampleToolbar(Horizontal):
423
421
  CANCEL_DISABLED = "Cancelling sample..."
424
422
 
425
423
  DEFAULT_CSS = f"""
426
- SampleToolbar {{
427
- grid-size: 5 1;
428
- grid-columns: auto auto 1fr auto auto;
429
- }}
430
424
  SampleToolbar #{STATUS_GROUP} {{
431
425
  width: 22;
432
426
  }}
@@ -9,7 +9,7 @@ from textual.containers import ScrollableContainer
9
9
  from textual.widget import Widget
10
10
  from textual.widgets import Static
11
11
 
12
- from inspect_ai._util.content import ContentText
12
+ from inspect_ai._util.content import ContentReasoning, ContentText
13
13
  from inspect_ai._util.rich import lines_display
14
14
  from inspect_ai._util.transcript import (
15
15
  set_transcript_markdown_options,
@@ -36,7 +36,6 @@ from inspect_ai.log._transcript import (
36
36
  )
37
37
  from inspect_ai.model._chat_message import (
38
38
  ChatMessage,
39
- ChatMessageAssistant,
40
39
  ChatMessageUser,
41
40
  )
42
41
  from inspect_ai.model._render import messages_preceding_assistant
@@ -193,16 +192,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
193
192
  return EventDisplay(f"model: {event.model}", Group(*content))
194
193
 
195
194
 
196
- def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
197
- # render sub-events
198
- display: list[EventDisplay] = []
199
- if event.events:
200
- for e in event.events:
201
- display.extend(render_event(e) or [])
195
+ def render_sub_events(events: list[Event]) -> list[RenderableType]:
196
+ content: list[RenderableType] = []
197
+ for e in events:
198
+ event_displays = render_event(e) or []
199
+ for d in event_displays:
200
+ if d.content:
201
+ content.append(Text(" "))
202
+ content.append(transcript_separator(d.title, "black", "··"))
203
+ if isinstance(d.content, Markdown):
204
+ set_transcript_markdown_options(d.content)
205
+ content.append(d.content)
202
206
 
207
+ return content
208
+
209
+
210
+ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
203
211
  # render the call
204
212
  content = transcript_tool_call(event)
205
213
 
214
+ # render sub-events
215
+ if event.events:
216
+ content.extend(render_sub_events(event.events))
217
+
206
218
  # render the output
207
219
  if isinstance(event.result, list):
208
220
  result: ToolResult = "\n".join(
@@ -220,7 +232,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
220
232
  result = str(result).strip()
221
233
  content.extend(lines_display(result, 50))
222
234
 
223
- return display + [EventDisplay("tool call", Group(*content))]
235
+ return [EventDisplay("tool call", Group(*content))]
224
236
 
225
237
 
226
238
  def render_step_event(event: StepEvent) -> EventDisplay:
@@ -257,13 +269,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
257
269
 
258
270
 
259
271
  def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
272
+ # render header
273
+ content: list[RenderableType] = [transcript_function(event.name, event.input)]
274
+
260
275
  # render sub-events
261
- display: list[EventDisplay] = []
262
276
  if event.events:
263
- for e in event.events:
264
- display.extend(render_event(e) or [])
277
+ content.extend(render_sub_events(event.events))
265
278
 
266
- content: list[RenderableType] = [transcript_function(event.name, event.input)]
267
279
  if event.result:
268
280
  content.append(Text())
269
281
  if isinstance(event.result, str | int | float | bool | None):
@@ -271,7 +283,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
271
283
  else:
272
284
  content.append(render_as_json(event.result))
273
285
 
274
- return display + [EventDisplay(f"subtask: {event.name}", Group(*content))]
286
+ return [EventDisplay(f"subtask: {event.name}", Group(*content))]
275
287
 
276
288
 
277
289
  def render_input_event(event: InputEvent) -> EventDisplay:
@@ -320,11 +332,16 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
320
332
  Text(),
321
333
  ]
322
334
 
323
- if isinstance(message, ChatMessageAssistant) and message.reasoning:
324
- content.extend(transcript_reasoning(message.reasoning))
325
-
326
- if message.text:
335
+ # deal with plain text or with content blocks
336
+ if isinstance(message.content, str):
327
337
  content.extend([transcript_markdown(message.text.strip(), escape=True)])
338
+ else:
339
+ for c in message.content:
340
+ if isinstance(c, ContentReasoning):
341
+ content.extend(transcript_reasoning(c))
342
+ elif isinstance(c, ContentText):
343
+ content.extend([transcript_markdown(c.text.strip(), escape=True)])
344
+
328
345
  return content
329
346
 
330
347
 
inspect_ai/_eval/eval.py CHANGED
@@ -75,6 +75,7 @@ def eval(
75
75
  message_limit: int | None = None,
76
76
  token_limit: int | None = None,
77
77
  time_limit: int | None = None,
78
+ working_limit: int | None = None,
78
79
  max_samples: int | None = None,
79
80
  max_tasks: int | None = None,
80
81
  max_subprocesses: int | None = None,
@@ -132,7 +133,10 @@ def eval(
132
133
  so they can be debugged (defaults to False).
133
134
  message_limit: Limit on total messages used for each sample.
134
135
  token_limit: Limit on total tokens used for each sample.
135
- time_limit: Limit on time (in seconds) for execution of each sample.
136
+ time_limit: Limit on clock time (in seconds) for samples.
137
+ working_limit: Limit on working time (in seconds) for sample. Working
138
+ time includes model generation, tool calls, etc. but does not include
139
+ time spent waiting on retries or shared resources.
136
140
  max_samples: Maximum number of samples to run in parallel
137
141
  (default is max_connections)
138
142
  max_tasks: Maximum number of tasks to run in parallel
@@ -186,6 +190,7 @@ def eval(
186
190
  message_limit=message_limit,
187
191
  token_limit=token_limit,
188
192
  time_limit=time_limit,
193
+ working_limit=working_limit,
189
194
  max_samples=max_samples,
190
195
  max_tasks=max_tasks,
191
196
  max_subprocesses=max_subprocesses,
@@ -227,6 +232,7 @@ async def eval_async(
227
232
  message_limit: int | None = None,
228
233
  token_limit: int | None = None,
229
234
  time_limit: int | None = None,
235
+ working_limit: int | None = None,
230
236
  max_samples: int | None = None,
231
237
  max_tasks: int | None = None,
232
238
  max_subprocesses: int | None = None,
@@ -281,7 +287,10 @@ async def eval_async(
281
287
  so they can be debugged (defaults to False).
282
288
  message_limit (int | None): Limit on total messages used for each sample.
283
289
  token_limit (int | None): Limit on total tokens used for each sample.
284
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
290
+ time_limit: Limit on clock time (in seconds) for samples.
291
+ working_limit: Limit on working time (in seconds) for sample. Working
292
+ time includes model generation, tool calls, etc. but does not include
293
+ time spent waiting on retries or shared resources.
285
294
  max_samples (int | None): Maximum number of samples to run in parallel
286
295
  (default is max_connections)
287
296
  max_tasks (int | None): Maximum number of tasks to run in parallel
@@ -395,6 +404,7 @@ async def eval_async(
395
404
  message_limit=message_limit,
396
405
  token_limit=token_limit,
397
406
  time_limit=time_limit,
407
+ working_limit=working_limit,
398
408
  max_samples=max_samples,
399
409
  max_tasks=max_tasks,
400
410
  max_subprocesses=max_subprocesses,
@@ -702,6 +712,7 @@ async def eval_retry_async(
702
712
  message_limit = eval_log.eval.config.message_limit
703
713
  token_limit = eval_log.eval.config.token_limit
704
714
  time_limit = eval_log.eval.config.time_limit
715
+ working_limit = eval_log.eval.config.working_limit
705
716
  max_samples = max_samples or eval_log.eval.config.max_samples
706
717
  max_tasks = max_tasks or eval_log.eval.config.max_tasks
707
718
  max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
@@ -763,6 +774,7 @@ async def eval_retry_async(
763
774
  message_limit=message_limit,
764
775
  token_limit=token_limit,
765
776
  time_limit=time_limit,
777
+ working_limit=working_limit,
766
778
  max_samples=max_samples,
767
779
  max_tasks=max_tasks,
768
780
  max_subprocesses=max_subprocesses,
@@ -79,6 +79,7 @@ def eval_set(
79
79
  message_limit: int | None = None,
80
80
  token_limit: int | None = None,
81
81
  time_limit: int | None = None,
82
+ working_limit: int | None = None,
82
83
  max_samples: int | None = None,
83
84
  max_tasks: int | None = None,
84
85
  max_subprocesses: int | None = None,
@@ -146,7 +147,10 @@ def eval_set(
146
147
  so they can be debugged (defaults to False).
147
148
  message_limit: Limit on total messages used for each sample.
148
149
  token_limit: Limit on total tokens used for each sample.
149
- time_limit: Limit on time (in seconds) for execution of each sample.
150
+ time_limit: Limit on clock time (in seconds) for samples.
151
+ working_limit: Limit on working time (in seconds) for sample. Working
152
+ time includes model generation, tool calls, etc. but does not include
153
+ time spent waiting on retries or shared resources.
150
154
  max_samples: Maximum number of samples to run in parallel
151
155
  (default is max_connections)
152
156
  max_tasks: Maximum number of tasks to run in parallel
@@ -202,6 +206,7 @@ def eval_set(
202
206
  message_limit=message_limit,
203
207
  token_limit=token_limit,
204
208
  time_limit=time_limit,
209
+ working_limit=working_limit,
205
210
  max_samples=max_samples,
206
211
  max_tasks=max_tasks,
207
212
  max_subprocesses=max_subprocesses,
inspect_ai/_eval/run.py CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
163
163
  else:
164
164
  task.time_limit = task_eval_config.time_limit
165
165
 
166
+ # sample execution limit
167
+ if task_eval_config.working_limit is None:
168
+ task_eval_config.working_limit = task.working_limit
169
+ else:
170
+ task.working_limit = task_eval_config.working_limit
171
+
166
172
  # fail_on_error
167
173
  if task_eval_config.fail_on_error is None:
168
174
  task_eval_config.fail_on_error = task.fail_on_error