inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  import abc
2
- import asyncio
2
+ import contextlib
3
3
  import functools
4
4
  import json
5
5
  import logging
@@ -7,8 +7,9 @@ import os
7
7
  import time
8
8
  from contextvars import ContextVar
9
9
  from copy import deepcopy
10
+ from datetime import datetime
10
11
  from types import TracebackType
11
- from typing import Any, Callable, Literal, Type, cast
12
+ from typing import Any, AsyncIterator, Callable, Literal, Type, cast
12
13
 
13
14
  from pydantic_core import to_jsonable_python
14
15
  from tenacity import (
@@ -21,7 +22,12 @@ from tenacity import (
21
22
  )
22
23
 
23
24
  from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
24
- from inspect_ai._util.content import Content, ContentImage, ContentText
25
+ from inspect_ai._util.content import (
26
+ Content,
27
+ ContentImage,
28
+ ContentReasoning,
29
+ ContentText,
30
+ )
25
31
  from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
26
32
  from inspect_ai._util.interrupt import check_sample_interrupt
27
33
  from inspect_ai._util.platform import platform_init
@@ -33,6 +39,7 @@ from inspect_ai._util.registry import (
33
39
  )
34
40
  from inspect_ai._util.retry import log_rate_limit_retry
35
41
  from inspect_ai._util.trace import trace_action
42
+ from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
36
43
  from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
37
44
  from inspect_ai.tool._tool_def import ToolDef, tool_defs
38
45
  from inspect_ai.util import concurrency
@@ -147,6 +154,17 @@ class ModelAPI(abc.ABC):
147
154
  """Default max_tokens."""
148
155
  return None
149
156
 
157
+ def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
158
+ """Default max_tokens for a given config.
159
+
160
+ Args:
161
+ config: Generation config.
162
+
163
+ Returns:
164
+ Default maximum tokens for specified configuration.
165
+ """
166
+ return None
167
+
150
168
  def max_connections(self) -> int:
151
169
  """Default max_connections."""
152
170
  return DEFAULT_MAX_CONNECTIONS
@@ -179,9 +197,17 @@ class ModelAPI(abc.ABC):
179
197
  """Tool results can contain images"""
180
198
  return False
181
199
 
182
- def has_reasoning_history(self) -> bool:
183
- """Chat message assistant messages can include reasoning."""
184
- return False
200
+ def emulate_reasoning_history(self) -> bool:
201
+ """Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
202
+ return True
203
+
204
+ def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
205
+ """Force a specific reasoning history behavior for this provider."""
206
+ return None
207
+
208
+ def auto_reasoning_history(self) -> Literal["none", "all", "last"]:
209
+ """Behavior to use for reasoning_history='auto'"""
210
+ return "all"
185
211
 
186
212
 
187
213
  class Model:
@@ -284,9 +310,10 @@ class Model:
284
310
  config = base_config.merge(config)
285
311
 
286
312
  # provide max_tokens from the model api if required
287
- config.max_tokens = (
288
- config.max_tokens if config.max_tokens else self.api.max_tokens()
289
- )
313
+ if config.max_tokens is None:
314
+ config.max_tokens = self.api.max_tokens_for_config(config)
315
+ if config.max_tokens is None:
316
+ config.max_tokens = self.api.max_tokens()
290
317
 
291
318
  # disable parallel tool calls if requested by any of our tools
292
319
  if disable_parallel_tools(tools):
@@ -301,8 +328,11 @@ class Model:
301
328
  input = [ChatMessageSystem(content=config.system_message)] + input
302
329
 
303
330
  # enforce concurrency limits
331
+ start_time = datetime.now()
332
+ working_start = sample_working_time()
304
333
  async with self._connection_concurrency(config):
305
- return await self._generate(
334
+ # generate
335
+ output = await self._generate(
306
336
  input=input,
307
337
  tools=tools,
308
338
  tool_choice=tool_choice,
@@ -310,6 +340,28 @@ class Model:
310
340
  cache=cache,
311
341
  )
312
342
 
343
+ # update the most recent ModelEvent with the actual start/completed
344
+ # times as well as a computation of working time (events are
345
+ # created _after_ the call to _generate, potentially in response
346
+ # to retries, so they need their timestamp updated so it accurately
347
+ # reflects the full start/end time which we know here)
348
+ from inspect_ai.log._transcript import ModelEvent, transcript
349
+
350
+ last_model_event = transcript().find_last_event(ModelEvent)
351
+ if last_model_event:
352
+ last_model_event.timestamp = start_time
353
+ last_model_event.working_start = working_start
354
+ completed = datetime.now()
355
+ last_model_event.completed = completed
356
+ last_model_event.working_time = (
357
+ output.time
358
+ if output.time is not None
359
+ else (completed - start_time).total_seconds()
360
+ )
361
+
362
+ # return output
363
+ return output
364
+
313
365
  async def _generate(
314
366
  self,
315
367
  input: list[ChatMessage],
@@ -348,9 +400,7 @@ class Model:
348
400
  tool_choice = "none"
349
401
 
350
402
  # handle reasoning history
351
- input = resolve_reasoning_history(
352
- input, config, self.api.has_reasoning_history()
353
- )
403
+ input = resolve_reasoning_history(input, config, self.api)
354
404
 
355
405
  # apply any tool model_input handlers
356
406
  input = resolve_tool_model_input(tdefs, input)
@@ -435,14 +485,16 @@ class Model:
435
485
  )
436
486
 
437
487
  with trace_action(logger, "Model", f"generate ({str(self)})"):
438
- time_start = time.perf_counter()
439
- result = await self.api.generate(
440
- input=input,
441
- tools=tools,
442
- tool_choice=tool_choice,
443
- config=config,
444
- )
445
- time_elapsed = time.perf_counter() - time_start
488
+ time_start = time.monotonic()
489
+ try:
490
+ result = await self.api.generate(
491
+ input=input,
492
+ tools=tools,
493
+ tool_choice=tool_choice,
494
+ config=config,
495
+ )
496
+ finally:
497
+ time_elapsed = time.monotonic() - time_start
446
498
 
447
499
  if isinstance(result, tuple):
448
500
  output, call = result
@@ -461,8 +513,12 @@ class Model:
461
513
  error_message = f"{error}\n\nRequest:\n{request}"
462
514
  raise RuntimeError(error_message)
463
515
 
464
- # update output with time elapsed
465
- output.time = time_elapsed
516
+ # update output with time (call.time captures time spent
517
+ # on the actual request that succeeds w/ status 200)
518
+ if call and call.time is not None:
519
+ output.time = call.time
520
+ else:
521
+ output.time = time_elapsed
466
522
 
467
523
  # add views to tool calls
468
524
  for choice in output.choices:
@@ -488,8 +544,13 @@ class Model:
488
544
 
489
545
  return output
490
546
 
491
- # call the model
547
+ # call the model (this will so retries, etc., so report waiting time
548
+ # as elapsed time - actual time for successful model call)
549
+ time_start = time.monotonic()
492
550
  model_output = await generate()
551
+ total_time = time.monotonic() - time_start
552
+ if model_output.time:
553
+ report_sample_waiting_time(total_time - model_output.time)
493
554
 
494
555
  # return results
495
556
  return model_output
@@ -513,7 +574,10 @@ class Model:
513
574
  # override the _connection_key() argument to provide a scope within which
514
575
  # to enforce max_connections (e.g. by account/api_key, by endpoint, etc.)
515
576
 
516
- def _connection_concurrency(self, config: GenerateConfig) -> asyncio.Semaphore:
577
+ @contextlib.asynccontextmanager
578
+ async def _connection_concurrency(
579
+ self, config: GenerateConfig
580
+ ) -> AsyncIterator[None]:
517
581
  """Get the appropriate connection semaphore for this model instance."""
518
582
  max_connections = (
519
583
  config.max_connections
@@ -521,11 +585,12 @@ class Model:
521
585
  else self.api.max_connections()
522
586
  )
523
587
  model_name = ModelName(self)
524
- return concurrency(
588
+ async with concurrency(
525
589
  name=f"{model_name.api}",
526
590
  concurrency=max_connections,
527
591
  key=f"Model{self.api.connection_key()}",
528
- )
592
+ ):
593
+ yield
529
594
 
530
595
  def _record_model_interaction(
531
596
  self,
@@ -833,68 +898,91 @@ def simple_input_messages(
833
898
 
834
899
 
835
900
  def resolve_reasoning_history(
836
- messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
901
+ messages: list[ChatMessage],
902
+ config: GenerateConfig,
903
+ model_api: ModelAPI,
837
904
  ) -> list[ChatMessage]:
838
- # determine if we are including reasoning history
839
- reasoning_history = config.reasoning_history is not False
840
-
841
905
  # determine up front if we have any reasoning content
842
906
  have_reasoning = any(
843
907
  [
844
- isinstance(m, ChatMessageAssistant) and m.reasoning is not None
908
+ isinstance(m, ChatMessageAssistant)
909
+ and isinstance(m.content, list)
910
+ and any([c for c in m.content if isinstance(c, ContentReasoning)])
845
911
  for m in messages
846
912
  ]
847
913
  )
848
914
  if not have_reasoning:
849
915
  return messages
850
916
 
851
- # API asssistant message format directly supports reasoning history so we will:
852
- # (a) Remove reasoning content entirely if config says not to include it; or
853
- # (b) Leave the messages alone if config says to include it
854
- if api_has_reasoning_history:
855
- # remove reasoning history as per config
856
- if not reasoning_history:
857
- resolved_messages: list[ChatMessage] = []
858
- for message in messages:
859
- if isinstance(message, ChatMessageAssistant):
860
- resolved_messages.append(
861
- message.model_copy(update={"reasoning": None})
862
- )
863
- else:
864
- resolved_messages.append(message)
865
-
866
- return resolved_messages
867
-
868
- # include reasoning history as per config
869
- else:
870
- return messages
917
+ # determine reasoning history configuration
918
+ reasoning_history = (
919
+ config.reasoning_history if config.reasoning_history is not None else "auto"
920
+ )
871
921
 
872
- # API can't represent reasoning natively so include <think> tags
873
- elif reasoning_history:
922
+ # see if the provider is forcing a reasoning history
923
+ force = model_api.force_reasoning_history()
924
+ if force is not None:
925
+ reasoning_history = force
926
+ # if it's 'auto' then defer to the provider
927
+ elif reasoning_history == "auto":
928
+ reasoning_history = model_api.auto_reasoning_history()
929
+
930
+ # generate a version of message history with the correct history
931
+ if reasoning_history == "all":
932
+ resolved_messages: list[ChatMessage] = messages
933
+ else:
934
+ found_last = False
874
935
  resolved_messages = []
875
- for message in messages:
876
- if (
877
- isinstance(message, ChatMessageAssistant)
878
- and message.reasoning is not None
936
+ for message in reversed(messages):
937
+ if isinstance(message, ChatMessageAssistant) and isinstance(
938
+ message.content, list
879
939
  ):
880
- message = deepcopy(message)
881
- if isinstance(message.content, str):
882
- message.content = (
883
- f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
884
- )
885
- else:
886
- message.content.insert(
887
- 0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
888
- )
889
- message.reasoning = None
940
+ # is there reasoning in this message?
941
+ has_reasoning = any(
942
+ isinstance(c, ContentReasoning) for c in message.content
943
+ )
944
+ # remove it unless we are in "last" mode and haven't yet found last
945
+ if has_reasoning:
946
+ if reasoning_history == "none" or found_last:
947
+ message = message.model_copy(
948
+ update={
949
+ "content": [
950
+ content
951
+ for content in message.content
952
+ if not isinstance(content, ContentReasoning)
953
+ ]
954
+ }
955
+ )
956
+ found_last = True
890
957
 
891
958
  resolved_messages.append(message)
892
959
 
893
- return resolved_messages
960
+ # reverse them back
961
+ resolved_messages.reverse()
894
962
 
895
- # api doesn't handle reasoning and config says no reasoning_history, nothing to do
896
- else:
897
- return messages
963
+ # api can't represent reasoning natively so emulate it
964
+ if model_api.emulate_reasoning_history():
965
+ emulated_messages: list[ChatMessage] = []
966
+ for message in resolved_messages:
967
+ if isinstance(message, ChatMessageAssistant) and isinstance(
968
+ message.content, list
969
+ ):
970
+ content: list[Content] = []
971
+ for c in message.content:
972
+ if isinstance(c, ContentReasoning):
973
+ content.append(
974
+ ContentText(text=f"<think>\n{c.reasoning}\n</think>")
975
+ )
976
+ else:
977
+ content.append(c)
978
+ message = message.model_copy(update={"content": content})
979
+
980
+ emulated_messages.append(message)
981
+
982
+ resolved_messages = emulated_messages
983
+
984
+ # return messages
985
+ return resolved_messages
898
986
 
899
987
 
900
988
  def resolve_tool_model_input(
@@ -1184,6 +1272,10 @@ def set_model_usage(
1184
1272
  if total_usage.input_tokens_cache_read is None:
1185
1273
  total_usage.input_tokens_cache_read = 0
1186
1274
  total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
1275
+ if usage.reasoning_tokens is not None:
1276
+ if total_usage.reasoning_tokens is None:
1277
+ total_usage.reasoning_tokens = 0
1278
+ total_usage.reasoning_tokens += usage.reasoning_tokens
1187
1279
 
1188
1280
  model_usage[model] = total_usage
1189
1281
 
@@ -1,6 +1,6 @@
1
1
  from typing import Any, Callable
2
2
 
3
- from pydantic import BaseModel, JsonValue
3
+ from pydantic import BaseModel, Field, JsonValue
4
4
 
5
5
  from inspect_ai._util.json import jsonable_python
6
6
 
@@ -22,9 +22,15 @@ class ModelCall(BaseModel):
22
22
  response: dict[str, JsonValue]
23
23
  """Raw response data from model."""
24
24
 
25
+ time: float | None = Field(default=None)
26
+ """Time taken for underlying model call."""
27
+
25
28
  @staticmethod
26
29
  def create(
27
- request: Any, response: Any, filter: ModelCallFilter | None = None
30
+ request: Any,
31
+ response: Any,
32
+ filter: ModelCallFilter | None = None,
33
+ time: float | None = None,
28
34
  ) -> "ModelCall":
29
35
  """Create a ModelCall object.
30
36
 
@@ -36,6 +42,7 @@ class ModelCall(BaseModel):
36
42
  request (Any): Request object (dict, dataclass, BaseModel, etc.)
37
43
  response (Any): Response object (dict, dataclass, BaseModel, etc.)
38
44
  filter (ModelCallFilter): Function for filtering model call data.
45
+ time: Time taken for underlying ModelCall
39
46
  """
40
47
  request_dict = jsonable_python(request)
41
48
  if filter:
@@ -43,7 +50,7 @@ class ModelCall(BaseModel):
43
50
  response_dict = jsonable_python(response)
44
51
  if filter:
45
52
  response_dict = _walk_json_value(None, response_dict, filter)
46
- return ModelCall(request=request_dict, response=response_dict)
53
+ return ModelCall(request=request_dict, response=response_dict, time=time)
47
54
 
48
55
 
49
56
  def _walk_json_value(
@@ -26,6 +26,9 @@ class ModelUsage(BaseModel):
26
26
  input_tokens_cache_read: int | None = Field(default=None)
27
27
  """Number of tokens retrieved from the cache."""
28
28
 
29
+ reasoning_tokens: int | None = Field(default=None)
30
+ """Number of tokens used for reasoning."""
31
+
29
32
 
30
33
  StopReason = Literal[
31
34
  "stop",
@@ -27,11 +27,18 @@ from openai.types.chat.chat_completion_message_tool_call import Function
27
27
  from openai.types.completion_usage import CompletionUsage
28
28
  from openai.types.shared_params.function_definition import FunctionDefinition
29
29
 
30
- from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentText
30
+ from inspect_ai._util.content import (
31
+ Content,
32
+ ContentAudio,
33
+ ContentImage,
34
+ ContentReasoning,
35
+ ContentText,
36
+ )
31
37
  from inspect_ai._util.images import file_as_data_uri
32
38
  from inspect_ai._util.url import is_http_url
33
39
  from inspect_ai.model._call_tools import parse_tool_call
34
40
  from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
41
+ from inspect_ai.model._reasoning import parse_content_with_reasoning
35
42
  from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
36
43
 
37
44
  from ._chat_message import (
@@ -148,14 +155,14 @@ async def openai_chat_message(
148
155
  if message.tool_calls:
149
156
  return ChatCompletionAssistantMessageParam(
150
157
  role=message.role,
151
- content=message.text,
158
+ content=openai_assistant_content(message),
152
159
  tool_calls=[
153
160
  openai_chat_tool_call_param(call) for call in message.tool_calls
154
161
  ],
155
162
  )
156
163
  else:
157
164
  return ChatCompletionAssistantMessageParam(
158
- role=message.role, content=message.text
165
+ role=message.role, content=openai_assistant_content(message)
159
166
  )
160
167
  elif message.role == "tool":
161
168
  return ChatCompletionToolMessageParam(
@@ -175,16 +182,29 @@ async def openai_chat_messages(
175
182
  return [await openai_chat_message(message, model) for message in messages]
176
183
 
177
184
 
185
+ def openai_assistant_content(message: ChatMessageAssistant) -> str:
186
+ if isinstance(message.content, str):
187
+ content = message.content
188
+ else:
189
+ content = ""
190
+ for c in message.content:
191
+ if c.type == "reasoning":
192
+ attribs = ""
193
+ if c.signature is not None:
194
+ attribs = f'{attribs} signature="{c.signature}"'
195
+ if c.redacted:
196
+ attribs = f'{attribs} redacted="true"'
197
+ content = f"{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n"
198
+ elif c.type == "text":
199
+ content = f"{content}\n{c.text}"
200
+ return content
201
+
202
+
178
203
  def openai_chat_choices(choices: list[ChatCompletionChoice]) -> list[Choice]:
179
204
  oai_choices: list[Choice] = []
180
205
 
181
206
  for index, choice in enumerate(choices):
182
- if isinstance(choice.message.content, str):
183
- content = choice.message.content
184
- else:
185
- content = "\n".join(
186
- [c.text for c in choice.message.content if c.type == "text"]
187
- )
207
+ content = openai_assistant_content(choice.message)
188
208
  if choice.message.tool_calls:
189
209
  tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
190
210
  else:
@@ -274,35 +294,47 @@ def chat_messages_from_openai(
274
294
  chat_messages: list[ChatMessage] = []
275
295
 
276
296
  for message in messages:
297
+ content: str | list[Content] = []
277
298
  if message["role"] == "system" or message["role"] == "developer":
278
299
  sys_content = message["content"]
279
300
  if isinstance(sys_content, str):
280
301
  chat_messages.append(ChatMessageSystem(content=sys_content))
281
302
  else:
282
- chat_messages.append(
283
- ChatMessageSystem(
284
- content=[content_from_openai(c) for c in sys_content]
285
- )
286
- )
303
+ content = []
304
+ for sc in sys_content:
305
+ content.extend(content_from_openai(sc))
306
+ chat_messages.append(ChatMessageSystem(content=content))
287
307
  elif message["role"] == "user":
288
308
  user_content = message["content"]
289
309
  if isinstance(user_content, str):
290
310
  chat_messages.append(ChatMessageUser(content=user_content))
291
311
  else:
292
- chat_messages.append(
293
- ChatMessageUser(
294
- content=[content_from_openai(c) for c in user_content]
295
- )
296
- )
312
+ content = []
313
+ for uc in user_content:
314
+ content.extend(content_from_openai(uc))
315
+ chat_messages.append(ChatMessageUser(content=content))
297
316
  elif message["role"] == "assistant":
298
317
  # resolve content
299
- asst_content = message["content"]
318
+ asst_content = message.get("content", None)
300
319
  if isinstance(asst_content, str):
301
- content: str | list[Content] = asst_content
320
+ result = parse_content_with_reasoning(asst_content)
321
+ if result is not None:
322
+ content = [
323
+ ContentReasoning(
324
+ reasoning=result.reasoning,
325
+ signature=result.signature,
326
+ redacted=result.redacted,
327
+ ),
328
+ ContentText(text=result.content),
329
+ ]
330
+ else:
331
+ content = asst_content
302
332
  elif asst_content is None:
303
333
  content = message.get("refusal", None) or ""
304
334
  else:
305
- content = [content_from_openai(c) for c in asst_content]
335
+ content = []
336
+ for ac in asst_content:
337
+ content.extend(content_from_openai(ac, parse_reasoning=True))
306
338
 
307
339
  # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
308
340
  # interfaces e.g. DeepSeek do include this field so we pluck it out)
@@ -310,22 +342,25 @@ def chat_messages_from_openai(
310
342
  "reasoning", None
311
343
  )
312
344
  if reasoning is not None:
313
- reasoning = str(reasoning)
345
+ if isinstance(content, str):
346
+ content = [ContentText(text=content)]
347
+ else:
348
+ content.insert(0, ContentReasoning(reasoning=str(reasoning)))
314
349
 
315
350
  # return message
316
351
  if "tool_calls" in message:
317
352
  tool_calls: list[ToolCall] = []
318
- for tc in message["tool_calls"]:
319
- tool_calls.append(tool_call_from_openai(tc))
320
- tool_names[tc["id"]] = tc["function"]["name"]
353
+ for call in message["tool_calls"]:
354
+ tool_calls.append(tool_call_from_openai(call))
355
+ tool_names[call["id"]] = call["function"]["name"]
321
356
 
322
357
  else:
323
358
  tool_calls = []
359
+
324
360
  chat_messages.append(
325
361
  ChatMessageAssistant(
326
362
  content=content,
327
363
  tool_calls=tool_calls or None,
328
- reasoning=reasoning,
329
364
  )
330
365
  )
331
366
  elif message["role"] == "tool":
@@ -333,7 +368,9 @@ def chat_messages_from_openai(
333
368
  if isinstance(tool_content, str):
334
369
  content = tool_content
335
370
  else:
336
- content = [content_from_openai(c) for c in tool_content]
371
+ content = []
372
+ for tc in tool_content:
373
+ content.extend(content_from_openai(tc))
337
374
  chat_messages.append(
338
375
  ChatMessageTool(
339
376
  content=content,
@@ -357,20 +394,40 @@ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> Tool
357
394
 
358
395
  def content_from_openai(
359
396
  content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
360
- ) -> Content:
397
+ parse_reasoning: bool = False,
398
+ ) -> list[Content]:
361
399
  if content["type"] == "text":
362
- return ContentText(text=content["text"])
400
+ text = content["text"]
401
+ if parse_reasoning:
402
+ result = parse_content_with_reasoning(text)
403
+ if result:
404
+ return [
405
+ ContentReasoning(
406
+ reasoning=result.reasoning,
407
+ signature=result.signature,
408
+ redacted=result.redacted,
409
+ ),
410
+ ContentText(text=result.content),
411
+ ]
412
+ else:
413
+ return [ContentText(text=text)]
414
+ else:
415
+ return [ContentText(text=text)]
363
416
  elif content["type"] == "image_url":
364
- return ContentImage(
365
- image=content["image_url"]["url"], detail=content["image_url"]["detail"]
366
- )
417
+ return [
418
+ ContentImage(
419
+ image=content["image_url"]["url"], detail=content["image_url"]["detail"]
420
+ )
421
+ ]
367
422
  elif content["type"] == "input_audio":
368
- return ContentAudio(
369
- audio=content["input_audio"]["data"],
370
- format=content["input_audio"]["format"],
371
- )
423
+ return [
424
+ ContentAudio(
425
+ audio=content["input_audio"]["data"],
426
+ format=content["input_audio"]["format"],
427
+ )
428
+ ]
372
429
  elif content["type"] == "refusal":
373
- return ContentText(text=content["refusal"])
430
+ return [ContentText(text=content["refusal"])]
374
431
 
375
432
 
376
433
  def chat_message_assistant_from_openai(
@@ -380,11 +437,20 @@ def chat_message_assistant_from_openai(
380
437
  reasoning = getattr(message, "reasoning_content", None) or getattr(
381
438
  message, "reasoning", None
382
439
  )
440
+
441
+ msg_content = refusal or message.content or ""
442
+ if reasoning is not None:
443
+ content: str | list[Content] = [
444
+ ContentReasoning(reasoning=str(reasoning)),
445
+ ContentText(text=msg_content),
446
+ ]
447
+ else:
448
+ content = msg_content
449
+
383
450
  return ChatMessageAssistant(
384
- content=refusal or message.content or "",
451
+ content=content,
385
452
  source="generate",
386
453
  tool_calls=chat_tool_calls_from_openai(message, tools),
387
- reasoning=reasoning,
388
454
  )
389
455
 
390
456