inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. inspect_ai/_cli/eval.py +14 -8
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +6 -1
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/transcript.py +10 -6
  9. inspect_ai/_eval/task/run.py +5 -8
  10. inspect_ai/_util/content.py +20 -1
  11. inspect_ai/_util/transcript.py +10 -4
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/App.css +6 -0
  14. inspect_ai/_view/www/dist/assets/index.css +115 -87
  15. inspect_ai/_view/www/dist/assets/index.js +5324 -2276
  16. inspect_ai/_view/www/eslint.config.mjs +24 -1
  17. inspect_ai/_view/www/log-schema.json +283 -20
  18. inspect_ai/_view/www/package.json +8 -3
  19. inspect_ai/_view/www/src/App.tsx +2 -2
  20. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  21. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  22. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  23. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  24. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  25. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  26. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  27. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  28. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  29. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  30. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  31. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  32. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  33. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  34. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  35. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  36. inspect_ai/_view/www/src/index.tsx +2 -2
  37. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  38. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  39. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  40. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
  41. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  42. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  43. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  44. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  45. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  46. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  47. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  48. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  49. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
  50. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  51. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  52. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  53. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  54. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  55. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  56. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  57. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  58. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  59. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  60. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  64. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  65. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  66. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  67. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  68. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  69. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  70. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  71. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  72. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  73. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  74. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  75. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  76. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  77. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  78. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  79. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  80. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  83. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  84. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  85. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  86. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  87. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  88. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
  89. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  90. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
  91. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  92. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
  93. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  94. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  95. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
  96. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  97. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  98. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  99. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  100. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  101. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  102. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  103. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  104. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  105. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  106. inspect_ai/_view/www/src/types/log.d.ts +129 -34
  107. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  108. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  109. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  110. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  111. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  112. inspect_ai/_view/www/src/utils/format.ts +1 -1
  113. inspect_ai/_view/www/src/utils/json.ts +24 -0
  114. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  115. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
  116. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  117. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  118. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  119. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  120. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  121. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  122. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  123. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  124. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  125. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  126. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  127. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  128. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  129. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  130. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  131. inspect_ai/_view/www/yarn.lock +241 -5
  132. inspect_ai/log/_condense.py +3 -0
  133. inspect_ai/log/_recorders/eval.py +6 -1
  134. inspect_ai/log/_transcript.py +58 -1
  135. inspect_ai/model/__init__.py +2 -0
  136. inspect_ai/model/_call_tools.py +7 -0
  137. inspect_ai/model/_chat_message.py +22 -7
  138. inspect_ai/model/_conversation.py +10 -8
  139. inspect_ai/model/_generate_config.py +25 -4
  140. inspect_ai/model/_model.py +133 -57
  141. inspect_ai/model/_model_output.py +3 -0
  142. inspect_ai/model/_openai.py +106 -40
  143. inspect_ai/model/_providers/anthropic.py +281 -153
  144. inspect_ai/model/_providers/google.py +27 -8
  145. inspect_ai/model/_providers/groq.py +9 -4
  146. inspect_ai/model/_providers/openai.py +57 -4
  147. inspect_ai/model/_providers/openai_o1.py +10 -0
  148. inspect_ai/model/_providers/providers.py +1 -1
  149. inspect_ai/model/_reasoning.py +15 -2
  150. inspect_ai/scorer/_model.py +23 -19
  151. inspect_ai/solver/_human_agent/agent.py +14 -10
  152. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  153. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  154. inspect_ai/tool/__init__.py +2 -0
  155. inspect_ai/tool/_tool.py +3 -1
  156. inspect_ai/tool/_tools/_computer/_common.py +117 -58
  157. inspect_ai/tool/_tools/_computer/_computer.py +80 -57
  158. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
  159. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
  160. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
  161. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
  162. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
  163. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
  164. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  165. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
  166. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
  167. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
  168. inspect_ai/tool/_tools/_computer/test_args.py +151 -0
  169. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  170. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  171. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  172. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  173. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  174. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  175. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  176. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  177. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  178. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  179. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  180. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  181. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  182. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  183. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  184. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  185. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  186. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  187. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  191. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  192. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  196. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  197. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  198. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  199. inspect_ai/util/__init__.py +2 -1
  200. inspect_ai/util/_display.py +12 -0
  201. inspect_ai/util/_sandbox/events.py +55 -21
  202. inspect_ai/util/_sandbox/self_check.py +131 -43
  203. inspect_ai/util/_subtask.py +11 -0
  204. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
  205. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
  206. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
  207. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  208. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  209. inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
  210. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  211. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  212. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  214. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  215. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  216. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  217. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
  218. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
  219. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
3
3
 
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
 
6
- from inspect_ai._util.content import Content, ContentText
6
+ from inspect_ai._util.content import Content, ContentReasoning, ContentText
7
7
  from inspect_ai.tool import ToolCall
8
8
  from inspect_ai.tool._tool_call import ToolCallError
9
9
 
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
64
64
  self.content = text
65
65
  else:
66
66
  all_other = [content for content in self.content if content.type != "text"]
67
- self.content = [ContentText(text=text)] + all_other
67
+ self.content = all_other + [ContentText(text=text)]
68
68
 
69
69
 
70
70
  class ChatMessageSystem(ChatMessageBase):
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
93
93
  tool_calls: list[ToolCall] | None = Field(default=None)
94
94
  """Tool calls made by the model."""
95
95
 
96
- reasoning: str | None = Field(default=None)
97
- """Reasoning content."""
98
-
99
96
  # Some OpenAI compatible REST endpoints include reasoning as a field alongside
100
97
  # content, however since this field doesn't exist in the OpenAI interface,
101
98
  # hosting providers (so far we've seen this with Together and Groq) may
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
110
107
  @classmethod
111
108
  def extract_reasoning(cls, data: Any) -> Any:
112
109
  if isinstance(data, dict):
110
+ # cleave apart <think> blocks
113
111
  content = data.get("content", None)
114
112
  if isinstance(content, str):
115
113
  parsed = parse_content_with_reasoning(content)
116
114
  if parsed:
117
- data["reasoning"] = parsed.reasoning
118
- data["content"] = parsed.content
115
+ data["content"] = [
116
+ ContentReasoning(reasoning=parsed.reasoning),
117
+ ContentText(text=parsed.content),
118
+ ]
119
+ # migrate messages that has explicit 'reasoning' field
120
+ # (which was our original representation of reasoning)
121
+ reasoning = data.get("reasoning", None)
122
+ if isinstance(reasoning, str):
123
+ # ensure that content is a list
124
+ content = data.get("content", None)
125
+ if content is None:
126
+ data["content"] = []
127
+ elif isinstance(content, str):
128
+ data["content"] = [ContentText(text=content)]
129
+ elif not isinstance(content, list):
130
+ data["content"] = []
131
+ data["content"].insert(0, ContentReasoning(reasoning=reasoning))
132
+
133
+ del data["reasoning"]
119
134
  return data
120
135
 
121
136
 
@@ -1,6 +1,7 @@
1
1
  from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
+ from inspect_ai._util.content import ContentReasoning, ContentText
4
5
  from inspect_ai._util.rich import lines_display
5
6
  from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
6
7
  from inspect_ai.util._conversation import conversation_panel
@@ -41,14 +42,15 @@ def conversation_assistant_message(
41
42
  # build content
42
43
  content: list[RenderableType] = []
43
44
 
44
- # reasoning
45
- if message.reasoning:
46
- content.extend(transcript_reasoning(message.reasoning))
47
-
48
- # message text
49
- content.extend(
50
- [transcript_markdown(message.text, escape=True)] if message.text else []
51
- )
45
+ # deal with plain text or with content blocks
46
+ if isinstance(message.content, str):
47
+ content.extend([transcript_markdown(message.text.strip(), escape=True)])
48
+ else:
49
+ for c in message.content:
50
+ if isinstance(c, ContentReasoning):
51
+ content.extend(transcript_reasoning(c))
52
+ elif isinstance(c, ContentText) and c.text:
53
+ content.extend([transcript_markdown(c.text.strip(), escape=True)])
52
54
 
53
55
  # print tool calls
54
56
  if message.tool_calls:
@@ -1,8 +1,8 @@
1
1
  from contextvars import ContextVar
2
2
  from copy import deepcopy
3
- from typing import Literal, Union
3
+ from typing import Any, Literal, Union
4
4
 
5
- from pydantic import BaseModel, Field
5
+ from pydantic import BaseModel, Field, model_validator
6
6
  from typing_extensions import TypedDict
7
7
 
8
8
 
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
75
75
  reasoning_effort: Literal["low", "medium", "high"] | None
76
76
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
77
77
 
78
- reasoning_history: bool | None
78
+ reasoning_tokens: int | None
79
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
80
+
81
+ reasoning_history: Literal["none", "all", "last", "auto"] | None
79
82
  """Include reasoning in chat message history sent to generate."""
80
83
 
81
84
 
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
148
151
  reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
149
152
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
150
153
 
151
- reasoning_history: bool | None = Field(default=None)
154
+ reasoning_tokens: int | None = Field(default=None)
155
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
156
+
157
+ reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
158
+ default=None
159
+ )
152
160
  """Include reasoning in chat message history sent to generate."""
153
161
 
162
+ # migrate reasoning_history as a bool
163
+ @model_validator(mode="before")
164
+ @classmethod
165
+ def migrate_reasoning(cls, data: Any) -> Any:
166
+ if isinstance(data, dict):
167
+ reasoning_history = data.get("reasoning_history", None)
168
+ if reasoning_history is True:
169
+ data["reasoning_history"] = "all"
170
+ elif reasoning_history is False:
171
+ data["reasoning_history"] = "none"
172
+
173
+ return data
174
+
154
175
  def merge(
155
176
  self, other: Union["GenerateConfig", GenerateConfigArgs]
156
177
  ) -> "GenerateConfig":
@@ -7,6 +7,7 @@ import os
7
7
  import time
8
8
  from contextvars import ContextVar
9
9
  from copy import deepcopy
10
+ from datetime import datetime
10
11
  from types import TracebackType
11
12
  from typing import Any, AsyncIterator, Callable, Literal, Type, cast
12
13
 
@@ -21,7 +22,12 @@ from tenacity import (
21
22
  )
22
23
 
23
24
  from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
24
- from inspect_ai._util.content import Content, ContentImage, ContentText
25
+ from inspect_ai._util.content import (
26
+ Content,
27
+ ContentImage,
28
+ ContentReasoning,
29
+ ContentText,
30
+ )
25
31
  from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
26
32
  from inspect_ai._util.interrupt import check_sample_interrupt
27
33
  from inspect_ai._util.platform import platform_init
@@ -33,7 +39,7 @@ from inspect_ai._util.registry import (
33
39
  )
34
40
  from inspect_ai._util.retry import log_rate_limit_retry
35
41
  from inspect_ai._util.trace import trace_action
36
- from inspect_ai._util.working import report_sample_waiting_time
42
+ from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
37
43
  from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
38
44
  from inspect_ai.tool._tool_def import ToolDef, tool_defs
39
45
  from inspect_ai.util import concurrency
@@ -148,6 +154,17 @@ class ModelAPI(abc.ABC):
148
154
  """Default max_tokens."""
149
155
  return None
150
156
 
157
+ def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
158
+ """Default max_tokens for a given config.
159
+
160
+ Args:
161
+ config: Generation config.
162
+
163
+ Returns:
164
+ Default maximum tokens for specified configuration.
165
+ """
166
+ return None
167
+
151
168
  def max_connections(self) -> int:
152
169
  """Default max_connections."""
153
170
  return DEFAULT_MAX_CONNECTIONS
@@ -180,9 +197,17 @@ class ModelAPI(abc.ABC):
180
197
  """Tool results can contain images"""
181
198
  return False
182
199
 
183
- def has_reasoning_history(self) -> bool:
184
- """Chat message assistant messages can include reasoning."""
185
- return False
200
+ def emulate_reasoning_history(self) -> bool:
201
+ """Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
202
+ return True
203
+
204
+ def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
205
+ """Force a specific reasoning history behavior for this provider."""
206
+ return None
207
+
208
+ def auto_reasoning_history(self) -> Literal["none", "all", "last"]:
209
+ """Behavior to use for reasoning_history='auto'"""
210
+ return "all"
186
211
 
187
212
 
188
213
  class Model:
@@ -285,9 +310,10 @@ class Model:
285
310
  config = base_config.merge(config)
286
311
 
287
312
  # provide max_tokens from the model api if required
288
- config.max_tokens = (
289
- config.max_tokens if config.max_tokens else self.api.max_tokens()
290
- )
313
+ if config.max_tokens is None:
314
+ config.max_tokens = self.api.max_tokens_for_config(config)
315
+ if config.max_tokens is None:
316
+ config.max_tokens = self.api.max_tokens()
291
317
 
292
318
  # disable parallel tool calls if requested by any of our tools
293
319
  if disable_parallel_tools(tools):
@@ -302,8 +328,11 @@ class Model:
302
328
  input = [ChatMessageSystem(content=config.system_message)] + input
303
329
 
304
330
  # enforce concurrency limits
331
+ start_time = datetime.now()
332
+ working_start = sample_working_time()
305
333
  async with self._connection_concurrency(config):
306
- return await self._generate(
334
+ # generate
335
+ output = await self._generate(
307
336
  input=input,
308
337
  tools=tools,
309
338
  tool_choice=tool_choice,
@@ -311,6 +340,28 @@ class Model:
311
340
  cache=cache,
312
341
  )
313
342
 
343
+ # update the most recent ModelEvent with the actual start/completed
344
+ # times as well as a computation of working time (events are
345
+ # created _after_ the call to _generate, potentially in response
346
+ # to retries, so they need their timestamp updated so it accurately
347
+ # reflects the full start/end time which we know here)
348
+ from inspect_ai.log._transcript import ModelEvent, transcript
349
+
350
+ last_model_event = transcript().find_last_event(ModelEvent)
351
+ if last_model_event:
352
+ last_model_event.timestamp = start_time
353
+ last_model_event.working_start = working_start
354
+ completed = datetime.now()
355
+ last_model_event.completed = completed
356
+ last_model_event.working_time = (
357
+ output.time
358
+ if output.time is not None
359
+ else (completed - start_time).total_seconds()
360
+ )
361
+
362
+ # return output
363
+ return output
364
+
314
365
  async def _generate(
315
366
  self,
316
367
  input: list[ChatMessage],
@@ -349,9 +400,7 @@ class Model:
349
400
  tool_choice = "none"
350
401
 
351
402
  # handle reasoning history
352
- input = resolve_reasoning_history(
353
- input, config, self.api.has_reasoning_history()
354
- )
403
+ input = resolve_reasoning_history(input, config, self.api)
355
404
 
356
405
  # apply any tool model_input handlers
357
406
  input = resolve_tool_model_input(tdefs, input)
@@ -849,68 +898,91 @@ def simple_input_messages(
849
898
 
850
899
 
851
900
  def resolve_reasoning_history(
852
- messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
901
+ messages: list[ChatMessage],
902
+ config: GenerateConfig,
903
+ model_api: ModelAPI,
853
904
  ) -> list[ChatMessage]:
854
- # determine if we are including reasoning history
855
- reasoning_history = config.reasoning_history is not False
856
-
857
905
  # determine up front if we have any reasoning content
858
906
  have_reasoning = any(
859
907
  [
860
- isinstance(m, ChatMessageAssistant) and m.reasoning is not None
908
+ isinstance(m, ChatMessageAssistant)
909
+ and isinstance(m.content, list)
910
+ and any([c for c in m.content if isinstance(c, ContentReasoning)])
861
911
  for m in messages
862
912
  ]
863
913
  )
864
914
  if not have_reasoning:
865
915
  return messages
866
916
 
867
- # API asssistant message format directly supports reasoning history so we will:
868
- # (a) Remove reasoning content entirely if config says not to include it; or
869
- # (b) Leave the messages alone if config says to include it
870
- if api_has_reasoning_history:
871
- # remove reasoning history as per config
872
- if not reasoning_history:
873
- resolved_messages: list[ChatMessage] = []
874
- for message in messages:
875
- if isinstance(message, ChatMessageAssistant):
876
- resolved_messages.append(
877
- message.model_copy(update={"reasoning": None})
878
- )
879
- else:
880
- resolved_messages.append(message)
881
-
882
- return resolved_messages
883
-
884
- # include reasoning history as per config
885
- else:
886
- return messages
917
+ # determine reasoning history configuration
918
+ reasoning_history = (
919
+ config.reasoning_history if config.reasoning_history is not None else "auto"
920
+ )
887
921
 
888
- # API can't represent reasoning natively so include <think> tags
889
- elif reasoning_history:
922
+ # see if the provider is forcing a reasoning history
923
+ force = model_api.force_reasoning_history()
924
+ if force is not None:
925
+ reasoning_history = force
926
+ # if it's 'auto' then defer to the provider
927
+ elif reasoning_history == "auto":
928
+ reasoning_history = model_api.auto_reasoning_history()
929
+
930
+ # generate a version of message history with the correct history
931
+ if reasoning_history == "all":
932
+ resolved_messages: list[ChatMessage] = messages
933
+ else:
934
+ found_last = False
890
935
  resolved_messages = []
891
- for message in messages:
892
- if (
893
- isinstance(message, ChatMessageAssistant)
894
- and message.reasoning is not None
936
+ for message in reversed(messages):
937
+ if isinstance(message, ChatMessageAssistant) and isinstance(
938
+ message.content, list
895
939
  ):
896
- message = deepcopy(message)
897
- if isinstance(message.content, str):
898
- message.content = (
899
- f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
900
- )
901
- else:
902
- message.content.insert(
903
- 0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
904
- )
905
- message.reasoning = None
940
+ # is there reasoning in this message?
941
+ has_reasoning = any(
942
+ isinstance(c, ContentReasoning) for c in message.content
943
+ )
944
+ # remove it unless we are in "last" mode and haven't yet found last
945
+ if has_reasoning:
946
+ if reasoning_history == "none" or found_last:
947
+ message = message.model_copy(
948
+ update={
949
+ "content": [
950
+ content
951
+ for content in message.content
952
+ if not isinstance(content, ContentReasoning)
953
+ ]
954
+ }
955
+ )
956
+ found_last = True
906
957
 
907
958
  resolved_messages.append(message)
908
959
 
909
- return resolved_messages
960
+ # reverse them back
961
+ resolved_messages.reverse()
910
962
 
911
- # api doesn't handle reasoning and config says no reasoning_history, nothing to do
912
- else:
913
- return messages
963
+ # api can't represent reasoning natively so emulate it
964
+ if model_api.emulate_reasoning_history():
965
+ emulated_messages: list[ChatMessage] = []
966
+ for message in resolved_messages:
967
+ if isinstance(message, ChatMessageAssistant) and isinstance(
968
+ message.content, list
969
+ ):
970
+ content: list[Content] = []
971
+ for c in message.content:
972
+ if isinstance(c, ContentReasoning):
973
+ content.append(
974
+ ContentText(text=f"<think>\n{c.reasoning}\n</think>")
975
+ )
976
+ else:
977
+ content.append(c)
978
+ message = message.model_copy(update={"content": content})
979
+
980
+ emulated_messages.append(message)
981
+
982
+ resolved_messages = emulated_messages
983
+
984
+ # return messages
985
+ return resolved_messages
914
986
 
915
987
 
916
988
  def resolve_tool_model_input(
@@ -1200,6 +1272,10 @@ def set_model_usage(
1200
1272
  if total_usage.input_tokens_cache_read is None:
1201
1273
  total_usage.input_tokens_cache_read = 0
1202
1274
  total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
1275
+ if usage.reasoning_tokens is not None:
1276
+ if total_usage.reasoning_tokens is None:
1277
+ total_usage.reasoning_tokens = 0
1278
+ total_usage.reasoning_tokens += usage.reasoning_tokens
1203
1279
 
1204
1280
  model_usage[model] = total_usage
1205
1281
 
@@ -26,6 +26,9 @@ class ModelUsage(BaseModel):
26
26
  input_tokens_cache_read: int | None = Field(default=None)
27
27
  """Number of tokens retrieved from the cache."""
28
28
 
29
+ reasoning_tokens: int | None = Field(default=None)
30
+ """Number of tokens used for reasoning."""
31
+
29
32
 
30
33
  StopReason = Literal[
31
34
  "stop",