inspect-ai 0.3.70__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. inspect_ai/_cli/eval.py +14 -8
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +6 -1
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/transcript.py +10 -6
  9. inspect_ai/_eval/task/run.py +5 -8
  10. inspect_ai/_util/content.py +20 -1
  11. inspect_ai/_util/transcript.py +10 -4
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/App.css +6 -0
  14. inspect_ai/_view/www/dist/assets/index.css +115 -87
  15. inspect_ai/_view/www/dist/assets/index.js +5324 -2276
  16. inspect_ai/_view/www/eslint.config.mjs +24 -1
  17. inspect_ai/_view/www/log-schema.json +283 -20
  18. inspect_ai/_view/www/package.json +8 -3
  19. inspect_ai/_view/www/src/App.tsx +2 -2
  20. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  21. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  22. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  23. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  24. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  25. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  26. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  27. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  28. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  29. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  30. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  31. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  32. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  33. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  34. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  35. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  36. inspect_ai/_view/www/src/index.tsx +2 -2
  37. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  38. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  39. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  40. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
  41. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  42. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  43. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  44. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  45. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  46. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  47. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  48. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  49. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
  50. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  51. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  52. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  53. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  54. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  55. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  56. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  57. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  58. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  59. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  60. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  64. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  65. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  66. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  67. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  68. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  69. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  70. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  71. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  72. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  73. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  74. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  75. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  76. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  77. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  78. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  79. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  80. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  83. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  84. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  85. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  86. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  87. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  88. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
  89. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  90. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
  91. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  92. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
  93. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  94. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  95. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
  96. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  97. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  98. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  99. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  100. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  101. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  102. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  103. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  104. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  105. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  106. inspect_ai/_view/www/src/types/log.d.ts +129 -34
  107. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  108. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  109. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  110. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  111. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  112. inspect_ai/_view/www/src/utils/format.ts +1 -1
  113. inspect_ai/_view/www/src/utils/json.ts +24 -0
  114. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  115. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
  116. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  117. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  118. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  119. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  120. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  121. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  122. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  123. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  124. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  125. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  126. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  127. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  128. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  129. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  130. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  131. inspect_ai/_view/www/yarn.lock +241 -5
  132. inspect_ai/log/_condense.py +3 -0
  133. inspect_ai/log/_recorders/eval.py +6 -1
  134. inspect_ai/log/_transcript.py +58 -1
  135. inspect_ai/model/__init__.py +2 -0
  136. inspect_ai/model/_call_tools.py +7 -0
  137. inspect_ai/model/_chat_message.py +22 -7
  138. inspect_ai/model/_conversation.py +10 -8
  139. inspect_ai/model/_generate_config.py +25 -4
  140. inspect_ai/model/_model.py +133 -57
  141. inspect_ai/model/_model_output.py +3 -0
  142. inspect_ai/model/_openai.py +106 -40
  143. inspect_ai/model/_providers/anthropic.py +134 -26
  144. inspect_ai/model/_providers/google.py +27 -8
  145. inspect_ai/model/_providers/groq.py +9 -4
  146. inspect_ai/model/_providers/openai.py +57 -4
  147. inspect_ai/model/_providers/openai_o1.py +10 -0
  148. inspect_ai/model/_providers/providers.py +1 -1
  149. inspect_ai/model/_reasoning.py +15 -2
  150. inspect_ai/scorer/_model.py +23 -19
  151. inspect_ai/solver/_human_agent/agent.py +14 -10
  152. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  153. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  154. inspect_ai/tool/__init__.py +2 -0
  155. inspect_ai/tool/_tool.py +3 -1
  156. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  157. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  158. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  159. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  160. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  161. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  162. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  163. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  164. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  165. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  166. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  167. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  168. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  169. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  170. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  171. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  172. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  173. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  174. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  175. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  176. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  177. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  178. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  179. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  180. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  181. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  182. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  183. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  184. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  185. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  186. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  187. inspect_ai/util/__init__.py +2 -1
  188. inspect_ai/util/_display.py +12 -0
  189. inspect_ai/util/_sandbox/events.py +55 -21
  190. inspect_ai/util/_sandbox/self_check.py +131 -43
  191. inspect_ai/util/_subtask.py +11 -0
  192. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +1 -1
  193. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +197 -182
  194. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  195. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  196. inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
  197. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  198. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  199. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  200. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  201. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  203. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  204. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  205. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  206. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  207. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  208. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -27,11 +27,18 @@ from openai.types.chat.chat_completion_message_tool_call import Function
27
27
  from openai.types.completion_usage import CompletionUsage
28
28
  from openai.types.shared_params.function_definition import FunctionDefinition
29
29
 
30
- from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentText
30
+ from inspect_ai._util.content import (
31
+ Content,
32
+ ContentAudio,
33
+ ContentImage,
34
+ ContentReasoning,
35
+ ContentText,
36
+ )
31
37
  from inspect_ai._util.images import file_as_data_uri
32
38
  from inspect_ai._util.url import is_http_url
33
39
  from inspect_ai.model._call_tools import parse_tool_call
34
40
  from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
41
+ from inspect_ai.model._reasoning import parse_content_with_reasoning
35
42
  from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
36
43
 
37
44
  from ._chat_message import (
@@ -148,14 +155,14 @@ async def openai_chat_message(
148
155
  if message.tool_calls:
149
156
  return ChatCompletionAssistantMessageParam(
150
157
  role=message.role,
151
- content=message.text,
158
+ content=openai_assistant_content(message),
152
159
  tool_calls=[
153
160
  openai_chat_tool_call_param(call) for call in message.tool_calls
154
161
  ],
155
162
  )
156
163
  else:
157
164
  return ChatCompletionAssistantMessageParam(
158
- role=message.role, content=message.text
165
+ role=message.role, content=openai_assistant_content(message)
159
166
  )
160
167
  elif message.role == "tool":
161
168
  return ChatCompletionToolMessageParam(
@@ -175,16 +182,29 @@ async def openai_chat_messages(
175
182
  return [await openai_chat_message(message, model) for message in messages]
176
183
 
177
184
 
185
+ def openai_assistant_content(message: ChatMessageAssistant) -> str:
186
+ if isinstance(message.content, str):
187
+ content = message.content
188
+ else:
189
+ content = ""
190
+ for c in message.content:
191
+ if c.type == "reasoning":
192
+ attribs = ""
193
+ if c.signature is not None:
194
+ attribs = f'{attribs} signature="{c.signature}"'
195
+ if c.redacted:
196
+ attribs = f'{attribs} redacted="true"'
197
+ content = f"{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n"
198
+ elif c.type == "text":
199
+ content = f"{content}\n{c.text}"
200
+ return content
201
+
202
+
178
203
  def openai_chat_choices(choices: list[ChatCompletionChoice]) -> list[Choice]:
179
204
  oai_choices: list[Choice] = []
180
205
 
181
206
  for index, choice in enumerate(choices):
182
- if isinstance(choice.message.content, str):
183
- content = choice.message.content
184
- else:
185
- content = "\n".join(
186
- [c.text for c in choice.message.content if c.type == "text"]
187
- )
207
+ content = openai_assistant_content(choice.message)
188
208
  if choice.message.tool_calls:
189
209
  tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
190
210
  else:
@@ -274,35 +294,47 @@ def chat_messages_from_openai(
274
294
  chat_messages: list[ChatMessage] = []
275
295
 
276
296
  for message in messages:
297
+ content: str | list[Content] = []
277
298
  if message["role"] == "system" or message["role"] == "developer":
278
299
  sys_content = message["content"]
279
300
  if isinstance(sys_content, str):
280
301
  chat_messages.append(ChatMessageSystem(content=sys_content))
281
302
  else:
282
- chat_messages.append(
283
- ChatMessageSystem(
284
- content=[content_from_openai(c) for c in sys_content]
285
- )
286
- )
303
+ content = []
304
+ for sc in sys_content:
305
+ content.extend(content_from_openai(sc))
306
+ chat_messages.append(ChatMessageSystem(content=content))
287
307
  elif message["role"] == "user":
288
308
  user_content = message["content"]
289
309
  if isinstance(user_content, str):
290
310
  chat_messages.append(ChatMessageUser(content=user_content))
291
311
  else:
292
- chat_messages.append(
293
- ChatMessageUser(
294
- content=[content_from_openai(c) for c in user_content]
295
- )
296
- )
312
+ content = []
313
+ for uc in user_content:
314
+ content.extend(content_from_openai(uc))
315
+ chat_messages.append(ChatMessageUser(content=content))
297
316
  elif message["role"] == "assistant":
298
317
  # resolve content
299
- asst_content = message["content"]
318
+ asst_content = message.get("content", None)
300
319
  if isinstance(asst_content, str):
301
- content: str | list[Content] = asst_content
320
+ result = parse_content_with_reasoning(asst_content)
321
+ if result is not None:
322
+ content = [
323
+ ContentReasoning(
324
+ reasoning=result.reasoning,
325
+ signature=result.signature,
326
+ redacted=result.redacted,
327
+ ),
328
+ ContentText(text=result.content),
329
+ ]
330
+ else:
331
+ content = asst_content
302
332
  elif asst_content is None:
303
333
  content = message.get("refusal", None) or ""
304
334
  else:
305
- content = [content_from_openai(c) for c in asst_content]
335
+ content = []
336
+ for ac in asst_content:
337
+ content.extend(content_from_openai(ac, parse_reasoning=True))
306
338
 
307
339
  # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
308
340
  # interfaces e.g. DeepSeek do include this field so we pluck it out)
@@ -310,22 +342,25 @@ def chat_messages_from_openai(
310
342
  "reasoning", None
311
343
  )
312
344
  if reasoning is not None:
313
- reasoning = str(reasoning)
345
+ if isinstance(content, str):
346
+ content = [ContentText(text=content)]
347
+ else:
348
+ content.insert(0, ContentReasoning(reasoning=str(reasoning)))
314
349
 
315
350
  # return message
316
351
  if "tool_calls" in message:
317
352
  tool_calls: list[ToolCall] = []
318
- for tc in message["tool_calls"]:
319
- tool_calls.append(tool_call_from_openai(tc))
320
- tool_names[tc["id"]] = tc["function"]["name"]
353
+ for call in message["tool_calls"]:
354
+ tool_calls.append(tool_call_from_openai(call))
355
+ tool_names[call["id"]] = call["function"]["name"]
321
356
 
322
357
  else:
323
358
  tool_calls = []
359
+
324
360
  chat_messages.append(
325
361
  ChatMessageAssistant(
326
362
  content=content,
327
363
  tool_calls=tool_calls or None,
328
- reasoning=reasoning,
329
364
  )
330
365
  )
331
366
  elif message["role"] == "tool":
@@ -333,7 +368,9 @@ def chat_messages_from_openai(
333
368
  if isinstance(tool_content, str):
334
369
  content = tool_content
335
370
  else:
336
- content = [content_from_openai(c) for c in tool_content]
371
+ content = []
372
+ for tc in tool_content:
373
+ content.extend(content_from_openai(tc))
337
374
  chat_messages.append(
338
375
  ChatMessageTool(
339
376
  content=content,
@@ -357,20 +394,40 @@ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> Tool
357
394
 
358
395
  def content_from_openai(
359
396
  content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
360
- ) -> Content:
397
+ parse_reasoning: bool = False,
398
+ ) -> list[Content]:
361
399
  if content["type"] == "text":
362
- return ContentText(text=content["text"])
400
+ text = content["text"]
401
+ if parse_reasoning:
402
+ result = parse_content_with_reasoning(text)
403
+ if result:
404
+ return [
405
+ ContentReasoning(
406
+ reasoning=result.reasoning,
407
+ signature=result.signature,
408
+ redacted=result.redacted,
409
+ ),
410
+ ContentText(text=result.content),
411
+ ]
412
+ else:
413
+ return [ContentText(text=text)]
414
+ else:
415
+ return [ContentText(text=text)]
363
416
  elif content["type"] == "image_url":
364
- return ContentImage(
365
- image=content["image_url"]["url"], detail=content["image_url"]["detail"]
366
- )
417
+ return [
418
+ ContentImage(
419
+ image=content["image_url"]["url"], detail=content["image_url"]["detail"]
420
+ )
421
+ ]
367
422
  elif content["type"] == "input_audio":
368
- return ContentAudio(
369
- audio=content["input_audio"]["data"],
370
- format=content["input_audio"]["format"],
371
- )
423
+ return [
424
+ ContentAudio(
425
+ audio=content["input_audio"]["data"],
426
+ format=content["input_audio"]["format"],
427
+ )
428
+ ]
372
429
  elif content["type"] == "refusal":
373
- return ContentText(text=content["refusal"])
430
+ return [ContentText(text=content["refusal"])]
374
431
 
375
432
 
376
433
  def chat_message_assistant_from_openai(
@@ -380,11 +437,20 @@ def chat_message_assistant_from_openai(
380
437
  reasoning = getattr(message, "reasoning_content", None) or getattr(
381
438
  message, "reasoning", None
382
439
  )
440
+
441
+ msg_content = refusal or message.content or ""
442
+ if reasoning is not None:
443
+ content: str | list[Content] = [
444
+ ContentReasoning(reasoning=str(reasoning)),
445
+ ContentText(text=msg_content),
446
+ ]
447
+ else:
448
+ content = msg_content
449
+
383
450
  return ChatMessageAssistant(
384
- content=refusal or message.content or "",
451
+ content=content,
385
452
  source="generate",
386
453
  tool_calls=chat_tool_calls_from_openai(message, tools),
387
- reasoning=reasoning,
388
454
  )
389
455
 
390
456
 
@@ -1,5 +1,6 @@
1
1
  import functools
2
2
  import os
3
+ import re
3
4
  import sys
4
5
  from copy import copy
5
6
  from logging import getLogger
@@ -28,8 +29,12 @@ from anthropic.types import (
28
29
  ImageBlockParam,
29
30
  Message,
30
31
  MessageParam,
32
+ RedactedThinkingBlock,
33
+ RedactedThinkingBlockParam,
31
34
  TextBlock,
32
35
  TextBlockParam,
36
+ ThinkingBlock,
37
+ ThinkingBlockParam,
33
38
  ToolParam,
34
39
  ToolResultBlockParam,
35
40
  ToolUseBlock,
@@ -44,7 +49,12 @@ from inspect_ai._util.constants import (
44
49
  DEFAULT_MAX_RETRIES,
45
50
  NO_CONTENT,
46
51
  )
47
- from inspect_ai._util.content import Content, ContentImage, ContentText
52
+ from inspect_ai._util.content import (
53
+ Content,
54
+ ContentImage,
55
+ ContentReasoning,
56
+ ContentText,
57
+ )
48
58
  from inspect_ai._util.error import exception_message
49
59
  from inspect_ai._util.images import file_as_data_uri
50
60
  from inspect_ai._util.logger import warn_once
@@ -204,23 +214,33 @@ class AnthropicAPI(ModelAPI):
204
214
  request["system"] = system_param
205
215
  request["tools"] = tools_param
206
216
  if len(tools) > 0:
207
- request["tool_choice"] = message_tool_choice(tool_choice)
217
+ request["tool_choice"] = message_tool_choice(
218
+ tool_choice, self.is_using_thinking(config)
219
+ )
208
220
 
209
221
  # additional options
210
- request = request | self.completion_params(config)
222
+ req, headers, betas = self.completion_config(config)
223
+ request = request | req
211
224
 
212
225
  # extra headers (for time tracker and computer use)
213
- extra_headers = {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
226
+ extra_headers = headers | {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
214
227
  if computer_use:
215
- extra_headers["anthropic-beta"] = "computer-use-2024-10-22"
228
+ betas.append("computer-use-2024-10-22")
229
+ if len(betas) > 0:
230
+ extra_headers["anthropic-beta"] = ",".join(betas)
231
+
216
232
  request["extra_headers"] = extra_headers
217
233
 
218
234
  # extra_body
219
235
  if self.extra_body is not None:
220
236
  request["extra_body"] = self.extra_body
221
237
 
222
- # make request
223
- message = await self.client.messages.create(**request, stream=False)
238
+ # make request (stream if we are using reasoning)
239
+ if self.is_using_thinking(config):
240
+ async with self.client.messages.stream(**request) as stream:
241
+ message = await stream.get_final_message()
242
+ else:
243
+ message = await self.client.messages.create(**request, stream=False)
224
244
 
225
245
  # set response for ModelCall
226
246
  response = message.model_dump()
@@ -245,27 +265,67 @@ class AnthropicAPI(ModelAPI):
245
265
  else:
246
266
  raise ex
247
267
 
248
- def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
249
- params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
250
- if config.temperature is not None:
251
- params["temperature"] = config.temperature
252
- if config.top_p is not None:
253
- params["top_p"] = config.top_p
254
- if config.top_k is not None:
255
- params["top_k"] = config.top_k
268
+ def completion_config(
269
+ self, config: GenerateConfig
270
+ ) -> tuple[dict[str, Any], dict[str, str], list[str]]:
271
+ max_tokens = cast(int, config.max_tokens)
272
+ params = dict(model=self.model_name, max_tokens=max_tokens)
273
+ headers: dict[str, str] = {}
274
+ betas: list[str] = []
275
+ # some params not compatible with thinking models
276
+ if not self.is_using_thinking(config):
277
+ if config.temperature is not None:
278
+ params["temperature"] = config.temperature
279
+ if config.top_p is not None:
280
+ params["top_p"] = config.top_p
281
+ if config.top_k is not None:
282
+ params["top_k"] = config.top_k
283
+
284
+ # some thinking-only stuff
285
+ if self.is_using_thinking(config):
286
+ params["thinking"] = dict(
287
+ type="enabled", budget_tokens=config.reasoning_tokens
288
+ )
289
+ headers["anthropic-version"] = "2023-06-01"
290
+ if max_tokens > 8192:
291
+ betas.append("output-128k-2025-02-19")
292
+
293
+ # config that applies to all models
256
294
  if config.timeout is not None:
257
295
  params["timeout"] = float(config.timeout)
258
296
  if config.stop_seqs is not None:
259
297
  params["stop_sequences"] = config.stop_seqs
260
- return params
298
+
299
+ # return config
300
+ return params, headers, betas
261
301
 
262
302
  @override
263
303
  def max_tokens(self) -> int | None:
264
304
  # anthropic requires you to explicitly specify max_tokens (most others
265
305
  # set it to the maximum allowable output tokens for the model).
266
- # set to 4096 which is the lowest documented max_tokens for claude models
306
+ # set to 4096 which is the highest possible for claude 3 (claude 3.5
307
+ # allows up to 8192)
267
308
  return 4096
268
309
 
310
+ @override
311
+ def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
312
+ max_tokens = cast(int, self.max_tokens())
313
+ if self.is_thinking_model() and config.reasoning_tokens is not None:
314
+ max_tokens = max_tokens + config.reasoning_tokens
315
+ return max_tokens
316
+
317
+ def is_using_thinking(self, config: GenerateConfig) -> bool:
318
+ return self.is_thinking_model() and config.reasoning_tokens is not None
319
+
320
+ def is_thinking_model(self) -> bool:
321
+ return not self.is_claude_3() and not self.is_claude_3_5()
322
+
323
+ def is_claude_3(self) -> bool:
324
+ return re.search(r"claude-3-[a-zA-Z]", self.model_name) is not None
325
+
326
+ def is_claude_3_5(self) -> bool:
327
+ return "claude-3-5-" in self.model_name
328
+
269
329
  @override
270
330
  def connection_key(self) -> str:
271
331
  return str(self.api_key)
@@ -295,6 +355,14 @@ class AnthropicAPI(ModelAPI):
295
355
  def tool_result_images(self) -> bool:
296
356
  return True
297
357
 
358
+ @override
359
+ def emulate_reasoning_history(self) -> bool:
360
+ return False
361
+
362
+ @override
363
+ def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
364
+ return "all"
365
+
298
366
  # convert some common BadRequestError states into 'refusal' model output
299
367
  def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | Exception:
300
368
  error = exception_message(ex).lower()
@@ -498,7 +566,7 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
498
566
  role = a["role"]
499
567
  a_content = a["content"]
500
568
  b_content = b["content"]
501
- if isinstance(a_content, str) and isinstance(a_content, str):
569
+ if isinstance(a_content, str) and isinstance(b_content, str):
502
570
  return MessageParam(role=role, content=f"{a_content}\n{b_content}")
503
571
  elif isinstance(a_content, list) and isinstance(b_content, list):
504
572
  return MessageParam(role=role, content=a_content + b_content)
@@ -514,9 +582,15 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
514
582
  raise ValueError(f"Unexpected content types for messages: {a}, {b}")
515
583
 
516
584
 
517
- def message_tool_choice(tool_choice: ToolChoice) -> message_create_params.ToolChoice:
585
+ def message_tool_choice(
586
+ tool_choice: ToolChoice, thinking_model: bool
587
+ ) -> message_create_params.ToolChoice:
518
588
  if isinstance(tool_choice, ToolFunction):
519
- return {"type": "tool", "name": tool_choice.name}
589
+ # forced tool use not compatible with thinking models
590
+ if thinking_model:
591
+ return {"type": "any"}
592
+ else:
593
+ return {"type": "tool", "name": tool_choice.name}
520
594
  elif tool_choice == "any":
521
595
  return {"type": "any"}
522
596
  elif tool_choice == "none":
@@ -544,9 +618,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
544
618
  # "tool" means serving a tool call result back to claude
545
619
  elif message.role == "tool":
546
620
  if message.error is not None:
547
- content: str | list[TextBlockParam | ImageBlockParam] = (
548
- message.error.message
549
- )
621
+ content: (
622
+ str
623
+ | list[
624
+ TextBlockParam
625
+ | ImageBlockParam
626
+ | ThinkingBlockParam
627
+ | RedactedThinkingBlockParam
628
+ ]
629
+ ) = message.error.message
550
630
  # anthropic requires that content be populated when
551
631
  # is_error is true (throws bad_request_error when not)
552
632
  # so make sure this precondition is met
@@ -567,7 +647,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
567
647
  ToolResultBlockParam(
568
648
  tool_use_id=str(message.tool_call_id),
569
649
  type="tool_result",
570
- content=content,
650
+ content=cast(list[TextBlockParam | ImageBlockParam], content),
571
651
  is_error=message.error is not None,
572
652
  )
573
653
  ],
@@ -576,7 +656,13 @@ async def message_param(message: ChatMessage) -> MessageParam:
576
656
  # tool_calls means claude is attempting to call our tools
577
657
  elif message.role == "assistant" and message.tool_calls:
578
658
  # first include content (claude <thinking>)
579
- tools_content: list[TextBlockParam | ImageBlockParam | ToolUseBlockParam] = (
659
+ tools_content: list[
660
+ TextBlockParam
661
+ | ThinkingBlockParam
662
+ | RedactedThinkingBlockParam
663
+ | ImageBlockParam
664
+ | ToolUseBlockParam
665
+ ] = (
580
666
  [TextBlockParam(type="text", text=message.content or NO_CONTENT)]
581
667
  if isinstance(message.content, str)
582
668
  else (
@@ -645,6 +731,16 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
645
731
  arguments=content_block.model_dump().get("input", {}),
646
732
  )
647
733
  )
734
+ elif isinstance(content_block, RedactedThinkingBlock):
735
+ content.append(
736
+ ContentReasoning(reasoning=content_block.data, redacted=True)
737
+ )
738
+ elif isinstance(content_block, ThinkingBlock):
739
+ content.append(
740
+ ContentReasoning(
741
+ reasoning=content_block.thinking, signature=content_block.signature
742
+ )
743
+ )
648
744
 
649
745
  # resolve choice
650
746
  choice = ChatCompletionChoice(
@@ -702,7 +798,7 @@ def split_system_messages(
702
798
 
703
799
  async def message_param_content(
704
800
  content: Content,
705
- ) -> TextBlockParam | ImageBlockParam:
801
+ ) -> TextBlockParam | ImageBlockParam | ThinkingBlockParam | RedactedThinkingBlockParam:
706
802
  if isinstance(content, ContentText):
707
803
  return TextBlockParam(type="text", text=content.text or NO_CONTENT)
708
804
  elif isinstance(content, ContentImage):
@@ -720,6 +816,18 @@ async def message_param_content(
720
816
  type="image",
721
817
  source=dict(type="base64", media_type=cast(Any, media_type), data=image),
722
818
  )
819
+ elif isinstance(content, ContentReasoning):
820
+ if content.redacted:
821
+ return RedactedThinkingBlockParam(
822
+ type="redacted_thinking",
823
+ data=content.reasoning,
824
+ )
825
+ else:
826
+ if content.signature is None:
827
+ raise ValueError("Thinking content without signature.")
828
+ return ThinkingBlockParam(
829
+ type="thinking", thinking=content.reasoning, signature=content.signature
830
+ )
723
831
  else:
724
832
  raise RuntimeError(
725
833
  "Anthropic models do not currently support audio or video inputs."
@@ -38,10 +38,13 @@ from pydantic import JsonValue
38
38
  from typing_extensions import override
39
39
 
40
40
  from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
41
- from inspect_ai._util.content import Content as InspectContent
41
+ from inspect_ai._util.content import (
42
+ Content as InspectContent,
43
+ )
42
44
  from inspect_ai._util.content import (
43
45
  ContentAudio,
44
46
  ContentImage,
47
+ ContentReasoning,
45
48
  ContentText,
46
49
  ContentVideo,
47
50
  )
@@ -250,7 +253,10 @@ class GoogleGenAIAPI(ModelAPI):
250
253
 
251
254
  @override
252
255
  def is_rate_limit(self, ex: BaseException) -> bool:
253
- return isinstance(ex, APIError) and ex.code in (429, 500, 503, 504)
256
+ # see https://cloud.google.com/storage/docs/retry-strategy
257
+ return isinstance(ex, APIError) and (
258
+ ex.code in (408, 429, 429) or ex.code >= 500
259
+ )
254
260
 
255
261
  @override
256
262
  def connection_key(self) -> str:
@@ -405,6 +411,8 @@ async def content_part(client: Client, content: InspectContent | str) -> Part:
405
411
  return Part.from_text(text=content or NO_CONTENT)
406
412
  elif isinstance(content, ContentText):
407
413
  return Part.from_text(text=content.text or NO_CONTENT)
414
+ elif isinstance(content, ContentReasoning):
415
+ return Part.from_text(text=content.reasoning or NO_CONTENT)
408
416
  else:
409
417
  return await chat_content_to_part(client, content)
410
418
 
@@ -417,7 +425,8 @@ async def chat_content_to_part(
417
425
  content_bytes, mime_type = await file_as_data(content.image)
418
426
  return Part.from_bytes(mime_type=mime_type, data=content_bytes)
419
427
  else:
420
- return await file_for_content(client, content)
428
+ file = await file_for_content(client, content)
429
+ return Part.from_uri(file_uri=file.uri, mime_type=file.mime_type)
421
430
 
422
431
 
423
432
  async def extract_system_message_as_parts(
@@ -552,11 +561,19 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
552
561
  # stop reason
553
562
  stop_reason = finish_reason_to_stop_reason(candidate.finish_reason)
554
563
 
564
+ # choice content may include reasoning
565
+ if reasoning:
566
+ choice_content: str | list[Content] = [
567
+ ContentReasoning(reasoning=reasoning),
568
+ ContentText(text=content),
569
+ ]
570
+ else:
571
+ choice_content = content
572
+
555
573
  # build choice
556
574
  choice = ChatCompletionChoice(
557
575
  message=ChatMessageAssistant(
558
- content=content,
559
- reasoning=reasoning,
576
+ content=choice_content,
560
577
  tool_calls=tool_calls if len(tool_calls) > 0 else None,
561
578
  source="generate",
562
579
  ),
@@ -742,7 +759,7 @@ async def file_for_content(
742
759
  uploaded_file = files_db.get(content_sha256)
743
760
  if uploaded_file:
744
761
  try:
745
- upload: File = client.files.get(uploaded_file)
762
+ upload: File = client.files.get(name=uploaded_file)
746
763
  if upload.state.name == "ACTIVE":
747
764
  trace(f"Using uploaded file: {uploaded_file}")
748
765
  return upload
@@ -754,10 +771,12 @@ async def file_for_content(
754
771
  trace(f"Error attempting to access uploaded file: {ex}")
755
772
  files_db.delete(content_sha256)
756
773
  # do the upload (and record it)
757
- upload = client.files.upload(BytesIO(content_bytes), mime_type=mime_type)
774
+ upload = client.files.upload(
775
+ file=BytesIO(content_bytes), config=dict(mime_type=mime_type)
776
+ )
758
777
  while upload.state.name == "PROCESSING":
759
778
  await asyncio.sleep(3)
760
- upload = client.files.get(upload.name)
779
+ upload = client.files.get(name=upload.name)
761
780
  if upload.state.name == "FAILED":
762
781
  trace(f"Failed to upload file '{upload.name}: {upload.error}")
763
782
  raise ValueError(f"Google file upload failed: {upload.error}")
@@ -28,7 +28,7 @@ from inspect_ai._util.constants import (
28
28
  DEFAULT_MAX_RETRIES,
29
29
  DEFAULT_MAX_TOKENS,
30
30
  )
31
- from inspect_ai._util.content import Content
31
+ from inspect_ai._util.content import Content, ContentReasoning, ContentText
32
32
  from inspect_ai._util.images import file_as_data_uri
33
33
  from inspect_ai._util.url import is_http_url
34
34
  from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
@@ -326,12 +326,17 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
326
326
  def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
327
327
  reasoning = getattr(message, "reasoning", None)
328
328
  if reasoning is not None:
329
- reasoning = str(reasoning)
329
+ content: str | list[Content] = [
330
+ ContentReasoning(reasoning=str(reasoning)),
331
+ ContentText(text=message.content or ""),
332
+ ]
333
+ else:
334
+ content = message.content or ""
335
+
330
336
  return ChatMessageAssistant(
331
- content=message.content or "",
337
+ content=content,
332
338
  source="generate",
333
339
  tool_calls=chat_tool_calls(message, tools),
334
- reasoning=reasoning,
335
340
  )
336
341
 
337
342