inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. inspect_ai/_cli/eval.py +14 -8
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +6 -1
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/transcript.py +10 -6
  9. inspect_ai/_eval/task/run.py +5 -8
  10. inspect_ai/_util/content.py +20 -1
  11. inspect_ai/_util/transcript.py +10 -4
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/App.css +6 -0
  14. inspect_ai/_view/www/dist/assets/index.css +115 -87
  15. inspect_ai/_view/www/dist/assets/index.js +5324 -2276
  16. inspect_ai/_view/www/eslint.config.mjs +24 -1
  17. inspect_ai/_view/www/log-schema.json +283 -20
  18. inspect_ai/_view/www/package.json +8 -3
  19. inspect_ai/_view/www/src/App.tsx +2 -2
  20. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  21. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  22. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  23. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  24. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  25. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  26. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  27. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  28. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  29. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  30. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  31. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  32. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  33. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  34. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  35. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  36. inspect_ai/_view/www/src/index.tsx +2 -2
  37. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  38. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  39. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  40. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
  41. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  42. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  43. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  44. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  45. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  46. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  47. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  48. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  49. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
  50. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  51. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  52. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  53. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  54. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  55. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  56. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  57. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  58. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  59. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  60. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  64. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  65. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  66. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  67. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  68. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  69. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  70. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  71. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  72. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  73. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  74. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  75. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  76. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  77. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  78. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  79. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  80. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  83. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  84. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  85. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  86. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  87. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  88. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
  89. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  90. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
  91. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  92. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
  93. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  94. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  95. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
  96. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  97. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  98. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  99. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  100. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  101. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  102. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  103. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  104. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  105. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  106. inspect_ai/_view/www/src/types/log.d.ts +129 -34
  107. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  108. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  109. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  110. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  111. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  112. inspect_ai/_view/www/src/utils/format.ts +1 -1
  113. inspect_ai/_view/www/src/utils/json.ts +24 -0
  114. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  115. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
  116. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  117. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  118. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  119. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  120. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  121. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  122. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  123. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  124. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  125. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  126. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  127. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  128. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  129. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  130. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  131. inspect_ai/_view/www/yarn.lock +241 -5
  132. inspect_ai/log/_condense.py +3 -0
  133. inspect_ai/log/_recorders/eval.py +6 -1
  134. inspect_ai/log/_transcript.py +58 -1
  135. inspect_ai/model/__init__.py +2 -0
  136. inspect_ai/model/_call_tools.py +7 -0
  137. inspect_ai/model/_chat_message.py +22 -7
  138. inspect_ai/model/_conversation.py +10 -8
  139. inspect_ai/model/_generate_config.py +25 -4
  140. inspect_ai/model/_model.py +133 -57
  141. inspect_ai/model/_model_output.py +3 -0
  142. inspect_ai/model/_openai.py +106 -40
  143. inspect_ai/model/_providers/anthropic.py +281 -153
  144. inspect_ai/model/_providers/google.py +27 -8
  145. inspect_ai/model/_providers/groq.py +9 -4
  146. inspect_ai/model/_providers/openai.py +57 -4
  147. inspect_ai/model/_providers/openai_o1.py +10 -0
  148. inspect_ai/model/_providers/providers.py +1 -1
  149. inspect_ai/model/_reasoning.py +15 -2
  150. inspect_ai/scorer/_model.py +23 -19
  151. inspect_ai/solver/_human_agent/agent.py +14 -10
  152. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  153. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  154. inspect_ai/tool/__init__.py +2 -0
  155. inspect_ai/tool/_tool.py +3 -1
  156. inspect_ai/tool/_tools/_computer/_common.py +117 -58
  157. inspect_ai/tool/_tools/_computer/_computer.py +80 -57
  158. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
  159. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
  160. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
  161. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
  162. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
  163. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
  164. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  165. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
  166. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
  167. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
  168. inspect_ai/tool/_tools/_computer/test_args.py +151 -0
  169. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  170. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  171. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  172. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  173. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  174. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  175. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  176. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  177. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  178. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  179. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  180. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  181. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  182. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  183. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  184. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  185. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  186. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  187. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  191. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  192. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  196. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  197. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  198. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  199. inspect_ai/util/__init__.py +2 -1
  200. inspect_ai/util/_display.py +12 -0
  201. inspect_ai/util/_sandbox/events.py +55 -21
  202. inspect_ai/util/_sandbox/self_check.py +131 -43
  203. inspect_ai/util/_subtask.py +11 -0
  204. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
  205. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
  206. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
  207. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  208. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  209. inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
  210. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  211. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  212. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  214. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  215. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  216. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  217. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
  218. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
  219. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  import functools
2
2
  import os
3
+ import re
3
4
  import sys
4
5
  from copy import copy
5
6
  from logging import getLogger
6
- from typing import Any, Literal, Tuple, TypedDict, cast
7
+ from typing import Any, Literal, Optional, Tuple, TypedDict, cast
7
8
 
8
9
  from .util.tracker import HttpxTimeTracker
9
10
 
@@ -28,8 +29,12 @@ from anthropic.types import (
28
29
  ImageBlockParam,
29
30
  Message,
30
31
  MessageParam,
32
+ RedactedThinkingBlock,
33
+ RedactedThinkingBlockParam,
31
34
  TextBlock,
32
35
  TextBlockParam,
36
+ ThinkingBlock,
37
+ ThinkingBlockParam,
33
38
  ToolParam,
34
39
  ToolResultBlockParam,
35
40
  ToolUseBlock,
@@ -44,7 +49,12 @@ from inspect_ai._util.constants import (
44
49
  DEFAULT_MAX_RETRIES,
45
50
  NO_CONTENT,
46
51
  )
47
- from inspect_ai._util.content import Content, ContentImage, ContentText
52
+ from inspect_ai._util.content import (
53
+ Content,
54
+ ContentImage,
55
+ ContentReasoning,
56
+ ContentText,
57
+ )
48
58
  from inspect_ai._util.error import exception_message
49
59
  from inspect_ai._util.images import file_as_data_uri
50
60
  from inspect_ai._util.logger import warn_once
@@ -194,7 +204,7 @@ class AnthropicAPI(ModelAPI):
194
204
  tools_param,
195
205
  messages,
196
206
  computer_use,
197
- ) = await resolve_chat_input(self.model_name, input, tools, config)
207
+ ) = await self.resolve_chat_input(input, tools, config)
198
208
 
199
209
  # prepare request params (assembed this way so we can log the raw model call)
200
210
  request = dict(messages=messages)
@@ -204,23 +214,33 @@ class AnthropicAPI(ModelAPI):
204
214
  request["system"] = system_param
205
215
  request["tools"] = tools_param
206
216
  if len(tools) > 0:
207
- request["tool_choice"] = message_tool_choice(tool_choice)
217
+ request["tool_choice"] = message_tool_choice(
218
+ tool_choice, self.is_using_thinking(config)
219
+ )
208
220
 
209
221
  # additional options
210
- request = request | self.completion_params(config)
222
+ req, headers, betas = self.completion_config(config)
223
+ request = request | req
211
224
 
212
225
  # extra headers (for time tracker and computer use)
213
- extra_headers = {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
226
+ extra_headers = headers | {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
214
227
  if computer_use:
215
- extra_headers["anthropic-beta"] = "computer-use-2024-10-22"
228
+ betas.append("computer-use-2025-01-24")
229
+ if len(betas) > 0:
230
+ extra_headers["anthropic-beta"] = ",".join(betas)
231
+
216
232
  request["extra_headers"] = extra_headers
217
233
 
218
234
  # extra_body
219
235
  if self.extra_body is not None:
220
236
  request["extra_body"] = self.extra_body
221
237
 
222
- # make request
223
- message = await self.client.messages.create(**request, stream=False)
238
+ # make request (stream if we are using reasoning)
239
+ if self.is_using_thinking(config):
240
+ async with self.client.messages.stream(**request) as stream:
241
+ message = await stream.get_final_message()
242
+ else:
243
+ message = await self.client.messages.create(**request, stream=False)
224
244
 
225
245
  # set response for ModelCall
226
246
  response = message.model_dump()
@@ -245,27 +265,70 @@ class AnthropicAPI(ModelAPI):
245
265
  else:
246
266
  raise ex
247
267
 
248
- def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
249
- params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
250
- if config.temperature is not None:
251
- params["temperature"] = config.temperature
252
- if config.top_p is not None:
253
- params["top_p"] = config.top_p
254
- if config.top_k is not None:
255
- params["top_k"] = config.top_k
268
+ def completion_config(
269
+ self, config: GenerateConfig
270
+ ) -> tuple[dict[str, Any], dict[str, str], list[str]]:
271
+ max_tokens = cast(int, config.max_tokens)
272
+ params = dict(model=self.model_name, max_tokens=max_tokens)
273
+ headers: dict[str, str] = {}
274
+ betas: list[str] = []
275
+ # some params not compatible with thinking models
276
+ if not self.is_using_thinking(config):
277
+ if config.temperature is not None:
278
+ params["temperature"] = config.temperature
279
+ if config.top_p is not None:
280
+ params["top_p"] = config.top_p
281
+ if config.top_k is not None:
282
+ params["top_k"] = config.top_k
283
+
284
+ # some thinking-only stuff
285
+ if self.is_using_thinking(config):
286
+ params["thinking"] = dict(
287
+ type="enabled", budget_tokens=config.reasoning_tokens
288
+ )
289
+ headers["anthropic-version"] = "2023-06-01"
290
+ if max_tokens > 8192:
291
+ betas.append("output-128k-2025-02-19")
292
+
293
+ # config that applies to all models
256
294
  if config.timeout is not None:
257
295
  params["timeout"] = float(config.timeout)
258
296
  if config.stop_seqs is not None:
259
297
  params["stop_sequences"] = config.stop_seqs
260
- return params
298
+
299
+ # return config
300
+ return params, headers, betas
261
301
 
262
302
  @override
263
303
  def max_tokens(self) -> int | None:
264
304
  # anthropic requires you to explicitly specify max_tokens (most others
265
305
  # set it to the maximum allowable output tokens for the model).
266
- # set to 4096 which is the lowest documented max_tokens for claude models
306
+ # set to 4096 which is the highest possible for claude 3 (claude 3.5
307
+ # allows up to 8192)
267
308
  return 4096
268
309
 
310
+ @override
311
+ def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
312
+ max_tokens = cast(int, self.max_tokens())
313
+ if self.is_thinking_model() and config.reasoning_tokens is not None:
314
+ max_tokens = max_tokens + config.reasoning_tokens
315
+ return max_tokens
316
+
317
+ def is_using_thinking(self, config: GenerateConfig) -> bool:
318
+ return self.is_thinking_model() and config.reasoning_tokens is not None
319
+
320
+ def is_thinking_model(self) -> bool:
321
+ return not self.is_claude_3() and not self.is_claude_3_5()
322
+
323
+ def is_claude_3(self) -> bool:
324
+ return re.search(r"claude-3-[a-zA-Z]", self.model_name) is not None
325
+
326
+ def is_claude_3_5(self) -> bool:
327
+ return "claude-3-5-" in self.model_name
328
+
329
+ def is_claude_3_7(self) -> bool:
330
+ return "claude-3-7-" in self.model_name
331
+
269
332
  @override
270
333
  def connection_key(self) -> str:
271
334
  return str(self.api_key)
@@ -295,6 +358,14 @@ class AnthropicAPI(ModelAPI):
295
358
  def tool_result_images(self) -> bool:
296
359
  return True
297
360
 
361
+ @override
362
+ def emulate_reasoning_history(self) -> bool:
363
+ return False
364
+
365
+ @override
366
+ def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
367
+ return "all"
368
+
298
369
  # convert some common BadRequestError states into 'refusal' model output
299
370
  def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | Exception:
300
371
  error = exception_message(ex).lower()
@@ -329,6 +400,148 @@ class AnthropicAPI(ModelAPI):
329
400
  else:
330
401
  return ex
331
402
 
403
+ async def resolve_chat_input(
404
+ self,
405
+ input: list[ChatMessage],
406
+ tools: list[ToolInfo],
407
+ config: GenerateConfig,
408
+ ) -> Tuple[
409
+ list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
410
+ ]:
411
+ # extract system message
412
+ system_messages, messages = split_system_messages(input, config)
413
+
414
+ # messages
415
+ message_params = [(await message_param(message)) for message in messages]
416
+
417
+ # collapse user messages (as Inspect 'tool' messages become Claude 'user' messages)
418
+ message_params = functools.reduce(
419
+ consecutive_user_message_reducer, message_params, []
420
+ )
421
+
422
+ # tools
423
+ tools_params, computer_use = self.tool_params_for_tools(tools, config)
424
+
425
+ # system messages
426
+ if len(system_messages) > 0:
427
+ system_param: list[TextBlockParam] | None = [
428
+ TextBlockParam(type="text", text=message.text)
429
+ for message in system_messages
430
+ ]
431
+ else:
432
+ system_param = None
433
+
434
+ # add caching directives if necessary
435
+ cache_prompt = (
436
+ config.cache_prompt
437
+ if isinstance(config.cache_prompt, bool)
438
+ else True
439
+ if len(tools_params)
440
+ else False
441
+ )
442
+
443
+ # only certain claude models qualify
444
+ if cache_prompt:
445
+ if (
446
+ "claude-3-sonnet" in self.model_name
447
+ or "claude-2" in self.model_name
448
+ or "claude-instant" in self.model_name
449
+ ):
450
+ cache_prompt = False
451
+
452
+ if cache_prompt:
453
+ # system
454
+ if system_param:
455
+ add_cache_control(system_param[-1])
456
+ # tools
457
+ if tools_params:
458
+ add_cache_control(tools_params[-1])
459
+ # last 2 user messages
460
+ user_message_params = list(
461
+ filter(lambda m: m["role"] == "user", reversed(message_params))
462
+ )
463
+ for message in user_message_params[:2]:
464
+ if isinstance(message["content"], str):
465
+ text_param = TextBlockParam(type="text", text=message["content"])
466
+ add_cache_control(text_param)
467
+ message["content"] = [text_param]
468
+ else:
469
+ content = list(message["content"])
470
+ add_cache_control(cast(dict[str, Any], content[-1]))
471
+
472
+ # return chat input
473
+ return system_param, tools_params, message_params, computer_use
474
+
475
+ def tool_params_for_tools(
476
+ self, tools: list[ToolInfo], config: GenerateConfig
477
+ ) -> tuple[list["ToolParamDef"], bool]:
478
+ # tool params and computer_use bit to return
479
+ tool_params: list["ToolParamDef"] = []
480
+ computer_use = False
481
+
482
+ # for each tool, check if it has a native computer use implementation and use that
483
+ # when available (noting that we need to set the computer use request header)
484
+ for tool in tools:
485
+ computer_use_tool = (
486
+ self.computer_use_tool_param(tool)
487
+ if config.internal_tools is not False
488
+ else None
489
+ )
490
+ if computer_use_tool:
491
+ tool_params.append(computer_use_tool)
492
+ computer_use = True
493
+ else:
494
+ tool_params.append(
495
+ ToolParam(
496
+ name=tool.name,
497
+ description=tool.description,
498
+ input_schema=tool.parameters.model_dump(exclude_none=True),
499
+ )
500
+ )
501
+
502
+ return tool_params, computer_use
503
+
504
+ def computer_use_tool_param(
505
+ self, tool: ToolInfo
506
+ ) -> Optional["ComputerUseToolParam"]:
507
+ # check for compatible 'computer' tool
508
+ if tool.name == "computer" and (
509
+ sorted(tool.parameters.properties.keys())
510
+ == sorted(
511
+ [
512
+ "action",
513
+ "coordinate",
514
+ "duration",
515
+ "scroll_amount",
516
+ "scroll_direction",
517
+ "start_coordinate",
518
+ "text",
519
+ ]
520
+ )
521
+ ):
522
+ if self.is_claude_3_5():
523
+ warn_once(
524
+ logger,
525
+ "Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
526
+ )
527
+ return None
528
+ return ComputerUseToolParam(
529
+ type="computer_20250124",
530
+ name="computer",
531
+ # Note: The dimensions passed here for display_width_px and display_height_px should
532
+ # match the dimensions of screenshots returned by the tool.
533
+ # Those dimensions will always be one of the values in MAX_SCALING_TARGETS
534
+ # in _x11_client.py.
535
+ # TODO: enhance this code to calculate the dimensions based on the scaled screen
536
+ # size used by the container.
537
+ display_width_px=1366,
538
+ display_height_px=768,
539
+ display_number=1,
540
+ )
541
+ # not a computer_use tool
542
+ else:
543
+ return None
544
+
332
545
 
333
546
  # native anthropic tool definitions for computer use beta
334
547
  # https://docs.anthropic.com/en/docs/build-with-claude/computer-use
@@ -344,131 +557,6 @@ class ComputerUseToolParam(TypedDict):
344
557
  ToolParamDef = ToolParam | ComputerUseToolParam
345
558
 
346
559
 
347
- async def resolve_chat_input(
348
- model: str,
349
- input: list[ChatMessage],
350
- tools: list[ToolInfo],
351
- config: GenerateConfig,
352
- ) -> Tuple[list[TextBlockParam] | None, list[ToolParamDef], list[MessageParam], bool]:
353
- # extract system message
354
- system_messages, messages = split_system_messages(input, config)
355
-
356
- # messages
357
- message_params = [(await message_param(message)) for message in messages]
358
-
359
- # collapse user messages (as Inspect 'tool' messages become Claude 'user' messages)
360
- message_params = functools.reduce(
361
- consecutive_user_message_reducer, message_params, []
362
- )
363
-
364
- # tools
365
- tools_params, computer_use = tool_params_for_tools(tools, config)
366
-
367
- # system messages
368
- if len(system_messages) > 0:
369
- system_param: list[TextBlockParam] | None = [
370
- TextBlockParam(type="text", text=message.text)
371
- for message in system_messages
372
- ]
373
- else:
374
- system_param = None
375
-
376
- # add caching directives if necessary
377
- cache_prompt = (
378
- config.cache_prompt
379
- if isinstance(config.cache_prompt, bool)
380
- else True
381
- if len(tools_params)
382
- else False
383
- )
384
-
385
- # only certain claude models qualify
386
- if cache_prompt:
387
- if (
388
- "claude-3-sonnet" in model
389
- or "claude-2" in model
390
- or "claude-instant" in model
391
- ):
392
- cache_prompt = False
393
-
394
- if cache_prompt:
395
- # system
396
- if system_param:
397
- add_cache_control(system_param[-1])
398
- # tools
399
- if tools_params:
400
- add_cache_control(tools_params[-1])
401
- # last 2 user messages
402
- user_message_params = list(
403
- filter(lambda m: m["role"] == "user", reversed(message_params))
404
- )
405
- for message in user_message_params[:2]:
406
- if isinstance(message["content"], str):
407
- text_param = TextBlockParam(type="text", text=message["content"])
408
- add_cache_control(text_param)
409
- message["content"] = [text_param]
410
- else:
411
- content = list(message["content"])
412
- add_cache_control(cast(dict[str, Any], content[-1]))
413
-
414
- # return chat input
415
- return system_param, tools_params, message_params, computer_use
416
-
417
-
418
- def tool_params_for_tools(
419
- tools: list[ToolInfo], config: GenerateConfig
420
- ) -> tuple[list[ToolParamDef], bool]:
421
- # tool params and computer_use bit to return
422
- tool_params: list[ToolParamDef] = []
423
- computer_use = False
424
-
425
- # for each tool, check if it has a native computer use implementation and use that
426
- # when available (noting that we need to set the computer use request header)
427
- for tool in tools:
428
- computer_use_tool = (
429
- computer_use_tool_param(tool)
430
- if config.internal_tools is not False
431
- else None
432
- )
433
- if computer_use_tool:
434
- tool_params.append(computer_use_tool)
435
- computer_use = True
436
- else:
437
- tool_params.append(
438
- ToolParam(
439
- name=tool.name,
440
- description=tool.description,
441
- input_schema=tool.parameters.model_dump(exclude_none=True),
442
- )
443
- )
444
-
445
- return tool_params, computer_use
446
-
447
-
448
- def computer_use_tool_param(tool: ToolInfo) -> ComputerUseToolParam | None:
449
- # check for compatible 'computer' tool
450
- if tool.name == "computer" and (
451
- sorted(tool.parameters.properties.keys())
452
- == sorted(["action", "coordinate", "text"])
453
- ):
454
- return ComputerUseToolParam(
455
- type="computer_20241022",
456
- name="computer",
457
- # Note: The dimensions passed here for display_width_px and display_height_px should
458
- # match the dimensions of screenshots returned by the tool.
459
- # Those dimensions will always be one of the values in MAX_SCALING_TARGETS
460
- # in _x11_client.py.
461
- # TODO: enhance this code to calculate the dimensions based on the scaled screen
462
- # size used by the container.
463
- display_width_px=1366,
464
- display_height_px=768,
465
- display_number=1,
466
- )
467
- # not a computer_use tool
468
- else:
469
- return None
470
-
471
-
472
560
  def add_cache_control(
473
561
  param: TextBlockParam | ToolParam | ComputerUseToolParam | dict[str, Any],
474
562
  ) -> None:
@@ -498,7 +586,7 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
498
586
  role = a["role"]
499
587
  a_content = a["content"]
500
588
  b_content = b["content"]
501
- if isinstance(a_content, str) and isinstance(a_content, str):
589
+ if isinstance(a_content, str) and isinstance(b_content, str):
502
590
  return MessageParam(role=role, content=f"{a_content}\n{b_content}")
503
591
  elif isinstance(a_content, list) and isinstance(b_content, list):
504
592
  return MessageParam(role=role, content=a_content + b_content)
@@ -514,9 +602,15 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
514
602
  raise ValueError(f"Unexpected content types for messages: {a}, {b}")
515
603
 
516
604
 
517
- def message_tool_choice(tool_choice: ToolChoice) -> message_create_params.ToolChoice:
605
+ def message_tool_choice(
606
+ tool_choice: ToolChoice, thinking_model: bool
607
+ ) -> message_create_params.ToolChoice:
518
608
  if isinstance(tool_choice, ToolFunction):
519
- return {"type": "tool", "name": tool_choice.name}
609
+ # forced tool use not compatible with thinking models
610
+ if thinking_model:
611
+ return {"type": "any"}
612
+ else:
613
+ return {"type": "tool", "name": tool_choice.name}
520
614
  elif tool_choice == "any":
521
615
  return {"type": "any"}
522
616
  elif tool_choice == "none":
@@ -544,9 +638,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
544
638
  # "tool" means serving a tool call result back to claude
545
639
  elif message.role == "tool":
546
640
  if message.error is not None:
547
- content: str | list[TextBlockParam | ImageBlockParam] = (
548
- message.error.message
549
- )
641
+ content: (
642
+ str
643
+ | list[
644
+ TextBlockParam
645
+ | ImageBlockParam
646
+ | ThinkingBlockParam
647
+ | RedactedThinkingBlockParam
648
+ ]
649
+ ) = message.error.message
550
650
  # anthropic requires that content be populated when
551
651
  # is_error is true (throws bad_request_error when not)
552
652
  # so make sure this precondition is met
@@ -567,7 +667,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
567
667
  ToolResultBlockParam(
568
668
  tool_use_id=str(message.tool_call_id),
569
669
  type="tool_result",
570
- content=content,
670
+ content=cast(list[TextBlockParam | ImageBlockParam], content),
571
671
  is_error=message.error is not None,
572
672
  )
573
673
  ],
@@ -576,7 +676,13 @@ async def message_param(message: ChatMessage) -> MessageParam:
576
676
  # tool_calls means claude is attempting to call our tools
577
677
  elif message.role == "assistant" and message.tool_calls:
578
678
  # first include content (claude <thinking>)
579
- tools_content: list[TextBlockParam | ImageBlockParam | ToolUseBlockParam] = (
679
+ tools_content: list[
680
+ TextBlockParam
681
+ | ThinkingBlockParam
682
+ | RedactedThinkingBlockParam
683
+ | ImageBlockParam
684
+ | ToolUseBlockParam
685
+ ] = (
580
686
  [TextBlockParam(type="text", text=message.content or NO_CONTENT)]
581
687
  if isinstance(message.content, str)
582
688
  else (
@@ -645,6 +751,16 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
645
751
  arguments=content_block.model_dump().get("input", {}),
646
752
  )
647
753
  )
754
+ elif isinstance(content_block, RedactedThinkingBlock):
755
+ content.append(
756
+ ContentReasoning(reasoning=content_block.data, redacted=True)
757
+ )
758
+ elif isinstance(content_block, ThinkingBlock):
759
+ content.append(
760
+ ContentReasoning(
761
+ reasoning=content_block.thinking, signature=content_block.signature
762
+ )
763
+ )
648
764
 
649
765
  # resolve choice
650
766
  choice = ChatCompletionChoice(
@@ -702,7 +818,7 @@ def split_system_messages(
702
818
 
703
819
  async def message_param_content(
704
820
  content: Content,
705
- ) -> TextBlockParam | ImageBlockParam:
821
+ ) -> TextBlockParam | ImageBlockParam | ThinkingBlockParam | RedactedThinkingBlockParam:
706
822
  if isinstance(content, ContentText):
707
823
  return TextBlockParam(type="text", text=content.text or NO_CONTENT)
708
824
  elif isinstance(content, ContentImage):
@@ -720,6 +836,18 @@ async def message_param_content(
720
836
  type="image",
721
837
  source=dict(type="base64", media_type=cast(Any, media_type), data=image),
722
838
  )
839
+ elif isinstance(content, ContentReasoning):
840
+ if content.redacted:
841
+ return RedactedThinkingBlockParam(
842
+ type="redacted_thinking",
843
+ data=content.reasoning,
844
+ )
845
+ else:
846
+ if content.signature is None:
847
+ raise ValueError("Thinking content without signature.")
848
+ return ThinkingBlockParam(
849
+ type="thinking", thinking=content.reasoning, signature=content.signature
850
+ )
723
851
  else:
724
852
  raise RuntimeError(
725
853
  "Anthropic models do not currently support audio or video inputs."
@@ -38,10 +38,13 @@ from pydantic import JsonValue
38
38
  from typing_extensions import override
39
39
 
40
40
  from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
41
- from inspect_ai._util.content import Content as InspectContent
41
+ from inspect_ai._util.content import (
42
+ Content as InspectContent,
43
+ )
42
44
  from inspect_ai._util.content import (
43
45
  ContentAudio,
44
46
  ContentImage,
47
+ ContentReasoning,
45
48
  ContentText,
46
49
  ContentVideo,
47
50
  )
@@ -250,7 +253,10 @@ class GoogleGenAIAPI(ModelAPI):
250
253
 
251
254
  @override
252
255
  def is_rate_limit(self, ex: BaseException) -> bool:
253
- return isinstance(ex, APIError) and ex.code in (429, 500, 503, 504)
256
+ # see https://cloud.google.com/storage/docs/retry-strategy
257
+ return isinstance(ex, APIError) and (
258
+ ex.code in (408, 429, 429) or ex.code >= 500
259
+ )
254
260
 
255
261
  @override
256
262
  def connection_key(self) -> str:
@@ -405,6 +411,8 @@ async def content_part(client: Client, content: InspectContent | str) -> Part:
405
411
  return Part.from_text(text=content or NO_CONTENT)
406
412
  elif isinstance(content, ContentText):
407
413
  return Part.from_text(text=content.text or NO_CONTENT)
414
+ elif isinstance(content, ContentReasoning):
415
+ return Part.from_text(text=content.reasoning or NO_CONTENT)
408
416
  else:
409
417
  return await chat_content_to_part(client, content)
410
418
 
@@ -417,7 +425,8 @@ async def chat_content_to_part(
417
425
  content_bytes, mime_type = await file_as_data(content.image)
418
426
  return Part.from_bytes(mime_type=mime_type, data=content_bytes)
419
427
  else:
420
- return await file_for_content(client, content)
428
+ file = await file_for_content(client, content)
429
+ return Part.from_uri(file_uri=file.uri, mime_type=file.mime_type)
421
430
 
422
431
 
423
432
  async def extract_system_message_as_parts(
@@ -552,11 +561,19 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
552
561
  # stop reason
553
562
  stop_reason = finish_reason_to_stop_reason(candidate.finish_reason)
554
563
 
564
+ # choice content may include reasoning
565
+ if reasoning:
566
+ choice_content: str | list[Content] = [
567
+ ContentReasoning(reasoning=reasoning),
568
+ ContentText(text=content),
569
+ ]
570
+ else:
571
+ choice_content = content
572
+
555
573
  # build choice
556
574
  choice = ChatCompletionChoice(
557
575
  message=ChatMessageAssistant(
558
- content=content,
559
- reasoning=reasoning,
576
+ content=choice_content,
560
577
  tool_calls=tool_calls if len(tool_calls) > 0 else None,
561
578
  source="generate",
562
579
  ),
@@ -742,7 +759,7 @@ async def file_for_content(
742
759
  uploaded_file = files_db.get(content_sha256)
743
760
  if uploaded_file:
744
761
  try:
745
- upload: File = client.files.get(uploaded_file)
762
+ upload: File = client.files.get(name=uploaded_file)
746
763
  if upload.state.name == "ACTIVE":
747
764
  trace(f"Using uploaded file: {uploaded_file}")
748
765
  return upload
@@ -754,10 +771,12 @@ async def file_for_content(
754
771
  trace(f"Error attempting to access uploaded file: {ex}")
755
772
  files_db.delete(content_sha256)
756
773
  # do the upload (and record it)
757
- upload = client.files.upload(BytesIO(content_bytes), mime_type=mime_type)
774
+ upload = client.files.upload(
775
+ file=BytesIO(content_bytes), config=dict(mime_type=mime_type)
776
+ )
758
777
  while upload.state.name == "PROCESSING":
759
778
  await asyncio.sleep(3)
760
- upload = client.files.get(upload.name)
779
+ upload = client.files.get(name=upload.name)
761
780
  if upload.state.name == "FAILED":
762
781
  trace(f"Failed to upload file '{upload.name}: {upload.error}")
763
782
  raise ValueError(f"Google file upload failed: {upload.error}")
@@ -28,7 +28,7 @@ from inspect_ai._util.constants import (
28
28
  DEFAULT_MAX_RETRIES,
29
29
  DEFAULT_MAX_TOKENS,
30
30
  )
31
- from inspect_ai._util.content import Content
31
+ from inspect_ai._util.content import Content, ContentReasoning, ContentText
32
32
  from inspect_ai._util.images import file_as_data_uri
33
33
  from inspect_ai._util.url import is_http_url
34
34
  from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
@@ -326,12 +326,17 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
326
326
  def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
327
327
  reasoning = getattr(message, "reasoning", None)
328
328
  if reasoning is not None:
329
- reasoning = str(reasoning)
329
+ content: str | list[Content] = [
330
+ ContentReasoning(reasoning=str(reasoning)),
331
+ ContentText(text=message.content or ""),
332
+ ]
333
+ else:
334
+ content = message.content or ""
335
+
330
336
  return ChatMessageAssistant(
331
- content=message.content or "",
337
+ content=content,
332
338
  source="generate",
333
339
  tool_calls=chat_tool_calls(message, tools),
334
- reasoning=reasoning,
335
340
  )
336
341
 
337
342