inspect-ai 0.3.103__py3-none-any.whl → 0.3.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. inspect_ai/_cli/common.py +2 -1
  2. inspect_ai/_cli/eval.py +2 -2
  3. inspect_ai/_display/core/active.py +3 -0
  4. inspect_ai/_display/core/config.py +1 -0
  5. inspect_ai/_display/core/panel.py +21 -13
  6. inspect_ai/_display/core/results.py +3 -7
  7. inspect_ai/_display/core/rich.py +3 -5
  8. inspect_ai/_display/log/__init__.py +0 -0
  9. inspect_ai/_display/log/display.py +173 -0
  10. inspect_ai/_display/plain/display.py +2 -2
  11. inspect_ai/_display/rich/display.py +2 -4
  12. inspect_ai/_display/textual/app.py +1 -6
  13. inspect_ai/_display/textual/widgets/task_detail.py +3 -14
  14. inspect_ai/_display/textual/widgets/tasks.py +1 -1
  15. inspect_ai/_eval/eval.py +1 -1
  16. inspect_ai/_eval/evalset.py +3 -3
  17. inspect_ai/_eval/registry.py +6 -1
  18. inspect_ai/_eval/run.py +5 -1
  19. inspect_ai/_eval/task/constants.py +1 -0
  20. inspect_ai/_eval/task/log.py +2 -0
  21. inspect_ai/_eval/task/run.py +65 -39
  22. inspect_ai/_util/citation.py +88 -0
  23. inspect_ai/_util/content.py +24 -2
  24. inspect_ai/_util/json.py +17 -2
  25. inspect_ai/_util/registry.py +19 -4
  26. inspect_ai/_view/schema.py +0 -6
  27. inspect_ai/_view/server.py +17 -0
  28. inspect_ai/_view/www/dist/assets/index.css +93 -31
  29. inspect_ai/_view/www/dist/assets/index.js +10639 -10011
  30. inspect_ai/_view/www/log-schema.json +418 -1
  31. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  32. inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
  33. inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
  34. inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
  35. inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
  36. inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
  37. inspect_ai/_view/www/package.json +2 -2
  38. inspect_ai/_view/www/src/@types/log.d.ts +140 -39
  39. inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
  40. inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
  41. inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
  42. inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
  43. inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
  44. inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
  45. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
  46. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
  47. inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
  48. inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
  49. inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
  50. inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
  51. inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
  52. inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
  53. inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
  54. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
  55. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
  56. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
  57. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
  58. inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
  59. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
  60. inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
  61. inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
  62. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
  63. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
  64. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
  65. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
  66. inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
  67. inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
  68. inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
  69. inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
  70. inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
  71. inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
  72. inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
  73. inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
  74. inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
  75. inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
  76. inspect_ai/_view/www/src/client/api/types.ts +3 -0
  77. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
  78. inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
  79. inspect_ai/_view/www/src/tests/README.md +2 -2
  80. inspect_ai/_view/www/src/utils/git.ts +3 -1
  81. inspect_ai/_view/www/src/utils/html.ts +6 -0
  82. inspect_ai/agent/_handoff.py +8 -5
  83. inspect_ai/agent/_react.py +5 -5
  84. inspect_ai/dataset/_dataset.py +1 -1
  85. inspect_ai/log/_condense.py +5 -0
  86. inspect_ai/log/_file.py +4 -1
  87. inspect_ai/log/_log.py +9 -4
  88. inspect_ai/log/_recorders/json.py +4 -2
  89. inspect_ai/log/_samples.py +5 -0
  90. inspect_ai/log/_util.py +2 -0
  91. inspect_ai/model/__init__.py +14 -0
  92. inspect_ai/model/_call_tools.py +17 -8
  93. inspect_ai/model/_chat_message.py +3 -0
  94. inspect_ai/model/_openai_responses.py +80 -34
  95. inspect_ai/model/_providers/_anthropic_citations.py +158 -0
  96. inspect_ai/model/_providers/_google_citations.py +100 -0
  97. inspect_ai/model/_providers/anthropic.py +219 -36
  98. inspect_ai/model/_providers/google.py +98 -22
  99. inspect_ai/model/_providers/mistral.py +20 -7
  100. inspect_ai/model/_providers/openai.py +11 -10
  101. inspect_ai/model/_providers/openai_compatible.py +3 -2
  102. inspect_ai/model/_providers/openai_responses.py +2 -5
  103. inspect_ai/model/_providers/perplexity.py +123 -0
  104. inspect_ai/model/_providers/providers.py +13 -2
  105. inspect_ai/model/_providers/vertex.py +3 -0
  106. inspect_ai/model/_trim.py +5 -0
  107. inspect_ai/tool/__init__.py +14 -0
  108. inspect_ai/tool/_mcp/_mcp.py +5 -2
  109. inspect_ai/tool/_mcp/sampling.py +19 -3
  110. inspect_ai/tool/_mcp/server.py +1 -1
  111. inspect_ai/tool/_tool.py +10 -1
  112. inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
  113. inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
  114. inspect_ai/tool/_tools/_web_search/_google.py +22 -25
  115. inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
  116. inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
  117. inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
  118. inspect_ai/util/__init__.py +8 -0
  119. inspect_ai/util/_background.py +64 -0
  120. inspect_ai/util/_display.py +11 -2
  121. inspect_ai/util/_limit.py +72 -5
  122. inspect_ai/util/_sandbox/__init__.py +2 -0
  123. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  124. inspect_ai/util/_sandbox/service.py +28 -7
  125. inspect_ai/util/_span.py +12 -1
  126. inspect_ai/util/_subprocess.py +51 -38
  127. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +2 -2
  128. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +134 -109
  129. /inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
  130. /inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
  131. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
  132. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
  133. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
  134. {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,8 @@ from anthropic.types import (
22
22
  MessageParam,
23
23
  RedactedThinkingBlock,
24
24
  RedactedThinkingBlockParam,
25
+ ServerToolUseBlock,
26
+ ServerToolUseBlockParam,
25
27
  TextBlock,
26
28
  TextBlockParam,
27
29
  ThinkingBlock,
@@ -31,11 +33,15 @@ from anthropic.types import (
31
33
  ToolTextEditor20250124Param,
32
34
  ToolUseBlock,
33
35
  ToolUseBlockParam,
36
+ WebSearchTool20250305Param,
37
+ WebSearchToolResultBlock,
38
+ WebSearchToolResultBlockParam,
34
39
  message_create_params,
35
40
  )
36
41
  from anthropic.types.beta import (
37
42
  BetaToolComputerUse20250124Param,
38
43
  BetaToolTextEditor20241022Param,
44
+ BetaToolTextEditor20250429Param,
39
45
  )
40
46
  from pydantic import JsonValue
41
47
  from typing_extensions import override
@@ -43,6 +49,7 @@ from typing_extensions import override
43
49
  from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
44
50
  from inspect_ai._util.content import (
45
51
  Content,
52
+ ContentData,
46
53
  ContentImage,
47
54
  ContentReasoning,
48
55
  ContentText,
@@ -61,6 +68,10 @@ from .._generate_config import GenerateConfig
61
68
  from .._model import ModelAPI
62
69
  from .._model_call import ModelCall
63
70
  from .._model_output import ChatCompletionChoice, ModelOutput, ModelUsage, StopReason
71
+ from .._providers._anthropic_citations import (
72
+ to_anthropic_citation,
73
+ to_inspect_citation,
74
+ )
64
75
  from .util import environment_prerequisite_error, model_base_url
65
76
  from .util.hooks import HttpxHooks
66
77
 
@@ -70,6 +81,14 @@ ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
70
81
 
71
82
  INTERNAL_COMPUTER_TOOL_NAME = "computer"
72
83
 
84
+ WEB_SEARCH_COMPATIBLE_MODELS = [
85
+ "claude-opus-4-20250514",
86
+ "claude-sonnet-4-20250514",
87
+ "claude-3-7-sonnet-20250219",
88
+ "claude-3-5-sonnet-latest",
89
+ "claude-3-5-haiku-latest",
90
+ ]
91
+
73
92
 
74
93
  class AnthropicAPI(ModelAPI):
75
94
  def __init__(
@@ -232,27 +251,19 @@ class AnthropicAPI(ModelAPI):
232
251
  if self.extra_body is not None:
233
252
  request["extra_body"] = self.extra_body
234
253
 
235
- # make request (unless overrideen, stream if we are using reasoning)
254
+ # make request (unless overridden, stream if we are using reasoning)
236
255
  streaming = (
237
256
  self.is_using_thinking(config)
238
257
  if self.streaming == "auto"
239
258
  else self.streaming
240
259
  )
241
- if streaming:
242
- async with self.client.messages.stream(**request) as stream:
243
- message = await stream.get_final_message()
244
- else:
245
- message = await self.client.messages.create(**request, stream=False)
246
260
 
247
- # set response for ModelCall
248
- response = message.model_dump()
249
-
250
- # extract output
251
- output = await model_output_from_message(
252
- self.client, self.service_model_name(), message, tools
261
+ message, output = await self._perform_request_and_continuations(
262
+ request, streaming, tools
253
263
  )
254
264
 
255
- # return output and call
265
+ response = message.model_dump()
266
+
256
267
  return output, model_call()
257
268
 
258
269
  except BadRequestError as ex:
@@ -269,6 +280,50 @@ class AnthropicAPI(ModelAPI):
269
280
  else:
270
281
  raise ex
271
282
 
283
+ async def _perform_request_and_continuations(
284
+ self,
285
+ request: dict[str, Any],
286
+ streaming: bool,
287
+ tools: list[ToolInfo],
288
+ ) -> tuple[Message, ModelOutput]:
289
+ """
290
+ This helper function is split out so that it can be easily call itself recursively in cases where the model requires a continuation
291
+
292
+ It considers the result from the initial request the "head" and the result
293
+ from the continuation the "tail".
294
+ """
295
+ if streaming:
296
+ async with self.client.messages.stream(**request) as stream:
297
+ head_message = await stream.get_final_message()
298
+ else:
299
+ head_message = await self.client.messages.create(**request, stream=False)
300
+
301
+ head_model_output, continuation_required = await model_output_from_message(
302
+ self.client, self.service_model_name(), head_message, tools
303
+ )
304
+
305
+ if continuation_required:
306
+ tail_request = dict(request)
307
+ tail_request["messages"] = request["messages"] + [
308
+ MessageParam(role=head_message.role, content=head_message.content)
309
+ ]
310
+ _, tail_model_output = await self._perform_request_and_continuations(
311
+ tail_request, streaming, tools
312
+ )
313
+
314
+ head_content = _content_list(head_model_output.message.content)
315
+ tail_content = _content_list(tail_model_output.message.content)
316
+ tail_model_output.message.content = head_content + tail_content
317
+
318
+ # TODO:
319
+ # It looks weird to return the head message with the tail output, but
320
+ # the contract for this function is that it returns the head message
321
+ # even when it has needed to recurse. This is because model_call()
322
+ # above doesn't currently support multiple requests
323
+ return head_message, tail_model_output
324
+
325
+ return head_message, head_model_output
326
+
272
327
  def completion_config(
273
328
  self, config: GenerateConfig
274
329
  ) -> tuple[dict[str, Any], dict[str, str], list[str]]:
@@ -343,6 +398,9 @@ class AnthropicAPI(ModelAPI):
343
398
  def is_claude_3_7(self) -> bool:
344
399
  return "claude-3-7-" in self.service_model_name()
345
400
 
401
+ def is_claude_4(self) -> bool:
402
+ return re.search(r"claude-4-[a-zA-Z]", self.service_model_name()) is not None
403
+
346
404
  @override
347
405
  def connection_key(self) -> str:
348
406
  return str(self.api_key)
@@ -521,7 +579,11 @@ class AnthropicAPI(ModelAPI):
521
579
  self, tool: ToolInfo, config: GenerateConfig
522
580
  ) -> Optional["ToolParamDef"]:
523
581
  return (
524
- (self.computer_use_tool_param(tool) or self.text_editor_tool_param(tool))
582
+ (
583
+ self.computer_use_tool_param(tool)
584
+ or self.text_editor_tool_param(tool)
585
+ or self.web_search_tool_param(tool)
586
+ )
525
587
  if config.internal_tools is not False
526
588
  else None
527
589
  )
@@ -569,7 +631,17 @@ class AnthropicAPI(ModelAPI):
569
631
 
570
632
  def text_editor_tool_param(
571
633
  self, tool: ToolInfo
572
- ) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
634
+ ) -> (
635
+ ToolTextEditor20250124Param
636
+ | BetaToolTextEditor20241022Param
637
+ | BetaToolTextEditor20250429Param
638
+ | None
639
+ ):
640
+ # See: https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/text-editor-tool#before-using-the-text-editor-tool
641
+ # TODO: It would be great to enhance our `is_claude_xxx` functions to help here.
642
+ if self.model_name.startswith(("claude-3-5-haiku", "claude-3-opus")):
643
+ return None
644
+
573
645
  # check for compatible 'text editor' tool
574
646
  if tool.name == "text_editor" and (
575
647
  sorted(tool.parameters.properties.keys())
@@ -586,7 +658,11 @@ class AnthropicAPI(ModelAPI):
586
658
  )
587
659
  ):
588
660
  return (
589
- BetaToolTextEditor20241022Param(
661
+ BetaToolTextEditor20250429Param(
662
+ type="text_editor_20250429", name="str_replace_based_edit_tool"
663
+ )
664
+ if self.is_claude_4()
665
+ else BetaToolTextEditor20241022Param(
590
666
  type="text_editor_20241022", name="str_replace_editor"
591
667
  )
592
668
  if self.is_claude_3_5()
@@ -598,6 +674,49 @@ class AnthropicAPI(ModelAPI):
598
674
  else:
599
675
  return None
600
676
 
677
+ def web_search_tool_param(
678
+ self, tool: ToolInfo
679
+ ) -> WebSearchTool20250305Param | None:
680
+ if (
681
+ tool.name == "web_search"
682
+ and tool.options
683
+ and "anthropic" in tool.options
684
+ and self.model_name in WEB_SEARCH_COMPATIBLE_MODELS
685
+ ):
686
+ return _web_search_tool_param(tool.options["anthropic"])
687
+ else:
688
+ return None
689
+
690
+
691
+ def _web_search_tool_param(
692
+ maybe_anthropic_options: object,
693
+ ) -> WebSearchTool20250305Param:
694
+ if maybe_anthropic_options is not None and not isinstance(
695
+ maybe_anthropic_options, dict
696
+ ):
697
+ raise TypeError(
698
+ f"Expected a dictionary for anthropic_options, got {type(maybe_anthropic_options)}"
699
+ )
700
+
701
+ result = WebSearchTool20250305Param(
702
+ name="web_search",
703
+ type="web_search_20250305",
704
+ )
705
+
706
+ if maybe_anthropic_options:
707
+ if "allowed_domains" in maybe_anthropic_options:
708
+ result["allowed_domains"] = maybe_anthropic_options["allowed_domains"]
709
+ if "blocked_domains" in maybe_anthropic_options:
710
+ result["blocked_domains"] = maybe_anthropic_options["blocked_domains"]
711
+ if "cache_control" in maybe_anthropic_options:
712
+ result["cache_control"] = maybe_anthropic_options["cache_control"]
713
+ if "max_uses" in maybe_anthropic_options:
714
+ result["max_uses"] = maybe_anthropic_options["max_uses"]
715
+ if "user_location" in maybe_anthropic_options:
716
+ result["user_location"] = maybe_anthropic_options["user_location"]
717
+
718
+ return result
719
+
601
720
 
602
721
  # tools can be either a stock tool param or a special Anthropic native use tool param
603
722
  ToolParamDef = (
@@ -605,6 +724,8 @@ ToolParamDef = (
605
724
  | BetaToolComputerUse20250124Param
606
725
  | ToolTextEditor20250124Param
607
726
  | BetaToolTextEditor20241022Param
727
+ | BetaToolTextEditor20250429Param
728
+ | WebSearchTool20250305Param
608
729
  )
609
730
 
610
731
 
@@ -614,6 +735,8 @@ def add_cache_control(
614
735
  | BetaToolComputerUse20250124Param
615
736
  | ToolTextEditor20250124Param
616
737
  | BetaToolTextEditor20241022Param
738
+ | BetaToolTextEditor20250429Param
739
+ | WebSearchTool20250305Param
617
740
  | dict[str, Any],
618
741
  ) -> None:
619
742
  cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -698,6 +821,8 @@ async def message_param(message: ChatMessage) -> MessageParam:
698
821
  | ImageBlockParam
699
822
  | ThinkingBlockParam
700
823
  | RedactedThinkingBlockParam
824
+ | ServerToolUseBlockParam
825
+ | WebSearchToolResultBlockParam
701
826
  ]
702
827
  ) = message.error.message
703
828
  # anthropic requires that content be populated when
@@ -735,6 +860,8 @@ async def message_param(message: ChatMessage) -> MessageParam:
735
860
  | RedactedThinkingBlockParam
736
861
  | ImageBlockParam
737
862
  | ToolUseBlockParam
863
+ | ServerToolUseBlockParam
864
+ | WebSearchToolResultBlockParam
738
865
  ] = (
739
866
  [TextBlockParam(type="text", text=message.content or NO_CONTENT)]
740
867
  if isinstance(message.content, str)
@@ -785,7 +912,7 @@ async def model_output_from_message(
785
912
  model: str,
786
913
  message: Message,
787
914
  tools: list[ToolInfo],
788
- ) -> ModelOutput:
915
+ ) -> tuple[ModelOutput, bool]:
789
916
  # extract content and tool calls
790
917
  content: list[Content] = []
791
918
  reasoning_tokens = 0
@@ -800,7 +927,20 @@ async def model_output_from_message(
800
927
  content_text = content_text.replace("<result>", "").replace(
801
928
  "</result>", ""
802
929
  )
803
- content.append(ContentText(type="text", text=content_text))
930
+ content.append(
931
+ ContentText(
932
+ type="text",
933
+ text=content_text,
934
+ citations=(
935
+ [
936
+ to_inspect_citation(citation)
937
+ for citation in content_block.citations
938
+ ]
939
+ if content_block.citations
940
+ else None
941
+ ),
942
+ )
943
+ )
804
944
  elif isinstance(content_block, ToolUseBlock):
805
945
  tool_calls = tool_calls or []
806
946
  (tool_name, internal_name) = _names_for_tool_call(content_block.name, tools)
@@ -812,6 +952,10 @@ async def model_output_from_message(
812
952
  internal=internal_name,
813
953
  )
814
954
  )
955
+ elif isinstance(content_block, ServerToolUseBlock):
956
+ content.append(ContentData(data=content_block.model_dump()))
957
+ elif isinstance(content_block, WebSearchToolResultBlock):
958
+ content.append(ContentData(data=content_block.model_dump()))
815
959
  elif isinstance(content_block, RedactedThinkingBlock):
816
960
  content.append(
817
961
  ContentReasoning(reasoning=content_block.data, redacted=True)
@@ -827,11 +971,12 @@ async def model_output_from_message(
827
971
  )
828
972
 
829
973
  # resolve choice
974
+ stop_reason, pause_turn = message_stop_reason(message)
830
975
  choice = ChatCompletionChoice(
831
976
  message=ChatMessageAssistant(
832
977
  content=content, tool_calls=tool_calls, model=model, source="generate"
833
978
  ),
834
- stop_reason=message_stop_reason(message),
979
+ stop_reason=stop_reason,
835
980
  )
836
981
 
837
982
  # return ModelOutput
@@ -844,17 +989,20 @@ async def model_output_from_message(
844
989
  + (input_tokens_cache_read or 0)
845
990
  + message.usage.output_tokens # includes reasoning tokens
846
991
  )
847
- return ModelOutput(
848
- model=message.model,
849
- choices=[choice],
850
- usage=ModelUsage(
851
- input_tokens=message.usage.input_tokens,
852
- output_tokens=message.usage.output_tokens,
853
- total_tokens=total_tokens,
854
- input_tokens_cache_write=input_tokens_cache_write,
855
- input_tokens_cache_read=input_tokens_cache_read,
856
- reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
992
+ return (
993
+ ModelOutput(
994
+ model=message.model,
995
+ choices=[choice],
996
+ usage=ModelUsage(
997
+ input_tokens=message.usage.input_tokens,
998
+ output_tokens=message.usage.output_tokens,
999
+ total_tokens=total_tokens,
1000
+ input_tokens_cache_write=input_tokens_cache_write,
1001
+ input_tokens_cache_read=input_tokens_cache_read,
1002
+ reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
1003
+ ),
857
1004
  ),
1005
+ pause_turn,
858
1006
  )
859
1007
 
860
1008
 
@@ -880,6 +1028,7 @@ def _names_for_tool_call(
880
1028
  (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
881
1029
  ("str_replace_editor", "text_editor_20241022", "text_editor"),
882
1030
  ("str_replace_editor", "text_editor_20250124", "text_editor"),
1031
+ ("str_replace_based_edit_tool", "text_editor_20250429", "text_editor"),
883
1032
  ("bash", "bash_20250124", "bash_session"),
884
1033
  )
885
1034
 
@@ -893,16 +1042,18 @@ def _names_for_tool_call(
893
1042
  )
894
1043
 
895
1044
 
896
- def message_stop_reason(message: Message) -> StopReason:
1045
+ def message_stop_reason(message: Message) -> tuple[StopReason, bool]:
897
1046
  match message.stop_reason:
898
1047
  case "end_turn" | "stop_sequence":
899
- return "stop"
1048
+ return "stop", False
900
1049
  case "tool_use":
901
- return "tool_calls"
1050
+ return "tool_calls", False
902
1051
  case "max_tokens":
903
- return message.stop_reason
1052
+ return message.stop_reason, False
1053
+ case "refusal":
1054
+ return "content_filter", False
904
1055
  case _:
905
- return "unknown"
1056
+ return "unknown", message.stop_reason == "pause_turn"
906
1057
 
907
1058
 
908
1059
  def split_system_messages(
@@ -918,9 +1069,24 @@ def split_system_messages(
918
1069
 
919
1070
  async def message_param_content(
920
1071
  content: Content,
921
- ) -> TextBlockParam | ImageBlockParam | ThinkingBlockParam | RedactedThinkingBlockParam:
1072
+ ) -> (
1073
+ TextBlockParam
1074
+ | ImageBlockParam
1075
+ | ThinkingBlockParam
1076
+ | RedactedThinkingBlockParam
1077
+ | ServerToolUseBlockParam
1078
+ | WebSearchToolResultBlockParam
1079
+ ):
922
1080
  if isinstance(content, ContentText):
923
- return TextBlockParam(type="text", text=content.text or NO_CONTENT)
1081
+ citations = (
1082
+ [to_anthropic_citation(citation) for citation in content.citations]
1083
+ if content.citations
1084
+ else None
1085
+ )
1086
+
1087
+ return TextBlockParam(
1088
+ type="text", text=content.text or NO_CONTENT, citations=citations
1089
+ )
924
1090
  elif isinstance(content, ContentImage):
925
1091
  # resolve to url
926
1092
  image = await file_as_data_uri(content.image)
@@ -948,6 +1114,19 @@ async def message_param_content(
948
1114
  return ThinkingBlockParam(
949
1115
  type="thinking", thinking=content.reasoning, signature=content.signature
950
1116
  )
1117
+ elif isinstance(content, ContentData):
1118
+ match content.data.get("type", None):
1119
+ case "server_tool_use":
1120
+ return cast(
1121
+ ServerToolUseBlockParam,
1122
+ ServerToolUseBlock.model_validate(content.data).model_dump(),
1123
+ )
1124
+ case "web_search_tool_result":
1125
+ return cast(
1126
+ WebSearchToolResultBlockParam,
1127
+ WebSearchToolResultBlock.model_validate(content.data).model_dump(),
1128
+ )
1129
+ raise NotImplementedError()
951
1130
  else:
952
1131
  raise RuntimeError(
953
1132
  "Anthropic models do not currently support audio or video inputs."
@@ -990,3 +1169,7 @@ def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
990
1169
  value = copy(value)
991
1170
  value.update(data=BASE_64_DATA_REMOVED)
992
1171
  return value
1172
+
1173
+
1174
+ def _content_list(input: str | list[Content]) -> list[Content]:
1175
+ return [ContentText(text=input)] if isinstance(input, str) else input
@@ -26,6 +26,7 @@ from google.genai.types import (
26
26
  GenerateContentResponse,
27
27
  GenerateContentResponsePromptFeedback,
28
28
  GenerateContentResponseUsageMetadata,
29
+ GoogleSearch,
29
30
  HarmBlockThreshold,
30
31
  HarmCategory,
31
32
  HttpOptions,
@@ -48,6 +49,7 @@ from inspect_ai._util.content import (
48
49
  )
49
50
  from inspect_ai._util.content import (
50
51
  ContentAudio,
52
+ ContentData,
51
53
  ContentImage,
52
54
  ContentReasoning,
53
55
  ContentText,
@@ -74,6 +76,7 @@ from inspect_ai.model import (
74
76
  TopLogprob,
75
77
  )
76
78
  from inspect_ai.model._model_call import ModelCall
79
+ from inspect_ai.model._providers._google_citations import get_candidate_citations
77
80
  from inspect_ai.tool import (
78
81
  ToolCall,
79
82
  ToolChoice,
@@ -247,7 +250,7 @@ class GoogleGenAIAPI(ModelAPI):
247
250
 
248
251
  # Create google-genai types.
249
252
  gemini_contents = await as_chat_messages(client, input)
250
- gemini_tools = chat_tools(tools) if len(tools) > 0 else None
253
+ gemini_tools = self.chat_tools(tools) if len(tools) > 0 else None
251
254
  gemini_tool_config = chat_tool_config(tool_choice) if len(tools) > 0 else None
252
255
  parameters = GenerateContentConfig(
253
256
  http_options=HttpOptions(headers={HttpHooks.REQUEST_ID_HEADER: request_id}),
@@ -362,6 +365,61 @@ class GoogleGenAIAPI(ModelAPI):
362
365
  else:
363
366
  return None
364
367
 
368
+ def _use_native_search(self, tool: ToolInfo) -> bool:
369
+ return (
370
+ tool.name == "web_search"
371
+ and tool.options is not None
372
+ and "gemini" in tool.options
373
+ # Support "starts with" Gemini 2.0
374
+ and (self.is_gemini() and not self.is_gemini_1_5())
375
+ )
376
+
377
+ def _categorize_tool(
378
+ self, acc: tuple[bool, list[FunctionDeclaration]], tool: ToolInfo
379
+ ) -> tuple[bool, list[FunctionDeclaration]]:
380
+ """Reducer function that categorizes tools into native search vs function declarations.
381
+
382
+ Returns:
383
+ Tuple of (has_native_search, function_declarations) where has_native_search
384
+ is True if any tool uses native search, and function_declarations contains
385
+ all non-native-search tools converted to FunctionDeclaration objects.
386
+ """
387
+ return (
388
+ (True, acc[1])
389
+ if self._use_native_search(tool)
390
+ else (
391
+ acc[0],
392
+ acc[1]
393
+ + [
394
+ FunctionDeclaration(
395
+ name=tool.name,
396
+ description=tool.description,
397
+ parameters=schema_from_param(tool.parameters)
398
+ if len(tool.parameters.properties) > 0
399
+ else None,
400
+ )
401
+ ],
402
+ )
403
+ )
404
+
405
+ def chat_tools(self, tools: list[ToolInfo]) -> ToolListUnion:
406
+ has_native_search, function_declarations = functools.reduce(
407
+ self._categorize_tool, tools, (False, list[FunctionDeclaration]())
408
+ )
409
+
410
+ # TODO: Google doesn't yet support native search concurrently with other tools.
411
+ # Revisit this from time to time to adapt when they fix it.
412
+ if has_native_search and function_declarations:
413
+ raise ValueError(
414
+ "Gemini does not yet support native search concurrently with other tools."
415
+ )
416
+
417
+ return (
418
+ [Tool(google_search=GoogleSearch())]
419
+ if has_native_search
420
+ else [Tool(function_declarations=function_declarations)]
421
+ )
422
+
365
423
 
366
424
  def safety_settings_to_list(
367
425
  safety_settings: list[SafetySettingDict],
@@ -500,6 +558,8 @@ async def content_part(client: Client, content: InspectContent | str) -> Part:
500
558
  return Part.from_text(text=content.text or NO_CONTENT)
501
559
  elif isinstance(content, ContentReasoning):
502
560
  return Part.from_text(text=content.reasoning or NO_CONTENT)
561
+ elif isinstance(content, ContentData):
562
+ assert False, "Google provider should never encounter ContentData"
503
563
  else:
504
564
  return await chat_content_to_part(client, content)
505
565
 
@@ -538,20 +598,6 @@ async def extract_system_message_as_parts(
538
598
  return system_parts or None
539
599
 
540
600
 
541
- def chat_tools(tools: list[ToolInfo]) -> ToolListUnion:
542
- declarations = [
543
- FunctionDeclaration(
544
- name=tool.name,
545
- description=tool.description,
546
- parameters=schema_from_param(tool.parameters)
547
- if len(tool.parameters.properties) > 0
548
- else None,
549
- )
550
- for tool in tools
551
- ]
552
- return [Tool(function_declarations=declarations)]
553
-
554
-
555
601
  # https://ai.google.dev/gemini-api/tutorials/extract_structured_data#define_the_schema
556
602
  def schema_from_param(
557
603
  param: ToolParam | ToolParams, nullable: bool | None = False
@@ -656,19 +702,36 @@ def completion_choice_from_candidate(
656
702
  | ContentImage
657
703
  | ContentAudio
658
704
  | ContentVideo
705
+ | ContentData
659
706
  ]
660
707
  ) = ""
661
708
  # content.parts can be None when the finish_reason is MALFORMED_FUNCTION_CALL
662
709
  elif candidate.content.parts is None:
663
710
  content = ""
664
711
  else:
665
- content = []
666
- for part in candidate.content.parts:
667
- if part.text is not None:
668
- if part.thought is True:
669
- content.append(ContentReasoning(reasoning=part.text))
670
- else:
671
- content.append(ContentText(text=part.text))
712
+ # Google's grounded search metadata provides start/end indices for cited
713
+ # text based on the joining of all separate text parts (despite the doc
714
+ # suggesting that they provide part_index). Thankfully, the doc also says:
715
+ #
716
+ # Exactly one field within a Part should be set, representing the specific type
717
+ # of content being conveyed. Using multiple fields within the same `Part`
718
+ # instance is considered invalid.
719
+ #
720
+ # That means that we can safely collapse adjacent parts with a `text` field
721
+ # and not fear that we're breaking other types of content parts
722
+ parts = functools.reduce(
723
+ _combine_text_parts, candidate.content.parts, list[Part]()
724
+ )
725
+
726
+ content = [
727
+ ContentReasoning(reasoning=part.text)
728
+ if part.thought is True
729
+ else ContentText(
730
+ text=part.text, citations=get_candidate_citations(candidate)
731
+ )
732
+ for part in parts
733
+ if part.text is not None
734
+ ]
672
735
 
673
736
  # now tool calls
674
737
  tool_calls: list[ToolCall] = []
@@ -922,3 +985,16 @@ async def file_for_content(
922
985
  files_db.put(content_sha256, str(upload.name))
923
986
  # return the file
924
987
  return upload
988
+
989
+
990
+ def _combine_text_parts(acc: list[Part], part: Part) -> list[Part]:
991
+ """Combine adjacent text parts into a single part."""
992
+ return (
993
+ acc + [part]
994
+ if part.text is None
995
+ or part.thought is True
996
+ or len(acc) == 0
997
+ or acc[-1].text is None
998
+ or acc[-1].thought is True
999
+ else acc[:-1] + [Part(text=acc[-1].text + part.text)]
1000
+ )
@@ -44,9 +44,15 @@ from typing_extensions import override
44
44
  # TODO: Migration guide:
45
45
  # https://github.com/mistralai/client-python/blob/main/MIGRATION.md
46
46
  from inspect_ai._util.constants import NO_CONTENT
47
- from inspect_ai._util.content import Content, ContentImage, ContentText
47
+ from inspect_ai._util.content import (
48
+ Content,
49
+ ContentImage,
50
+ ContentReasoning,
51
+ ContentText,
52
+ )
48
53
  from inspect_ai._util.http import is_retryable_http_status
49
54
  from inspect_ai._util.images import file_as_data_uri
55
+ from inspect_ai.model._reasoning import parse_content_with_reasoning
50
56
  from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
51
57
 
52
58
  from ..._util.httpx import httpx_should_retry
@@ -481,26 +487,33 @@ def completion_content(content: str | list[ContentChunk]) -> str | list[Content]
481
487
  if isinstance(content, str):
482
488
  return content
483
489
  else:
484
- return [completion_content_chunk(c) for c in content]
490
+ return [item for c in content for item in completion_content_chunks(c)]
485
491
 
486
492
 
487
- def completion_content_chunk(content: ContentChunk) -> Content:
493
+ def completion_content_chunks(content: ContentChunk) -> list[Content]:
488
494
  if isinstance(content, ReferenceChunk):
489
495
  raise TypeError("ReferenceChunk content is not supported by Inspect.")
490
496
  elif isinstance(content, TextChunk):
491
- return ContentText(text=content.text)
497
+ parsed = parse_content_with_reasoning(content.text)
498
+ if parsed:
499
+ return [
500
+ ContentReasoning(reasoning=parsed.reasoning),
501
+ ContentText(text=parsed.content),
502
+ ]
503
+ else:
504
+ return [ContentText(text=content.text)]
492
505
  elif isinstance(content, DocumentURLChunk):
493
- return ContentText(text=content.document_url)
506
+ return [ContentText(text=content.document_url)]
494
507
  else:
495
508
  if isinstance(content.image_url, str):
496
- return ContentImage(image=content.image_url)
509
+ return [ContentImage(image=content.image_url)]
497
510
  else:
498
511
  match content.image_url.detail:
499
512
  case "low" | "high":
500
513
  detail: Literal["auto", "low", "high"] = content.image_url.detail
501
514
  case _:
502
515
  detail = "auto"
503
- return ContentImage(image=content.image_url.url, detail=detail)
516
+ return [ContentImage(image=content.image_url.url, detail=detail)]
504
517
 
505
518
 
506
519
  def completion_choices_from_response(