inspect-ai 0.3.99__py3-none-any.whl → 0.3.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. inspect_ai/_cli/eval.py +2 -1
  2. inspect_ai/_display/core/config.py +11 -5
  3. inspect_ai/_display/core/panel.py +66 -2
  4. inspect_ai/_display/core/textual.py +5 -2
  5. inspect_ai/_display/plain/display.py +1 -0
  6. inspect_ai/_display/rich/display.py +2 -2
  7. inspect_ai/_display/textual/widgets/transcript.py +37 -9
  8. inspect_ai/_eval/eval.py +13 -1
  9. inspect_ai/_eval/evalset.py +3 -2
  10. inspect_ai/_eval/run.py +2 -0
  11. inspect_ai/_eval/score.py +2 -4
  12. inspect_ai/_eval/task/log.py +3 -1
  13. inspect_ai/_eval/task/run.py +59 -81
  14. inspect_ai/_util/content.py +11 -6
  15. inspect_ai/_util/interrupt.py +2 -2
  16. inspect_ai/_util/text.py +7 -0
  17. inspect_ai/_util/working.py +8 -37
  18. inspect_ai/_view/__init__.py +0 -0
  19. inspect_ai/_view/schema.py +2 -1
  20. inspect_ai/_view/www/CLAUDE.md +15 -0
  21. inspect_ai/_view/www/dist/assets/index.css +307 -171
  22. inspect_ai/_view/www/dist/assets/index.js +24733 -21641
  23. inspect_ai/_view/www/log-schema.json +77 -3
  24. inspect_ai/_view/www/package.json +9 -5
  25. inspect_ai/_view/www/src/@types/log.d.ts +9 -0
  26. inspect_ai/_view/www/src/app/App.tsx +1 -15
  27. inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
  28. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
  29. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
  30. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +220 -205
  31. inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
  32. inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
  33. inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
  34. inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
  35. inspect_ai/_view/www/src/app/routing/url.ts +84 -4
  36. inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
  37. inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
  39. inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +24 -17
  40. inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
  41. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
  42. inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
  43. inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
  44. inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
  45. inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
  46. inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
  47. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
  48. inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
  49. inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
  50. inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -2
  51. inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
  52. inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
  53. inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
  54. inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
  55. inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
  56. inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
  57. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
  58. inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
  59. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
  60. inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
  61. inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
  62. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
  63. inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
  64. inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
  65. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
  66. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
  67. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
  68. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
  69. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
  70. inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
  71. inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
  72. inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
  73. inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
  74. inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
  75. inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
  76. inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
  77. inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
  78. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
  79. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
  80. inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
  81. inspect_ai/_view/www/src/app/types.ts +5 -1
  82. inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
  83. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
  84. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
  85. inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
  86. inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
  87. inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
  88. inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
  89. inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
  90. inspect_ai/_view/www/src/state/hooks.ts +52 -2
  91. inspect_ai/_view/www/src/state/logSlice.ts +4 -3
  92. inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
  93. inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
  94. inspect_ai/_view/www/src/state/scrolling.ts +152 -0
  95. inspect_ai/_view/www/src/utils/attachments.ts +7 -0
  96. inspect_ai/_view/www/src/utils/python.ts +18 -0
  97. inspect_ai/_view/www/yarn.lock +290 -33
  98. inspect_ai/agent/_react.py +12 -7
  99. inspect_ai/agent/_run.py +2 -3
  100. inspect_ai/analysis/beta/__init__.py +2 -0
  101. inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
  102. inspect_ai/dataset/_sources/csv.py +2 -6
  103. inspect_ai/dataset/_sources/hf.py +2 -6
  104. inspect_ai/dataset/_sources/json.py +2 -6
  105. inspect_ai/dataset/_util.py +23 -0
  106. inspect_ai/log/_log.py +1 -1
  107. inspect_ai/log/_recorders/eval.py +4 -3
  108. inspect_ai/log/_recorders/file.py +2 -9
  109. inspect_ai/log/_recorders/json.py +1 -0
  110. inspect_ai/log/_recorders/recorder.py +1 -0
  111. inspect_ai/log/_transcript.py +1 -1
  112. inspect_ai/model/_call_tools.py +6 -2
  113. inspect_ai/model/_openai.py +1 -1
  114. inspect_ai/model/_openai_responses.py +85 -41
  115. inspect_ai/model/_openai_web_search.py +38 -0
  116. inspect_ai/model/_providers/azureai.py +72 -3
  117. inspect_ai/model/_providers/openai.py +4 -1
  118. inspect_ai/model/_providers/openai_responses.py +5 -1
  119. inspect_ai/scorer/_metric.py +1 -2
  120. inspect_ai/scorer/_reducer/reducer.py +1 -1
  121. inspect_ai/solver/_task_state.py +2 -2
  122. inspect_ai/tool/_tool.py +6 -2
  123. inspect_ai/tool/_tool_def.py +27 -4
  124. inspect_ai/tool/_tool_info.py +2 -0
  125. inspect_ai/tool/_tools/_web_search/_google.py +43 -15
  126. inspect_ai/tool/_tools/_web_search/_tavily.py +46 -13
  127. inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
  128. inspect_ai/util/__init__.py +4 -0
  129. inspect_ai/util/_json.py +3 -0
  130. inspect_ai/util/_limit.py +230 -20
  131. inspect_ai/util/_sandbox/docker/compose.py +20 -11
  132. inspect_ai/util/_span.py +1 -1
  133. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +3 -3
  134. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +138 -124
  135. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +1 -1
  136. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
  137. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
  138. {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ from inspect_ai.model import (
13
13
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
14
14
 
15
15
  from ._dataset import (
16
+ Dataset,
16
17
  DatasetRecord,
17
18
  FieldSpec,
18
19
  RecordToSample,
@@ -225,3 +226,25 @@ def read_files(files: Any | None) -> dict[str, str] | None:
225
226
  raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
226
227
  else:
227
228
  return None
229
+
230
+
231
+ def shuffle_choices_if_requested(
232
+ dataset: Dataset, shuffle_choices: bool | int | None
233
+ ) -> None:
234
+ """
235
+ Shuffle the choices in the dataset if requested.
236
+
237
+ The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
238
+ and `hf_dataset` can be a boolean, an integer, or `None` (default).
239
+ If it is a boolean, it will shuffle the choices if the value is `True`,
240
+ and do nothing if it is `False`.
241
+ If it is an integer, it will shuffle the choices using the integer as the seed.
242
+ """
243
+ # Note that `isinstance(x, int)` returns True if x is True or False,
244
+ # so we need to check for both explicitly
245
+ if shuffle_choices is True:
246
+ dataset.shuffle_choices()
247
+ elif shuffle_choices is False:
248
+ pass
249
+ elif isinstance(shuffle_choices, int):
250
+ dataset.shuffle_choices(seed=shuffle_choices)
inspect_ai/log/_log.py CHANGED
@@ -165,7 +165,7 @@ class EvalSampleLimit(BaseModel):
165
165
  ]
166
166
  """The type of limit"""
167
167
 
168
- limit: int
168
+ limit: float
169
169
  """The limit value"""
170
170
 
171
171
 
@@ -133,6 +133,7 @@ class EvalRecorder(FileRecorder):
133
133
  results: EvalResults | None,
134
134
  reductions: list[EvalSampleReductions] | None,
135
135
  error: EvalError | None = None,
136
+ header_only: bool = False,
136
137
  ) -> EvalLog:
137
138
  # get the key and log
138
139
  key = self._log_file_key(eval)
@@ -174,7 +175,7 @@ class EvalRecorder(FileRecorder):
174
175
 
175
176
  # flush and write the results
176
177
  await log.flush()
177
- return await log.close()
178
+ return await log.close(header_only)
178
179
 
179
180
  @classmethod
180
181
  @override
@@ -321,12 +322,12 @@ class ZipLogFile:
321
322
  # re-open zip file w/ self.temp_file pointer at end
322
323
  self._open()
323
324
 
324
- async def close(self) -> EvalLog:
325
+ async def close(self, header_only: bool) -> EvalLog:
325
326
  async with self._lock:
326
327
  # read the log from the temp file then close it
327
328
  try:
328
329
  self._temp_file.seek(0)
329
- return _read_log(self._temp_file, self._file)
330
+ return _read_log(self._temp_file, self._file, header_only=header_only)
330
331
  finally:
331
332
  self._temp_file.close()
332
333
  if self._zip:
@@ -67,16 +67,9 @@ class FileRecorder(Recorder):
67
67
  async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
68
68
  # establish the log to read from (might be cached)
69
69
  eval_log = await cls._log_file_maybe_cached(location)
70
-
71
- # throw if no samples
72
70
  if not eval_log.samples:
73
- raise IndexError(f"No samples found in log {location}")
74
-
75
- summaries: list[EvalSampleSummary] = []
76
- for sample in eval_log.samples:
77
- summaries.append(sample.summary())
78
-
79
- return summaries
71
+ return []
72
+ return [sample.summary() for sample in eval_log.samples]
80
73
 
81
74
  @classmethod
82
75
  async def _log_file_maybe_cached(cls, location: str) -> EvalLog:
@@ -96,6 +96,7 @@ class JSONRecorder(FileRecorder):
96
96
  results: EvalResults | None,
97
97
  reductions: list[EvalSampleReductions] | None,
98
98
  error: EvalError | None = None,
99
+ header_only: bool = False,
99
100
  ) -> EvalLog:
100
101
  log = self.data[self._log_file_key(spec)]
101
102
  log.data.status = status
@@ -46,6 +46,7 @@ class Recorder(abc.ABC):
46
46
  results: EvalResults | None,
47
47
  reductions: list[EvalSampleReductions] | None,
48
48
  error: EvalError | None = None,
49
+ header_only: bool = False,
49
50
  ) -> EvalLog: ...
50
51
 
51
52
  @classmethod
@@ -111,7 +111,7 @@ class SampleLimitEvent(BaseEvent):
111
111
  message: str
112
112
  """A message associated with this limit"""
113
113
 
114
- limit: int | None = Field(default=None)
114
+ limit: float | None = Field(default=None)
115
115
  """The limit value (if any)"""
116
116
 
117
117
 
@@ -4,6 +4,7 @@ import types
4
4
  from copy import copy
5
5
  from dataclasses import is_dataclass
6
6
  from datetime import date, datetime, time
7
+ from enum import EnumMeta
7
8
  from logging import getLogger
8
9
  from textwrap import dedent
9
10
  from types import UnionType
@@ -172,7 +173,7 @@ async def execute_tools(
172
173
  except LimitExceededError as ex:
173
174
  tool_error = ToolCallError(
174
175
  "limit",
175
- f"The tool exceeded its {ex.type} limit of {ex.limit}.",
176
+ f"The tool exceeded its {ex.type} limit of {ex.limit_str}.",
176
177
  )
177
178
  except ToolParsingError as ex:
178
179
  tool_error = ToolCallError("parsing", ex.message)
@@ -497,7 +498,7 @@ async def agent_handoff(
497
498
  ChatMessageUser(
498
499
  content=(
499
500
  f"The {agent_name} exceeded its {limit_error.type} limit of "
500
- f"{limit_error.limit}."
501
+ f"{limit_error.limit_str}."
501
502
  )
502
503
  )
503
504
  )
@@ -548,6 +549,7 @@ def tools_info(
548
549
  name=tool.name,
549
550
  description=tool.description,
550
551
  parameters=tool.parameters,
552
+ options=tool.options,
551
553
  )
552
554
  )
553
555
  return tools_info
@@ -652,6 +654,8 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
652
654
  return type_hint(**dataclass_data)
653
655
  elif issubclass(type_hint, BaseModel):
654
656
  return type_hint(**input)
657
+ elif isinstance(type_hint, EnumMeta):
658
+ return type_hint(input)
655
659
  else:
656
660
  return input
657
661
  elif origin is list or origin is List:
@@ -594,7 +594,7 @@ def chat_choices_from_openai(
594
594
  stop_reason=as_stop_reason(choice.finish_reason),
595
595
  logprobs=(
596
596
  Logprobs(**choice.logprobs.model_dump())
597
- if choice.logprobs is not None
597
+ if choice.logprobs and choice.logprobs.content is not None
598
598
  else None
599
599
  ),
600
600
  )
@@ -1,6 +1,5 @@
1
1
  import json
2
- from itertools import chain
3
- from typing import TypedDict, cast
2
+ from typing import Sequence, TypedDict, cast
4
3
 
5
4
  from openai.types.responses import (
6
5
  FunctionToolParam,
@@ -8,6 +7,8 @@ from openai.types.responses import (
8
7
  ResponseComputerToolCallParam,
9
8
  ResponseFunctionToolCall,
10
9
  ResponseFunctionToolCallParam,
10
+ ResponseFunctionWebSearch,
11
+ ResponseFunctionWebSearchParam,
11
12
  ResponseInputContentParam,
12
13
  ResponseInputImageParam,
13
14
  ResponseInputItemParam,
@@ -51,6 +52,7 @@ from inspect_ai.model._openai_computer_use import (
51
52
  maybe_computer_use_preview_tool,
52
53
  tool_call_from_openai_computer_tool_call,
53
54
  )
55
+ from inspect_ai.model._openai_web_search import maybe_web_search_tool
54
56
  from inspect_ai.tool._tool_call import ToolCall
55
57
  from inspect_ai.tool._tool_choice import ToolChoice
56
58
  from inspect_ai.tool._tool_info import ToolInfo
@@ -160,9 +162,9 @@ def openai_responses_tool_choice(
160
162
 
161
163
 
162
164
  def openai_responses_tools(
163
- tools: list[ToolInfo], config: GenerateConfig
165
+ tools: list[ToolInfo], model_name: str, config: GenerateConfig
164
166
  ) -> list[ToolParam]:
165
- return [_tool_param_for_tool_info(tool, config) for tool in tools]
167
+ return [_tool_param_for_tool_info(tool, model_name, config) for tool in tools]
166
168
 
167
169
 
168
170
  def openai_responses_chat_choices(
@@ -174,6 +176,14 @@ def openai_responses_chat_choices(
174
176
  return [ChatCompletionChoice(message=message, stop_reason=stop_reason)]
175
177
 
176
178
 
179
+ def is_native_tool_configured(
180
+ tools: Sequence[ToolInfo], model_name: str, config: GenerateConfig
181
+ ) -> bool:
182
+ return any(
183
+ _maybe_native_tool_param(tool, model_name, config) is not None for tool in tools
184
+ )
185
+
186
+
177
187
  # The next two function perform transformations between OpenAI types an Inspect
178
188
  # ChatMessageAssistant. Here is a diagram that helps visualize the transforms.
179
189
  # ┌───────────────────────────┐ ┌───────────────────────────┐ ┌───────────────────────────┐
@@ -207,7 +217,6 @@ def openai_responses_chat_choices(
207
217
 
208
218
 
209
219
  class _AssistantInternal(TypedDict):
210
- output_message_id: str | None
211
220
  tool_message_ids: dict[str, str]
212
221
 
213
222
 
@@ -237,17 +246,17 @@ def _chat_message_assistant_from_openai_response(
237
246
  # collect output and tool calls
238
247
  message_content: list[Content] = []
239
248
  tool_calls: list[ToolCall] = []
240
- internal = _AssistantInternal(output_message_id=None, tool_message_ids={})
249
+ internal = _AssistantInternal(tool_message_ids={})
241
250
  for output in response.output:
242
251
  match output:
243
252
  case ResponseOutputMessage(content=content, id=id):
244
- assert internal["output_message_id"] is None, "Multiple message outputs"
245
- internal["output_message_id"] = id
246
253
  message_content.extend(
247
254
  [
248
- ContentText(text=c.text)
255
+ ContentText(text=c.text, internal={"id": id})
249
256
  if isinstance(c, ResponseOutputText)
250
- else ContentText(text=c.refusal, refusal=True)
257
+ else ContentText(
258
+ text=c.refusal, refusal=True, internal={"id": id}
259
+ )
251
260
  for c in content
252
261
  ]
253
262
  )
@@ -277,6 +286,13 @@ def _chat_message_assistant_from_openai_response(
277
286
  tool_calls.append(
278
287
  tool_call_from_openai_computer_tool_call(output)
279
288
  )
289
+ case ResponseFunctionWebSearch():
290
+ # We don't currently capture this since the model did the
291
+ # "tool call" internally. It's conceivable that could be
292
+ # forced to include it in `.internal` in the future, but
293
+ # for now we just ignore it.
294
+ # {"id":"ws_682cdcec3fa88198bc10b38fafefbd5e077e89e31fd4a3d5","status":"completed","type":"web_search_call"}
295
+ pass
280
296
  case _:
281
297
  raise ValueError(f"Unexpected output type: {output.__class__}")
282
298
 
@@ -304,25 +320,39 @@ def _openai_input_items_from_chat_message_assistant(
304
320
  field of the `ChatMessageAssistant` to help it provide the proper id's the
305
321
  items in the returned list.
306
322
  """
307
- (output_message_id, tool_message_ids) = _ids_from_assistant_internal(message)
323
+ tool_message_ids = _ids_from_assistant_internal(message)
308
324
 
309
325
  # we want to prevent yielding output messages in the case where we have an
310
326
  # 'internal' field (so the message came from the model API as opposed to
311
- # being user synthesized) AND there is no output_message_id (indicating that
312
- # when reading the message from the server we didn't find output). this could
313
- # happen e.g. when a react() agent sets the output.completion in response
327
+ # being user synthesized) AND there are no ContentText items with message IDs
328
+ # (indicating that when reading the message from the server we didn't find output).
329
+ # this could happen e.g. when a react() agent sets the output.completion in response
314
330
  # to a submit() tool call
315
- suppress_output_message = message.internal is not None and output_message_id is None
331
+ content_items: list[ContentText | ContentReasoning] = (
332
+ [ContentText(text=message.content)]
333
+ if isinstance(message.content, str)
334
+ else [
335
+ c for c in message.content if isinstance(c, ContentText | ContentReasoning)
336
+ ]
337
+ )
338
+ has_content_with_ids = any(
339
+ isinstance(c, ContentText)
340
+ and isinstance(c.internal, dict)
341
+ and "id" in c.internal
342
+ for c in content_items
343
+ )
344
+ suppress_output_message = message.internal is not None and not has_content_with_ids
316
345
 
317
346
  # if we are not storing messages on the server then blank these out
318
347
  if not store:
319
- output_message_id = None
320
348
  tool_message_ids = {}
321
349
 
322
- # items to return -- ensure we use a single output message (and just chain
323
- # additional content on to it)
350
+ # items to return
324
351
  items: list[ResponseInputItemParam] = []
325
- output_message: ResponseOutputMessageParam | None = None
352
+ # group content by message ID
353
+ messages_by_id: dict[
354
+ str | None, list[ResponseOutputTextParam | ResponseOutputRefusalParam]
355
+ ] = {}
326
356
 
327
357
  for content in (
328
358
  list[ContentText | ContentReasoning]([ContentText(text=message.content)])
@@ -352,6 +382,14 @@ def _openai_input_items_from_chat_message_assistant(
352
382
  if suppress_output_message:
353
383
  continue
354
384
 
385
+ # get the message ID from ContentText.modelJson
386
+ content_message_id: str | None = None
387
+ if isinstance(content.internal, dict) and "id" in content.internal:
388
+ id_value = content.internal["id"]
389
+ content_message_id = id_value if isinstance(id_value, str) else None
390
+ else:
391
+ content_message_id = None
392
+
355
393
  new_content = (
356
394
  ResponseOutputRefusalParam(type="refusal", refusal=text)
357
395
  if refusal
@@ -359,22 +397,24 @@ def _openai_input_items_from_chat_message_assistant(
359
397
  type="output_text", text=text, annotations=[]
360
398
  )
361
399
  )
362
- if output_message is None:
363
- output_message = ResponseOutputMessageParam(
364
- type="message",
365
- role="assistant",
366
- # this actually can be `None`, and it will in fact be `None` when the
367
- # assistant message is synthesized by the scaffold as opposed to being
368
- # replayed from the model (or when store=False)
369
- id=output_message_id, # type: ignore[typeddict-item]
370
- content=[new_content],
371
- status="completed",
372
- )
373
- items.append(output_message)
374
- else:
375
- output_message["content"] = chain(
376
- output_message["content"], [new_content]
377
- )
400
+
401
+ if content_message_id not in messages_by_id:
402
+ messages_by_id[content_message_id] = []
403
+ messages_by_id[content_message_id].append(new_content)
404
+
405
+ # create ResponseOutputMessage for each unique ID
406
+ for msg_id, content_list in messages_by_id.items():
407
+ output_message = ResponseOutputMessageParam(
408
+ type="message",
409
+ role="assistant",
410
+ # this actually can be `None`, and it will in fact be `None` when the
411
+ # assistant message is synthesized by the scaffold as opposed to being
412
+ # replayed from the model (or when store=False)
413
+ id=msg_id, # type: ignore[typeddict-item]
414
+ content=content_list,
415
+ status="completed",
416
+ )
417
+ items.append(output_message)
378
418
 
379
419
  return items + _tool_call_items_from_assistant_message(message, tool_message_ids)
380
420
 
@@ -395,11 +435,13 @@ def _model_tool_call_for_internal(
395
435
 
396
436
  def _maybe_native_tool_param(
397
437
  tool: ToolInfo,
438
+ model_name: str,
398
439
  config: GenerateConfig,
399
440
  ) -> ToolParam | None:
400
441
  return (
401
442
  (
402
443
  maybe_computer_use_preview_tool(tool)
444
+ or maybe_web_search_tool(model_name, tool)
403
445
  # or self.text_editor_tool_param(tool)
404
446
  # or self.bash_tool_param(tool)
405
447
  )
@@ -442,32 +484,34 @@ def _tool_call_items_from_assistant_message(
442
484
 
443
485
  def _ids_from_assistant_internal(
444
486
  message: ChatMessageAssistant,
445
- ) -> tuple[str | None, dict[str, str]]:
487
+ ) -> dict[str, str]:
446
488
  if message.internal is not None:
447
489
  assert isinstance(message.internal, dict), (
448
490
  "OpenAI ChatMessageAssistant internal must be an _AssistantInternal"
449
491
  )
450
492
  internal = cast(_AssistantInternal, message.internal)
451
- return (internal["output_message_id"], internal["tool_message_ids"])
493
+ return internal["tool_message_ids"]
452
494
  else:
453
- return None, {}
495
+ return {}
454
496
 
455
497
 
456
498
  _ResponseToolCallParam = (
457
- ResponseFunctionToolCallParam | ResponseComputerToolCallParam
499
+ ResponseFunctionToolCallParam
500
+ | ResponseComputerToolCallParam
501
+ | ResponseFunctionWebSearchParam
458
502
  # | ResponseFileSearchToolCallParam
459
503
  # | ResponseFunctionToolCallParam
460
- # | ResponseFunctionWebSearchParam
461
504
  )
462
505
 
463
506
 
464
507
  def _tool_param_for_tool_info(
465
508
  tool: ToolInfo,
509
+ model_name: str,
466
510
  config: GenerateConfig,
467
511
  ) -> ToolParam:
468
512
  # Use a native tool implementation when available. Otherwise, use the
469
513
  # standard tool implementation
470
- return _maybe_native_tool_param(tool, config) or FunctionToolParam(
514
+ return _maybe_native_tool_param(tool, model_name, config) or FunctionToolParam(
471
515
  type="function",
472
516
  name=_responses_tool_alias(tool.name),
473
517
  description=tool.description,
@@ -0,0 +1,38 @@
1
+ from typing import cast
2
+
3
+ from openai.types.responses import WebSearchTool, WebSearchToolParam
4
+
5
+ from inspect_ai.tool._tool_info import ToolInfo
6
+
7
+ COMPATIBLE_MODELS = ["gpt-4o", "gpt-4o-mini", "gpt-4.1"]
8
+
9
+
10
+ def maybe_web_search_tool(model_name: str, tool: ToolInfo) -> WebSearchToolParam | None:
11
+ return (
12
+ _web_search_tool(tool.options["openai"])
13
+ if (
14
+ tool.name == "web_search"
15
+ and tool.options
16
+ and "openai" in tool.options
17
+ and model_name in COMPATIBLE_MODELS
18
+ )
19
+ else None
20
+ )
21
+
22
+
23
+ def _web_search_tool(maybe_openai_options: object) -> WebSearchToolParam:
24
+ if maybe_openai_options is None:
25
+ maybe_openai_options = {}
26
+ elif not isinstance(maybe_openai_options, dict):
27
+ raise TypeError(
28
+ f"Expected a dictionary for openai_options, got {type(maybe_openai_options)}"
29
+ )
30
+ openai_options = (
31
+ WebSearchTool.model_validate(
32
+ {"type": "web_search_preview", **maybe_openai_options}
33
+ )
34
+ if maybe_openai_options
35
+ else WebSearchTool(type="web_search_preview")
36
+ )
37
+
38
+ return cast(WebSearchToolParam, openai_options.model_dump(exclude_none=True))
@@ -1,3 +1,4 @@
1
+ import functools
1
2
  import json
2
3
  import os
3
4
  from copy import copy
@@ -151,7 +152,7 @@ class AzureAIAPI(ModelAPI):
151
152
 
152
153
  # prepare request
153
154
  request = dict(
154
- messages=await chat_request_messages(input, handler),
155
+ messages=await chat_request_messages(input, handler, self.is_mistral()),
155
156
  **self.completion_params(config),
156
157
  )
157
158
  # newer versions of vllm reject requests with tools or tool_choice if the
@@ -280,9 +281,77 @@ class AzureAIAPI(ModelAPI):
280
281
 
281
282
 
282
283
  async def chat_request_messages(
283
- messages: list[ChatMessage], handler: ChatAPIHandler | None
284
+ messages: list[ChatMessage],
285
+ handler: ChatAPIHandler | None,
286
+ is_mistral: bool = False,
287
+ ) -> list[ChatRequestMessage]:
288
+ chat_messages = [
289
+ await chat_request_message(message, handler) for message in messages
290
+ ]
291
+ if is_mistral:
292
+ chat_messages = functools.reduce(mistral_message_reducer, chat_messages, [])
293
+
294
+ return chat_messages
295
+
296
+
297
+ def mistral_message_reducer(
298
+ messages: list[ChatRequestMessage],
299
+ message: ChatRequestMessage,
284
300
  ) -> list[ChatRequestMessage]:
285
- return [await chat_request_message(message, handler) for message in messages]
301
+ """Fold any user messages found immediately after tool messages into the last tool message."""
302
+ if (
303
+ len(messages) > 0
304
+ and isinstance(messages[-1], ToolMessage)
305
+ and isinstance(message, UserMessage)
306
+ ):
307
+ messages[-1] = fold_user_message_into_tool_message(messages[-1], message)
308
+ else:
309
+ messages.append(message)
310
+
311
+ return messages
312
+
313
+
314
+ def fold_user_message_into_tool_message(
315
+ tool_message: ToolMessage,
316
+ user_message: UserMessage,
317
+ ) -> ToolMessage:
318
+ def convert_content_items_to_string(list_content: list[ContentItem]) -> str:
319
+ if not all(
320
+ isinstance(item, (TextContentItem | ImageContentItem))
321
+ for item in list_content
322
+ ):
323
+ raise TypeError(
324
+ "Expected all items to be TextContentItem or ImageContentItem"
325
+ )
326
+
327
+ parts = []
328
+ for item in list_content:
329
+ if isinstance(item, TextContentItem):
330
+ parts.append(item.text)
331
+ elif isinstance(item, ImageContentItem):
332
+ parts.append(f"[Image: {item.image_url.url}]")
333
+ else:
334
+ raise ValueError("Unexpected content item type")
335
+ return "".join(parts)
336
+
337
+ def normalise_content(
338
+ content: str | list[ContentItem] | None,
339
+ ) -> str | None:
340
+ return (
341
+ None
342
+ if content is None
343
+ else convert_content_items_to_string(content)
344
+ if isinstance(content, list)
345
+ else content
346
+ )
347
+
348
+ tool_content = normalise_content(tool_message.content)
349
+ user_content = normalise_content(user_message.content)
350
+
351
+ return ToolMessage(
352
+ content=(tool_content or "") + (user_content or ""),
353
+ tool_call_id=tool_message.tool_call_id,
354
+ )
286
355
 
287
356
 
288
357
  async def chat_request_message(
@@ -42,6 +42,7 @@ from .._openai import (
42
42
  openai_media_filter,
43
43
  openai_should_retry,
44
44
  )
45
+ from .._openai_responses import is_native_tool_configured
45
46
  from .openai_o1 import generate_o1
46
47
  from .util import environment_prerequisite_error, model_base_url
47
48
 
@@ -241,7 +242,9 @@ class OpenAIAPI(ModelAPI):
241
242
  tools=tools,
242
243
  **self.completion_params(config, False),
243
244
  )
244
- elif self.responses_api:
245
+ elif self.responses_api or is_native_tool_configured(
246
+ tools, self.model_name, config
247
+ ):
245
248
  return await generate_responses(
246
249
  client=self.client,
247
250
  http_hooks=self._http_hooks,
@@ -59,7 +59,11 @@ async def generate_responses(
59
59
  )
60
60
 
61
61
  # prepare request (we do this so we can log the ModelCall)
62
- tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
62
+ tool_params = (
63
+ openai_responses_tools(tools, model_name, config)
64
+ if len(tools) > 0
65
+ else NOT_GIVEN
66
+ )
63
67
  request = dict(
64
68
  input=await openai_responses_inputs(input, model_name, store),
65
69
  tools=tool_params,
@@ -7,7 +7,6 @@ from typing import (
7
7
  Protocol,
8
8
  Type,
9
9
  Union,
10
- cast,
11
10
  overload,
12
11
  runtime_checkable,
13
12
  )
@@ -356,7 +355,7 @@ def metric(
356
355
  )
357
356
  return metric
358
357
 
359
- return metric_register(cast(Callable[P, Metric], metric_wrapper), metric_name)
358
+ return metric_register(metric_wrapper, metric_name)
360
359
 
361
360
  # for decorators with an explicit name, one more wrapper for the name
362
361
  if isinstance(name, str):
@@ -121,7 +121,7 @@ def pass_at(
121
121
  def reduce(scores: list[Score]) -> Score:
122
122
  def pass_at_k(values: list[float]) -> float:
123
123
  total = len(scores)
124
- correct = sum(1 for v in values if v == value)
124
+ correct = sum(1 for v in values if v >= value)
125
125
  if total - correct < k:
126
126
  return 1.0
127
127
  else:
@@ -290,7 +290,7 @@ class TaskState:
290
290
  return self._tools
291
291
 
292
292
  @tools.setter
293
- def tools(self, tools: list[Tool | ToolDef]) -> None:
293
+ def tools(self, tools: Sequence[Tool | ToolDef]) -> None:
294
294
  self._tools.clear()
295
295
  for tool in tools:
296
296
  self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
@@ -353,7 +353,7 @@ class TaskState:
353
353
  def completed(self) -> bool:
354
354
  """Is the task completed.
355
355
 
356
- Additionally, checks message and token limits and raises if they are exceeded, and also checks for an operator interrupt of the sample.
356
+ Additionally, checks for an operator interrupt of the sample.
357
357
  """
358
358
  from inspect_ai.log._samples import set_active_sample_total_messages
359
359