inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. inspect_ai/_cli/eval.py +13 -1
  2. inspect_ai/_cli/view.py +4 -0
  3. inspect_ai/_display/textual/widgets/transcript.py +15 -9
  4. inspect_ai/_eval/task/error.py +10 -14
  5. inspect_ai/_eval/task/generate.py +41 -35
  6. inspect_ai/_eval/task/run.py +20 -12
  7. inspect_ai/_util/hooks.py +17 -7
  8. inspect_ai/_util/transcript.py +11 -0
  9. inspect_ai/_view/www/dist/assets/index.css +1 -0
  10. inspect_ai/_view/www/dist/assets/index.js +100 -94
  11. inspect_ai/_view/www/log-schema.json +35 -19
  12. inspect_ai/_view/www/package.json +1 -1
  13. inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
  14. inspect_ai/_view/www/src/types/log.d.ts +6 -4
  15. inspect_ai/log/_recorders/eval.py +1 -1
  16. inspect_ai/model/_chat_message.py +29 -2
  17. inspect_ai/model/_conversation.py +10 -3
  18. inspect_ai/model/_generate_config.py +6 -0
  19. inspect_ai/model/_model.py +164 -25
  20. inspect_ai/model/_openai.py +33 -1
  21. inspect_ai/model/_providers/anthropic.py +12 -3
  22. inspect_ai/model/_providers/groq.py +4 -0
  23. inspect_ai/model/_providers/openai.py +21 -9
  24. inspect_ai/model/_providers/providers.py +1 -1
  25. inspect_ai/model/_reasoning.py +17 -0
  26. inspect_ai/solver/__init__.py +2 -0
  27. inspect_ai/solver/_basic_agent.py +78 -58
  28. inspect_ai/{util → solver}/_limit.py +13 -0
  29. inspect_ai/solver/_task_state.py +37 -7
  30. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
  31. inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
  32. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
  33. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  34. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
  35. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
  36. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
  37. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
  38. inspect_ai/util/__init__.py +0 -2
  39. inspect_ai/util/_sandbox/self_check.py +51 -28
  40. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
  41. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
  42. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
  43. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
  44. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
  45. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
  46. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
@@ -260,13 +260,26 @@
260
260
  ],
261
261
  "default": null,
262
262
  "title": "Tool Calls"
263
+ },
264
+ "reasoning": {
265
+ "anyOf": [
266
+ {
267
+ "type": "string"
268
+ },
269
+ {
270
+ "type": "null"
271
+ }
272
+ ],
273
+ "default": null,
274
+ "title": "Reasoning"
263
275
  }
264
276
  },
265
277
  "required": [
266
278
  "content",
267
279
  "source",
268
280
  "role",
269
- "tool_calls"
281
+ "tool_calls",
282
+ "reasoning"
270
283
  ],
271
284
  "title": "ChatMessageAssistant",
272
285
  "type": "object",
@@ -486,7 +499,10 @@
486
499
  "tool_call_id": {
487
500
  "anyOf": [
488
501
  {
489
- "type": "string"
502
+ "items": {
503
+ "type": "string"
504
+ },
505
+ "type": "array"
490
506
  },
491
507
  {
492
508
  "type": "null"
@@ -1131,7 +1147,6 @@
1131
1147
  "presence_penalty": null,
1132
1148
  "logit_bias": null,
1133
1149
  "seed": null,
1134
- "suffix": null,
1135
1150
  "top_k": null,
1136
1151
  "num_choices": null,
1137
1152
  "logprobs": null,
@@ -1140,7 +1155,8 @@
1140
1155
  "internal_tools": null,
1141
1156
  "max_tool_output": null,
1142
1157
  "cache_prompt": null,
1143
- "reasoning_effort": null
1158
+ "reasoning_effort": null,
1159
+ "reasoning_history": null
1144
1160
  }
1145
1161
  }
1146
1162
  },
@@ -2120,18 +2136,6 @@
2120
2136
  "default": null,
2121
2137
  "title": "Seed"
2122
2138
  },
2123
- "suffix": {
2124
- "anyOf": [
2125
- {
2126
- "type": "string"
2127
- },
2128
- {
2129
- "type": "null"
2130
- }
2131
- ],
2132
- "default": null,
2133
- "title": "Suffix"
2134
- },
2135
2139
  "top_k": {
2136
2140
  "anyOf": [
2137
2141
  {
@@ -2248,6 +2252,18 @@
2248
2252
  ],
2249
2253
  "default": null,
2250
2254
  "title": "Reasoning Effort"
2255
+ },
2256
+ "reasoning_history": {
2257
+ "anyOf": [
2258
+ {
2259
+ "type": "boolean"
2260
+ },
2261
+ {
2262
+ "type": "null"
2263
+ }
2264
+ ],
2265
+ "default": null,
2266
+ "title": "Reasoning History"
2251
2267
  }
2252
2268
  },
2253
2269
  "title": "GenerateConfig",
@@ -2266,7 +2282,6 @@
2266
2282
  "presence_penalty",
2267
2283
  "logit_bias",
2268
2284
  "seed",
2269
- "suffix",
2270
2285
  "top_k",
2271
2286
  "num_choices",
2272
2287
  "logprobs",
@@ -2275,7 +2290,8 @@
2275
2290
  "internal_tools",
2276
2291
  "max_tool_output",
2277
2292
  "cache_prompt",
2278
- "reasoning_effort"
2293
+ "reasoning_effort",
2294
+ "reasoning_history"
2279
2295
  ],
2280
2296
  "additionalProperties": false
2281
2297
  },
@@ -4247,9 +4263,9 @@
4247
4263
  "parallel_tool_calls": null,
4248
4264
  "presence_penalty": null,
4249
4265
  "reasoning_effort": null,
4266
+ "reasoning_history": null,
4250
4267
  "seed": null,
4251
4268
  "stop_seqs": null,
4252
- "suffix": null,
4253
4269
  "system_message": null,
4254
4270
  "temperature": null,
4255
4271
  "timeout": null,
@@ -26,7 +26,7 @@
26
26
  },
27
27
  "dependencies": {
28
28
  "@popperjs/core": "^2.11.8",
29
- "asciinema-player": "^3.8.1",
29
+ "asciinema-player": "^3.8.2",
30
30
  "bootstrap": "^5.3.3",
31
31
  "bootstrap-icons": "^1.11.3",
32
32
  "clipboard": "^2.0.11",
@@ -8,6 +8,7 @@ import { ExpandablePanel } from "./ExpandablePanel.mjs";
8
8
  import { FontSize, TextStyle } from "../appearance/Fonts.mjs";
9
9
  import { resolveToolInput, ToolCallView } from "./Tools.mjs";
10
10
  import { VirtualList } from "./VirtualList.mjs";
11
+ import { MarkdownDiv } from "./MarkdownDiv.mjs";
11
12
 
12
13
  /**
13
14
  * Renders the ChatViewVirtualList component.
@@ -282,7 +283,29 @@ const ChatMessage = ({
282
283
  <i class="${iconForMsg(message)}"></i>
283
284
  ${message.role}
284
285
  </div>
286
+
287
+ ${
288
+ message.role === "assistant" && message.reasoning
289
+ ? html` <div
290
+ style=${{
291
+ marginLeft: indented ? "1.1rem" : "0",
292
+ paddingBottom: "0.8rem",
293
+ }}
294
+ >
295
+ <div style=${{ ...TextStyle.label, ...TextStyle.secondary }}>Reasoning</div>
296
+ <${ExpandablePanel} collapse=${true}><${MarkdownDiv} markdown=${message.reasoning}/></${ExpandablePanel}>
297
+ </div>`
298
+ : undefined
299
+ }
300
+
285
301
  <div style=${{ marginLeft: indented ? "1.1rem" : "0", paddingBottom: indented ? "0.8rem" : "0" }}>
302
+ ${
303
+ message.role === "assistant" && message.reasoning
304
+ ? html`<div style=${{ ...TextStyle.label, ...TextStyle.secondary }}>
305
+ Response
306
+ </div>`
307
+ : ""
308
+ }
286
309
  <${ExpandablePanel} collapse=${collapse}>
287
310
  <${MessageContents}
288
311
  key=${`${id}-contents`}
@@ -70,7 +70,6 @@ export type LogitBias = {
70
70
  [k: string]: number;
71
71
  } | null;
72
72
  export type Seed = number | null;
73
- export type Suffix = string | null;
74
73
  export type TopK = number | null;
75
74
  export type NumChoices = number | null;
76
75
  export type Logprobs = boolean | null;
@@ -80,6 +79,7 @@ export type InternalTools = boolean | null;
80
79
  export type MaxToolOutput = number | null;
81
80
  export type CachePrompt = "auto" | boolean | null;
82
81
  export type ReasoningEffort = ("low" | "medium" | "high") | null;
82
+ export type ReasoningHistory = boolean | null;
83
83
  export type TotalSamples = number;
84
84
  export type CompletedSamples = number;
85
85
  export type Name3 = string;
@@ -133,7 +133,7 @@ export type Content1 =
133
133
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
134
134
  export type Source1 = ("input" | "generate") | null;
135
135
  export type Role1 = "user";
136
- export type ToolCallId = string | null;
136
+ export type ToolCallId = string[] | null;
137
137
  export type Content2 =
138
138
  | string
139
139
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
@@ -147,6 +147,7 @@ export type ParseError = string | null;
147
147
  export type Title = string | null;
148
148
  export type Format2 = "text" | "markdown";
149
149
  export type Content3 = string;
150
+ export type Reasoning = string | null;
150
151
  export type Content4 =
151
152
  | string
152
153
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
@@ -547,7 +548,6 @@ export interface GenerateConfig {
547
548
  presence_penalty: PresencePenalty;
548
549
  logit_bias: LogitBias;
549
550
  seed: Seed;
550
- suffix: Suffix;
551
551
  top_k: TopK;
552
552
  num_choices: NumChoices;
553
553
  logprobs: Logprobs;
@@ -557,6 +557,7 @@ export interface GenerateConfig {
557
557
  max_tool_output: MaxToolOutput;
558
558
  cache_prompt: CachePrompt;
559
559
  reasoning_effort: ReasoningEffort;
560
+ reasoning_history: ReasoningHistory;
560
561
  }
561
562
  export interface EvalResults {
562
563
  total_samples: TotalSamples;
@@ -658,6 +659,7 @@ export interface ChatMessageAssistant {
658
659
  source: Source2;
659
660
  role: Role2;
660
661
  tool_calls: ToolCalls;
662
+ reasoning: Reasoning;
661
663
  }
662
664
  export interface ToolCall {
663
665
  id: Id1;
@@ -901,7 +903,6 @@ export interface GenerateConfig1 {
901
903
  presence_penalty: PresencePenalty;
902
904
  logit_bias: LogitBias;
903
905
  seed: Seed;
904
- suffix: Suffix;
905
906
  top_k: TopK;
906
907
  num_choices: NumChoices;
907
908
  logprobs: Logprobs;
@@ -911,6 +912,7 @@ export interface GenerateConfig1 {
911
912
  max_tool_output: MaxToolOutput;
912
913
  cache_prompt: CachePrompt;
913
914
  reasoning_effort: ReasoningEffort;
915
+ reasoning_history: ReasoningHistory;
914
916
  }
915
917
  /**
916
918
  * Model call (raw request/response data).
@@ -203,7 +203,7 @@ class EvalRecorder(FileRecorder):
203
203
  # of small fetches from the zip file streams)
204
204
  temp_log: str | None = None
205
205
  fs = filesystem(location)
206
- if not fs.is_local():
206
+ if not fs.is_local() and header_only is False:
207
207
  with tempfile.NamedTemporaryFile(delete=False) as temp:
208
208
  temp_log = temp.name
209
209
  fs.get_file(location, temp_log)
@@ -7,6 +7,8 @@ from inspect_ai._util.content import Content, ContentText
7
7
  from inspect_ai.tool import ToolCall
8
8
  from inspect_ai.tool._tool_call import ToolCallError
9
9
 
10
+ from ._reasoning import parse_content_with_reasoning
11
+
10
12
  logger = getLogger(__name__)
11
13
 
12
14
 
@@ -72,8 +74,8 @@ class ChatMessageUser(ChatMessageBase):
72
74
  role: Literal["user"] = Field(default="user")
73
75
  """Conversation role."""
74
76
 
75
- tool_call_id: str | None = Field(default=None)
76
- """ID of tool call this message has the content payload for."""
77
+ tool_call_id: list[str] | None = Field(default=None)
78
+ """ID(s) of tool call(s) this message has the content payload for."""
77
79
 
78
80
 
79
81
  class ChatMessageAssistant(ChatMessageBase):
@@ -83,6 +85,31 @@ class ChatMessageAssistant(ChatMessageBase):
83
85
  tool_calls: list[ToolCall] | None = Field(default=None)
84
86
  """Tool calls made by the model."""
85
87
 
88
+ reasoning: str | None = Field(default=None)
89
+ """Reasoning content."""
90
+
91
+ # Some OpenAI compatible REST endpoints include reasoning as a field alongside
92
+ # content, however since this field doesn't exist in the OpenAI interface,
93
+ # hosting providers (so far we've seen this with Together and Groq) may
94
+ # include the reasoning in a <think></think> tag before the main response.
95
+ # We expect this pattern to be repeated elsewhere, so include this hook to
96
+ # automatically extract the reasoning content when the response is prefaced
97
+ # with a <think> block. If this ends up being an overeach we can fall back
98
+ # to each provider manually parsing out <think> using a helper function.
99
+ # The implementation isn't important here, the critical thing to establish
100
+ # is that Inspect makes reasoning content available separately.
101
+ @model_validator(mode="before")
102
+ @classmethod
103
+ def extract_reasoning(cls, data: Any) -> Any:
104
+ if isinstance(data, dict):
105
+ content = data.get("content", None)
106
+ if isinstance(content, str):
107
+ parsed = parse_content_with_reasoning(content)
108
+ if parsed:
109
+ data["reasoning"] = parsed.reasoning
110
+ data["content"] = parsed.content
111
+ return data
112
+
86
113
 
87
114
  class ChatMessageTool(ChatMessageBase):
88
115
  role: Literal["tool"] = Field(default="tool")
@@ -2,7 +2,7 @@ from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
4
  from inspect_ai._util.rich import lines_display
5
- from inspect_ai._util.transcript import transcript_markdown
5
+ from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
6
6
  from inspect_ai.util._conversation import conversation_panel
7
7
  from inspect_ai.util._display import display_type
8
8
 
@@ -38,8 +38,15 @@ def conversation_assistant_message(
38
38
  content=transcript_markdown(m.text, escape=True),
39
39
  )
40
40
 
41
- # start with assistant content
42
- content: list[RenderableType] = (
41
+ # build content
42
+ content: list[RenderableType] = []
43
+
44
+ # reasoning
45
+ if message.reasoning:
46
+ content.extend(transcript_reasoning(message.reasoning))
47
+
48
+ # message text
49
+ content.extend(
43
50
  [transcript_markdown(message.text, escape=True)] if message.text else []
44
51
  )
45
52
 
@@ -75,6 +75,9 @@ class GenerateConfigArgs(TypedDict, total=False):
75
75
  reasoning_effort: Literal["low", "medium", "high"] | None
76
76
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
77
77
 
78
+ reasoning_history: bool | None
79
+ """Include reasoning in chat message history sent to generate."""
80
+
78
81
 
79
82
  class GenerateConfig(BaseModel):
80
83
  """Base class for model generation configs."""
@@ -145,6 +148,9 @@ class GenerateConfig(BaseModel):
145
148
  reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
146
149
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
147
150
 
151
+ reasoning_history: bool | None = Field(default=None)
152
+ """Include reasoning in chat message history sent to generate."""
153
+
148
154
  def merge(
149
155
  self, other: Union["GenerateConfig", GenerateConfigArgs]
150
156
  ) -> "GenerateConfig":
@@ -33,7 +33,6 @@ from inspect_ai._util.trace import trace_action
33
33
  from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
34
34
  from inspect_ai.tool._tool_def import ToolDef, tool_defs
35
35
  from inspect_ai.util import concurrency
36
- from inspect_ai.util._limit import SampleLimitExceededError
37
36
 
38
37
  from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
39
38
  from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
@@ -169,6 +168,10 @@ class ModelAPI(abc.ABC):
169
168
  """Tool results can contain images"""
170
169
  return False
171
170
 
171
+ def has_reasoning_history(self) -> bool:
172
+ """Chat message assistant messages can include reasoning."""
173
+ return False
174
+
172
175
 
173
176
  class Model:
174
177
  """Model interface."""
@@ -303,6 +306,11 @@ class Model:
303
306
  tools = []
304
307
  tool_choice = "none"
305
308
 
309
+ # handle reasoning history
310
+ input = resolve_reasoning_history(
311
+ input, config, self.api.has_reasoning_history()
312
+ )
313
+
306
314
  # apply any tool model_input handlers
307
315
  input = resolve_tool_model_input(tdefs, input)
308
316
 
@@ -727,6 +735,71 @@ def simple_input_messages(
727
735
  return messages
728
736
 
729
737
 
738
+ def resolve_reasoning_history(
739
+ messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
740
+ ) -> list[ChatMessage]:
741
+ # determine if we are including reasoning history
742
+ reasoning_history = config.reasoning_history is not False
743
+
744
+ # determine up front if we have any reasoning content
745
+ have_reasoning = any(
746
+ [
747
+ isinstance(m, ChatMessageAssistant) and m.reasoning is not None
748
+ for m in messages
749
+ ]
750
+ )
751
+ if not have_reasoning:
752
+ return messages
753
+
754
+ # API asssistant message format directly supports reasoning history so we will:
755
+ # (a) Remove reasoning content entirely if config says not to include it; or
756
+ # (b) Leave the messages alone if config says to include it
757
+ if api_has_reasoning_history:
758
+ # remove reasoning history as per config
759
+ if not reasoning_history:
760
+ resolved_messages: list[ChatMessage] = []
761
+ for message in messages:
762
+ if isinstance(message, ChatMessageAssistant):
763
+ resolved_messages.append(
764
+ message.model_copy(update={"reasoning": None})
765
+ )
766
+ else:
767
+ resolved_messages.append(message)
768
+
769
+ return resolved_messages
770
+
771
+ # include reasoning history as per config
772
+ else:
773
+ return messages
774
+
775
+ # API can't represent reasoning natively so include <think> tags
776
+ elif reasoning_history:
777
+ resolved_messages = []
778
+ for message in messages:
779
+ if (
780
+ isinstance(message, ChatMessageAssistant)
781
+ and message.reasoning is not None
782
+ ):
783
+ message = deepcopy(message)
784
+ if isinstance(message.content, str):
785
+ message.content = (
786
+ f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
787
+ )
788
+ else:
789
+ message.content.insert(
790
+ 0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
791
+ )
792
+ message.reasoning = None
793
+
794
+ resolved_messages.append(message)
795
+
796
+ return resolved_messages
797
+
798
+ # api doesn't handle reasoning and config says no reasoning_history, nothing to do
799
+ else:
800
+ return messages
801
+
802
+
730
803
  def resolve_tool_model_input(
731
804
  tdefs: list[ToolDef], messages: list[ChatMessage]
732
805
  ) -> list[ChatMessage]:
@@ -764,40 +837,104 @@ def resolve_tool_model_input(
764
837
  def tool_result_images_as_user_message(
765
838
  messages: list[ChatMessage],
766
839
  ) -> list[ChatMessage]:
767
- return functools.reduce(tool_result_images_reducer, messages, [])
840
+ """
841
+ To conform to models lacking support for images in tool responses, create an alternate message history that moves images into a fabricated user message.
842
+
843
+ Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
844
+ """
845
+ init_accum: ImagesAccumulator = ([], [], [])
846
+ chat_messages, user_message_content, tool_call_ids = functools.reduce(
847
+ tool_result_images_reducer, messages, init_accum
848
+ )
849
+ # if the last message was a tool result, we may need to flush the pending stuff here
850
+ return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
851
+
852
+
853
+ ImagesAccumulator = tuple[list[ChatMessage], list[Content], list[str]]
854
+ """
855
+ ImagesAccumulator is a tuple containing three lists:
856
+ - The first list contains ChatMessages that are the result of processing.
857
+ - The second list contains ContentImages that need to be inserted into a fabricated user message.
858
+ - The third list contains the tool_call_id's associated with the tool responses.
859
+ """
768
860
 
769
861
 
770
862
  def tool_result_images_reducer(
771
- messages: list[ChatMessage],
863
+ accum: ImagesAccumulator,
772
864
  message: ChatMessage,
773
- ) -> list[ChatMessage]:
865
+ ) -> ImagesAccumulator:
866
+ messages, pending_content, tool_call_ids = accum
774
867
  # if there are tool result images, pull them out into a ChatUserMessage
775
- if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
776
- tool_message = ChatMessageTool(
777
- content=message.content.copy(),
778
- tool_call_id=message.tool_call_id,
779
- function=message.function,
868
+ if (
869
+ isinstance(message, ChatMessageTool)
870
+ and isinstance(message.content, list)
871
+ and any([isinstance(c, ContentImage) for c in message.content])
872
+ ):
873
+ init_accum: ImageContentAccumulator = ([], [])
874
+ new_user_message_content, edited_tool_message_content = functools.reduce(
875
+ tool_result_image_content_reducer, message.content, init_accum
780
876
  )
781
- assert isinstance(tool_message.content, list)
782
- messages.append(tool_message)
783
-
784
- user_content: list[Content] = []
785
- for i in range(0, len(tool_message.content)):
786
- if isinstance(tool_message.content[i], ContentImage):
787
- user_content.append(message.content[i])
788
- tool_message.content[i] = ContentText(
789
- text="Image content is in the message below."
877
+
878
+ return (
879
+ messages
880
+ + [
881
+ ChatMessageTool(
882
+ content=edited_tool_message_content,
883
+ tool_call_id=message.tool_call_id,
884
+ function=message.function,
790
885
  )
791
- if len(user_content) > 0:
792
- messages.append(
793
- ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
794
- )
886
+ ],
887
+ pending_content + new_user_message_content,
888
+ tool_call_ids + ([message.tool_call_id] if message.tool_call_id else []),
889
+ )
795
890
 
796
891
  else:
797
- messages.append(message)
892
+ return (
893
+ maybe_adding_user_message(messages, pending_content, tool_call_ids)
894
+ + [message],
895
+ [],
896
+ [],
897
+ )
798
898
 
799
- # return messages
800
- return messages
899
+
900
+ ImageContentAccumulator = tuple[list[Content], list[Content]]
901
+ """
902
+ ImageContentAccumulator is a tuple containing two lists of Content objects:
903
+ - The first list contains ContentImages that will be included in a fabricated user message.
904
+ - The second list contains modified content for the tool message with images replaced with text.
905
+ """
906
+
907
+
908
+ def tool_result_image_content_reducer(
909
+ acc: ImageContentAccumulator, content: Content
910
+ ) -> ImageContentAccumulator:
911
+ """
912
+ Reduces the messages Content into two separate lists: one for a fabricated user message that will contain the images and one for modified tool message with the images replaced with text.
913
+
914
+ Returns:
915
+ ImageContentReducer: A tuple containing two lists of Content objects.
916
+ - The first list contains the images that will be included in a fabricated user message.
917
+ - The second list contains modified content for the tool message with images replaced with text.
918
+ """
919
+ new_user_message_content, edited_tool_message_content = acc
920
+ if isinstance(content, ContentImage):
921
+ return new_user_message_content + [content], edited_tool_message_content + [
922
+ ContentText(text="Image content is included below.")
923
+ ]
924
+
925
+ else:
926
+ return new_user_message_content, edited_tool_message_content + [content]
927
+
928
+
929
+ def maybe_adding_user_message(
930
+ messages: list[ChatMessage], content: list[Content], tool_call_ids: list[str]
931
+ ) -> list[ChatMessage]:
932
+ """If content is empty, return messages, otherwise, create a new ChatMessageUser with it and return a new messages list with that message added."""
933
+ return (
934
+ messages + [ChatMessageUser(content=content, tool_call_id=tool_call_ids)]
935
+ if content
936
+ else messages
937
+ )
801
938
 
802
939
 
803
940
  # Functions to reduce consecutive user messages to a single user message -> required for some models
@@ -884,6 +1021,7 @@ def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
884
1021
  active_sample_message_limit,
885
1022
  set_active_sample_total_messages,
886
1023
  )
1024
+ from inspect_ai.solver._limit import SampleLimitExceededError
887
1025
 
888
1026
  total_messages = 1 if isinstance(input, str) else len(input)
889
1027
  message_limit = active_sample_message_limit()
@@ -910,6 +1048,7 @@ def record_model_usage(model: str, usage: ModelUsage) -> None:
910
1048
  active_sample_token_limit,
911
1049
  set_active_sample_total_tokens,
912
1050
  )
1051
+ from inspect_ai.solver._limit import SampleLimitExceededError
913
1052
 
914
1053
  # record usage
915
1054
  set_model_usage(model, usage, sample_model_usage_context_var.get(None))
@@ -43,10 +43,18 @@ from ._chat_message import (
43
43
  from ._model_output import ModelUsage, StopReason, as_stop_reason
44
44
 
45
45
 
46
+ def is_o_series(name: str) -> bool:
47
+ return is_o1(name) or is_o3(name)
48
+
49
+
46
50
  def is_o1(name: str) -> bool:
47
51
  return name.startswith("o1")
48
52
 
49
53
 
54
+ def is_o3(name: str) -> bool:
55
+ return name.startswith("o3")
56
+
57
+
50
58
  def is_o1_full(name: str) -> bool:
51
59
  return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
52
60
 
@@ -55,10 +63,18 @@ def is_o1_mini(name: str) -> bool:
55
63
  return name.startswith("o1-mini")
56
64
 
57
65
 
66
+ def is_o3_mini(name: str) -> bool:
67
+ return name.startswith("o3-mini")
68
+
69
+
58
70
  def is_o1_preview(name: str) -> bool:
59
71
  return name.startswith("o1-preview")
60
72
 
61
73
 
74
+ def is_gpt(name: str) -> bool:
75
+ return name.startswith("gpt")
76
+
77
+
62
78
  def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
63
79
  return ChatCompletionMessageToolCall(
64
80
  type="function",
@@ -296,6 +312,14 @@ def chat_messages_from_openai(
296
312
  else:
297
313
  content = [content_from_openai(c) for c in asst_content]
298
314
 
315
+ # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
316
+ # interfaces e.g. DeepSeek do include this field so we pluck it out)
317
+ reasoning = message.get("reasoning_content", None) or message.get(
318
+ "reasoning", None
319
+ )
320
+ if reasoning is not None:
321
+ reasoning = str(reasoning)
322
+
299
323
  # return message
300
324
  if "tool_calls" in message:
301
325
  tool_calls: list[ToolCall] = []
@@ -306,7 +330,11 @@ def chat_messages_from_openai(
306
330
  else:
307
331
  tool_calls = []
308
332
  chat_messages.append(
309
- ChatMessageAssistant(content=content, tool_calls=tool_calls or None)
333
+ ChatMessageAssistant(
334
+ content=content,
335
+ tool_calls=tool_calls or None,
336
+ reasoning=reasoning,
337
+ )
310
338
  )
311
339
  elif message["role"] == "tool":
312
340
  tool_content = message.get("content", None) or ""
@@ -357,10 +385,14 @@ def chat_message_assistant_from_openai(
357
385
  message: ChatCompletionMessage, tools: list[ToolInfo]
358
386
  ) -> ChatMessageAssistant:
359
387
  refusal = getattr(message, "refusal", None)
388
+ reasoning = getattr(message, "reasoning_content", None) or getattr(
389
+ message, "reasoning", None
390
+ )
360
391
  return ChatMessageAssistant(
361
392
  content=refusal or message.content or "",
362
393
  source="generate",
363
394
  tool_calls=chat_tool_calls_from_openai(message, tools),
395
+ reasoning=reasoning,
364
396
  )
365
397
 
366
398