inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/generate.py +41 -35
- inspect_ai/_eval/task/run.py +20 -12
- inspect_ai/_util/hooks.py +17 -7
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/dist/assets/index.css +1 -0
- inspect_ai/_view/www/dist/assets/index.js +100 -94
- inspect_ai/_view/www/log-schema.json +35 -19
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
- inspect_ai/_view/www/src/types/log.d.ts +6 -4
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/model/_chat_message.py +29 -2
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +164 -25
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -3
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_basic_agent.py +78 -58
- inspect_ai/{util → solver}/_limit.py +13 -0
- inspect_ai/solver/_task_state.py +37 -7
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/util/__init__.py +0 -2
- inspect_ai/util/_sandbox/self_check.py +51 -28
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
@@ -260,13 +260,26 @@
|
|
260
260
|
],
|
261
261
|
"default": null,
|
262
262
|
"title": "Tool Calls"
|
263
|
+
},
|
264
|
+
"reasoning": {
|
265
|
+
"anyOf": [
|
266
|
+
{
|
267
|
+
"type": "string"
|
268
|
+
},
|
269
|
+
{
|
270
|
+
"type": "null"
|
271
|
+
}
|
272
|
+
],
|
273
|
+
"default": null,
|
274
|
+
"title": "Reasoning"
|
263
275
|
}
|
264
276
|
},
|
265
277
|
"required": [
|
266
278
|
"content",
|
267
279
|
"source",
|
268
280
|
"role",
|
269
|
-
"tool_calls"
|
281
|
+
"tool_calls",
|
282
|
+
"reasoning"
|
270
283
|
],
|
271
284
|
"title": "ChatMessageAssistant",
|
272
285
|
"type": "object",
|
@@ -486,7 +499,10 @@
|
|
486
499
|
"tool_call_id": {
|
487
500
|
"anyOf": [
|
488
501
|
{
|
489
|
-
"
|
502
|
+
"items": {
|
503
|
+
"type": "string"
|
504
|
+
},
|
505
|
+
"type": "array"
|
490
506
|
},
|
491
507
|
{
|
492
508
|
"type": "null"
|
@@ -1131,7 +1147,6 @@
|
|
1131
1147
|
"presence_penalty": null,
|
1132
1148
|
"logit_bias": null,
|
1133
1149
|
"seed": null,
|
1134
|
-
"suffix": null,
|
1135
1150
|
"top_k": null,
|
1136
1151
|
"num_choices": null,
|
1137
1152
|
"logprobs": null,
|
@@ -1140,7 +1155,8 @@
|
|
1140
1155
|
"internal_tools": null,
|
1141
1156
|
"max_tool_output": null,
|
1142
1157
|
"cache_prompt": null,
|
1143
|
-
"reasoning_effort": null
|
1158
|
+
"reasoning_effort": null,
|
1159
|
+
"reasoning_history": null
|
1144
1160
|
}
|
1145
1161
|
}
|
1146
1162
|
},
|
@@ -2120,18 +2136,6 @@
|
|
2120
2136
|
"default": null,
|
2121
2137
|
"title": "Seed"
|
2122
2138
|
},
|
2123
|
-
"suffix": {
|
2124
|
-
"anyOf": [
|
2125
|
-
{
|
2126
|
-
"type": "string"
|
2127
|
-
},
|
2128
|
-
{
|
2129
|
-
"type": "null"
|
2130
|
-
}
|
2131
|
-
],
|
2132
|
-
"default": null,
|
2133
|
-
"title": "Suffix"
|
2134
|
-
},
|
2135
2139
|
"top_k": {
|
2136
2140
|
"anyOf": [
|
2137
2141
|
{
|
@@ -2248,6 +2252,18 @@
|
|
2248
2252
|
],
|
2249
2253
|
"default": null,
|
2250
2254
|
"title": "Reasoning Effort"
|
2255
|
+
},
|
2256
|
+
"reasoning_history": {
|
2257
|
+
"anyOf": [
|
2258
|
+
{
|
2259
|
+
"type": "boolean"
|
2260
|
+
},
|
2261
|
+
{
|
2262
|
+
"type": "null"
|
2263
|
+
}
|
2264
|
+
],
|
2265
|
+
"default": null,
|
2266
|
+
"title": "Reasoning History"
|
2251
2267
|
}
|
2252
2268
|
},
|
2253
2269
|
"title": "GenerateConfig",
|
@@ -2266,7 +2282,6 @@
|
|
2266
2282
|
"presence_penalty",
|
2267
2283
|
"logit_bias",
|
2268
2284
|
"seed",
|
2269
|
-
"suffix",
|
2270
2285
|
"top_k",
|
2271
2286
|
"num_choices",
|
2272
2287
|
"logprobs",
|
@@ -2275,7 +2290,8 @@
|
|
2275
2290
|
"internal_tools",
|
2276
2291
|
"max_tool_output",
|
2277
2292
|
"cache_prompt",
|
2278
|
-
"reasoning_effort"
|
2293
|
+
"reasoning_effort",
|
2294
|
+
"reasoning_history"
|
2279
2295
|
],
|
2280
2296
|
"additionalProperties": false
|
2281
2297
|
},
|
@@ -4247,9 +4263,9 @@
|
|
4247
4263
|
"parallel_tool_calls": null,
|
4248
4264
|
"presence_penalty": null,
|
4249
4265
|
"reasoning_effort": null,
|
4266
|
+
"reasoning_history": null,
|
4250
4267
|
"seed": null,
|
4251
4268
|
"stop_seqs": null,
|
4252
|
-
"suffix": null,
|
4253
4269
|
"system_message": null,
|
4254
4270
|
"temperature": null,
|
4255
4271
|
"timeout": null,
|
@@ -8,6 +8,7 @@ import { ExpandablePanel } from "./ExpandablePanel.mjs";
|
|
8
8
|
import { FontSize, TextStyle } from "../appearance/Fonts.mjs";
|
9
9
|
import { resolveToolInput, ToolCallView } from "./Tools.mjs";
|
10
10
|
import { VirtualList } from "./VirtualList.mjs";
|
11
|
+
import { MarkdownDiv } from "./MarkdownDiv.mjs";
|
11
12
|
|
12
13
|
/**
|
13
14
|
* Renders the ChatViewVirtualList component.
|
@@ -282,7 +283,29 @@ const ChatMessage = ({
|
|
282
283
|
<i class="${iconForMsg(message)}"></i>
|
283
284
|
${message.role}
|
284
285
|
</div>
|
286
|
+
|
287
|
+
${
|
288
|
+
message.role === "assistant" && message.reasoning
|
289
|
+
? html` <div
|
290
|
+
style=${{
|
291
|
+
marginLeft: indented ? "1.1rem" : "0",
|
292
|
+
paddingBottom: "0.8rem",
|
293
|
+
}}
|
294
|
+
>
|
295
|
+
<div style=${{ ...TextStyle.label, ...TextStyle.secondary }}>Reasoning</div>
|
296
|
+
<${ExpandablePanel} collapse=${true}><${MarkdownDiv} markdown=${message.reasoning}/></${ExpandablePanel}>
|
297
|
+
</div>`
|
298
|
+
: undefined
|
299
|
+
}
|
300
|
+
|
285
301
|
<div style=${{ marginLeft: indented ? "1.1rem" : "0", paddingBottom: indented ? "0.8rem" : "0" }}>
|
302
|
+
${
|
303
|
+
message.role === "assistant" && message.reasoning
|
304
|
+
? html`<div style=${{ ...TextStyle.label, ...TextStyle.secondary }}>
|
305
|
+
Response
|
306
|
+
</div>`
|
307
|
+
: ""
|
308
|
+
}
|
286
309
|
<${ExpandablePanel} collapse=${collapse}>
|
287
310
|
<${MessageContents}
|
288
311
|
key=${`${id}-contents`}
|
@@ -70,7 +70,6 @@ export type LogitBias = {
|
|
70
70
|
[k: string]: number;
|
71
71
|
} | null;
|
72
72
|
export type Seed = number | null;
|
73
|
-
export type Suffix = string | null;
|
74
73
|
export type TopK = number | null;
|
75
74
|
export type NumChoices = number | null;
|
76
75
|
export type Logprobs = boolean | null;
|
@@ -80,6 +79,7 @@ export type InternalTools = boolean | null;
|
|
80
79
|
export type MaxToolOutput = number | null;
|
81
80
|
export type CachePrompt = "auto" | boolean | null;
|
82
81
|
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
82
|
+
export type ReasoningHistory = boolean | null;
|
83
83
|
export type TotalSamples = number;
|
84
84
|
export type CompletedSamples = number;
|
85
85
|
export type Name3 = string;
|
@@ -133,7 +133,7 @@ export type Content1 =
|
|
133
133
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
134
134
|
export type Source1 = ("input" | "generate") | null;
|
135
135
|
export type Role1 = "user";
|
136
|
-
export type ToolCallId = string | null;
|
136
|
+
export type ToolCallId = string[] | null;
|
137
137
|
export type Content2 =
|
138
138
|
| string
|
139
139
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
@@ -147,6 +147,7 @@ export type ParseError = string | null;
|
|
147
147
|
export type Title = string | null;
|
148
148
|
export type Format2 = "text" | "markdown";
|
149
149
|
export type Content3 = string;
|
150
|
+
export type Reasoning = string | null;
|
150
151
|
export type Content4 =
|
151
152
|
| string
|
152
153
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
@@ -547,7 +548,6 @@ export interface GenerateConfig {
|
|
547
548
|
presence_penalty: PresencePenalty;
|
548
549
|
logit_bias: LogitBias;
|
549
550
|
seed: Seed;
|
550
|
-
suffix: Suffix;
|
551
551
|
top_k: TopK;
|
552
552
|
num_choices: NumChoices;
|
553
553
|
logprobs: Logprobs;
|
@@ -557,6 +557,7 @@ export interface GenerateConfig {
|
|
557
557
|
max_tool_output: MaxToolOutput;
|
558
558
|
cache_prompt: CachePrompt;
|
559
559
|
reasoning_effort: ReasoningEffort;
|
560
|
+
reasoning_history: ReasoningHistory;
|
560
561
|
}
|
561
562
|
export interface EvalResults {
|
562
563
|
total_samples: TotalSamples;
|
@@ -658,6 +659,7 @@ export interface ChatMessageAssistant {
|
|
658
659
|
source: Source2;
|
659
660
|
role: Role2;
|
660
661
|
tool_calls: ToolCalls;
|
662
|
+
reasoning: Reasoning;
|
661
663
|
}
|
662
664
|
export interface ToolCall {
|
663
665
|
id: Id1;
|
@@ -901,7 +903,6 @@ export interface GenerateConfig1 {
|
|
901
903
|
presence_penalty: PresencePenalty;
|
902
904
|
logit_bias: LogitBias;
|
903
905
|
seed: Seed;
|
904
|
-
suffix: Suffix;
|
905
906
|
top_k: TopK;
|
906
907
|
num_choices: NumChoices;
|
907
908
|
logprobs: Logprobs;
|
@@ -911,6 +912,7 @@ export interface GenerateConfig1 {
|
|
911
912
|
max_tool_output: MaxToolOutput;
|
912
913
|
cache_prompt: CachePrompt;
|
913
914
|
reasoning_effort: ReasoningEffort;
|
915
|
+
reasoning_history: ReasoningHistory;
|
914
916
|
}
|
915
917
|
/**
|
916
918
|
* Model call (raw request/response data).
|
@@ -203,7 +203,7 @@ class EvalRecorder(FileRecorder):
|
|
203
203
|
# of small fetches from the zip file streams)
|
204
204
|
temp_log: str | None = None
|
205
205
|
fs = filesystem(location)
|
206
|
-
if not fs.is_local():
|
206
|
+
if not fs.is_local() and header_only is False:
|
207
207
|
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
208
208
|
temp_log = temp.name
|
209
209
|
fs.get_file(location, temp_log)
|
@@ -7,6 +7,8 @@ from inspect_ai._util.content import Content, ContentText
|
|
7
7
|
from inspect_ai.tool import ToolCall
|
8
8
|
from inspect_ai.tool._tool_call import ToolCallError
|
9
9
|
|
10
|
+
from ._reasoning import parse_content_with_reasoning
|
11
|
+
|
10
12
|
logger = getLogger(__name__)
|
11
13
|
|
12
14
|
|
@@ -72,8 +74,8 @@ class ChatMessageUser(ChatMessageBase):
|
|
72
74
|
role: Literal["user"] = Field(default="user")
|
73
75
|
"""Conversation role."""
|
74
76
|
|
75
|
-
tool_call_id: str | None = Field(default=None)
|
76
|
-
"""ID of tool call this message has the content payload for."""
|
77
|
+
tool_call_id: list[str] | None = Field(default=None)
|
78
|
+
"""ID(s) of tool call(s) this message has the content payload for."""
|
77
79
|
|
78
80
|
|
79
81
|
class ChatMessageAssistant(ChatMessageBase):
|
@@ -83,6 +85,31 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
83
85
|
tool_calls: list[ToolCall] | None = Field(default=None)
|
84
86
|
"""Tool calls made by the model."""
|
85
87
|
|
88
|
+
reasoning: str | None = Field(default=None)
|
89
|
+
"""Reasoning content."""
|
90
|
+
|
91
|
+
# Some OpenAI compatible REST endpoints include reasoning as a field alongside
|
92
|
+
# content, however since this field doesn't exist in the OpenAI interface,
|
93
|
+
# hosting providers (so far we've seen this with Together and Groq) may
|
94
|
+
# include the reasoning in a <think></think> tag before the main response.
|
95
|
+
# We expect this pattern to be repeated elsewhere, so include this hook to
|
96
|
+
# automatically extract the reasoning content when the response is prefaced
|
97
|
+
# with a <think> block. If this ends up being an overeach we can fall back
|
98
|
+
# to each provider manually parsing out <think> using a helper function.
|
99
|
+
# The implementation isn't important here, the critical thing to establish
|
100
|
+
# is that Inspect makes reasoning content available separately.
|
101
|
+
@model_validator(mode="before")
|
102
|
+
@classmethod
|
103
|
+
def extract_reasoning(cls, data: Any) -> Any:
|
104
|
+
if isinstance(data, dict):
|
105
|
+
content = data.get("content", None)
|
106
|
+
if isinstance(content, str):
|
107
|
+
parsed = parse_content_with_reasoning(content)
|
108
|
+
if parsed:
|
109
|
+
data["reasoning"] = parsed.reasoning
|
110
|
+
data["content"] = parsed.content
|
111
|
+
return data
|
112
|
+
|
86
113
|
|
87
114
|
class ChatMessageTool(ChatMessageBase):
|
88
115
|
role: Literal["tool"] = Field(default="tool")
|
@@ -2,7 +2,7 @@ from rich.console import RenderableType
|
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
4
|
from inspect_ai._util.rich import lines_display
|
5
|
-
from inspect_ai._util.transcript import transcript_markdown
|
5
|
+
from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
|
6
6
|
from inspect_ai.util._conversation import conversation_panel
|
7
7
|
from inspect_ai.util._display import display_type
|
8
8
|
|
@@ -38,8 +38,15 @@ def conversation_assistant_message(
|
|
38
38
|
content=transcript_markdown(m.text, escape=True),
|
39
39
|
)
|
40
40
|
|
41
|
-
#
|
42
|
-
content: list[RenderableType] =
|
41
|
+
# build content
|
42
|
+
content: list[RenderableType] = []
|
43
|
+
|
44
|
+
# reasoning
|
45
|
+
if message.reasoning:
|
46
|
+
content.extend(transcript_reasoning(message.reasoning))
|
47
|
+
|
48
|
+
# message text
|
49
|
+
content.extend(
|
43
50
|
[transcript_markdown(message.text, escape=True)] if message.text else []
|
44
51
|
)
|
45
52
|
|
@@ -75,6 +75,9 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
75
75
|
reasoning_effort: Literal["low", "medium", "high"] | None
|
76
76
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
77
77
|
|
78
|
+
reasoning_history: bool | None
|
79
|
+
"""Include reasoning in chat message history sent to generate."""
|
80
|
+
|
78
81
|
|
79
82
|
class GenerateConfig(BaseModel):
|
80
83
|
"""Base class for model generation configs."""
|
@@ -145,6 +148,9 @@ class GenerateConfig(BaseModel):
|
|
145
148
|
reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
|
146
149
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
147
150
|
|
151
|
+
reasoning_history: bool | None = Field(default=None)
|
152
|
+
"""Include reasoning in chat message history sent to generate."""
|
153
|
+
|
148
154
|
def merge(
|
149
155
|
self, other: Union["GenerateConfig", GenerateConfigArgs]
|
150
156
|
) -> "GenerateConfig":
|
inspect_ai/model/_model.py
CHANGED
@@ -33,7 +33,6 @@ from inspect_ai._util.trace import trace_action
|
|
33
33
|
from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
|
34
34
|
from inspect_ai.tool._tool_def import ToolDef, tool_defs
|
35
35
|
from inspect_ai.util import concurrency
|
36
|
-
from inspect_ai.util._limit import SampleLimitExceededError
|
37
36
|
|
38
37
|
from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
|
39
38
|
from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
|
@@ -169,6 +168,10 @@ class ModelAPI(abc.ABC):
|
|
169
168
|
"""Tool results can contain images"""
|
170
169
|
return False
|
171
170
|
|
171
|
+
def has_reasoning_history(self) -> bool:
|
172
|
+
"""Chat message assistant messages can include reasoning."""
|
173
|
+
return False
|
174
|
+
|
172
175
|
|
173
176
|
class Model:
|
174
177
|
"""Model interface."""
|
@@ -303,6 +306,11 @@ class Model:
|
|
303
306
|
tools = []
|
304
307
|
tool_choice = "none"
|
305
308
|
|
309
|
+
# handle reasoning history
|
310
|
+
input = resolve_reasoning_history(
|
311
|
+
input, config, self.api.has_reasoning_history()
|
312
|
+
)
|
313
|
+
|
306
314
|
# apply any tool model_input handlers
|
307
315
|
input = resolve_tool_model_input(tdefs, input)
|
308
316
|
|
@@ -727,6 +735,71 @@ def simple_input_messages(
|
|
727
735
|
return messages
|
728
736
|
|
729
737
|
|
738
|
+
def resolve_reasoning_history(
|
739
|
+
messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
|
740
|
+
) -> list[ChatMessage]:
|
741
|
+
# determine if we are including reasoning history
|
742
|
+
reasoning_history = config.reasoning_history is not False
|
743
|
+
|
744
|
+
# determine up front if we have any reasoning content
|
745
|
+
have_reasoning = any(
|
746
|
+
[
|
747
|
+
isinstance(m, ChatMessageAssistant) and m.reasoning is not None
|
748
|
+
for m in messages
|
749
|
+
]
|
750
|
+
)
|
751
|
+
if not have_reasoning:
|
752
|
+
return messages
|
753
|
+
|
754
|
+
# API asssistant message format directly supports reasoning history so we will:
|
755
|
+
# (a) Remove reasoning content entirely if config says not to include it; or
|
756
|
+
# (b) Leave the messages alone if config says to include it
|
757
|
+
if api_has_reasoning_history:
|
758
|
+
# remove reasoning history as per config
|
759
|
+
if not reasoning_history:
|
760
|
+
resolved_messages: list[ChatMessage] = []
|
761
|
+
for message in messages:
|
762
|
+
if isinstance(message, ChatMessageAssistant):
|
763
|
+
resolved_messages.append(
|
764
|
+
message.model_copy(update={"reasoning": None})
|
765
|
+
)
|
766
|
+
else:
|
767
|
+
resolved_messages.append(message)
|
768
|
+
|
769
|
+
return resolved_messages
|
770
|
+
|
771
|
+
# include reasoning history as per config
|
772
|
+
else:
|
773
|
+
return messages
|
774
|
+
|
775
|
+
# API can't represent reasoning natively so include <think> tags
|
776
|
+
elif reasoning_history:
|
777
|
+
resolved_messages = []
|
778
|
+
for message in messages:
|
779
|
+
if (
|
780
|
+
isinstance(message, ChatMessageAssistant)
|
781
|
+
and message.reasoning is not None
|
782
|
+
):
|
783
|
+
message = deepcopy(message)
|
784
|
+
if isinstance(message.content, str):
|
785
|
+
message.content = (
|
786
|
+
f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
|
787
|
+
)
|
788
|
+
else:
|
789
|
+
message.content.insert(
|
790
|
+
0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
|
791
|
+
)
|
792
|
+
message.reasoning = None
|
793
|
+
|
794
|
+
resolved_messages.append(message)
|
795
|
+
|
796
|
+
return resolved_messages
|
797
|
+
|
798
|
+
# api doesn't handle reasoning and config says no reasoning_history, nothing to do
|
799
|
+
else:
|
800
|
+
return messages
|
801
|
+
|
802
|
+
|
730
803
|
def resolve_tool_model_input(
|
731
804
|
tdefs: list[ToolDef], messages: list[ChatMessage]
|
732
805
|
) -> list[ChatMessage]:
|
@@ -764,40 +837,104 @@ def resolve_tool_model_input(
|
|
764
837
|
def tool_result_images_as_user_message(
|
765
838
|
messages: list[ChatMessage],
|
766
839
|
) -> list[ChatMessage]:
|
767
|
-
|
840
|
+
"""
|
841
|
+
To conform to models lacking support for images in tool responses, create an alternate message history that moves images into a fabricated user message.
|
842
|
+
|
843
|
+
Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
|
844
|
+
"""
|
845
|
+
init_accum: ImagesAccumulator = ([], [], [])
|
846
|
+
chat_messages, user_message_content, tool_call_ids = functools.reduce(
|
847
|
+
tool_result_images_reducer, messages, init_accum
|
848
|
+
)
|
849
|
+
# if the last message was a tool result, we may need to flush the pending stuff here
|
850
|
+
return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
|
851
|
+
|
852
|
+
|
853
|
+
ImagesAccumulator = tuple[list[ChatMessage], list[Content], list[str]]
|
854
|
+
"""
|
855
|
+
ImagesAccumulator is a tuple containing three lists:
|
856
|
+
- The first list contains ChatMessages that are the result of processing.
|
857
|
+
- The second list contains ContentImages that need to be inserted into a fabricated user message.
|
858
|
+
- The third list contains the tool_call_id's associated with the tool responses.
|
859
|
+
"""
|
768
860
|
|
769
861
|
|
770
862
|
def tool_result_images_reducer(
|
771
|
-
|
863
|
+
accum: ImagesAccumulator,
|
772
864
|
message: ChatMessage,
|
773
|
-
) ->
|
865
|
+
) -> ImagesAccumulator:
|
866
|
+
messages, pending_content, tool_call_ids = accum
|
774
867
|
# if there are tool result images, pull them out into a ChatUserMessage
|
775
|
-
if
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
868
|
+
if (
|
869
|
+
isinstance(message, ChatMessageTool)
|
870
|
+
and isinstance(message.content, list)
|
871
|
+
and any([isinstance(c, ContentImage) for c in message.content])
|
872
|
+
):
|
873
|
+
init_accum: ImageContentAccumulator = ([], [])
|
874
|
+
new_user_message_content, edited_tool_message_content = functools.reduce(
|
875
|
+
tool_result_image_content_reducer, message.content, init_accum
|
780
876
|
)
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
text="Image content is in the message below."
|
877
|
+
|
878
|
+
return (
|
879
|
+
messages
|
880
|
+
+ [
|
881
|
+
ChatMessageTool(
|
882
|
+
content=edited_tool_message_content,
|
883
|
+
tool_call_id=message.tool_call_id,
|
884
|
+
function=message.function,
|
790
885
|
)
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
886
|
+
],
|
887
|
+
pending_content + new_user_message_content,
|
888
|
+
tool_call_ids + ([message.tool_call_id] if message.tool_call_id else []),
|
889
|
+
)
|
795
890
|
|
796
891
|
else:
|
797
|
-
|
892
|
+
return (
|
893
|
+
maybe_adding_user_message(messages, pending_content, tool_call_ids)
|
894
|
+
+ [message],
|
895
|
+
[],
|
896
|
+
[],
|
897
|
+
)
|
798
898
|
|
799
|
-
|
800
|
-
|
899
|
+
|
900
|
+
ImageContentAccumulator = tuple[list[Content], list[Content]]
|
901
|
+
"""
|
902
|
+
ImageContentAccumulator is a tuple containing two lists of Content objects:
|
903
|
+
- The first list contains ContentImages that will be included in a fabricated user message.
|
904
|
+
- The second list contains modified content for the tool message with images replaced with text.
|
905
|
+
"""
|
906
|
+
|
907
|
+
|
908
|
+
def tool_result_image_content_reducer(
|
909
|
+
acc: ImageContentAccumulator, content: Content
|
910
|
+
) -> ImageContentAccumulator:
|
911
|
+
"""
|
912
|
+
Reduces the messages Content into two separate lists: one for a fabricated user message that will contain the images and one for modified tool message with the images replaced with text.
|
913
|
+
|
914
|
+
Returns:
|
915
|
+
ImageContentReducer: A tuple containing two lists of Content objects.
|
916
|
+
- The first list contains the images that will be included in a fabricated user message.
|
917
|
+
- The second list contains modified content for the tool message with images replaced with text.
|
918
|
+
"""
|
919
|
+
new_user_message_content, edited_tool_message_content = acc
|
920
|
+
if isinstance(content, ContentImage):
|
921
|
+
return new_user_message_content + [content], edited_tool_message_content + [
|
922
|
+
ContentText(text="Image content is included below.")
|
923
|
+
]
|
924
|
+
|
925
|
+
else:
|
926
|
+
return new_user_message_content, edited_tool_message_content + [content]
|
927
|
+
|
928
|
+
|
929
|
+
def maybe_adding_user_message(
|
930
|
+
messages: list[ChatMessage], content: list[Content], tool_call_ids: list[str]
|
931
|
+
) -> list[ChatMessage]:
|
932
|
+
"""If content is empty, return messages, otherwise, create a new ChatMessageUser with it and return a new messages list with that message added."""
|
933
|
+
return (
|
934
|
+
messages + [ChatMessageUser(content=content, tool_call_id=tool_call_ids)]
|
935
|
+
if content
|
936
|
+
else messages
|
937
|
+
)
|
801
938
|
|
802
939
|
|
803
940
|
# Functions to reduce consecutive user messages to a single user message -> required for some models
|
@@ -884,6 +1021,7 @@ def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
|
|
884
1021
|
active_sample_message_limit,
|
885
1022
|
set_active_sample_total_messages,
|
886
1023
|
)
|
1024
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
887
1025
|
|
888
1026
|
total_messages = 1 if isinstance(input, str) else len(input)
|
889
1027
|
message_limit = active_sample_message_limit()
|
@@ -910,6 +1048,7 @@ def record_model_usage(model: str, usage: ModelUsage) -> None:
|
|
910
1048
|
active_sample_token_limit,
|
911
1049
|
set_active_sample_total_tokens,
|
912
1050
|
)
|
1051
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
913
1052
|
|
914
1053
|
# record usage
|
915
1054
|
set_model_usage(model, usage, sample_model_usage_context_var.get(None))
|
inspect_ai/model/_openai.py
CHANGED
@@ -43,10 +43,18 @@ from ._chat_message import (
|
|
43
43
|
from ._model_output import ModelUsage, StopReason, as_stop_reason
|
44
44
|
|
45
45
|
|
46
|
+
def is_o_series(name: str) -> bool:
|
47
|
+
return is_o1(name) or is_o3(name)
|
48
|
+
|
49
|
+
|
46
50
|
def is_o1(name: str) -> bool:
|
47
51
|
return name.startswith("o1")
|
48
52
|
|
49
53
|
|
54
|
+
def is_o3(name: str) -> bool:
|
55
|
+
return name.startswith("o3")
|
56
|
+
|
57
|
+
|
50
58
|
def is_o1_full(name: str) -> bool:
|
51
59
|
return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
|
52
60
|
|
@@ -55,10 +63,18 @@ def is_o1_mini(name: str) -> bool:
|
|
55
63
|
return name.startswith("o1-mini")
|
56
64
|
|
57
65
|
|
66
|
+
def is_o3_mini(name: str) -> bool:
|
67
|
+
return name.startswith("o3-mini")
|
68
|
+
|
69
|
+
|
58
70
|
def is_o1_preview(name: str) -> bool:
|
59
71
|
return name.startswith("o1-preview")
|
60
72
|
|
61
73
|
|
74
|
+
def is_gpt(name: str) -> bool:
|
75
|
+
return name.startswith("gpt")
|
76
|
+
|
77
|
+
|
62
78
|
def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
|
63
79
|
return ChatCompletionMessageToolCall(
|
64
80
|
type="function",
|
@@ -296,6 +312,14 @@ def chat_messages_from_openai(
|
|
296
312
|
else:
|
297
313
|
content = [content_from_openai(c) for c in asst_content]
|
298
314
|
|
315
|
+
# resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
|
316
|
+
# interfaces e.g. DeepSeek do include this field so we pluck it out)
|
317
|
+
reasoning = message.get("reasoning_content", None) or message.get(
|
318
|
+
"reasoning", None
|
319
|
+
)
|
320
|
+
if reasoning is not None:
|
321
|
+
reasoning = str(reasoning)
|
322
|
+
|
299
323
|
# return message
|
300
324
|
if "tool_calls" in message:
|
301
325
|
tool_calls: list[ToolCall] = []
|
@@ -306,7 +330,11 @@ def chat_messages_from_openai(
|
|
306
330
|
else:
|
307
331
|
tool_calls = []
|
308
332
|
chat_messages.append(
|
309
|
-
ChatMessageAssistant(
|
333
|
+
ChatMessageAssistant(
|
334
|
+
content=content,
|
335
|
+
tool_calls=tool_calls or None,
|
336
|
+
reasoning=reasoning,
|
337
|
+
)
|
310
338
|
)
|
311
339
|
elif message["role"] == "tool":
|
312
340
|
tool_content = message.get("content", None) or ""
|
@@ -357,10 +385,14 @@ def chat_message_assistant_from_openai(
|
|
357
385
|
message: ChatCompletionMessage, tools: list[ToolInfo]
|
358
386
|
) -> ChatMessageAssistant:
|
359
387
|
refusal = getattr(message, "refusal", None)
|
388
|
+
reasoning = getattr(message, "reasoning_content", None) or getattr(
|
389
|
+
message, "reasoning", None
|
390
|
+
)
|
360
391
|
return ChatMessageAssistant(
|
361
392
|
content=refusal or message.content or "",
|
362
393
|
source="generate",
|
363
394
|
tool_calls=chat_tool_calls_from_openai(message, tools),
|
395
|
+
reasoning=reasoning,
|
364
396
|
)
|
365
397
|
|
366
398
|
|