inspect-ai 0.3.70__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +14 -8
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +6 -1
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +10 -6
- inspect_ai/_eval/task/run.py +5 -8
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/transcript.py +10 -4
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +115 -87
- inspect_ai/_view/www/dist/assets/index.js +5324 -2276
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +283 -20
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/types/log.d.ts +129 -34
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +1 -1
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/_condense.py +3 -0
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_transcript.py +58 -1
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +7 -0
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +10 -8
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +133 -57
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +134 -26
- inspect_ai/model/_providers/google.py +27 -8
- inspect_ai/model/_providers/groq.py +9 -4
- inspect_ai/model/_providers/openai.py +57 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/events.py +55 -21
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +197 -182
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
inspect_ai/model/_openai.py
CHANGED
@@ -27,11 +27,18 @@ from openai.types.chat.chat_completion_message_tool_call import Function
|
|
27
27
|
from openai.types.completion_usage import CompletionUsage
|
28
28
|
from openai.types.shared_params.function_definition import FunctionDefinition
|
29
29
|
|
30
|
-
from inspect_ai._util.content import
|
30
|
+
from inspect_ai._util.content import (
|
31
|
+
Content,
|
32
|
+
ContentAudio,
|
33
|
+
ContentImage,
|
34
|
+
ContentReasoning,
|
35
|
+
ContentText,
|
36
|
+
)
|
31
37
|
from inspect_ai._util.images import file_as_data_uri
|
32
38
|
from inspect_ai._util.url import is_http_url
|
33
39
|
from inspect_ai.model._call_tools import parse_tool_call
|
34
40
|
from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
|
41
|
+
from inspect_ai.model._reasoning import parse_content_with_reasoning
|
35
42
|
from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
|
36
43
|
|
37
44
|
from ._chat_message import (
|
@@ -148,14 +155,14 @@ async def openai_chat_message(
|
|
148
155
|
if message.tool_calls:
|
149
156
|
return ChatCompletionAssistantMessageParam(
|
150
157
|
role=message.role,
|
151
|
-
content=message
|
158
|
+
content=openai_assistant_content(message),
|
152
159
|
tool_calls=[
|
153
160
|
openai_chat_tool_call_param(call) for call in message.tool_calls
|
154
161
|
],
|
155
162
|
)
|
156
163
|
else:
|
157
164
|
return ChatCompletionAssistantMessageParam(
|
158
|
-
role=message.role, content=message
|
165
|
+
role=message.role, content=openai_assistant_content(message)
|
159
166
|
)
|
160
167
|
elif message.role == "tool":
|
161
168
|
return ChatCompletionToolMessageParam(
|
@@ -175,16 +182,29 @@ async def openai_chat_messages(
|
|
175
182
|
return [await openai_chat_message(message, model) for message in messages]
|
176
183
|
|
177
184
|
|
185
|
+
def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
186
|
+
if isinstance(message.content, str):
|
187
|
+
content = message.content
|
188
|
+
else:
|
189
|
+
content = ""
|
190
|
+
for c in message.content:
|
191
|
+
if c.type == "reasoning":
|
192
|
+
attribs = ""
|
193
|
+
if c.signature is not None:
|
194
|
+
attribs = f'{attribs} signature="{c.signature}"'
|
195
|
+
if c.redacted:
|
196
|
+
attribs = f'{attribs} redacted="true"'
|
197
|
+
content = f"{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n"
|
198
|
+
elif c.type == "text":
|
199
|
+
content = f"{content}\n{c.text}"
|
200
|
+
return content
|
201
|
+
|
202
|
+
|
178
203
|
def openai_chat_choices(choices: list[ChatCompletionChoice]) -> list[Choice]:
|
179
204
|
oai_choices: list[Choice] = []
|
180
205
|
|
181
206
|
for index, choice in enumerate(choices):
|
182
|
-
|
183
|
-
content = choice.message.content
|
184
|
-
else:
|
185
|
-
content = "\n".join(
|
186
|
-
[c.text for c in choice.message.content if c.type == "text"]
|
187
|
-
)
|
207
|
+
content = openai_assistant_content(choice.message)
|
188
208
|
if choice.message.tool_calls:
|
189
209
|
tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
|
190
210
|
else:
|
@@ -274,35 +294,47 @@ def chat_messages_from_openai(
|
|
274
294
|
chat_messages: list[ChatMessage] = []
|
275
295
|
|
276
296
|
for message in messages:
|
297
|
+
content: str | list[Content] = []
|
277
298
|
if message["role"] == "system" or message["role"] == "developer":
|
278
299
|
sys_content = message["content"]
|
279
300
|
if isinstance(sys_content, str):
|
280
301
|
chat_messages.append(ChatMessageSystem(content=sys_content))
|
281
302
|
else:
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
)
|
303
|
+
content = []
|
304
|
+
for sc in sys_content:
|
305
|
+
content.extend(content_from_openai(sc))
|
306
|
+
chat_messages.append(ChatMessageSystem(content=content))
|
287
307
|
elif message["role"] == "user":
|
288
308
|
user_content = message["content"]
|
289
309
|
if isinstance(user_content, str):
|
290
310
|
chat_messages.append(ChatMessageUser(content=user_content))
|
291
311
|
else:
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
)
|
312
|
+
content = []
|
313
|
+
for uc in user_content:
|
314
|
+
content.extend(content_from_openai(uc))
|
315
|
+
chat_messages.append(ChatMessageUser(content=content))
|
297
316
|
elif message["role"] == "assistant":
|
298
317
|
# resolve content
|
299
|
-
asst_content = message
|
318
|
+
asst_content = message.get("content", None)
|
300
319
|
if isinstance(asst_content, str):
|
301
|
-
|
320
|
+
result = parse_content_with_reasoning(asst_content)
|
321
|
+
if result is not None:
|
322
|
+
content = [
|
323
|
+
ContentReasoning(
|
324
|
+
reasoning=result.reasoning,
|
325
|
+
signature=result.signature,
|
326
|
+
redacted=result.redacted,
|
327
|
+
),
|
328
|
+
ContentText(text=result.content),
|
329
|
+
]
|
330
|
+
else:
|
331
|
+
content = asst_content
|
302
332
|
elif asst_content is None:
|
303
333
|
content = message.get("refusal", None) or ""
|
304
334
|
else:
|
305
|
-
content = [
|
335
|
+
content = []
|
336
|
+
for ac in asst_content:
|
337
|
+
content.extend(content_from_openai(ac, parse_reasoning=True))
|
306
338
|
|
307
339
|
# resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
|
308
340
|
# interfaces e.g. DeepSeek do include this field so we pluck it out)
|
@@ -310,22 +342,25 @@ def chat_messages_from_openai(
|
|
310
342
|
"reasoning", None
|
311
343
|
)
|
312
344
|
if reasoning is not None:
|
313
|
-
|
345
|
+
if isinstance(content, str):
|
346
|
+
content = [ContentText(text=content)]
|
347
|
+
else:
|
348
|
+
content.insert(0, ContentReasoning(reasoning=str(reasoning)))
|
314
349
|
|
315
350
|
# return message
|
316
351
|
if "tool_calls" in message:
|
317
352
|
tool_calls: list[ToolCall] = []
|
318
|
-
for
|
319
|
-
tool_calls.append(tool_call_from_openai(
|
320
|
-
tool_names[
|
353
|
+
for call in message["tool_calls"]:
|
354
|
+
tool_calls.append(tool_call_from_openai(call))
|
355
|
+
tool_names[call["id"]] = call["function"]["name"]
|
321
356
|
|
322
357
|
else:
|
323
358
|
tool_calls = []
|
359
|
+
|
324
360
|
chat_messages.append(
|
325
361
|
ChatMessageAssistant(
|
326
362
|
content=content,
|
327
363
|
tool_calls=tool_calls or None,
|
328
|
-
reasoning=reasoning,
|
329
364
|
)
|
330
365
|
)
|
331
366
|
elif message["role"] == "tool":
|
@@ -333,7 +368,9 @@ def chat_messages_from_openai(
|
|
333
368
|
if isinstance(tool_content, str):
|
334
369
|
content = tool_content
|
335
370
|
else:
|
336
|
-
content = [
|
371
|
+
content = []
|
372
|
+
for tc in tool_content:
|
373
|
+
content.extend(content_from_openai(tc))
|
337
374
|
chat_messages.append(
|
338
375
|
ChatMessageTool(
|
339
376
|
content=content,
|
@@ -357,20 +394,40 @@ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> Tool
|
|
357
394
|
|
358
395
|
def content_from_openai(
|
359
396
|
content: ChatCompletionContentPartParam | ChatCompletionContentPartRefusalParam,
|
360
|
-
|
397
|
+
parse_reasoning: bool = False,
|
398
|
+
) -> list[Content]:
|
361
399
|
if content["type"] == "text":
|
362
|
-
|
400
|
+
text = content["text"]
|
401
|
+
if parse_reasoning:
|
402
|
+
result = parse_content_with_reasoning(text)
|
403
|
+
if result:
|
404
|
+
return [
|
405
|
+
ContentReasoning(
|
406
|
+
reasoning=result.reasoning,
|
407
|
+
signature=result.signature,
|
408
|
+
redacted=result.redacted,
|
409
|
+
),
|
410
|
+
ContentText(text=result.content),
|
411
|
+
]
|
412
|
+
else:
|
413
|
+
return [ContentText(text=text)]
|
414
|
+
else:
|
415
|
+
return [ContentText(text=text)]
|
363
416
|
elif content["type"] == "image_url":
|
364
|
-
return
|
365
|
-
|
366
|
-
|
417
|
+
return [
|
418
|
+
ContentImage(
|
419
|
+
image=content["image_url"]["url"], detail=content["image_url"]["detail"]
|
420
|
+
)
|
421
|
+
]
|
367
422
|
elif content["type"] == "input_audio":
|
368
|
-
return
|
369
|
-
|
370
|
-
|
371
|
-
|
423
|
+
return [
|
424
|
+
ContentAudio(
|
425
|
+
audio=content["input_audio"]["data"],
|
426
|
+
format=content["input_audio"]["format"],
|
427
|
+
)
|
428
|
+
]
|
372
429
|
elif content["type"] == "refusal":
|
373
|
-
return ContentText(text=content["refusal"])
|
430
|
+
return [ContentText(text=content["refusal"])]
|
374
431
|
|
375
432
|
|
376
433
|
def chat_message_assistant_from_openai(
|
@@ -380,11 +437,20 @@ def chat_message_assistant_from_openai(
|
|
380
437
|
reasoning = getattr(message, "reasoning_content", None) or getattr(
|
381
438
|
message, "reasoning", None
|
382
439
|
)
|
440
|
+
|
441
|
+
msg_content = refusal or message.content or ""
|
442
|
+
if reasoning is not None:
|
443
|
+
content: str | list[Content] = [
|
444
|
+
ContentReasoning(reasoning=str(reasoning)),
|
445
|
+
ContentText(text=msg_content),
|
446
|
+
]
|
447
|
+
else:
|
448
|
+
content = msg_content
|
449
|
+
|
383
450
|
return ChatMessageAssistant(
|
384
|
-
content=
|
451
|
+
content=content,
|
385
452
|
source="generate",
|
386
453
|
tool_calls=chat_tool_calls_from_openai(message, tools),
|
387
|
-
reasoning=reasoning,
|
388
454
|
)
|
389
455
|
|
390
456
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import functools
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import sys
|
4
5
|
from copy import copy
|
5
6
|
from logging import getLogger
|
@@ -28,8 +29,12 @@ from anthropic.types import (
|
|
28
29
|
ImageBlockParam,
|
29
30
|
Message,
|
30
31
|
MessageParam,
|
32
|
+
RedactedThinkingBlock,
|
33
|
+
RedactedThinkingBlockParam,
|
31
34
|
TextBlock,
|
32
35
|
TextBlockParam,
|
36
|
+
ThinkingBlock,
|
37
|
+
ThinkingBlockParam,
|
33
38
|
ToolParam,
|
34
39
|
ToolResultBlockParam,
|
35
40
|
ToolUseBlock,
|
@@ -44,7 +49,12 @@ from inspect_ai._util.constants import (
|
|
44
49
|
DEFAULT_MAX_RETRIES,
|
45
50
|
NO_CONTENT,
|
46
51
|
)
|
47
|
-
from inspect_ai._util.content import
|
52
|
+
from inspect_ai._util.content import (
|
53
|
+
Content,
|
54
|
+
ContentImage,
|
55
|
+
ContentReasoning,
|
56
|
+
ContentText,
|
57
|
+
)
|
48
58
|
from inspect_ai._util.error import exception_message
|
49
59
|
from inspect_ai._util.images import file_as_data_uri
|
50
60
|
from inspect_ai._util.logger import warn_once
|
@@ -204,23 +214,33 @@ class AnthropicAPI(ModelAPI):
|
|
204
214
|
request["system"] = system_param
|
205
215
|
request["tools"] = tools_param
|
206
216
|
if len(tools) > 0:
|
207
|
-
request["tool_choice"] = message_tool_choice(
|
217
|
+
request["tool_choice"] = message_tool_choice(
|
218
|
+
tool_choice, self.is_using_thinking(config)
|
219
|
+
)
|
208
220
|
|
209
221
|
# additional options
|
210
|
-
|
222
|
+
req, headers, betas = self.completion_config(config)
|
223
|
+
request = request | req
|
211
224
|
|
212
225
|
# extra headers (for time tracker and computer use)
|
213
|
-
extra_headers = {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
|
226
|
+
extra_headers = headers | {HttpxTimeTracker.REQUEST_ID_HEADER: request_id}
|
214
227
|
if computer_use:
|
215
|
-
|
228
|
+
betas.append("computer-use-2024-10-22")
|
229
|
+
if len(betas) > 0:
|
230
|
+
extra_headers["anthropic-beta"] = ",".join(betas)
|
231
|
+
|
216
232
|
request["extra_headers"] = extra_headers
|
217
233
|
|
218
234
|
# extra_body
|
219
235
|
if self.extra_body is not None:
|
220
236
|
request["extra_body"] = self.extra_body
|
221
237
|
|
222
|
-
# make request
|
223
|
-
|
238
|
+
# make request (stream if we are using reasoning)
|
239
|
+
if self.is_using_thinking(config):
|
240
|
+
async with self.client.messages.stream(**request) as stream:
|
241
|
+
message = await stream.get_final_message()
|
242
|
+
else:
|
243
|
+
message = await self.client.messages.create(**request, stream=False)
|
224
244
|
|
225
245
|
# set response for ModelCall
|
226
246
|
response = message.model_dump()
|
@@ -245,27 +265,67 @@ class AnthropicAPI(ModelAPI):
|
|
245
265
|
else:
|
246
266
|
raise ex
|
247
267
|
|
248
|
-
def
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
268
|
+
def completion_config(
|
269
|
+
self, config: GenerateConfig
|
270
|
+
) -> tuple[dict[str, Any], dict[str, str], list[str]]:
|
271
|
+
max_tokens = cast(int, config.max_tokens)
|
272
|
+
params = dict(model=self.model_name, max_tokens=max_tokens)
|
273
|
+
headers: dict[str, str] = {}
|
274
|
+
betas: list[str] = []
|
275
|
+
# some params not compatible with thinking models
|
276
|
+
if not self.is_using_thinking(config):
|
277
|
+
if config.temperature is not None:
|
278
|
+
params["temperature"] = config.temperature
|
279
|
+
if config.top_p is not None:
|
280
|
+
params["top_p"] = config.top_p
|
281
|
+
if config.top_k is not None:
|
282
|
+
params["top_k"] = config.top_k
|
283
|
+
|
284
|
+
# some thinking-only stuff
|
285
|
+
if self.is_using_thinking(config):
|
286
|
+
params["thinking"] = dict(
|
287
|
+
type="enabled", budget_tokens=config.reasoning_tokens
|
288
|
+
)
|
289
|
+
headers["anthropic-version"] = "2023-06-01"
|
290
|
+
if max_tokens > 8192:
|
291
|
+
betas.append("output-128k-2025-02-19")
|
292
|
+
|
293
|
+
# config that applies to all models
|
256
294
|
if config.timeout is not None:
|
257
295
|
params["timeout"] = float(config.timeout)
|
258
296
|
if config.stop_seqs is not None:
|
259
297
|
params["stop_sequences"] = config.stop_seqs
|
260
|
-
|
298
|
+
|
299
|
+
# return config
|
300
|
+
return params, headers, betas
|
261
301
|
|
262
302
|
@override
|
263
303
|
def max_tokens(self) -> int | None:
|
264
304
|
# anthropic requires you to explicitly specify max_tokens (most others
|
265
305
|
# set it to the maximum allowable output tokens for the model).
|
266
|
-
# set to 4096 which is the
|
306
|
+
# set to 4096 which is the highest possible for claude 3 (claude 3.5
|
307
|
+
# allows up to 8192)
|
267
308
|
return 4096
|
268
309
|
|
310
|
+
@override
|
311
|
+
def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
|
312
|
+
max_tokens = cast(int, self.max_tokens())
|
313
|
+
if self.is_thinking_model() and config.reasoning_tokens is not None:
|
314
|
+
max_tokens = max_tokens + config.reasoning_tokens
|
315
|
+
return max_tokens
|
316
|
+
|
317
|
+
def is_using_thinking(self, config: GenerateConfig) -> bool:
|
318
|
+
return self.is_thinking_model() and config.reasoning_tokens is not None
|
319
|
+
|
320
|
+
def is_thinking_model(self) -> bool:
|
321
|
+
return not self.is_claude_3() and not self.is_claude_3_5()
|
322
|
+
|
323
|
+
def is_claude_3(self) -> bool:
|
324
|
+
return re.search(r"claude-3-[a-zA-Z]", self.model_name) is not None
|
325
|
+
|
326
|
+
def is_claude_3_5(self) -> bool:
|
327
|
+
return "claude-3-5-" in self.model_name
|
328
|
+
|
269
329
|
@override
|
270
330
|
def connection_key(self) -> str:
|
271
331
|
return str(self.api_key)
|
@@ -295,6 +355,14 @@ class AnthropicAPI(ModelAPI):
|
|
295
355
|
def tool_result_images(self) -> bool:
|
296
356
|
return True
|
297
357
|
|
358
|
+
@override
|
359
|
+
def emulate_reasoning_history(self) -> bool:
|
360
|
+
return False
|
361
|
+
|
362
|
+
@override
|
363
|
+
def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
|
364
|
+
return "all"
|
365
|
+
|
298
366
|
# convert some common BadRequestError states into 'refusal' model output
|
299
367
|
def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | Exception:
|
300
368
|
error = exception_message(ex).lower()
|
@@ -498,7 +566,7 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
|
|
498
566
|
role = a["role"]
|
499
567
|
a_content = a["content"]
|
500
568
|
b_content = b["content"]
|
501
|
-
if isinstance(a_content, str) and isinstance(
|
569
|
+
if isinstance(a_content, str) and isinstance(b_content, str):
|
502
570
|
return MessageParam(role=role, content=f"{a_content}\n{b_content}")
|
503
571
|
elif isinstance(a_content, list) and isinstance(b_content, list):
|
504
572
|
return MessageParam(role=role, content=a_content + b_content)
|
@@ -514,9 +582,15 @@ def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
|
|
514
582
|
raise ValueError(f"Unexpected content types for messages: {a}, {b}")
|
515
583
|
|
516
584
|
|
517
|
-
def message_tool_choice(
|
585
|
+
def message_tool_choice(
|
586
|
+
tool_choice: ToolChoice, thinking_model: bool
|
587
|
+
) -> message_create_params.ToolChoice:
|
518
588
|
if isinstance(tool_choice, ToolFunction):
|
519
|
-
|
589
|
+
# forced tool use not compatible with thinking models
|
590
|
+
if thinking_model:
|
591
|
+
return {"type": "any"}
|
592
|
+
else:
|
593
|
+
return {"type": "tool", "name": tool_choice.name}
|
520
594
|
elif tool_choice == "any":
|
521
595
|
return {"type": "any"}
|
522
596
|
elif tool_choice == "none":
|
@@ -544,9 +618,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
|
|
544
618
|
# "tool" means serving a tool call result back to claude
|
545
619
|
elif message.role == "tool":
|
546
620
|
if message.error is not None:
|
547
|
-
content:
|
548
|
-
|
549
|
-
|
621
|
+
content: (
|
622
|
+
str
|
623
|
+
| list[
|
624
|
+
TextBlockParam
|
625
|
+
| ImageBlockParam
|
626
|
+
| ThinkingBlockParam
|
627
|
+
| RedactedThinkingBlockParam
|
628
|
+
]
|
629
|
+
) = message.error.message
|
550
630
|
# anthropic requires that content be populated when
|
551
631
|
# is_error is true (throws bad_request_error when not)
|
552
632
|
# so make sure this precondition is met
|
@@ -567,7 +647,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
|
|
567
647
|
ToolResultBlockParam(
|
568
648
|
tool_use_id=str(message.tool_call_id),
|
569
649
|
type="tool_result",
|
570
|
-
content=content,
|
650
|
+
content=cast(list[TextBlockParam | ImageBlockParam], content),
|
571
651
|
is_error=message.error is not None,
|
572
652
|
)
|
573
653
|
],
|
@@ -576,7 +656,13 @@ async def message_param(message: ChatMessage) -> MessageParam:
|
|
576
656
|
# tool_calls means claude is attempting to call our tools
|
577
657
|
elif message.role == "assistant" and message.tool_calls:
|
578
658
|
# first include content (claude <thinking>)
|
579
|
-
tools_content: list[
|
659
|
+
tools_content: list[
|
660
|
+
TextBlockParam
|
661
|
+
| ThinkingBlockParam
|
662
|
+
| RedactedThinkingBlockParam
|
663
|
+
| ImageBlockParam
|
664
|
+
| ToolUseBlockParam
|
665
|
+
] = (
|
580
666
|
[TextBlockParam(type="text", text=message.content or NO_CONTENT)]
|
581
667
|
if isinstance(message.content, str)
|
582
668
|
else (
|
@@ -645,6 +731,16 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
|
|
645
731
|
arguments=content_block.model_dump().get("input", {}),
|
646
732
|
)
|
647
733
|
)
|
734
|
+
elif isinstance(content_block, RedactedThinkingBlock):
|
735
|
+
content.append(
|
736
|
+
ContentReasoning(reasoning=content_block.data, redacted=True)
|
737
|
+
)
|
738
|
+
elif isinstance(content_block, ThinkingBlock):
|
739
|
+
content.append(
|
740
|
+
ContentReasoning(
|
741
|
+
reasoning=content_block.thinking, signature=content_block.signature
|
742
|
+
)
|
743
|
+
)
|
648
744
|
|
649
745
|
# resolve choice
|
650
746
|
choice = ChatCompletionChoice(
|
@@ -702,7 +798,7 @@ def split_system_messages(
|
|
702
798
|
|
703
799
|
async def message_param_content(
|
704
800
|
content: Content,
|
705
|
-
) -> TextBlockParam | ImageBlockParam:
|
801
|
+
) -> TextBlockParam | ImageBlockParam | ThinkingBlockParam | RedactedThinkingBlockParam:
|
706
802
|
if isinstance(content, ContentText):
|
707
803
|
return TextBlockParam(type="text", text=content.text or NO_CONTENT)
|
708
804
|
elif isinstance(content, ContentImage):
|
@@ -720,6 +816,18 @@ async def message_param_content(
|
|
720
816
|
type="image",
|
721
817
|
source=dict(type="base64", media_type=cast(Any, media_type), data=image),
|
722
818
|
)
|
819
|
+
elif isinstance(content, ContentReasoning):
|
820
|
+
if content.redacted:
|
821
|
+
return RedactedThinkingBlockParam(
|
822
|
+
type="redacted_thinking",
|
823
|
+
data=content.reasoning,
|
824
|
+
)
|
825
|
+
else:
|
826
|
+
if content.signature is None:
|
827
|
+
raise ValueError("Thinking content without signature.")
|
828
|
+
return ThinkingBlockParam(
|
829
|
+
type="thinking", thinking=content.reasoning, signature=content.signature
|
830
|
+
)
|
723
831
|
else:
|
724
832
|
raise RuntimeError(
|
725
833
|
"Anthropic models do not currently support audio or video inputs."
|
@@ -38,10 +38,13 @@ from pydantic import JsonValue
|
|
38
38
|
from typing_extensions import override
|
39
39
|
|
40
40
|
from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
|
41
|
-
from inspect_ai._util.content import
|
41
|
+
from inspect_ai._util.content import (
|
42
|
+
Content as InspectContent,
|
43
|
+
)
|
42
44
|
from inspect_ai._util.content import (
|
43
45
|
ContentAudio,
|
44
46
|
ContentImage,
|
47
|
+
ContentReasoning,
|
45
48
|
ContentText,
|
46
49
|
ContentVideo,
|
47
50
|
)
|
@@ -250,7 +253,10 @@ class GoogleGenAIAPI(ModelAPI):
|
|
250
253
|
|
251
254
|
@override
|
252
255
|
def is_rate_limit(self, ex: BaseException) -> bool:
|
253
|
-
|
256
|
+
# see https://cloud.google.com/storage/docs/retry-strategy
|
257
|
+
return isinstance(ex, APIError) and (
|
258
|
+
ex.code in (408, 429, 429) or ex.code >= 500
|
259
|
+
)
|
254
260
|
|
255
261
|
@override
|
256
262
|
def connection_key(self) -> str:
|
@@ -405,6 +411,8 @@ async def content_part(client: Client, content: InspectContent | str) -> Part:
|
|
405
411
|
return Part.from_text(text=content or NO_CONTENT)
|
406
412
|
elif isinstance(content, ContentText):
|
407
413
|
return Part.from_text(text=content.text or NO_CONTENT)
|
414
|
+
elif isinstance(content, ContentReasoning):
|
415
|
+
return Part.from_text(text=content.reasoning or NO_CONTENT)
|
408
416
|
else:
|
409
417
|
return await chat_content_to_part(client, content)
|
410
418
|
|
@@ -417,7 +425,8 @@ async def chat_content_to_part(
|
|
417
425
|
content_bytes, mime_type = await file_as_data(content.image)
|
418
426
|
return Part.from_bytes(mime_type=mime_type, data=content_bytes)
|
419
427
|
else:
|
420
|
-
|
428
|
+
file = await file_for_content(client, content)
|
429
|
+
return Part.from_uri(file_uri=file.uri, mime_type=file.mime_type)
|
421
430
|
|
422
431
|
|
423
432
|
async def extract_system_message_as_parts(
|
@@ -552,11 +561,19 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
|
|
552
561
|
# stop reason
|
553
562
|
stop_reason = finish_reason_to_stop_reason(candidate.finish_reason)
|
554
563
|
|
564
|
+
# choice content may include reasoning
|
565
|
+
if reasoning:
|
566
|
+
choice_content: str | list[Content] = [
|
567
|
+
ContentReasoning(reasoning=reasoning),
|
568
|
+
ContentText(text=content),
|
569
|
+
]
|
570
|
+
else:
|
571
|
+
choice_content = content
|
572
|
+
|
555
573
|
# build choice
|
556
574
|
choice = ChatCompletionChoice(
|
557
575
|
message=ChatMessageAssistant(
|
558
|
-
content=
|
559
|
-
reasoning=reasoning,
|
576
|
+
content=choice_content,
|
560
577
|
tool_calls=tool_calls if len(tool_calls) > 0 else None,
|
561
578
|
source="generate",
|
562
579
|
),
|
@@ -742,7 +759,7 @@ async def file_for_content(
|
|
742
759
|
uploaded_file = files_db.get(content_sha256)
|
743
760
|
if uploaded_file:
|
744
761
|
try:
|
745
|
-
upload: File = client.files.get(uploaded_file)
|
762
|
+
upload: File = client.files.get(name=uploaded_file)
|
746
763
|
if upload.state.name == "ACTIVE":
|
747
764
|
trace(f"Using uploaded file: {uploaded_file}")
|
748
765
|
return upload
|
@@ -754,10 +771,12 @@ async def file_for_content(
|
|
754
771
|
trace(f"Error attempting to access uploaded file: {ex}")
|
755
772
|
files_db.delete(content_sha256)
|
756
773
|
# do the upload (and record it)
|
757
|
-
upload = client.files.upload(
|
774
|
+
upload = client.files.upload(
|
775
|
+
file=BytesIO(content_bytes), config=dict(mime_type=mime_type)
|
776
|
+
)
|
758
777
|
while upload.state.name == "PROCESSING":
|
759
778
|
await asyncio.sleep(3)
|
760
|
-
upload = client.files.get(upload.name)
|
779
|
+
upload = client.files.get(name=upload.name)
|
761
780
|
if upload.state.name == "FAILED":
|
762
781
|
trace(f"Failed to upload file '{upload.name}: {upload.error}")
|
763
782
|
raise ValueError(f"Google file upload failed: {upload.error}")
|
@@ -28,7 +28,7 @@ from inspect_ai._util.constants import (
|
|
28
28
|
DEFAULT_MAX_RETRIES,
|
29
29
|
DEFAULT_MAX_TOKENS,
|
30
30
|
)
|
31
|
-
from inspect_ai._util.content import Content
|
31
|
+
from inspect_ai._util.content import Content, ContentReasoning, ContentText
|
32
32
|
from inspect_ai._util.images import file_as_data_uri
|
33
33
|
from inspect_ai._util.url import is_http_url
|
34
34
|
from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
|
@@ -326,12 +326,17 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
|
|
326
326
|
def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
|
327
327
|
reasoning = getattr(message, "reasoning", None)
|
328
328
|
if reasoning is not None:
|
329
|
-
|
329
|
+
content: str | list[Content] = [
|
330
|
+
ContentReasoning(reasoning=str(reasoning)),
|
331
|
+
ContentText(text=message.content or ""),
|
332
|
+
]
|
333
|
+
else:
|
334
|
+
content = message.content or ""
|
335
|
+
|
330
336
|
return ChatMessageAssistant(
|
331
|
-
content=
|
337
|
+
content=content,
|
332
338
|
source="generate",
|
333
339
|
tool_calls=chat_tool_calls(message, tools),
|
334
|
-
reasoning=reasoning,
|
335
340
|
)
|
336
341
|
|
337
342
|
|