inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +14 -8
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +6 -1
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +10 -6
- inspect_ai/_eval/task/run.py +5 -8
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/transcript.py +10 -4
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +115 -87
- inspect_ai/_view/www/dist/assets/index.js +5324 -2276
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +283 -20
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/types/log.d.ts +129 -34
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +1 -1
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/_condense.py +3 -0
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_transcript.py +58 -1
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +7 -0
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +10 -8
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +133 -57
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +281 -153
- inspect_ai/model/_providers/google.py +27 -8
- inspect_ai/model/_providers/groq.py +9 -4
- inspect_ai/model/_providers/openai.py +57 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_common.py +117 -58
- inspect_ai/tool/_tools/_computer/_computer.py +80 -57
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_computer/test_args.py +151 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/events.py +55 -21
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
|
|
3
3
|
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
5
5
|
|
6
|
-
from inspect_ai._util.content import Content, ContentText
|
6
|
+
from inspect_ai._util.content import Content, ContentReasoning, ContentText
|
7
7
|
from inspect_ai.tool import ToolCall
|
8
8
|
from inspect_ai.tool._tool_call import ToolCallError
|
9
9
|
|
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
|
|
64
64
|
self.content = text
|
65
65
|
else:
|
66
66
|
all_other = [content for content in self.content if content.type != "text"]
|
67
|
-
self.content = [ContentText(text=text)]
|
67
|
+
self.content = all_other + [ContentText(text=text)]
|
68
68
|
|
69
69
|
|
70
70
|
class ChatMessageSystem(ChatMessageBase):
|
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
93
93
|
tool_calls: list[ToolCall] | None = Field(default=None)
|
94
94
|
"""Tool calls made by the model."""
|
95
95
|
|
96
|
-
reasoning: str | None = Field(default=None)
|
97
|
-
"""Reasoning content."""
|
98
|
-
|
99
96
|
# Some OpenAI compatible REST endpoints include reasoning as a field alongside
|
100
97
|
# content, however since this field doesn't exist in the OpenAI interface,
|
101
98
|
# hosting providers (so far we've seen this with Together and Groq) may
|
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
110
107
|
@classmethod
|
111
108
|
def extract_reasoning(cls, data: Any) -> Any:
|
112
109
|
if isinstance(data, dict):
|
110
|
+
# cleave apart <think> blocks
|
113
111
|
content = data.get("content", None)
|
114
112
|
if isinstance(content, str):
|
115
113
|
parsed = parse_content_with_reasoning(content)
|
116
114
|
if parsed:
|
117
|
-
data["
|
118
|
-
|
115
|
+
data["content"] = [
|
116
|
+
ContentReasoning(reasoning=parsed.reasoning),
|
117
|
+
ContentText(text=parsed.content),
|
118
|
+
]
|
119
|
+
# migrate messages that has explicit 'reasoning' field
|
120
|
+
# (which was our original representation of reasoning)
|
121
|
+
reasoning = data.get("reasoning", None)
|
122
|
+
if isinstance(reasoning, str):
|
123
|
+
# ensure that content is a list
|
124
|
+
content = data.get("content", None)
|
125
|
+
if content is None:
|
126
|
+
data["content"] = []
|
127
|
+
elif isinstance(content, str):
|
128
|
+
data["content"] = [ContentText(text=content)]
|
129
|
+
elif not isinstance(content, list):
|
130
|
+
data["content"] = []
|
131
|
+
data["content"].insert(0, ContentReasoning(reasoning=reasoning))
|
132
|
+
|
133
|
+
del data["reasoning"]
|
119
134
|
return data
|
120
135
|
|
121
136
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from rich.console import RenderableType
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
|
+
from inspect_ai._util.content import ContentReasoning, ContentText
|
4
5
|
from inspect_ai._util.rich import lines_display
|
5
6
|
from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
|
6
7
|
from inspect_ai.util._conversation import conversation_panel
|
@@ -41,14 +42,15 @@ def conversation_assistant_message(
|
|
41
42
|
# build content
|
42
43
|
content: list[RenderableType] = []
|
43
44
|
|
44
|
-
#
|
45
|
-
if message.
|
46
|
-
content.extend(
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
45
|
+
# deal with plain text or with content blocks
|
46
|
+
if isinstance(message.content, str):
|
47
|
+
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
48
|
+
else:
|
49
|
+
for c in message.content:
|
50
|
+
if isinstance(c, ContentReasoning):
|
51
|
+
content.extend(transcript_reasoning(c))
|
52
|
+
elif isinstance(c, ContentText) and c.text:
|
53
|
+
content.extend([transcript_markdown(c.text.strip(), escape=True)])
|
52
54
|
|
53
55
|
# print tool calls
|
54
56
|
if message.tool_calls:
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from contextvars import ContextVar
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import Literal, Union
|
3
|
+
from typing import Any, Literal, Union
|
4
4
|
|
5
|
-
from pydantic import BaseModel, Field
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
6
6
|
from typing_extensions import TypedDict
|
7
7
|
|
8
8
|
|
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
75
75
|
reasoning_effort: Literal["low", "medium", "high"] | None
|
76
76
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
77
77
|
|
78
|
-
|
78
|
+
reasoning_tokens: int | None
|
79
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
80
|
+
|
81
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None
|
79
82
|
"""Include reasoning in chat message history sent to generate."""
|
80
83
|
|
81
84
|
|
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
|
|
148
151
|
reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
|
149
152
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
150
153
|
|
151
|
-
|
154
|
+
reasoning_tokens: int | None = Field(default=None)
|
155
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
156
|
+
|
157
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
|
158
|
+
default=None
|
159
|
+
)
|
152
160
|
"""Include reasoning in chat message history sent to generate."""
|
153
161
|
|
162
|
+
# migrate reasoning_history as a bool
|
163
|
+
@model_validator(mode="before")
|
164
|
+
@classmethod
|
165
|
+
def migrate_reasoning(cls, data: Any) -> Any:
|
166
|
+
if isinstance(data, dict):
|
167
|
+
reasoning_history = data.get("reasoning_history", None)
|
168
|
+
if reasoning_history is True:
|
169
|
+
data["reasoning_history"] = "all"
|
170
|
+
elif reasoning_history is False:
|
171
|
+
data["reasoning_history"] = "none"
|
172
|
+
|
173
|
+
return data
|
174
|
+
|
154
175
|
def merge(
|
155
176
|
self, other: Union["GenerateConfig", GenerateConfigArgs]
|
156
177
|
) -> "GenerateConfig":
|
inspect_ai/model/_model.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import time
|
8
8
|
from contextvars import ContextVar
|
9
9
|
from copy import deepcopy
|
10
|
+
from datetime import datetime
|
10
11
|
from types import TracebackType
|
11
12
|
from typing import Any, AsyncIterator, Callable, Literal, Type, cast
|
12
13
|
|
@@ -21,7 +22,12 @@ from tenacity import (
|
|
21
22
|
)
|
22
23
|
|
23
24
|
from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
|
24
|
-
from inspect_ai._util.content import
|
25
|
+
from inspect_ai._util.content import (
|
26
|
+
Content,
|
27
|
+
ContentImage,
|
28
|
+
ContentReasoning,
|
29
|
+
ContentText,
|
30
|
+
)
|
25
31
|
from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
|
26
32
|
from inspect_ai._util.interrupt import check_sample_interrupt
|
27
33
|
from inspect_ai._util.platform import platform_init
|
@@ -33,7 +39,7 @@ from inspect_ai._util.registry import (
|
|
33
39
|
)
|
34
40
|
from inspect_ai._util.retry import log_rate_limit_retry
|
35
41
|
from inspect_ai._util.trace import trace_action
|
36
|
-
from inspect_ai._util.working import report_sample_waiting_time
|
42
|
+
from inspect_ai._util.working import report_sample_waiting_time, sample_working_time
|
37
43
|
from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
|
38
44
|
from inspect_ai.tool._tool_def import ToolDef, tool_defs
|
39
45
|
from inspect_ai.util import concurrency
|
@@ -148,6 +154,17 @@ class ModelAPI(abc.ABC):
|
|
148
154
|
"""Default max_tokens."""
|
149
155
|
return None
|
150
156
|
|
157
|
+
def max_tokens_for_config(self, config: GenerateConfig) -> int | None:
|
158
|
+
"""Default max_tokens for a given config.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
config: Generation config.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
Default maximum tokens for specified configuration.
|
165
|
+
"""
|
166
|
+
return None
|
167
|
+
|
151
168
|
def max_connections(self) -> int:
|
152
169
|
"""Default max_connections."""
|
153
170
|
return DEFAULT_MAX_CONNECTIONS
|
@@ -180,9 +197,17 @@ class ModelAPI(abc.ABC):
|
|
180
197
|
"""Tool results can contain images"""
|
181
198
|
return False
|
182
199
|
|
183
|
-
def
|
184
|
-
"""Chat message assistant messages
|
185
|
-
return
|
200
|
+
def emulate_reasoning_history(self) -> bool:
|
201
|
+
"""Chat message assistant messages with reasoning should playback reasoning with emulation (.e.g. <think> tags)"""
|
202
|
+
return True
|
203
|
+
|
204
|
+
def force_reasoning_history(self) -> Literal["none", "all", "last"] | None:
|
205
|
+
"""Force a specific reasoning history behavior for this provider."""
|
206
|
+
return None
|
207
|
+
|
208
|
+
def auto_reasoning_history(self) -> Literal["none", "all", "last"]:
|
209
|
+
"""Behavior to use for reasoning_history='auto'"""
|
210
|
+
return "all"
|
186
211
|
|
187
212
|
|
188
213
|
class Model:
|
@@ -285,9 +310,10 @@ class Model:
|
|
285
310
|
config = base_config.merge(config)
|
286
311
|
|
287
312
|
# provide max_tokens from the model api if required
|
288
|
-
config.max_tokens
|
289
|
-
config.max_tokens
|
290
|
-
|
313
|
+
if config.max_tokens is None:
|
314
|
+
config.max_tokens = self.api.max_tokens_for_config(config)
|
315
|
+
if config.max_tokens is None:
|
316
|
+
config.max_tokens = self.api.max_tokens()
|
291
317
|
|
292
318
|
# disable parallel tool calls if requested by any of our tools
|
293
319
|
if disable_parallel_tools(tools):
|
@@ -302,8 +328,11 @@ class Model:
|
|
302
328
|
input = [ChatMessageSystem(content=config.system_message)] + input
|
303
329
|
|
304
330
|
# enforce concurrency limits
|
331
|
+
start_time = datetime.now()
|
332
|
+
working_start = sample_working_time()
|
305
333
|
async with self._connection_concurrency(config):
|
306
|
-
|
334
|
+
# generate
|
335
|
+
output = await self._generate(
|
307
336
|
input=input,
|
308
337
|
tools=tools,
|
309
338
|
tool_choice=tool_choice,
|
@@ -311,6 +340,28 @@ class Model:
|
|
311
340
|
cache=cache,
|
312
341
|
)
|
313
342
|
|
343
|
+
# update the most recent ModelEvent with the actual start/completed
|
344
|
+
# times as well as a computation of working time (events are
|
345
|
+
# created _after_ the call to _generate, potentially in response
|
346
|
+
# to retries, so they need their timestamp updated so it accurately
|
347
|
+
# reflects the full start/end time which we know here)
|
348
|
+
from inspect_ai.log._transcript import ModelEvent, transcript
|
349
|
+
|
350
|
+
last_model_event = transcript().find_last_event(ModelEvent)
|
351
|
+
if last_model_event:
|
352
|
+
last_model_event.timestamp = start_time
|
353
|
+
last_model_event.working_start = working_start
|
354
|
+
completed = datetime.now()
|
355
|
+
last_model_event.completed = completed
|
356
|
+
last_model_event.working_time = (
|
357
|
+
output.time
|
358
|
+
if output.time is not None
|
359
|
+
else (completed - start_time).total_seconds()
|
360
|
+
)
|
361
|
+
|
362
|
+
# return output
|
363
|
+
return output
|
364
|
+
|
314
365
|
async def _generate(
|
315
366
|
self,
|
316
367
|
input: list[ChatMessage],
|
@@ -349,9 +400,7 @@ class Model:
|
|
349
400
|
tool_choice = "none"
|
350
401
|
|
351
402
|
# handle reasoning history
|
352
|
-
input = resolve_reasoning_history(
|
353
|
-
input, config, self.api.has_reasoning_history()
|
354
|
-
)
|
403
|
+
input = resolve_reasoning_history(input, config, self.api)
|
355
404
|
|
356
405
|
# apply any tool model_input handlers
|
357
406
|
input = resolve_tool_model_input(tdefs, input)
|
@@ -849,68 +898,91 @@ def simple_input_messages(
|
|
849
898
|
|
850
899
|
|
851
900
|
def resolve_reasoning_history(
|
852
|
-
messages: list[ChatMessage],
|
901
|
+
messages: list[ChatMessage],
|
902
|
+
config: GenerateConfig,
|
903
|
+
model_api: ModelAPI,
|
853
904
|
) -> list[ChatMessage]:
|
854
|
-
# determine if we are including reasoning history
|
855
|
-
reasoning_history = config.reasoning_history is not False
|
856
|
-
|
857
905
|
# determine up front if we have any reasoning content
|
858
906
|
have_reasoning = any(
|
859
907
|
[
|
860
|
-
isinstance(m, ChatMessageAssistant)
|
908
|
+
isinstance(m, ChatMessageAssistant)
|
909
|
+
and isinstance(m.content, list)
|
910
|
+
and any([c for c in m.content if isinstance(c, ContentReasoning)])
|
861
911
|
for m in messages
|
862
912
|
]
|
863
913
|
)
|
864
914
|
if not have_reasoning:
|
865
915
|
return messages
|
866
916
|
|
867
|
-
#
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
# remove reasoning history as per config
|
872
|
-
if not reasoning_history:
|
873
|
-
resolved_messages: list[ChatMessage] = []
|
874
|
-
for message in messages:
|
875
|
-
if isinstance(message, ChatMessageAssistant):
|
876
|
-
resolved_messages.append(
|
877
|
-
message.model_copy(update={"reasoning": None})
|
878
|
-
)
|
879
|
-
else:
|
880
|
-
resolved_messages.append(message)
|
881
|
-
|
882
|
-
return resolved_messages
|
883
|
-
|
884
|
-
# include reasoning history as per config
|
885
|
-
else:
|
886
|
-
return messages
|
917
|
+
# determine reasoning history configuration
|
918
|
+
reasoning_history = (
|
919
|
+
config.reasoning_history if config.reasoning_history is not None else "auto"
|
920
|
+
)
|
887
921
|
|
888
|
-
#
|
889
|
-
|
922
|
+
# see if the provider is forcing a reasoning history
|
923
|
+
force = model_api.force_reasoning_history()
|
924
|
+
if force is not None:
|
925
|
+
reasoning_history = force
|
926
|
+
# if it's 'auto' then defer to the provider
|
927
|
+
elif reasoning_history == "auto":
|
928
|
+
reasoning_history = model_api.auto_reasoning_history()
|
929
|
+
|
930
|
+
# generate a version of message history with the correct history
|
931
|
+
if reasoning_history == "all":
|
932
|
+
resolved_messages: list[ChatMessage] = messages
|
933
|
+
else:
|
934
|
+
found_last = False
|
890
935
|
resolved_messages = []
|
891
|
-
for message in messages:
|
892
|
-
if (
|
893
|
-
|
894
|
-
and message.reasoning is not None
|
936
|
+
for message in reversed(messages):
|
937
|
+
if isinstance(message, ChatMessageAssistant) and isinstance(
|
938
|
+
message.content, list
|
895
939
|
):
|
896
|
-
|
897
|
-
|
898
|
-
message.content
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
940
|
+
# is there reasoning in this message?
|
941
|
+
has_reasoning = any(
|
942
|
+
isinstance(c, ContentReasoning) for c in message.content
|
943
|
+
)
|
944
|
+
# remove it unless we are in "last" mode and haven't yet found last
|
945
|
+
if has_reasoning:
|
946
|
+
if reasoning_history == "none" or found_last:
|
947
|
+
message = message.model_copy(
|
948
|
+
update={
|
949
|
+
"content": [
|
950
|
+
content
|
951
|
+
for content in message.content
|
952
|
+
if not isinstance(content, ContentReasoning)
|
953
|
+
]
|
954
|
+
}
|
955
|
+
)
|
956
|
+
found_last = True
|
906
957
|
|
907
958
|
resolved_messages.append(message)
|
908
959
|
|
909
|
-
|
960
|
+
# reverse them back
|
961
|
+
resolved_messages.reverse()
|
910
962
|
|
911
|
-
# api
|
912
|
-
|
913
|
-
|
963
|
+
# api can't represent reasoning natively so emulate it
|
964
|
+
if model_api.emulate_reasoning_history():
|
965
|
+
emulated_messages: list[ChatMessage] = []
|
966
|
+
for message in resolved_messages:
|
967
|
+
if isinstance(message, ChatMessageAssistant) and isinstance(
|
968
|
+
message.content, list
|
969
|
+
):
|
970
|
+
content: list[Content] = []
|
971
|
+
for c in message.content:
|
972
|
+
if isinstance(c, ContentReasoning):
|
973
|
+
content.append(
|
974
|
+
ContentText(text=f"<think>\n{c.reasoning}\n</think>")
|
975
|
+
)
|
976
|
+
else:
|
977
|
+
content.append(c)
|
978
|
+
message = message.model_copy(update={"content": content})
|
979
|
+
|
980
|
+
emulated_messages.append(message)
|
981
|
+
|
982
|
+
resolved_messages = emulated_messages
|
983
|
+
|
984
|
+
# return messages
|
985
|
+
return resolved_messages
|
914
986
|
|
915
987
|
|
916
988
|
def resolve_tool_model_input(
|
@@ -1200,6 +1272,10 @@ def set_model_usage(
|
|
1200
1272
|
if total_usage.input_tokens_cache_read is None:
|
1201
1273
|
total_usage.input_tokens_cache_read = 0
|
1202
1274
|
total_usage.input_tokens_cache_read += usage.input_tokens_cache_read
|
1275
|
+
if usage.reasoning_tokens is not None:
|
1276
|
+
if total_usage.reasoning_tokens is None:
|
1277
|
+
total_usage.reasoning_tokens = 0
|
1278
|
+
total_usage.reasoning_tokens += usage.reasoning_tokens
|
1203
1279
|
|
1204
1280
|
model_usage[model] = total_usage
|
1205
1281
|
|
@@ -26,6 +26,9 @@ class ModelUsage(BaseModel):
|
|
26
26
|
input_tokens_cache_read: int | None = Field(default=None)
|
27
27
|
"""Number of tokens retrieved from the cache."""
|
28
28
|
|
29
|
+
reasoning_tokens: int | None = Field(default=None)
|
30
|
+
"""Number of tokens used for reasoning."""
|
31
|
+
|
29
32
|
|
30
33
|
StopReason = Literal[
|
31
34
|
"stop",
|