inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -9
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +9 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +35 -18
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +49 -23
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +20 -6
- inspect_ai/_util/working.py +50 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +171 -99
- inspect_ai/_view/www/dist/assets/index.js +5972 -2770
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +619 -21
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +312 -137
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +8 -5
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +4 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +89 -2
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +8 -1
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +11 -9
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +164 -72
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +145 -26
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +29 -8
- inspect_ai/model/_providers/groq.py +66 -27
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +66 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +183 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
|
16
16
|
from inspect_ai._util.content import (
|
17
17
|
ContentAudio,
|
18
18
|
ContentImage,
|
19
|
+
ContentReasoning,
|
19
20
|
ContentText,
|
20
21
|
ContentVideo,
|
21
22
|
)
|
@@ -252,7 +253,11 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
|
|
252
253
|
for message in inputs:
|
253
254
|
if not isinstance(message.content, str):
|
254
255
|
filtered_content: list[
|
255
|
-
ContentText
|
256
|
+
ContentText
|
257
|
+
| ContentReasoning
|
258
|
+
| ContentImage
|
259
|
+
| ContentAudio
|
260
|
+
| ContentVideo
|
256
261
|
] = []
|
257
262
|
for content in message.content:
|
258
263
|
if content.type == "text":
|
inspect_ai/log/_samples.py
CHANGED
@@ -23,6 +23,7 @@ class ActiveSample:
|
|
23
23
|
message_limit: int | None,
|
24
24
|
token_limit: int | None,
|
25
25
|
time_limit: int | None,
|
26
|
+
working_limit: int | None,
|
26
27
|
fails_on_error: bool,
|
27
28
|
transcript: Transcript,
|
28
29
|
sandboxes: dict[str, SandboxConnection],
|
@@ -37,6 +38,7 @@ class ActiveSample:
|
|
37
38
|
self.message_limit = message_limit
|
38
39
|
self.token_limit = token_limit
|
39
40
|
self.time_limit = time_limit
|
41
|
+
self.working_limit = working_limit
|
40
42
|
self.fails_on_error = fails_on_error
|
41
43
|
self.total_messages = 0
|
42
44
|
self.total_tokens = 0
|
@@ -45,7 +47,7 @@ class ActiveSample:
|
|
45
47
|
self._interrupt_action: Literal["score", "error"] | None = None
|
46
48
|
|
47
49
|
@property
|
48
|
-
def
|
50
|
+
def running_time(self) -> float:
|
49
51
|
if self.started is not None:
|
50
52
|
completed = (
|
51
53
|
self.completed
|
@@ -78,6 +80,7 @@ async def active_sample(
|
|
78
80
|
message_limit: int | None,
|
79
81
|
token_limit: int | None,
|
80
82
|
time_limit: int | None,
|
83
|
+
working_limit: int | None,
|
81
84
|
fails_on_error: bool,
|
82
85
|
transcript: Transcript,
|
83
86
|
) -> AsyncGenerator[ActiveSample, None]:
|
@@ -90,6 +93,7 @@ async def active_sample(
|
|
90
93
|
message_limit=message_limit,
|
91
94
|
token_limit=token_limit,
|
92
95
|
time_limit=time_limit,
|
96
|
+
working_limit=working_limit,
|
93
97
|
sandboxes=await sandbox_connections(),
|
94
98
|
fails_on_error=fails_on_error,
|
95
99
|
transcript=transcript,
|
inspect_ai/log/_transcript.py
CHANGED
@@ -8,7 +8,9 @@ from typing import (
|
|
8
8
|
Iterator,
|
9
9
|
Literal,
|
10
10
|
Sequence,
|
11
|
+
Type,
|
11
12
|
TypeAlias,
|
13
|
+
TypeVar,
|
12
14
|
Union,
|
13
15
|
)
|
14
16
|
|
@@ -17,6 +19,7 @@ from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
|
|
17
19
|
from inspect_ai._util.constants import SAMPLE_SUBTASK
|
18
20
|
from inspect_ai._util.error import EvalError
|
19
21
|
from inspect_ai._util.json import JsonChange, json_changes
|
22
|
+
from inspect_ai._util.working import sample_working_time
|
20
23
|
from inspect_ai.dataset._dataset import Sample
|
21
24
|
from inspect_ai.log._message import LoggingMessage
|
22
25
|
from inspect_ai.model._chat_message import ChatMessage
|
@@ -41,7 +44,10 @@ logger = getLogger(__name__)
|
|
41
44
|
|
42
45
|
class BaseEvent(BaseModel):
|
43
46
|
timestamp: datetime = Field(default_factory=datetime.now)
|
44
|
-
"""
|
47
|
+
"""Clock time at which event occurred."""
|
48
|
+
|
49
|
+
working_start: float = Field(default_factory=sample_working_time)
|
50
|
+
"""Working time (within sample) at which the event occurred."""
|
45
51
|
|
46
52
|
pending: bool | None = Field(default=None)
|
47
53
|
"""Is this event pending?"""
|
@@ -70,7 +76,7 @@ class SampleLimitEvent(BaseEvent):
|
|
70
76
|
event: Literal["sample_limit"] = Field(default="sample_limit")
|
71
77
|
"""Event type."""
|
72
78
|
|
73
|
-
type: Literal["message", "time", "token", "operator", "custom"]
|
79
|
+
type: Literal["message", "time", "working", "token", "operator", "custom"]
|
74
80
|
"""Type of limit that halted processing"""
|
75
81
|
|
76
82
|
message: str
|
@@ -133,6 +139,18 @@ class ModelEvent(BaseEvent):
|
|
133
139
|
call: ModelCall | None = Field(default=None)
|
134
140
|
"""Raw call made to model API."""
|
135
141
|
|
142
|
+
completed: datetime | None = Field(default=None)
|
143
|
+
"""Time that model call completed (see `timestamp` for started)"""
|
144
|
+
|
145
|
+
working_time: float | None = Field(default=None)
|
146
|
+
"""working time for model call that succeeded (i.e. was not retried)."""
|
147
|
+
|
148
|
+
@field_serializer("completed")
|
149
|
+
def serialize_completed(self, dt: datetime) -> str:
|
150
|
+
if dt is None:
|
151
|
+
return None
|
152
|
+
return dt.astimezone().isoformat()
|
153
|
+
|
136
154
|
|
137
155
|
class ToolEvent(BaseEvent):
|
138
156
|
"""Call to a tool."""
|
@@ -167,18 +185,28 @@ class ToolEvent(BaseEvent):
|
|
167
185
|
events: list["Event"] = Field(default_factory=list)
|
168
186
|
"""Transcript of events for tool."""
|
169
187
|
|
188
|
+
completed: datetime | None = Field(default=None)
|
189
|
+
"""Time that tool call completed (see `timestamp` for started)"""
|
190
|
+
|
191
|
+
working_time: float | None = Field(default=None)
|
192
|
+
"""Working time for tool call (i.e. time not spent waiting on semaphores)."""
|
193
|
+
|
170
194
|
def _set_result(
|
171
195
|
self,
|
172
196
|
result: ToolResult,
|
173
197
|
truncated: tuple[int, int] | None,
|
174
198
|
error: ToolCallError | None,
|
175
199
|
events: list["Event"],
|
200
|
+
waiting_time: float,
|
176
201
|
) -> None:
|
177
202
|
self.result = result
|
178
203
|
self.truncated = truncated
|
179
204
|
self.error = error
|
180
205
|
self.events = events
|
181
206
|
self.pending = None
|
207
|
+
completed = datetime.now()
|
208
|
+
self.completed = completed
|
209
|
+
self.working_time = (completed - self.timestamp).total_seconds() - waiting_time
|
182
210
|
|
183
211
|
# mechanism for operator to cancel the tool call
|
184
212
|
|
@@ -206,6 +234,45 @@ class ToolEvent(BaseEvent):
|
|
206
234
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
207
235
|
"""Required so that we can include '_task' as a member."""
|
208
236
|
|
237
|
+
@field_serializer("completed")
|
238
|
+
def serialize_completed(self, dt: datetime) -> str:
|
239
|
+
return dt.astimezone().isoformat()
|
240
|
+
|
241
|
+
|
242
|
+
class SandboxEvent(BaseEvent):
|
243
|
+
"""Sandbox execution or I/O"""
|
244
|
+
|
245
|
+
event: Literal["sandbox"] = Field(default="sandbox")
|
246
|
+
"""Event type"""
|
247
|
+
|
248
|
+
action: Literal["exec", "read_file", "write_file"]
|
249
|
+
"""Sandbox action"""
|
250
|
+
|
251
|
+
cmd: str | None = Field(default=None)
|
252
|
+
"""Command (for exec)"""
|
253
|
+
|
254
|
+
options: dict[str, JsonValue] | None = Field(default=None)
|
255
|
+
"""Options (for exec)"""
|
256
|
+
|
257
|
+
file: str | None = Field(default=None)
|
258
|
+
"""File (for read_file and write_file)"""
|
259
|
+
|
260
|
+
input: str | None = Field(default=None)
|
261
|
+
"""Input (for cmd and write_file). Truncated to 100 lines."""
|
262
|
+
|
263
|
+
result: int | None = Field(default=None)
|
264
|
+
"""Result (for exec)"""
|
265
|
+
|
266
|
+
output: str | None = Field(default=None)
|
267
|
+
"""Output (for exec and read_file). Truncated to 100 lines."""
|
268
|
+
|
269
|
+
completed: datetime | None = Field(default=None)
|
270
|
+
"""Time that sandbox action completed (see `timestamp` for started)"""
|
271
|
+
|
272
|
+
@field_serializer("completed")
|
273
|
+
def serialize_completed(self, dt: datetime) -> str:
|
274
|
+
return dt.astimezone().isoformat()
|
275
|
+
|
209
276
|
|
210
277
|
class ApprovalEvent(BaseEvent):
|
211
278
|
"""Tool approval."""
|
@@ -338,14 +405,26 @@ class SubtaskEvent(BaseEvent):
|
|
338
405
|
events: list["Event"] = Field(default_factory=list)
|
339
406
|
"""Transcript of events for subtask."""
|
340
407
|
|
408
|
+
completed: datetime | None = Field(default=None)
|
409
|
+
"""Time that subtask completed (see `timestamp` for started)"""
|
410
|
+
|
411
|
+
working_time: float | None = Field(default=None)
|
412
|
+
"""Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
|
413
|
+
|
414
|
+
@field_serializer("completed")
|
415
|
+
def serialize_completed(self, dt: datetime) -> str:
|
416
|
+
return dt.astimezone().isoformat()
|
417
|
+
|
341
418
|
|
342
419
|
Event: TypeAlias = Union[
|
343
420
|
SampleInitEvent
|
344
421
|
| SampleLimitEvent
|
422
|
+
| SandboxEvent
|
345
423
|
| StateEvent
|
346
424
|
| StoreEvent
|
347
425
|
| ModelEvent
|
348
426
|
| ToolEvent
|
427
|
+
| SandboxEvent
|
349
428
|
| ApprovalEvent
|
350
429
|
| InputEvent
|
351
430
|
| ScoreEvent
|
@@ -357,6 +436,8 @@ Event: TypeAlias = Union[
|
|
357
436
|
]
|
358
437
|
"""Event in a transcript."""
|
359
438
|
|
439
|
+
ET = TypeVar("ET", bound=BaseEvent)
|
440
|
+
|
360
441
|
|
361
442
|
class Transcript:
|
362
443
|
"""Transcript of events."""
|
@@ -396,6 +477,12 @@ class Transcript:
|
|
396
477
|
def events(self) -> Sequence[Event]:
|
397
478
|
return self._events
|
398
479
|
|
480
|
+
def find_last_event(self, event_cls: Type[ET]) -> ET | None:
|
481
|
+
for event in reversed(self.events):
|
482
|
+
if isinstance(event, event_cls):
|
483
|
+
return event
|
484
|
+
return None
|
485
|
+
|
399
486
|
def _event(self, event: Event) -> None:
|
400
487
|
self._events.append(event)
|
401
488
|
|
inspect_ai/model/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from inspect_ai._util.content import (
|
|
4
4
|
Content,
|
5
5
|
ContentAudio,
|
6
6
|
ContentImage,
|
7
|
+
ContentReasoning,
|
7
8
|
ContentText,
|
8
9
|
ContentVideo,
|
9
10
|
)
|
@@ -51,6 +52,7 @@ __all__ = [
|
|
51
52
|
"CachePolicy",
|
52
53
|
"ContentAudio",
|
53
54
|
"ContentImage",
|
55
|
+
"ContentReasoning",
|
54
56
|
"ContentText",
|
55
57
|
"ContentVideo",
|
56
58
|
"Content",
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -36,6 +36,7 @@ from inspect_ai._util.content import (
|
|
36
36
|
from inspect_ai._util.format import format_function_call
|
37
37
|
from inspect_ai._util.text import truncate_string_to_bytes
|
38
38
|
from inspect_ai._util.trace import trace_action
|
39
|
+
from inspect_ai._util.working import sample_waiting_time
|
39
40
|
from inspect_ai.model._conversation import conversation_tool_mesage
|
40
41
|
from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
|
41
42
|
from inspect_ai.tool._tool import ToolApprovalError, ToolParsingError
|
@@ -180,6 +181,10 @@ async def call_tools(
|
|
180
181
|
task = asyncio.create_task(call_tool_task(call))
|
181
182
|
|
182
183
|
# create pending tool event and add it to the transcript
|
184
|
+
# (record the waiting time for the sample so we can compare
|
185
|
+
# it at the end to deduce total waiting time inside the tool
|
186
|
+
# call (in turn used to calculate working time)
|
187
|
+
waiting_time_start = sample_waiting_time()
|
183
188
|
event = ToolEvent(
|
184
189
|
id=call.id,
|
185
190
|
function=call.function,
|
@@ -227,11 +232,13 @@ async def call_tools(
|
|
227
232
|
conversation_tool_mesage(tool_message)
|
228
233
|
|
229
234
|
# update the event with the results
|
235
|
+
waiting_time_end = sample_waiting_time()
|
230
236
|
event._set_result(
|
231
237
|
result=result_event.result,
|
232
238
|
truncated=result_event.truncated,
|
233
239
|
error=result_event.error,
|
234
240
|
events=result_event.events,
|
241
|
+
waiting_time=waiting_time_end - waiting_time_start,
|
235
242
|
)
|
236
243
|
|
237
244
|
# return tool messages
|
@@ -407,7 +414,7 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
|
|
407
414
|
return tuple(input)
|
408
415
|
elif origin is dict or origin is Dict:
|
409
416
|
if args and len(args) > 1:
|
410
|
-
return {k: tool_param(args[1], v) for k, v in input}
|
417
|
+
return {k: tool_param(args[1], v) for k, v in input.items()}
|
411
418
|
else:
|
412
419
|
return input
|
413
420
|
elif origin is Union or origin is types.UnionType:
|
@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
|
|
3
3
|
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
5
5
|
|
6
|
-
from inspect_ai._util.content import Content, ContentText
|
6
|
+
from inspect_ai._util.content import Content, ContentReasoning, ContentText
|
7
7
|
from inspect_ai.tool import ToolCall
|
8
8
|
from inspect_ai.tool._tool_call import ToolCallError
|
9
9
|
|
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
|
|
64
64
|
self.content = text
|
65
65
|
else:
|
66
66
|
all_other = [content for content in self.content if content.type != "text"]
|
67
|
-
self.content = [ContentText(text=text)]
|
67
|
+
self.content = all_other + [ContentText(text=text)]
|
68
68
|
|
69
69
|
|
70
70
|
class ChatMessageSystem(ChatMessageBase):
|
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
93
93
|
tool_calls: list[ToolCall] | None = Field(default=None)
|
94
94
|
"""Tool calls made by the model."""
|
95
95
|
|
96
|
-
reasoning: str | None = Field(default=None)
|
97
|
-
"""Reasoning content."""
|
98
|
-
|
99
96
|
# Some OpenAI compatible REST endpoints include reasoning as a field alongside
|
100
97
|
# content, however since this field doesn't exist in the OpenAI interface,
|
101
98
|
# hosting providers (so far we've seen this with Together and Groq) may
|
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
110
107
|
@classmethod
|
111
108
|
def extract_reasoning(cls, data: Any) -> Any:
|
112
109
|
if isinstance(data, dict):
|
110
|
+
# cleave apart <think> blocks
|
113
111
|
content = data.get("content", None)
|
114
112
|
if isinstance(content, str):
|
115
113
|
parsed = parse_content_with_reasoning(content)
|
116
114
|
if parsed:
|
117
|
-
data["
|
118
|
-
|
115
|
+
data["content"] = [
|
116
|
+
ContentReasoning(reasoning=parsed.reasoning),
|
117
|
+
ContentText(text=parsed.content),
|
118
|
+
]
|
119
|
+
# migrate messages that has explicit 'reasoning' field
|
120
|
+
# (which was our original representation of reasoning)
|
121
|
+
reasoning = data.get("reasoning", None)
|
122
|
+
if isinstance(reasoning, str):
|
123
|
+
# ensure that content is a list
|
124
|
+
content = data.get("content", None)
|
125
|
+
if content is None:
|
126
|
+
data["content"] = []
|
127
|
+
elif isinstance(content, str):
|
128
|
+
data["content"] = [ContentText(text=content)]
|
129
|
+
elif not isinstance(content, list):
|
130
|
+
data["content"] = []
|
131
|
+
data["content"].insert(0, ContentReasoning(reasoning=reasoning))
|
132
|
+
|
133
|
+
del data["reasoning"]
|
119
134
|
return data
|
120
135
|
|
121
136
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from rich.console import RenderableType
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
|
+
from inspect_ai._util.content import ContentReasoning, ContentText
|
4
5
|
from inspect_ai._util.rich import lines_display
|
5
6
|
from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
|
6
7
|
from inspect_ai.util._conversation import conversation_panel
|
@@ -19,7 +20,7 @@ def conversation_tool_mesage(message: ChatMessageTool) -> None:
|
|
19
20
|
message.error.message.strip() if message.error else message.text.strip()
|
20
21
|
)
|
21
22
|
if output:
|
22
|
-
content = lines_display(output,
|
23
|
+
content = lines_display(output, 50)
|
23
24
|
|
24
25
|
conversation_panel(
|
25
26
|
title=f"Tool Output: {message.function}",
|
@@ -41,14 +42,15 @@ def conversation_assistant_message(
|
|
41
42
|
# build content
|
42
43
|
content: list[RenderableType] = []
|
43
44
|
|
44
|
-
#
|
45
|
-
if message.
|
46
|
-
content.extend(
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
45
|
+
# deal with plain text or with content blocks
|
46
|
+
if isinstance(message.content, str):
|
47
|
+
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
48
|
+
else:
|
49
|
+
for c in message.content:
|
50
|
+
if isinstance(c, ContentReasoning):
|
51
|
+
content.extend(transcript_reasoning(c))
|
52
|
+
elif isinstance(c, ContentText) and c.text:
|
53
|
+
content.extend([transcript_markdown(c.text.strip(), escape=True)])
|
52
54
|
|
53
55
|
# print tool calls
|
54
56
|
if message.tool_calls:
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from contextvars import ContextVar
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import Literal, Union
|
3
|
+
from typing import Any, Literal, Union
|
4
4
|
|
5
|
-
from pydantic import BaseModel, Field
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
6
6
|
from typing_extensions import TypedDict
|
7
7
|
|
8
8
|
|
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
75
75
|
reasoning_effort: Literal["low", "medium", "high"] | None
|
76
76
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
77
77
|
|
78
|
-
|
78
|
+
reasoning_tokens: int | None
|
79
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
80
|
+
|
81
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None
|
79
82
|
"""Include reasoning in chat message history sent to generate."""
|
80
83
|
|
81
84
|
|
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
|
|
148
151
|
reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
|
149
152
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
150
153
|
|
151
|
-
|
154
|
+
reasoning_tokens: int | None = Field(default=None)
|
155
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
156
|
+
|
157
|
+
reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
|
158
|
+
default=None
|
159
|
+
)
|
152
160
|
"""Include reasoning in chat message history sent to generate."""
|
153
161
|
|
162
|
+
# migrate reasoning_history as a bool
|
163
|
+
@model_validator(mode="before")
|
164
|
+
@classmethod
|
165
|
+
def migrate_reasoning(cls, data: Any) -> Any:
|
166
|
+
if isinstance(data, dict):
|
167
|
+
reasoning_history = data.get("reasoning_history", None)
|
168
|
+
if reasoning_history is True:
|
169
|
+
data["reasoning_history"] = "all"
|
170
|
+
elif reasoning_history is False:
|
171
|
+
data["reasoning_history"] = "none"
|
172
|
+
|
173
|
+
return data
|
174
|
+
|
154
175
|
def merge(
|
155
176
|
self, other: Union["GenerateConfig", GenerateConfigArgs]
|
156
177
|
) -> "GenerateConfig":
|