inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from inspect_ai._util.constants import LOG_SCHEMA_VERSION
16
16
  from inspect_ai._util.content import (
17
17
  ContentAudio,
18
18
  ContentImage,
19
+ ContentReasoning,
19
20
  ContentText,
20
21
  ContentVideo,
21
22
  )
@@ -252,7 +253,11 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
252
253
  for message in inputs:
253
254
  if not isinstance(message.content, str):
254
255
  filtered_content: list[
255
- ContentText | ContentImage | ContentAudio | ContentVideo
256
+ ContentText
257
+ | ContentReasoning
258
+ | ContentImage
259
+ | ContentAudio
260
+ | ContentVideo
256
261
  ] = []
257
262
  for content in message.content:
258
263
  if content.type == "text":
@@ -23,6 +23,7 @@ class ActiveSample:
23
23
  message_limit: int | None,
24
24
  token_limit: int | None,
25
25
  time_limit: int | None,
26
+ working_limit: int | None,
26
27
  fails_on_error: bool,
27
28
  transcript: Transcript,
28
29
  sandboxes: dict[str, SandboxConnection],
@@ -37,6 +38,7 @@ class ActiveSample:
37
38
  self.message_limit = message_limit
38
39
  self.token_limit = token_limit
39
40
  self.time_limit = time_limit
41
+ self.working_limit = working_limit
40
42
  self.fails_on_error = fails_on_error
41
43
  self.total_messages = 0
42
44
  self.total_tokens = 0
@@ -45,7 +47,7 @@ class ActiveSample:
45
47
  self._interrupt_action: Literal["score", "error"] | None = None
46
48
 
47
49
  @property
48
- def execution_time(self) -> float:
50
+ def running_time(self) -> float:
49
51
  if self.started is not None:
50
52
  completed = (
51
53
  self.completed
@@ -78,6 +80,7 @@ async def active_sample(
78
80
  message_limit: int | None,
79
81
  token_limit: int | None,
80
82
  time_limit: int | None,
83
+ working_limit: int | None,
81
84
  fails_on_error: bool,
82
85
  transcript: Transcript,
83
86
  ) -> AsyncGenerator[ActiveSample, None]:
@@ -90,6 +93,7 @@ async def active_sample(
90
93
  message_limit=message_limit,
91
94
  token_limit=token_limit,
92
95
  time_limit=time_limit,
96
+ working_limit=working_limit,
93
97
  sandboxes=await sandbox_connections(),
94
98
  fails_on_error=fails_on_error,
95
99
  transcript=transcript,
@@ -8,7 +8,9 @@ from typing import (
8
8
  Iterator,
9
9
  Literal,
10
10
  Sequence,
11
+ Type,
11
12
  TypeAlias,
13
+ TypeVar,
12
14
  Union,
13
15
  )
14
16
 
@@ -17,6 +19,7 @@ from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
17
19
  from inspect_ai._util.constants import SAMPLE_SUBTASK
18
20
  from inspect_ai._util.error import EvalError
19
21
  from inspect_ai._util.json import JsonChange, json_changes
22
+ from inspect_ai._util.working import sample_working_time
20
23
  from inspect_ai.dataset._dataset import Sample
21
24
  from inspect_ai.log._message import LoggingMessage
22
25
  from inspect_ai.model._chat_message import ChatMessage
@@ -41,7 +44,10 @@ logger = getLogger(__name__)
41
44
 
42
45
  class BaseEvent(BaseModel):
43
46
  timestamp: datetime = Field(default_factory=datetime.now)
44
- """Time at which event occurred."""
47
+ """Clock time at which event occurred."""
48
+
49
+ working_start: float = Field(default_factory=sample_working_time)
50
+ """Working time (within sample) at which the event occurred."""
45
51
 
46
52
  pending: bool | None = Field(default=None)
47
53
  """Is this event pending?"""
@@ -70,7 +76,7 @@ class SampleLimitEvent(BaseEvent):
70
76
  event: Literal["sample_limit"] = Field(default="sample_limit")
71
77
  """Event type."""
72
78
 
73
- type: Literal["message", "time", "token", "operator", "custom"]
79
+ type: Literal["message", "time", "working", "token", "operator", "custom"]
74
80
  """Type of limit that halted processing"""
75
81
 
76
82
  message: str
@@ -133,6 +139,18 @@ class ModelEvent(BaseEvent):
133
139
  call: ModelCall | None = Field(default=None)
134
140
  """Raw call made to model API."""
135
141
 
142
+ completed: datetime | None = Field(default=None)
143
+ """Time that model call completed (see `timestamp` for started)"""
144
+
145
+ working_time: float | None = Field(default=None)
146
+ """working time for model call that succeeded (i.e. was not retried)."""
147
+
148
+ @field_serializer("completed")
149
+ def serialize_completed(self, dt: datetime) -> str:
150
+ if dt is None:
151
+ return None
152
+ return dt.astimezone().isoformat()
153
+
136
154
 
137
155
  class ToolEvent(BaseEvent):
138
156
  """Call to a tool."""
@@ -167,18 +185,28 @@ class ToolEvent(BaseEvent):
167
185
  events: list["Event"] = Field(default_factory=list)
168
186
  """Transcript of events for tool."""
169
187
 
188
+ completed: datetime | None = Field(default=None)
189
+ """Time that tool call completed (see `timestamp` for started)"""
190
+
191
+ working_time: float | None = Field(default=None)
192
+ """Working time for tool call (i.e. time not spent waiting on semaphores)."""
193
+
170
194
  def _set_result(
171
195
  self,
172
196
  result: ToolResult,
173
197
  truncated: tuple[int, int] | None,
174
198
  error: ToolCallError | None,
175
199
  events: list["Event"],
200
+ waiting_time: float,
176
201
  ) -> None:
177
202
  self.result = result
178
203
  self.truncated = truncated
179
204
  self.error = error
180
205
  self.events = events
181
206
  self.pending = None
207
+ completed = datetime.now()
208
+ self.completed = completed
209
+ self.working_time = (completed - self.timestamp).total_seconds() - waiting_time
182
210
 
183
211
  # mechanism for operator to cancel the tool call
184
212
 
@@ -206,6 +234,45 @@ class ToolEvent(BaseEvent):
206
234
  model_config = ConfigDict(arbitrary_types_allowed=True)
207
235
  """Required so that we can include '_task' as a member."""
208
236
 
237
+ @field_serializer("completed")
238
+ def serialize_completed(self, dt: datetime) -> str:
239
+ return dt.astimezone().isoformat()
240
+
241
+
242
+ class SandboxEvent(BaseEvent):
243
+ """Sandbox execution or I/O"""
244
+
245
+ event: Literal["sandbox"] = Field(default="sandbox")
246
+ """Event type"""
247
+
248
+ action: Literal["exec", "read_file", "write_file"]
249
+ """Sandbox action"""
250
+
251
+ cmd: str | None = Field(default=None)
252
+ """Command (for exec)"""
253
+
254
+ options: dict[str, JsonValue] | None = Field(default=None)
255
+ """Options (for exec)"""
256
+
257
+ file: str | None = Field(default=None)
258
+ """File (for read_file and write_file)"""
259
+
260
+ input: str | None = Field(default=None)
261
+ """Input (for cmd and write_file). Truncated to 100 lines."""
262
+
263
+ result: int | None = Field(default=None)
264
+ """Result (for exec)"""
265
+
266
+ output: str | None = Field(default=None)
267
+ """Output (for exec and read_file). Truncated to 100 lines."""
268
+
269
+ completed: datetime | None = Field(default=None)
270
+ """Time that sandbox action completed (see `timestamp` for started)"""
271
+
272
+ @field_serializer("completed")
273
+ def serialize_completed(self, dt: datetime) -> str:
274
+ return dt.astimezone().isoformat()
275
+
209
276
 
210
277
  class ApprovalEvent(BaseEvent):
211
278
  """Tool approval."""
@@ -338,14 +405,26 @@ class SubtaskEvent(BaseEvent):
338
405
  events: list["Event"] = Field(default_factory=list)
339
406
  """Transcript of events for subtask."""
340
407
 
408
+ completed: datetime | None = Field(default=None)
409
+ """Time that subtask completed (see `timestamp` for started)"""
410
+
411
+ working_time: float | None = Field(default=None)
412
+ """Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
413
+
414
+ @field_serializer("completed")
415
+ def serialize_completed(self, dt: datetime) -> str:
416
+ return dt.astimezone().isoformat()
417
+
341
418
 
342
419
  Event: TypeAlias = Union[
343
420
  SampleInitEvent
344
421
  | SampleLimitEvent
422
+ | SandboxEvent
345
423
  | StateEvent
346
424
  | StoreEvent
347
425
  | ModelEvent
348
426
  | ToolEvent
427
+ | SandboxEvent
349
428
  | ApprovalEvent
350
429
  | InputEvent
351
430
  | ScoreEvent
@@ -357,6 +436,8 @@ Event: TypeAlias = Union[
357
436
  ]
358
437
  """Event in a transcript."""
359
438
 
439
+ ET = TypeVar("ET", bound=BaseEvent)
440
+
360
441
 
361
442
  class Transcript:
362
443
  """Transcript of events."""
@@ -396,6 +477,12 @@ class Transcript:
396
477
  def events(self) -> Sequence[Event]:
397
478
  return self._events
398
479
 
480
+ def find_last_event(self, event_cls: Type[ET]) -> ET | None:
481
+ for event in reversed(self.events):
482
+ if isinstance(event, event_cls):
483
+ return event
484
+ return None
485
+
399
486
  def _event(self, event: Event) -> None:
400
487
  self._events.append(event)
401
488
 
@@ -4,6 +4,7 @@ from inspect_ai._util.content import (
4
4
  Content,
5
5
  ContentAudio,
6
6
  ContentImage,
7
+ ContentReasoning,
7
8
  ContentText,
8
9
  ContentVideo,
9
10
  )
@@ -51,6 +52,7 @@ __all__ = [
51
52
  "CachePolicy",
52
53
  "ContentAudio",
53
54
  "ContentImage",
55
+ "ContentReasoning",
54
56
  "ContentText",
55
57
  "ContentVideo",
56
58
  "Content",
@@ -36,6 +36,7 @@ from inspect_ai._util.content import (
36
36
  from inspect_ai._util.format import format_function_call
37
37
  from inspect_ai._util.text import truncate_string_to_bytes
38
38
  from inspect_ai._util.trace import trace_action
39
+ from inspect_ai._util.working import sample_waiting_time
39
40
  from inspect_ai.model._conversation import conversation_tool_mesage
40
41
  from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
41
42
  from inspect_ai.tool._tool import ToolApprovalError, ToolParsingError
@@ -180,6 +181,10 @@ async def call_tools(
180
181
  task = asyncio.create_task(call_tool_task(call))
181
182
 
182
183
  # create pending tool event and add it to the transcript
184
+ # (record the waiting time for the sample so we can compare
185
+ # it at the end to deduce total waiting time inside the tool
186
+ # call (in turn used to calculate working time)
187
+ waiting_time_start = sample_waiting_time()
183
188
  event = ToolEvent(
184
189
  id=call.id,
185
190
  function=call.function,
@@ -227,11 +232,13 @@ async def call_tools(
227
232
  conversation_tool_mesage(tool_message)
228
233
 
229
234
  # update the event with the results
235
+ waiting_time_end = sample_waiting_time()
230
236
  event._set_result(
231
237
  result=result_event.result,
232
238
  truncated=result_event.truncated,
233
239
  error=result_event.error,
234
240
  events=result_event.events,
241
+ waiting_time=waiting_time_end - waiting_time_start,
235
242
  )
236
243
 
237
244
  # return tool messages
@@ -407,7 +414,7 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
407
414
  return tuple(input)
408
415
  elif origin is dict or origin is Dict:
409
416
  if args and len(args) > 1:
410
- return {k: tool_param(args[1], v) for k, v in input}
417
+ return {k: tool_param(args[1], v) for k, v in input.items()}
411
418
  else:
412
419
  return input
413
420
  elif origin is Union or origin is types.UnionType:
@@ -3,7 +3,7 @@ from typing import Any, Literal, Type, Union
3
3
 
4
4
  from pydantic import BaseModel, Field, model_validator
5
5
 
6
- from inspect_ai._util.content import Content, ContentText
6
+ from inspect_ai._util.content import Content, ContentReasoning, ContentText
7
7
  from inspect_ai.tool import ToolCall
8
8
  from inspect_ai.tool._tool_call import ToolCallError
9
9
 
@@ -64,7 +64,7 @@ class ChatMessageBase(BaseModel):
64
64
  self.content = text
65
65
  else:
66
66
  all_other = [content for content in self.content if content.type != "text"]
67
- self.content = [ContentText(text=text)] + all_other
67
+ self.content = all_other + [ContentText(text=text)]
68
68
 
69
69
 
70
70
  class ChatMessageSystem(ChatMessageBase):
@@ -93,9 +93,6 @@ class ChatMessageAssistant(ChatMessageBase):
93
93
  tool_calls: list[ToolCall] | None = Field(default=None)
94
94
  """Tool calls made by the model."""
95
95
 
96
- reasoning: str | None = Field(default=None)
97
- """Reasoning content."""
98
-
99
96
  # Some OpenAI compatible REST endpoints include reasoning as a field alongside
100
97
  # content, however since this field doesn't exist in the OpenAI interface,
101
98
  # hosting providers (so far we've seen this with Together and Groq) may
@@ -110,12 +107,30 @@ class ChatMessageAssistant(ChatMessageBase):
110
107
  @classmethod
111
108
  def extract_reasoning(cls, data: Any) -> Any:
112
109
  if isinstance(data, dict):
110
+ # cleave apart <think> blocks
113
111
  content = data.get("content", None)
114
112
  if isinstance(content, str):
115
113
  parsed = parse_content_with_reasoning(content)
116
114
  if parsed:
117
- data["reasoning"] = parsed.reasoning
118
- data["content"] = parsed.content
115
+ data["content"] = [
116
+ ContentReasoning(reasoning=parsed.reasoning),
117
+ ContentText(text=parsed.content),
118
+ ]
119
+ # migrate messages that has explicit 'reasoning' field
120
+ # (which was our original representation of reasoning)
121
+ reasoning = data.get("reasoning", None)
122
+ if isinstance(reasoning, str):
123
+ # ensure that content is a list
124
+ content = data.get("content", None)
125
+ if content is None:
126
+ data["content"] = []
127
+ elif isinstance(content, str):
128
+ data["content"] = [ContentText(text=content)]
129
+ elif not isinstance(content, list):
130
+ data["content"] = []
131
+ data["content"].insert(0, ContentReasoning(reasoning=reasoning))
132
+
133
+ del data["reasoning"]
119
134
  return data
120
135
 
121
136
 
@@ -1,6 +1,7 @@
1
1
  from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
+ from inspect_ai._util.content import ContentReasoning, ContentText
4
5
  from inspect_ai._util.rich import lines_display
5
6
  from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
6
7
  from inspect_ai.util._conversation import conversation_panel
@@ -19,7 +20,7 @@ def conversation_tool_mesage(message: ChatMessageTool) -> None:
19
20
  message.error.message.strip() if message.error else message.text.strip()
20
21
  )
21
22
  if output:
22
- content = lines_display(output, 100)
23
+ content = lines_display(output, 50)
23
24
 
24
25
  conversation_panel(
25
26
  title=f"Tool Output: {message.function}",
@@ -41,14 +42,15 @@ def conversation_assistant_message(
41
42
  # build content
42
43
  content: list[RenderableType] = []
43
44
 
44
- # reasoning
45
- if message.reasoning:
46
- content.extend(transcript_reasoning(message.reasoning))
47
-
48
- # message text
49
- content.extend(
50
- [transcript_markdown(message.text, escape=True)] if message.text else []
51
- )
45
+ # deal with plain text or with content blocks
46
+ if isinstance(message.content, str):
47
+ content.extend([transcript_markdown(message.text.strip(), escape=True)])
48
+ else:
49
+ for c in message.content:
50
+ if isinstance(c, ContentReasoning):
51
+ content.extend(transcript_reasoning(c))
52
+ elif isinstance(c, ContentText) and c.text:
53
+ content.extend([transcript_markdown(c.text.strip(), escape=True)])
52
54
 
53
55
  # print tool calls
54
56
  if message.tool_calls:
@@ -1,8 +1,8 @@
1
1
  from contextvars import ContextVar
2
2
  from copy import deepcopy
3
- from typing import Literal, Union
3
+ from typing import Any, Literal, Union
4
4
 
5
- from pydantic import BaseModel, Field
5
+ from pydantic import BaseModel, Field, model_validator
6
6
  from typing_extensions import TypedDict
7
7
 
8
8
 
@@ -75,7 +75,10 @@ class GenerateConfigArgs(TypedDict, total=False):
75
75
  reasoning_effort: Literal["low", "medium", "high"] | None
76
76
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
77
77
 
78
- reasoning_history: bool | None
78
+ reasoning_tokens: int | None
79
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
80
+
81
+ reasoning_history: Literal["none", "all", "last", "auto"] | None
79
82
  """Include reasoning in chat message history sent to generate."""
80
83
 
81
84
 
@@ -148,9 +151,27 @@ class GenerateConfig(BaseModel):
148
151
  reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
149
152
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
150
153
 
151
- reasoning_history: bool | None = Field(default=None)
154
+ reasoning_tokens: int | None = Field(default=None)
155
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
156
+
157
+ reasoning_history: Literal["none", "all", "last", "auto"] | None = Field(
158
+ default=None
159
+ )
152
160
  """Include reasoning in chat message history sent to generate."""
153
161
 
162
+ # migrate reasoning_history as a bool
163
+ @model_validator(mode="before")
164
+ @classmethod
165
+ def migrate_reasoning(cls, data: Any) -> Any:
166
+ if isinstance(data, dict):
167
+ reasoning_history = data.get("reasoning_history", None)
168
+ if reasoning_history is True:
169
+ data["reasoning_history"] = "all"
170
+ elif reasoning_history is False:
171
+ data["reasoning_history"] = "none"
172
+
173
+ return data
174
+
154
175
  def merge(
155
176
  self, other: Union["GenerateConfig", GenerateConfigArgs]
156
177
  ) -> "GenerateConfig":