inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. inspect_ai/_cli/eval.py +14 -8
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +6 -1
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/transcript.py +10 -6
  9. inspect_ai/_eval/task/run.py +5 -8
  10. inspect_ai/_util/content.py +20 -1
  11. inspect_ai/_util/transcript.py +10 -4
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/App.css +6 -0
  14. inspect_ai/_view/www/dist/assets/index.css +115 -87
  15. inspect_ai/_view/www/dist/assets/index.js +5324 -2276
  16. inspect_ai/_view/www/eslint.config.mjs +24 -1
  17. inspect_ai/_view/www/log-schema.json +283 -20
  18. inspect_ai/_view/www/package.json +8 -3
  19. inspect_ai/_view/www/src/App.tsx +2 -2
  20. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  21. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  22. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  23. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  24. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  25. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  26. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  27. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  28. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  29. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  30. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  31. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  32. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  33. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  34. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  35. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  36. inspect_ai/_view/www/src/index.tsx +2 -2
  37. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  38. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  39. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  40. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
  41. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  42. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  43. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  44. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  45. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  46. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  47. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  48. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  49. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
  50. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  51. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  52. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  53. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  54. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  55. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  56. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  57. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  58. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  59. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  60. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  64. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  65. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  66. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  67. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  68. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  69. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  70. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  71. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  72. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  73. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  74. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  75. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  76. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  77. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  78. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  79. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  80. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  83. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  84. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  85. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  86. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  87. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  88. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
  89. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  90. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
  91. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  92. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
  93. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  94. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  95. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
  96. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  97. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  98. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  99. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  100. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  101. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  102. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  103. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  104. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  105. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  106. inspect_ai/_view/www/src/types/log.d.ts +129 -34
  107. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  108. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  109. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  110. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  111. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  112. inspect_ai/_view/www/src/utils/format.ts +1 -1
  113. inspect_ai/_view/www/src/utils/json.ts +24 -0
  114. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  115. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
  116. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  117. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  118. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  119. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  120. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  121. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  122. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  123. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  124. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  125. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  126. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  127. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  128. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  129. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  130. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  131. inspect_ai/_view/www/yarn.lock +241 -5
  132. inspect_ai/log/_condense.py +3 -0
  133. inspect_ai/log/_recorders/eval.py +6 -1
  134. inspect_ai/log/_transcript.py +58 -1
  135. inspect_ai/model/__init__.py +2 -0
  136. inspect_ai/model/_call_tools.py +7 -0
  137. inspect_ai/model/_chat_message.py +22 -7
  138. inspect_ai/model/_conversation.py +10 -8
  139. inspect_ai/model/_generate_config.py +25 -4
  140. inspect_ai/model/_model.py +133 -57
  141. inspect_ai/model/_model_output.py +3 -0
  142. inspect_ai/model/_openai.py +106 -40
  143. inspect_ai/model/_providers/anthropic.py +281 -153
  144. inspect_ai/model/_providers/google.py +27 -8
  145. inspect_ai/model/_providers/groq.py +9 -4
  146. inspect_ai/model/_providers/openai.py +57 -4
  147. inspect_ai/model/_providers/openai_o1.py +10 -0
  148. inspect_ai/model/_providers/providers.py +1 -1
  149. inspect_ai/model/_reasoning.py +15 -2
  150. inspect_ai/scorer/_model.py +23 -19
  151. inspect_ai/solver/_human_agent/agent.py +14 -10
  152. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  153. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  154. inspect_ai/tool/__init__.py +2 -0
  155. inspect_ai/tool/_tool.py +3 -1
  156. inspect_ai/tool/_tools/_computer/_common.py +117 -58
  157. inspect_ai/tool/_tools/_computer/_computer.py +80 -57
  158. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
  159. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
  160. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
  161. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
  162. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
  163. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
  164. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  165. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
  166. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
  167. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
  168. inspect_ai/tool/_tools/_computer/test_args.py +151 -0
  169. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  170. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  171. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  172. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  173. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  174. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  175. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  176. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  177. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  178. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  179. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  180. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  181. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  182. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  183. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  184. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  185. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  186. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  187. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  191. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  192. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  196. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  197. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  198. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  199. inspect_ai/util/__init__.py +2 -1
  200. inspect_ai/util/_display.py +12 -0
  201. inspect_ai/util/_sandbox/events.py +55 -21
  202. inspect_ai/util/_sandbox/self_check.py +131 -43
  203. inspect_ai/util/_subtask.py +11 -0
  204. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
  205. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
  206. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
  207. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  208. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  209. inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
  210. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  211. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  212. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  214. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  215. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  216. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  217. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
  218. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
  219. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,12 @@
1
1
  import os
2
+ import socket
2
3
  from logging import getLogger
3
4
  from typing import Any
4
5
 
6
+ import httpx
5
7
  from openai import (
8
+ DEFAULT_CONNECTION_LIMITS,
9
+ DEFAULT_TIMEOUT,
6
10
  APIConnectionError,
7
11
  APITimeoutError,
8
12
  AsyncAzureOpenAI,
@@ -102,6 +106,9 @@ class OpenAIAPI(ModelAPI):
102
106
  ],
103
107
  )
104
108
 
109
+ # create async http client
110
+ http_client = OpenAIAsyncHttpxClient()
111
+
105
112
  # azure client
106
113
  if self.is_azure():
107
114
  # resolve base_url
@@ -126,6 +133,7 @@ class OpenAIAPI(ModelAPI):
126
133
  max_retries=(
127
134
  config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
128
135
  ),
136
+ http_client=http_client,
129
137
  **model_args,
130
138
  )
131
139
  else:
@@ -135,6 +143,7 @@ class OpenAIAPI(ModelAPI):
135
143
  max_retries=(
136
144
  config.max_retries if config.max_retries else DEFAULT_MAX_RETRIES
137
145
  ),
146
+ http_client=http_client,
138
147
  **model_args,
139
148
  )
140
149
 
@@ -231,6 +240,16 @@ class OpenAIAPI(ModelAPI):
231
240
  ModelUsage(
232
241
  input_tokens=completion.usage.prompt_tokens,
233
242
  output_tokens=completion.usage.completion_tokens,
243
+ input_tokens_cache_read=(
244
+ completion.usage.prompt_tokens_details.cached_tokens
245
+ if completion.usage.prompt_tokens_details is not None
246
+ else None # openai only have cache read stats/pricing.
247
+ ),
248
+ reasoning_tokens=(
249
+ completion.usage.completion_tokens_details.reasoning_tokens
250
+ if completion.usage.completion_tokens_details is not None
251
+ else None
252
+ ),
234
253
  total_tokens=completion.usage.total_tokens,
235
254
  )
236
255
  if completion.usage
@@ -250,10 +269,8 @@ class OpenAIAPI(ModelAPI):
250
269
  def is_rate_limit(self, ex: BaseException) -> bool:
251
270
  if isinstance(ex, RateLimitError):
252
271
  # Do not retry on these rate limit errors
253
- if (
254
- "Request too large" not in ex.message
255
- and "You exceeded your current quota" not in ex.message
256
- ):
272
+ # The quota exceeded one is related to monthly account quotas.
273
+ if "You exceeded your current quota" not in ex.message:
257
274
  return True
258
275
  elif isinstance(
259
276
  ex, (APIConnectionError | APITimeoutError | InternalServerError)
@@ -342,3 +359,39 @@ class OpenAIAPI(ModelAPI):
342
359
  )
343
360
  else:
344
361
  return e
362
+
363
+
364
+ class OpenAIAsyncHttpxClient(httpx.AsyncClient):
365
+ """Custom async client that deals better with long running Async requests.
366
+
367
+ Based on Anthropic DefaultAsyncHttpClient implementation that they
368
+ released along with Claude 3.7 as well as the OpenAI DefaultAsyncHttpxClient
369
+
370
+ """
371
+
372
+ def __init__(self, **kwargs: Any) -> None:
373
+ # This is based on the openai DefaultAsyncHttpxClient:
374
+ # https://github.com/openai/openai-python/commit/347363ed67a6a1611346427bb9ebe4becce53f7e
375
+ kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
376
+ kwargs.setdefault("limits", DEFAULT_CONNECTION_LIMITS)
377
+ kwargs.setdefault("follow_redirects", True)
378
+
379
+ # This is based on the anthrpopic changes for claude 3.7:
380
+ # https://github.com/anthropics/anthropic-sdk-python/commit/c5387e69e799f14e44006ea4e54fdf32f2f74393#diff-3acba71f89118b06b03f2ba9f782c49ceed5bb9f68d62727d929f1841b61d12bR1387-R1403
381
+
382
+ # set socket options to deal with long running reasoning requests
383
+ socket_options = [
384
+ (socket.SOL_SOCKET, socket.SO_KEEPALIVE, True),
385
+ (socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 60),
386
+ (socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 5),
387
+ ]
388
+ TCP_KEEPIDLE = getattr(socket, "TCP_KEEPIDLE", None)
389
+ if TCP_KEEPIDLE is not None:
390
+ socket_options.append((socket.IPPROTO_TCP, TCP_KEEPIDLE, 60))
391
+
392
+ kwargs["transport"] = httpx.AsyncHTTPTransport(
393
+ limits=DEFAULT_CONNECTION_LIMITS,
394
+ socket_options=socket_options,
395
+ )
396
+
397
+ super().__init__(**kwargs)
@@ -69,6 +69,16 @@ async def generate_o1(
69
69
  usage=ModelUsage(
70
70
  input_tokens=completion.usage.prompt_tokens,
71
71
  output_tokens=completion.usage.completion_tokens,
72
+ input_tokens_cache_read=(
73
+ completion.usage.prompt_tokens_details.cached_tokens
74
+ if completion.usage.prompt_tokens_details is not None
75
+ else None # openai only have cache read stats/pricing.
76
+ ),
77
+ reasoning_tokens=(
78
+ completion.usage.completion_tokens_details.reasoning_tokens
79
+ if completion.usage.completion_tokens_details is not None
80
+ else None
81
+ ),
72
82
  total_tokens=completion.usage.total_tokens,
73
83
  )
74
84
  if completion.usage
@@ -48,7 +48,7 @@ def openai() -> type[ModelAPI]:
48
48
  def anthropic() -> type[ModelAPI]:
49
49
  FEATURE = "Anthropic API"
50
50
  PACKAGE = "anthropic"
51
- MIN_VERSION = "0.29.0"
51
+ MIN_VERSION = "0.47.1"
52
52
 
53
53
  # verify we have the package
54
54
  try:
@@ -5,13 +5,26 @@ from typing import NamedTuple
5
5
  class ContentWithReasoning(NamedTuple):
6
6
  content: str
7
7
  reasoning: str
8
+ signature: str | None = None
9
+ redacted: bool = False
8
10
 
9
11
 
10
12
  def parse_content_with_reasoning(content: str) -> ContentWithReasoning | None:
11
- match = re.match(r"\s*<think>(.*?)</think>(.*)", content, re.DOTALL)
13
+ # Match <think> tag with optional attributes
14
+ pattern = r'\s*<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>(.*)'
15
+ match = re.match(pattern, content, re.DOTALL)
16
+
12
17
  if match:
18
+ signature = match.group(1) # This will be None if not present
19
+ redacted_value = match.group(2) # This will be "true" or None
20
+ reasoning = match.group(3).strip()
21
+ content_text = match.group(4).strip()
22
+
13
23
  return ContentWithReasoning(
14
- content=match.group(2).strip(), reasoning=match.group(1).strip()
24
+ content=content_text,
25
+ reasoning=reasoning,
26
+ signature=signature,
27
+ redacted=redacted_value == "true",
15
28
  )
16
29
  else:
17
30
  return None
@@ -274,25 +274,29 @@ def chat_history(state: TaskState) -> str:
274
274
 
275
275
  # begin history with text of first message (it will come right after
276
276
  # 'Task' or 'Question' in the template)
277
- history: list[str] = [messages[0].text]
278
-
279
- # for subsequent messages present with e.g. Assistant: {message.text}
280
- for message in messages[1:]:
281
- if isinstance(message, ChatMessageUser):
282
- history.append(f"User: {message.text}")
283
- elif isinstance(message, ChatMessageAssistant):
284
- assistant_message = [message.text] if message.text else []
285
- if message.tool_calls:
286
- assistant_message.extend(
287
- [
288
- format_function_call(tool_call.function, tool_call.arguments)
289
- for tool_call in message.tool_calls
290
- ]
277
+ history: list[str] = []
278
+ if len(messages) > 0:
279
+ history.append(messages[0].text)
280
+
281
+ # for subsequent messages present with e.g. Assistant: {message.text}
282
+ for message in messages[1:]:
283
+ if isinstance(message, ChatMessageUser):
284
+ history.append(f"User: {message.text}")
285
+ elif isinstance(message, ChatMessageAssistant):
286
+ assistant_message = [message.text] if message.text else []
287
+ if message.tool_calls:
288
+ assistant_message.extend(
289
+ [
290
+ format_function_call(
291
+ tool_call.function, tool_call.arguments
292
+ )
293
+ for tool_call in message.tool_calls
294
+ ]
295
+ )
296
+ history.append("Assistant: " + "\n\n".join(assistant_message))
297
+ elif isinstance(message, ChatMessageTool):
298
+ history.append(
299
+ f"Tool ({message.function}): {message.tool_error or ''}{message.text}"
291
300
  )
292
- history.append("Assistant: " + "\n\n".join(assistant_message))
293
- elif isinstance(message, ChatMessageTool):
294
- history.append(
295
- f"Tool ({message.function}): {message.tool_error or ''}{message.text}"
296
- )
297
301
 
298
302
  return "\n\n".join(history)
@@ -1,6 +1,8 @@
1
1
  import asyncio
2
+ from typing import cast
2
3
 
3
4
  from inspect_ai.util import display_type, input_panel, sandbox
5
+ from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
4
6
 
5
7
  from .._solver import Generate, Solver, solver
6
8
  from .._task_state import TaskState
@@ -56,19 +58,21 @@ def human_agent(
56
58
 
57
59
  # helper function to run the agent (called for fullscreen vs. fallback below)
58
60
  async def run_human_agent(view: HumanAgentView) -> TaskState:
59
- # create agent commands
60
- commands = human_agent_commands(
61
- state, answer, intermediate_scoring, record_session
62
- )
61
+ sandbox_proxy = cast(SandboxEnvironmentProxy, sandbox())
62
+ with sandbox_proxy.no_events():
63
+ # create agent commands
64
+ commands = human_agent_commands(
65
+ state, answer, intermediate_scoring, record_session
66
+ )
63
67
 
64
- # install agent tools
65
- await install_human_agent(state, commands, record_session)
68
+ # install agent tools
69
+ await install_human_agent(state, commands, record_session)
66
70
 
67
- # hookup the view ui
68
- view.connect(connection)
71
+ # hookup the view ui
72
+ view.connect(connection)
69
73
 
70
- # run sandbox service
71
- return await run_human_agent_service(state, commands, view)
74
+ # run sandbox service
75
+ return await run_human_agent_service(state, commands, view)
72
76
 
73
77
  # support both fullscreen ui and fallback
74
78
  if display_type() == "full":
@@ -6,7 +6,7 @@ from .instructions import InstructionsCommand
6
6
  from .note import NoteCommand
7
7
  from .score import ScoreCommand
8
8
  from .status import StatusCommand
9
- from .submit import SubmitCommand, ValidateCommand
9
+ from .submit import QuitCommand, SubmitCommand, ValidateCommand
10
10
 
11
11
 
12
12
  def human_agent_commands(
@@ -15,8 +15,12 @@ def human_agent_commands(
15
15
  intermediate_scoring: bool,
16
16
  record_session: bool,
17
17
  ) -> list[HumanAgentCommand]:
18
- # base submit and validate
19
- commands = [SubmitCommand(record_session), ValidateCommand(answer)]
18
+ # base submit, validate, and quit
19
+ commands = [
20
+ SubmitCommand(record_session),
21
+ ValidateCommand(answer),
22
+ QuitCommand(record_session),
23
+ ]
20
24
 
21
25
  # optional intermediate scoring
22
26
  if intermediate_scoring:
@@ -16,22 +16,89 @@ from .command import HumanAgentCommand, call_human_agent
16
16
  logger = getLogger(__name__)
17
17
 
18
18
 
19
- class SubmitCommand(HumanAgentCommand):
19
+ class SessionEndCommand(HumanAgentCommand):
20
20
  def __init__(self, record_session: bool):
21
21
  super().__init__()
22
22
  self._record_session = record_session
23
23
 
24
+ @property
25
+ def group(self) -> Literal[1, 2, 3]:
26
+ return 1
27
+
28
+ async def _read_session_logs(self) -> dict[str, str]:
29
+ # retreive session logs (don't fail)
30
+ sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
31
+ result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
32
+ if not result.success:
33
+ logger.warning(f"Error listing human agent session logs: {result.stderr}")
34
+ return {}
35
+
36
+ # read logs
37
+ session_logs: dict[str, str] = {}
38
+ for session_log in result.stdout.strip().splitlines():
39
+ try:
40
+ session_logs[session_log] = await sandbox().read_file(
41
+ (sessions_dir / session_log).as_posix()
42
+ )
43
+ except Exception as ex:
44
+ logger.warning(f"Error reading human agent session log: {ex}")
45
+
46
+ return session_logs
47
+
48
+
49
+ class QuitCommand(SessionEndCommand):
24
50
  @property
25
51
  def name(self) -> str:
26
- return "submit"
52
+ return "quit"
27
53
 
28
54
  @property
29
55
  def description(self) -> str:
30
- return "Submit your final answer for the task."
56
+ return "Quit the task without submitting an answer."
57
+
58
+ def cli(self, args: Namespace) -> None:
59
+ # verify that the user wants to proceed
60
+ action = "quit the task without submitting an answer (ending the exercise)"
61
+ while True:
62
+ response = (
63
+ input(
64
+ f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
65
+ )
66
+ .lower()
67
+ .strip()
68
+ )
69
+ if response in ["yes", "y"]:
70
+ break
71
+ elif response in ["no", "n"]:
72
+ return
73
+ else:
74
+ print("Please enter yes or no.")
31
75
 
76
+ # thank the user!
77
+ print(
78
+ "\nThank you for working on this task!\n\n"
79
+ + "Your task will now be scored and you will be disconnected from this container.\n"
80
+ )
81
+
82
+ call_human_agent("quit")
83
+
84
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
85
+ async def submit() -> None:
86
+ if self._record_session:
87
+ state.logs = await self._read_session_logs()
88
+ state.running = False
89
+ state.answer = ""
90
+
91
+ return submit
92
+
93
+
94
+ class SubmitCommand(SessionEndCommand):
32
95
  @property
33
- def group(self) -> Literal[1, 2, 3]:
34
- return 1
96
+ def name(self) -> str:
97
+ return "submit"
98
+
99
+ @property
100
+ def description(self) -> str:
101
+ return "Submit your final answer for the task."
35
102
 
36
103
  @property
37
104
  def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
@@ -55,10 +122,12 @@ class SubmitCommand(HumanAgentCommand):
55
122
  # verify that the user wants to proceed
56
123
  answer = call_args.get("answer", None)
57
124
  answer_text = f" '{answer}'" if answer else ""
125
+ action = f"end the task and submit{answer_text}"
126
+
58
127
  while True:
59
128
  response = (
60
129
  input(
61
- f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
130
+ f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
62
131
  )
63
132
  .lower()
64
133
  .strip()
@@ -76,13 +145,10 @@ class SubmitCommand(HumanAgentCommand):
76
145
  + "Your task will now be scored and you will be disconnected from this container.\n"
77
146
  )
78
147
 
79
- # submit the task
80
148
  call_human_agent("submit", **call_args)
81
149
 
82
150
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
83
- async def submit(
84
- answer: str | None, session_logs: dict[str, str] | None = None
85
- ) -> None:
151
+ async def submit(answer: str) -> None:
86
152
  if self._record_session:
87
153
  state.logs = await self._read_session_logs()
88
154
  state.running = False
@@ -90,26 +156,6 @@ class SubmitCommand(HumanAgentCommand):
90
156
 
91
157
  return submit
92
158
 
93
- async def _read_session_logs(self) -> dict[str, str]:
94
- # retreive session logs (don't fail)
95
- sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
96
- result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
97
- if not result.success:
98
- logger.warning(f"Error listing human agent session logs: {result.stderr}")
99
- return {}
100
-
101
- # read logs
102
- session_logs: dict[str, str] = {}
103
- for session_log in result.stdout.strip().splitlines():
104
- try:
105
- session_logs[session_log] = await sandbox().read_file(
106
- (sessions_dir / session_log).as_posix()
107
- )
108
- except Exception as ex:
109
- logger.warning(f"Error reading human agent session log: {ex}")
110
-
111
- return session_logs
112
-
113
159
 
114
160
  class ValidateCommand(HumanAgentCommand):
115
161
  def __init__(self, answer: bool | str) -> None:
@@ -2,6 +2,7 @@ from inspect_ai._util.content import (
2
2
  Content,
3
3
  ContentAudio,
4
4
  ContentImage,
5
+ ContentReasoning,
5
6
  ContentText,
6
7
  ContentVideo,
7
8
  )
@@ -41,6 +42,7 @@ __all__ = [
41
42
  "Content",
42
43
  "ContentAudio",
43
44
  "ContentImage",
45
+ "ContentReasoning",
44
46
  "ContentText",
45
47
  "ContentVideo",
46
48
  "ToolCall",
inspect_ai/tool/_tool.py CHANGED
@@ -14,6 +14,7 @@ from typing import (
14
14
  from inspect_ai._util.content import (
15
15
  ContentAudio,
16
16
  ContentImage,
17
+ ContentReasoning,
17
18
  ContentText,
18
19
  ContentVideo,
19
20
  )
@@ -35,10 +36,11 @@ ToolResult = (
35
36
  | float
36
37
  | bool
37
38
  | ContentText
39
+ | ContentReasoning
38
40
  | ContentImage
39
41
  | ContentAudio
40
42
  | ContentVideo
41
- | list[ContentText | ContentImage | ContentAudio | ContentVideo]
43
+ | list[ContentText | ContentReasoning | ContentImage | ContentAudio | ContentVideo]
42
44
  )
43
45
  """Valid types for results from tool calls."""
44
46
 
@@ -11,19 +11,6 @@ from inspect_ai.tool import ToolError, ToolResult
11
11
  from inspect_ai.util._sandbox.context import sandbox_with
12
12
  from inspect_ai.util._sandbox.environment import SandboxEnvironment
13
13
 
14
- Action = Literal[
15
- "key",
16
- "type",
17
- "mouse_move",
18
- "left_click",
19
- "left_click_drag",
20
- "right_click",
21
- "middle_click",
22
- "double_click",
23
- "screenshot",
24
- "cursor_position",
25
- ]
26
-
27
14
 
28
15
  class ToolExecResult(BaseModel):
29
16
  output: str | None = Field(default=None)
@@ -31,6 +18,122 @@ class ToolExecResult(BaseModel):
31
18
  base64_image: str | None = Field(default=None)
32
19
 
33
20
 
21
+ async def cursor_position(timeout: int | None = None) -> ToolResult:
22
+ return await _send_cmd(["cursor_position"], timeout=timeout)
23
+
24
+
25
+ async def screenshot(timeout: int | None = None) -> ToolResult:
26
+ return await _send_cmd(["screenshot"], timeout=timeout)
27
+
28
+
29
+ async def wait(duration: int, timeout: int | None = None) -> ToolResult:
30
+ return await _send_cmd(["wait", "--duration", f"{duration}"], timeout=timeout)
31
+
32
+
33
+ async def mouse_move(coordinate: list[int], timeout: int | None = None) -> ToolResult:
34
+ return await _send_cmd(
35
+ ["mouse_move", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
36
+ timeout=timeout,
37
+ )
38
+
39
+
40
+ async def left_mouse_down(timeout: int | None = None) -> ToolResult:
41
+ return await _send_cmd(["left_mouse_down"], timeout=timeout)
42
+
43
+
44
+ async def left_mouse_up(timeout: int | None = None) -> ToolResult:
45
+ return await _send_cmd(["left_mouse_up"], timeout=timeout)
46
+
47
+
48
+ async def left_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
49
+ return await _send_cmd(
50
+ ["left_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
51
+ timeout=timeout,
52
+ )
53
+
54
+
55
+ async def left_click_drag(
56
+ start_coordinate: list[int], coordinate: list[int], timeout: int | None = None
57
+ ) -> ToolResult:
58
+ return await _send_cmd(
59
+ [
60
+ "left_click_drag",
61
+ "--start_coordinate",
62
+ f"{start_coordinate[0]}",
63
+ f"{start_coordinate[1]}",
64
+ "--coordinate",
65
+ f"{coordinate[0]}",
66
+ f"{coordinate[1]}",
67
+ ],
68
+ timeout=timeout,
69
+ )
70
+
71
+
72
+ async def right_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
73
+ return await _send_cmd(
74
+ ["right_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
75
+ timeout=timeout,
76
+ )
77
+
78
+
79
+ async def middle_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
80
+ return await _send_cmd(
81
+ ["middle_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
82
+ timeout=timeout,
83
+ )
84
+
85
+
86
+ async def double_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
87
+ return await _send_cmd(
88
+ ["double_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
89
+ timeout=timeout,
90
+ )
91
+
92
+
93
+ async def triple_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
94
+ return await _send_cmd(
95
+ ["triple_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
96
+ timeout=timeout,
97
+ )
98
+
99
+
100
+ async def scroll(
101
+ scroll_amount: int,
102
+ scroll_direction: Literal["up", "down", "left", "right"],
103
+ coordinate: list[int] | None,
104
+ timeout: int | None = None,
105
+ ) -> ToolResult:
106
+ return await _send_cmd(
107
+ [
108
+ "scroll",
109
+ "--scroll_amount",
110
+ f"{scroll_amount}",
111
+ "--scroll_direction",
112
+ f"{scroll_direction}",
113
+ ]
114
+ + (
115
+ ["--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"]
116
+ if coordinate
117
+ else []
118
+ ),
119
+ timeout=timeout,
120
+ )
121
+
122
+
123
+ async def press_key(key: str, timeout: int | None = None) -> ToolResult:
124
+ return await _send_cmd(["key", "--text", key], timeout=timeout)
125
+
126
+
127
+ async def hold_key(key: str, duration: int, timeout: int | None = None) -> ToolResult:
128
+ return await _send_cmd(
129
+ ["hold_key", "--text", key, "--duration", f"{duration}"], timeout=timeout
130
+ )
131
+
132
+
133
+ async def type(text: str, timeout: int | None = None) -> ToolResult:
134
+ return await _send_cmd(["type", "--text", text], timeout=timeout)
135
+
136
+
34
137
  async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult:
35
138
  from inspect_ai.log._samples import sample_active
36
139
 
@@ -39,7 +142,7 @@ async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResul
39
142
  sample_id = sample.sample.id
40
143
  assert sample_id
41
144
 
42
- cmd = ["python3", "/opt/inspect/tool/computer_tool.py", "--action"] + cmdTail
145
+ cmd = ["python3", "/opt/inspect/tool/computer_tool.py"] + cmdTail
43
146
 
44
147
  raw_exec_result = await (await computer_sandbox()).exec(cmd, timeout=timeout)
45
148
 
@@ -72,50 +175,6 @@ async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResul
72
175
  return "OK"
73
176
 
74
177
 
75
- async def cursor_position(timeout: int | None = None) -> ToolResult:
76
- return await _send_cmd(["cursor_position"], timeout=timeout)
77
-
78
-
79
- async def screenshot(timeout: int | None = None) -> ToolResult:
80
- return await _send_cmd(["screenshot"], timeout=timeout)
81
-
82
-
83
- async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult:
84
- return await _send_cmd(
85
- ["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout
86
- )
87
-
88
-
89
- async def left_click(timeout: int | None = None) -> ToolResult:
90
- return await _send_cmd(["left_click"], timeout=timeout)
91
-
92
-
93
- async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult:
94
- return await _send_cmd(
95
- ["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout
96
- )
97
-
98
-
99
- async def right_click(timeout: int | None = None) -> ToolResult:
100
- return await _send_cmd(["right_click"], timeout=timeout)
101
-
102
-
103
- async def middle_click(timeout: int | None = None) -> ToolResult:
104
- return await _send_cmd(["middle_click"], timeout=timeout)
105
-
106
-
107
- async def double_click(timeout: int | None = None) -> ToolResult:
108
- return await _send_cmd(["double_click"], timeout=timeout)
109
-
110
-
111
- async def press_key(key: str, timeout: int | None = None) -> ToolResult:
112
- return await _send_cmd(["key", "--text", key], timeout=timeout)
113
-
114
-
115
- async def type(text: str, timeout: int | None = None) -> ToolResult:
116
- return await _send_cmd(["type", "--text", text], timeout=timeout)
117
-
118
-
119
178
  async def computer_sandbox() -> SandboxEnvironment:
120
179
  sb = await sandbox_with("/opt/inspect/tool/computer_tool.py")
121
180
  if sb: