inspect-ai 0.3.70__py3-none-any.whl → 0.3.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. inspect_ai/_cli/eval.py +14 -8
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +6 -1
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/transcript.py +10 -6
  9. inspect_ai/_eval/task/run.py +5 -8
  10. inspect_ai/_util/content.py +20 -1
  11. inspect_ai/_util/transcript.py +10 -4
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/App.css +6 -0
  14. inspect_ai/_view/www/dist/assets/index.css +115 -87
  15. inspect_ai/_view/www/dist/assets/index.js +5324 -2276
  16. inspect_ai/_view/www/eslint.config.mjs +24 -1
  17. inspect_ai/_view/www/log-schema.json +283 -20
  18. inspect_ai/_view/www/package.json +8 -3
  19. inspect_ai/_view/www/src/App.tsx +2 -2
  20. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  21. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  22. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  23. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  24. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  25. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  26. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  27. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  28. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  29. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  30. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  31. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  32. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  33. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  34. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  35. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  36. inspect_ai/_view/www/src/index.tsx +2 -2
  37. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  38. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  39. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  40. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -0
  41. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  42. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  43. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  44. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  45. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  46. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  47. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  48. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  49. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +2 -2
  50. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  51. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  52. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  53. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  54. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  55. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  56. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  57. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  58. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  59. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  60. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  64. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  65. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  66. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  67. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  68. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  69. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  70. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  71. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  72. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  73. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  74. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  75. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  76. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  77. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  78. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  79. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  80. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  83. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  84. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  85. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  86. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  87. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  88. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +5 -4
  89. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  90. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +8 -7
  91. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  92. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +3 -3
  93. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  94. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  95. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +34 -15
  96. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  97. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  98. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  99. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  100. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  101. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  102. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  103. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  104. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  105. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  106. inspect_ai/_view/www/src/types/log.d.ts +129 -34
  107. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  108. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  109. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  110. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  111. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  112. inspect_ai/_view/www/src/utils/format.ts +1 -1
  113. inspect_ai/_view/www/src/utils/json.ts +24 -0
  114. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  115. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -2
  116. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  117. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  118. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  119. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  120. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  121. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  122. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  123. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  124. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  125. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  126. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  127. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  128. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  129. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  130. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  131. inspect_ai/_view/www/yarn.lock +241 -5
  132. inspect_ai/log/_condense.py +3 -0
  133. inspect_ai/log/_recorders/eval.py +6 -1
  134. inspect_ai/log/_transcript.py +58 -1
  135. inspect_ai/model/__init__.py +2 -0
  136. inspect_ai/model/_call_tools.py +7 -0
  137. inspect_ai/model/_chat_message.py +22 -7
  138. inspect_ai/model/_conversation.py +10 -8
  139. inspect_ai/model/_generate_config.py +25 -4
  140. inspect_ai/model/_model.py +133 -57
  141. inspect_ai/model/_model_output.py +3 -0
  142. inspect_ai/model/_openai.py +106 -40
  143. inspect_ai/model/_providers/anthropic.py +281 -153
  144. inspect_ai/model/_providers/google.py +27 -8
  145. inspect_ai/model/_providers/groq.py +9 -4
  146. inspect_ai/model/_providers/openai.py +57 -4
  147. inspect_ai/model/_providers/openai_o1.py +10 -0
  148. inspect_ai/model/_providers/providers.py +1 -1
  149. inspect_ai/model/_reasoning.py +15 -2
  150. inspect_ai/scorer/_model.py +23 -19
  151. inspect_ai/solver/_human_agent/agent.py +14 -10
  152. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  153. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  154. inspect_ai/tool/__init__.py +2 -0
  155. inspect_ai/tool/_tool.py +3 -1
  156. inspect_ai/tool/_tools/_computer/_common.py +117 -58
  157. inspect_ai/tool/_tools/_computer/_computer.py +80 -57
  158. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +7 -1
  159. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +91 -0
  160. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +8 -0
  161. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +12 -0
  162. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +78 -0
  163. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +20 -0
  164. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  165. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +175 -113
  166. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +76 -20
  167. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +65 -0
  168. inspect_ai/tool/_tools/_computer/test_args.py +151 -0
  169. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  170. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  171. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  172. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  173. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  174. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  175. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  176. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  177. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  178. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  179. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  180. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  181. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  182. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  183. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  184. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  185. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  186. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  187. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  191. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  192. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  196. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  197. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  198. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  199. inspect_ai/util/__init__.py +2 -1
  200. inspect_ai/util/_display.py +12 -0
  201. inspect_ai/util/_sandbox/events.py +55 -21
  202. inspect_ai/util/_sandbox/self_check.py +131 -43
  203. inspect_ai/util/_subtask.py +11 -0
  204. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/METADATA +1 -1
  205. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/RECORD +209 -186
  206. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/WHEEL +1 -1
  207. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  208. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  209. inspect_ai/tool/_tools/_computer/_computer_split.py +0 -198
  210. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  211. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  212. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  214. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  215. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  216. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  217. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/LICENSE +0 -0
  218. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/entry_points.txt +0 -0
  219. {inspect_ai-0.3.70.dist-info → inspect_ai-0.3.72.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,14 @@ from inspect_ai.util import (
9
9
  SandboxEnvironmentLimits,
10
10
  )
11
11
 
12
+ # If you're wondering these tests are not using pytest fixtures,
13
+ # see the discussion https://github.com/UKGovernmentBEIS/inspect_ai/pull/347
14
+ # It's not ideal, so a PR to fix this would be welcome.
15
+ #
16
+ # If you are struggling to debug a failing one of these, two tips:
17
+ # 1. Comment out everything apart from the failing test in the list in the `self_check` function
18
+ # 2. Get rid of the try/catch in check_test_fn (the body can just be `await fn(sandbox_env); return True`
19
+
12
20
 
13
21
  async def check_test_fn(
14
22
  fn: Callable[[SandboxEnvironment], Coroutine[Any, Any, None]],
@@ -20,7 +28,7 @@ async def check_test_fn(
20
28
  except AssertionError as e:
21
29
  return f"FAILED: [{str(e)}]"
22
30
  except Exception as e:
23
- return f"ERROR: {repr(e)}"
31
+ return f"ERROR: [{repr(e)}]"
24
32
 
25
33
 
26
34
  async def self_check(sandbox_env: SandboxEnvironment) -> dict[str, bool | str]:
@@ -92,8 +100,12 @@ async def test_write_file_text_utf(sandbox_env: SandboxEnvironment) -> None:
92
100
  file_name = "test_write_file_text_utf.file"
93
101
  await sandbox_env.write_file(file_name, utf_content)
94
102
  file_with_utf_content = await sandbox_env.read_file(file_name, text=True)
95
- assert isinstance(file_with_utf_content, str)
96
- assert file_with_utf_content == utf_content
103
+ assert isinstance(file_with_utf_content, str), (
104
+ f"Expected file content to be a string, got {type(file_with_utf_content)}"
105
+ )
106
+ assert file_with_utf_content == utf_content, (
107
+ f"UTF-8 content should match, got {file_with_utf_content=}; expected {utf_content=}"
108
+ )
97
109
  await _cleanup_file(sandbox_env, file_name)
98
110
 
99
111
 
@@ -104,7 +116,7 @@ async def test_read_and_write_file_binary(sandbox_env: SandboxEnvironment) -> No
104
116
  ) # invalid UTF-8 from https://stackoverflow.com/a/17199164/116509
105
117
 
106
118
  written_file_bytes = await sandbox_env.read_file(file_name, text=False)
107
- assert b"\xc3\x28" == written_file_bytes
119
+ assert b"\xc3\x28" == written_file_bytes, "Binary content should match"
108
120
  await _cleanup_file(sandbox_env, file_name)
109
121
 
110
122
 
@@ -115,7 +127,7 @@ async def test_read_and_write_large_file_binary(
115
127
  long_bytes = b"\xc3" * 5_000_000
116
128
  await sandbox_env.write_file(file_name, long_bytes)
117
129
  written_file_bytes = await sandbox_env.read_file(file_name, text=False)
118
- assert long_bytes == written_file_bytes
130
+ assert long_bytes == written_file_bytes, "Large binary content should match"
119
131
  await _cleanup_file(sandbox_env, file_name)
120
132
 
121
133
 
@@ -125,7 +137,9 @@ async def test_read_and_write_file_including_directory_absolute(
125
137
  file_name = "/tmp/test_rw_including_directory_absolute/test.file"
126
138
  await sandbox_env.write_file(file_name, "absolutely enjoying being in a directory")
127
139
  written_file_string = await sandbox_env.read_file(file_name, text=True)
128
- assert "absolutely enjoying being in a directory" == written_file_string
140
+ assert "absolutely enjoying being in a directory" == written_file_string, (
141
+ f"Absolute directory content should match, got {written_file_string=}"
142
+ )
129
143
  await _cleanup_file(sandbox_env, file_name)
130
144
  await sandbox_env.exec(["rmdir", "/tmp/test_rw_including_directory_absolute"])
131
145
 
@@ -136,7 +150,9 @@ async def test_read_and_write_file_including_directory_relative(
136
150
  file_name = "test_rw_including_directory_relative/test.file"
137
151
  await sandbox_env.write_file(file_name, "relatively enjoying being in a directory")
138
152
  written_file_string = await sandbox_env.read_file(file_name, text=True)
139
- assert "relatively enjoying being in a directory" == written_file_string
153
+ assert "relatively enjoying being in a directory" == written_file_string, (
154
+ f"Relative directory content should match, got {written_file_string=}"
155
+ )
140
156
  await _cleanup_file(sandbox_env, file_name)
141
157
  await sandbox_env.exec(["rmdir", "test_rw_including_directory_relative"])
142
158
 
@@ -145,8 +161,12 @@ async def test_read_file_zero_length(sandbox_env: SandboxEnvironment) -> None:
145
161
  file_name = "zero_length_file.file"
146
162
  await sandbox_env.exec(["touch", file_name])
147
163
  zero_length = await sandbox_env.read_file(file_name, text=True)
148
- assert isinstance(zero_length, str)
149
- assert zero_length == ""
164
+ assert isinstance(zero_length, str), (
165
+ f"Zero-length file should return a string, got {type(zero_length)}"
166
+ )
167
+ assert zero_length == "", (
168
+ f"Zero-length file should be an empty string, got {zero_length=}"
169
+ )
150
170
  await _cleanup_file(sandbox_env, file_name)
151
171
 
152
172
 
@@ -154,7 +174,10 @@ async def test_read_file_not_found(sandbox_env: SandboxEnvironment) -> None:
154
174
  file_name = "nonexistent"
155
175
  with Raises(FileNotFoundError) as e_info:
156
176
  await sandbox_env.read_file(file_name, text=True)
157
- assert file_name in str(e_info.value)
177
+ assert e_info is not None, "FileNotFoundError should be raised"
178
+ assert file_name in str(e_info.value), (
179
+ f"FileNotFoundError should contain the filename, got {e_info.value=}"
180
+ )
158
181
 
159
182
 
160
183
  async def test_read_file_not_allowed(sandbox_env: SandboxEnvironment) -> None:
@@ -163,7 +186,10 @@ async def test_read_file_not_allowed(sandbox_env: SandboxEnvironment) -> None:
163
186
  await sandbox_env.exec(["chmod", "-r", file_name])
164
187
  with Raises(PermissionError) as e_info:
165
188
  await sandbox_env.read_file(file_name, text=True)
166
- assert file_name in str(e_info.value)
189
+ assert e_info is not None, "PermissionError should be raised"
190
+ assert file_name in str(e_info.value), (
191
+ f"PermissionError should contain the filename, got {e_info.value=}"
192
+ )
167
193
  await sandbox_env.exec(["chmod", "+r", file_name])
168
194
  await _cleanup_file(sandbox_env, file_name)
169
195
 
@@ -172,7 +198,10 @@ async def test_read_file_is_directory(sandbox_env: SandboxEnvironment) -> None:
172
198
  file_name = "/etc"
173
199
  with Raises(IsADirectoryError) as e_info:
174
200
  await sandbox_env.read_file(file_name, text=True)
175
- assert "directory" in str(e_info.value)
201
+ assert e_info is not None, "IsADirectoryError should be raised"
202
+ assert "directory" in str(e_info.value), (
203
+ f"IsADirectoryError should mention 'directory', got {e_info.value=}"
204
+ )
176
205
 
177
206
 
178
207
  async def test_read_file_nonsense_name(
@@ -181,7 +210,10 @@ async def test_read_file_nonsense_name(
181
210
  file_name = "https:/en.wikipedia.org/wiki/Bart%C5%82omiej_Kasprzykowski"
182
211
  with Raises(FileNotFoundError) as e_info:
183
212
  await sandbox_env.read_file(file_name, text=True)
184
- assert "wikipedia" in str(e_info.value)
213
+ assert e_info is not None, "FileNotFoundError should be raised"
214
+ assert "wikipedia" in str(e_info.value), (
215
+ f"FileNotFoundError should contain the filename, got {e_info.value=}"
216
+ )
185
217
 
186
218
 
187
219
  async def test_read_file_limit(sandbox_env: SandboxEnvironment) -> None:
@@ -191,7 +223,10 @@ async def test_read_file_limit(sandbox_env: SandboxEnvironment) -> None:
191
223
  with mock.patch.object(SandboxEnvironmentLimits, "MAX_READ_FILE_SIZE", 1024):
192
224
  with Raises(OutputLimitExceededError) as e_info:
193
225
  await sandbox_env.read_file(file_name, text=True)
194
- assert "limit of 100 MiB was exceeded" in str(e_info.value)
226
+ assert e_info is not None, "OutputLimitExceededError should be raised"
227
+ assert "limit of 100 MiB was exceeded" in str(e_info.value), (
228
+ f"OutputLimitExceededError should mention the limit, got {e_info.value=}"
229
+ )
195
230
  await _cleanup_file(sandbox_env, file_name)
196
231
 
197
232
 
@@ -199,8 +234,12 @@ async def test_write_text_file_zero_length(sandbox_env: SandboxEnvironment) -> N
199
234
  file_name = "zero_length_file.file"
200
235
  await sandbox_env.write_file(file_name, "")
201
236
  zero_length = await sandbox_env.read_file(file_name, text=True)
202
- assert isinstance(zero_length, str)
203
- assert zero_length == ""
237
+ assert isinstance(zero_length, str), (
238
+ f"Zero-length file should return a string, got {type(zero_length)}"
239
+ )
240
+ assert zero_length == "", (
241
+ f"Zero-length file should be an empty string, got {zero_length=}"
242
+ )
204
243
  await _cleanup_file(sandbox_env, file_name)
205
244
 
206
245
 
@@ -209,8 +248,12 @@ async def test_write_text_file_space(sandbox_env: SandboxEnvironment) -> None:
209
248
  file_name = "file with space.file"
210
249
  await sandbox_env.write_file(file_name, space)
211
250
  file_with_space = await sandbox_env.read_file(file_name, text=True)
212
- assert isinstance(file_with_space, str)
213
- assert file_with_space == space
251
+ assert isinstance(file_with_space, str), (
252
+ f"File with space should return a string, got {type(file_with_space)}"
253
+ )
254
+ assert file_with_space == space, (
255
+ f"File with space content should match, got {file_with_space=}; expected {space=}"
256
+ )
214
257
  await _cleanup_file(sandbox_env, file_name)
215
258
 
216
259
 
@@ -226,7 +269,10 @@ async def test_write_text_file_is_directory(
226
269
  "/tmp/inspect_ai_test_write_text_file_is_directory",
227
270
  "content cannot go in a directory, dummy",
228
271
  )
229
- assert "directory" in str(e_info.value)
272
+ assert e_info is not None, "IsADirectoryError should be raised"
273
+ assert "directory" in str(e_info.value), (
274
+ f"IsADirectoryError should mention 'directory', got {e_info.value=}"
275
+ )
230
276
  await sandbox_env.exec(
231
277
  ["rm", "-rf", "/tmp/inspect_ai_test_write_text_file_is_directory"]
232
278
  )
@@ -240,7 +286,10 @@ async def test_write_text_file_without_permissions(
240
286
  await sandbox_env.exec(["chmod", "-w", file_name])
241
287
  with Raises(PermissionError) as e_info:
242
288
  await sandbox_env.write_file(file_name, "this won't stick")
243
- assert file_name in str(e_info.value)
289
+ assert e_info is not None, "PermissionError should be raised"
290
+ assert file_name in str(e_info.value), (
291
+ f"PermissionError should contain the filename, got {e_info.value=}"
292
+ )
244
293
  await sandbox_env.exec(["chmod", "+w", file_name])
245
294
  await _cleanup_file(sandbox_env, file_name)
246
295
 
@@ -252,7 +301,9 @@ async def test_write_text_file_exists(
252
301
  await sandbox_env.write_file(file_name, "mundane content")
253
302
  await sandbox_env.write_file(file_name, "altered content")
254
303
  altered_content = await sandbox_env.read_file(file_name, text=True)
255
- assert altered_content == "altered content"
304
+ assert altered_content == "altered content", (
305
+ f"Existing file content should be overwritten, got {altered_content=}"
306
+ )
256
307
  await _cleanup_file(sandbox_env, file_name)
257
308
 
258
309
 
@@ -260,8 +311,12 @@ async def test_write_binary_file_zero_length(sandbox_env: SandboxEnvironment) ->
260
311
  file_name = "zero_length_file.file"
261
312
  await sandbox_env.write_file(file_name, b"")
262
313
  zero_length = await sandbox_env.read_file(file_name, text=False)
263
- assert isinstance(zero_length, bytes)
264
- assert zero_length == b""
314
+ assert isinstance(zero_length, bytes), (
315
+ f"Zero-length file should return bytes, got {type(zero_length)}"
316
+ )
317
+ assert zero_length == b"", (
318
+ f"Zero-length file should be empty bytes, got {zero_length=}"
319
+ )
265
320
  await _cleanup_file(sandbox_env, file_name)
266
321
 
267
322
 
@@ -270,8 +325,10 @@ async def test_write_binary_file_space(sandbox_env: SandboxEnvironment) -> None:
270
325
  file_name = "file with space.file"
271
326
  await sandbox_env.write_file(file_name, binary_content)
272
327
  file_with_space = await sandbox_env.read_file(file_name, text=False)
273
- assert isinstance(file_with_space, bytes)
274
- assert file_with_space == binary_content
328
+ assert isinstance(file_with_space, bytes), (
329
+ f"File with space should return bytes, got {type(file_with_space)}"
330
+ )
331
+ assert file_with_space == binary_content, "File with space content should match"
275
332
  await _cleanup_file(sandbox_env, file_name)
276
333
 
277
334
 
@@ -287,7 +344,10 @@ async def test_write_binary_file_is_directory(
287
344
  "/tmp/inspect_ai_test_write_binary_file_is_directory",
288
345
  b"\xc3\x28",
289
346
  )
290
- assert "directory" in str(e_info.value)
347
+ assert e_info is not None, "IsADirectoryError should be raised"
348
+ assert "directory" in str(e_info.value), (
349
+ f"IsADirectoryError should mention 'directory', got {e_info.value=}"
350
+ )
291
351
  await sandbox_env.exec(
292
352
  ["rm", "-rf", "/tmp/inspect_ai_test_write_binary_file_is_directory"]
293
353
  )
@@ -301,7 +361,10 @@ async def test_write_binary_file_without_permissions(
301
361
  await sandbox_env.exec(["chmod", "-w", file_name])
302
362
  with Raises(PermissionError) as e_info:
303
363
  await sandbox_env.write_file(file_name, b"\xc3\x28")
304
- assert file_name in str(e_info.value)
364
+ assert e_info is not None, "PermissionError should be raised"
365
+ assert file_name in str(e_info.value), (
366
+ f"PermissionError should contain the filename, got {e_info.value=}"
367
+ )
305
368
  await sandbox_env.exec(["chmod", "+w", file_name])
306
369
  await _cleanup_file(sandbox_env, file_name)
307
370
 
@@ -313,7 +376,7 @@ async def test_write_binary_file_exists(
313
376
  await sandbox_env.write_file(file_name, b"\xc3\x28")
314
377
  await sandbox_env.write_file(file_name, b"\xc3\x29")
315
378
  altered_content = await sandbox_env.read_file(file_name, text=False)
316
- assert altered_content == b"\xc3\x29"
379
+ assert altered_content == b"\xc3\x29", "Existing file content should be overwritten"
317
380
  await _cleanup_file(sandbox_env, file_name)
318
381
 
319
382
 
@@ -328,12 +391,16 @@ async def test_exec_output(sandbox_env: SandboxEnvironment) -> None:
328
391
 
329
392
  async def test_exec_stderr(sandbox_env: SandboxEnvironment) -> None:
330
393
  exec_result = await sandbox_env.exec(["sh", "-c", "echo boof; echo baz >&2"])
331
- assert exec_result.stderr == "baz\n"
394
+ assert exec_result.stderr == "baz\n", (
395
+ f"stderr output should match; got {exec_result.stderr=}, expected 'baz\n'"
396
+ )
332
397
 
333
398
 
334
399
  async def test_exec_returncode(sandbox_env: SandboxEnvironment) -> None:
335
400
  exec_result = await sandbox_env.exec(["sh", "-c", "echo foo; exit 70"])
336
- assert exec_result.returncode == 70
401
+ assert exec_result.returncode == 70, (
402
+ f"Return code should match, got {exec_result.returncode=}, expected 70"
403
+ )
337
404
 
338
405
 
339
406
  async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None:
@@ -391,13 +458,13 @@ async def test_exec_as_user(sandbox_env: SandboxEnvironment) -> None:
391
458
 
392
459
 
393
460
  async def test_exec_as_nonexistent_user(sandbox_env: SandboxEnvironment) -> None:
394
- result = await sandbox_env.exec(["whoami"], user="nonexistent")
461
+ nonexistent_username = "nonexistent"
462
+ result = await sandbox_env.exec(["whoami"], user=nonexistent_username)
395
463
  assert not result.success, "Command should have failed for nonexistent user"
396
- expected_error = (
397
- "unable to find user nonexistent: no matching entries in passwd file"
398
- )
399
- assert expected_error in result.stdout, (
400
- f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
464
+ assert (
465
+ nonexistent_username in result.stdout or nonexistent_username in result.stderr
466
+ ), (
467
+ f"Error not found in command output: '{result.stdout}' nor stderr '{result.stderr}"
401
468
  )
402
469
 
403
470
 
@@ -405,13 +472,17 @@ async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None:
405
472
  file_name = "test_cwd_unspecified.file"
406
473
  await sandbox_env.write_file(file_name, "ls me plz")
407
474
  current_dir_contents = (await sandbox_env.exec(["ls", "-1"])).stdout
408
- assert file_name in current_dir_contents
475
+ assert file_name in current_dir_contents, (
476
+ f"File should be in current directory contents; got {current_dir_contents=}"
477
+ )
409
478
  await _cleanup_file(sandbox_env, file_name)
410
479
 
411
480
 
412
481
  async def test_cwd_custom(sandbox_env: SandboxEnvironment) -> None:
413
482
  current_dir_contents = (await sandbox_env.exec(["ls"], cwd="/usr/bin")).stdout
414
- assert "env" in current_dir_contents
483
+ assert "env" in current_dir_contents, (
484
+ f"env should be in /usr/bin; got {current_dir_contents=}"
485
+ )
415
486
 
416
487
 
417
488
  async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None:
@@ -433,7 +504,9 @@ async def test_cwd_absolute(sandbox_env: SandboxEnvironment) -> None:
433
504
  file_name = "/tmp/test_cwd_absolute/test_cwd_absolute.file"
434
505
  await sandbox_env.write_file(file_name, "ls me plz")
435
506
  current_dir_contents = (await sandbox_env.exec(["ls"], cwd=cwd_directory)).stdout
436
- assert "test_cwd_absolute.file" in current_dir_contents
507
+ assert "test_cwd_absolute.file" in current_dir_contents, (
508
+ f"File should be in current directory contents, got {current_dir_contents=}"
509
+ )
437
510
  await _cleanup_file(sandbox_env, file_name)
438
511
  await sandbox_env.exec(["rmdir", cwd_directory])
439
512
 
@@ -442,20 +515,35 @@ async def test_exec_stdout_is_limited(sandbox_env: SandboxEnvironment) -> None:
442
515
  output_size = 10 * 1024**2 + 1024 # 10 MiB + 1 KiB
443
516
  with pytest.raises(OutputLimitExceededError) as e_info:
444
517
  await sandbox_env.exec(["sh", "-c", f"yes | head -c {output_size}"])
445
- assert "limit of 10 MiB was exceeded" in str(e_info.value)
518
+ assert e_info is not None, "OutputLimitExceededError should be raised"
519
+ assert "limit of 10 MiB was exceeded" in str(e_info.value), (
520
+ "OutputLimitExceededError should mention the limit; got {e_info.value=}"
521
+ )
446
522
  truncated_output = e_info.value.truncated_output
447
523
  # `yes` outputs 'y\n' (ASCII) so the size equals the string length.
448
524
  # some shells additionally output 'canceled\n' so we add fudge factor for that
449
- assert truncated_output and (len(truncated_output) - 10 * 1024**2) < 10
525
+ assert truncated_output and (len(truncated_output) - 10 * 1024**2) < 10, (
526
+ f"output not truncated or wrong length; start of truncated output = {'' if not truncated_output else truncated_output[:10]}; len(truncated_output): {'n/a' if not truncated_output else len(truncated_output)}"
527
+ )
450
528
 
451
529
 
452
530
  async def test_exec_stderr_is_limited(sandbox_env: SandboxEnvironment) -> None:
453
531
  output_size = 10 * 1024**2 + 1024 # 10 MiB + 1 KiB
454
532
  with pytest.raises(OutputLimitExceededError) as e_info:
455
533
  await sandbox_env.exec(["sh", "-c", f"yes | head -c {output_size} 1>&2"])
456
- assert "limit of 10 MiB was exceeded" in str(e_info.value)
534
+ assert e_info is not None, "OutputLimitExceededError should be raised"
535
+ assert "limit of 10 MiB was exceeded" in str(e_info.value), (
536
+ "OutputLimitExceededError should mention the limit; got {e_info.value=}"
537
+ )
457
538
  truncated_output = e_info.value.truncated_output
458
- assert truncated_output and len(truncated_output) == 10 * 1024**2
539
+ assert (
540
+ truncated_output
541
+ and truncated_output[0] == "y"
542
+ and len(truncated_output) <= 10 * 1024**2
543
+ and len(truncated_output) > 0
544
+ ), (
545
+ f"output not truncated or wrong length; start of truncated output = {'' if not truncated_output else truncated_output[:10]}; len(truncated_output): {'n/a' if not truncated_output else len(truncated_output)}"
546
+ )
459
547
 
460
548
 
461
549
  # TODO: write a test for when cwd doesn't exist
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import inspect
3
+ from datetime import datetime
3
4
  from functools import wraps
4
5
  from logging import getLogger
5
6
  from typing import (
@@ -15,6 +16,7 @@ from typing import (
15
16
  from inspect_ai._util._async import is_callable_coroutine
16
17
  from inspect_ai._util.content import Content
17
18
  from inspect_ai._util.trace import trace_action
19
+ from inspect_ai._util.working import sample_waiting_time
18
20
  from inspect_ai.util._store import Store, dict_jsonable, init_subtask_store
19
21
 
20
22
  SubtaskResult = str | int | float | bool | list[Content]
@@ -130,6 +132,7 @@ def subtask(
130
132
  return result, list(transcript().events)
131
133
 
132
134
  # create subtask event
135
+ waiting_time_start = sample_waiting_time()
133
136
  event = SubtaskEvent(
134
137
  name=subtask_name, input=log_input, type=type, pending=True
135
138
  )
@@ -139,6 +142,14 @@ def subtask(
139
142
  asyncio_task = asyncio.create_task(run())
140
143
  result, events = await asyncio_task
141
144
 
145
+ # time accounting
146
+ completed = datetime.now()
147
+ waiting_time_end = sample_waiting_time()
148
+ event.completed = completed
149
+ event.working_time = (completed - event.timestamp).total_seconds() - (
150
+ waiting_time_end - waiting_time_start
151
+ )
152
+
142
153
  # update event
143
154
  event.result = result
144
155
  event.events = events
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: inspect_ai
3
- Version: 0.3.70
3
+ Version: 0.3.72
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Security Institute
6
6
  License: MIT License