inspect-ai 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. inspect_ai/_cli/common.py +3 -1
  2. inspect_ai/_cli/eval.py +15 -2
  3. inspect_ai/_display/core/active.py +4 -1
  4. inspect_ai/_display/core/config.py +3 -3
  5. inspect_ai/_display/core/panel.py +7 -3
  6. inspect_ai/_display/plain/__init__.py +0 -0
  7. inspect_ai/_display/plain/display.py +203 -0
  8. inspect_ai/_display/rich/display.py +0 -5
  9. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  10. inspect_ai/_display/textual/widgets/samples.py +78 -11
  11. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  12. inspect_ai/_eval/score.py +1 -0
  13. inspect_ai/_eval/task/results.py +50 -22
  14. inspect_ai/_eval/task/run.py +41 -7
  15. inspect_ai/_eval/task/sandbox.py +10 -5
  16. inspect_ai/_util/constants.py +1 -0
  17. inspect_ai/_util/port_names.py +61 -0
  18. inspect_ai/_util/text.py +23 -0
  19. inspect_ai/_view/www/App.css +31 -1
  20. inspect_ai/_view/www/dist/assets/index.css +31 -1
  21. inspect_ai/_view/www/dist/assets/index.js +25344 -1849
  22. inspect_ai/_view/www/log-schema.json +32 -2
  23. inspect_ai/_view/www/package.json +2 -0
  24. inspect_ai/_view/www/src/App.mjs +8 -10
  25. inspect_ai/_view/www/src/Types.mjs +0 -1
  26. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  27. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  28. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  29. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  30. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  31. inspect_ai/_view/www/src/index.js +75 -2
  32. inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
  33. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
  34. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  35. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  36. inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
  37. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  38. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +24 -12
  39. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
  40. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  41. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  42. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  43. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  44. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  45. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  46. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  47. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  48. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  49. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  50. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  51. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  52. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  53. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  54. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  55. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  56. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  57. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  58. inspect_ai/_view/www/src/types/log.d.ts +13 -2
  59. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  60. inspect_ai/_view/www/src/utils/Json.mjs +12 -6
  61. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
  62. inspect_ai/_view/www/vite.config.js +7 -0
  63. inspect_ai/_view/www/yarn.lock +116 -0
  64. inspect_ai/approval/_human/__init__.py +0 -0
  65. inspect_ai/approval/_policy.py +12 -6
  66. inspect_ai/log/_log.py +1 -1
  67. inspect_ai/log/_samples.py +16 -0
  68. inspect_ai/log/_transcript.py +4 -1
  69. inspect_ai/model/_call_tools.py +4 -0
  70. inspect_ai/model/_conversation.py +20 -8
  71. inspect_ai/model/_generate_config.py +10 -4
  72. inspect_ai/model/_model.py +117 -18
  73. inspect_ai/model/_model_output.py +7 -2
  74. inspect_ai/model/_providers/anthropic.py +100 -44
  75. inspect_ai/model/_providers/azureai.py +20 -20
  76. inspect_ai/model/_providers/bedrock.py +37 -40
  77. inspect_ai/model/_providers/google.py +46 -54
  78. inspect_ai/model/_providers/mistral.py +11 -11
  79. inspect_ai/model/_providers/openai.py +15 -16
  80. inspect_ai/model/_providers/openai_o1.py +9 -8
  81. inspect_ai/model/_providers/providers.py +1 -1
  82. inspect_ai/model/_providers/together.py +8 -8
  83. inspect_ai/model/_providers/vertex.py +1 -4
  84. inspect_ai/scorer/_reducer/reducer.py +1 -1
  85. inspect_ai/scorer/_scorer.py +2 -2
  86. inspect_ai/solver/__init__.py +2 -5
  87. inspect_ai/solver/_prompt.py +35 -5
  88. inspect_ai/solver/_task_state.py +80 -38
  89. inspect_ai/tool/__init__.py +2 -0
  90. inspect_ai/tool/_tool.py +12 -1
  91. inspect_ai/tool/_tool_call.py +10 -0
  92. inspect_ai/tool/_tool_def.py +16 -5
  93. inspect_ai/tool/_tool_with.py +21 -4
  94. inspect_ai/tool/beta/__init__.py +5 -0
  95. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  96. inspect_ai/tool/beta/_computer/_common.py +133 -0
  97. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  98. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  99. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  100. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  101. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  102. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  103. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  104. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  105. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  106. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  107. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  108. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  109. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  110. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  111. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  112. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  113. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  114. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  115. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  116. inspect_ai/util/__init__.py +2 -0
  117. inspect_ai/util/_limit.py +26 -0
  118. inspect_ai/util/_sandbox/docker/docker.py +64 -1
  119. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  120. inspect_ai/util/_sandbox/environment.py +14 -0
  121. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
  122. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +126 -98
  123. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  124. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
  125. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
  126. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
  127. {inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -169,6 +169,70 @@
169
169
  "@babel/helper-validator-identifier" "^7.24.7"
170
170
  to-fast-properties "^2.0.0"
171
171
 
172
+ "@codemirror/autocomplete@^6.0.0":
173
+ version "6.18.3"
174
+ resolved "https://registry.yarnpkg.com/@codemirror/autocomplete/-/autocomplete-6.18.3.tgz#f9ea79a2f369662516f71bc0b2f819454d3c8e00"
175
+ integrity sha512-1dNIOmiM0z4BIBwxmxEfA1yoxh1MF/6KPBbh20a5vphGV0ictKlgQsbJs6D6SkR6iJpGbpwRsa6PFMNlg9T9pQ==
176
+ dependencies:
177
+ "@codemirror/language" "^6.0.0"
178
+ "@codemirror/state" "^6.0.0"
179
+ "@codemirror/view" "^6.17.0"
180
+ "@lezer/common" "^1.0.0"
181
+
182
+ "@codemirror/commands@^6.0.0":
183
+ version "6.7.1"
184
+ resolved "https://registry.yarnpkg.com/@codemirror/commands/-/commands-6.7.1.tgz#04561e95bc0779eaa49efd63e916c4efb3bbf6d6"
185
+ integrity sha512-llTrboQYw5H4THfhN4U3qCnSZ1SOJ60ohhz+SzU0ADGtwlc533DtklQP0vSFaQuCPDn3BPpOd1GbbnUtwNjsrw==
186
+ dependencies:
187
+ "@codemirror/language" "^6.0.0"
188
+ "@codemirror/state" "^6.4.0"
189
+ "@codemirror/view" "^6.27.0"
190
+ "@lezer/common" "^1.1.0"
191
+
192
+ "@codemirror/language@^6.0.0":
193
+ version "6.10.6"
194
+ resolved "https://registry.yarnpkg.com/@codemirror/language/-/language-6.10.6.tgz#3770aa55fce575b45b1037b390b576907f0061c7"
195
+ integrity sha512-KrsbdCnxEztLVbB5PycWXFxas4EOyk/fPAfruSOnDDppevQgid2XZ+KbJ9u+fDikP/e7MW7HPBTvTb8JlZK9vA==
196
+ dependencies:
197
+ "@codemirror/state" "^6.0.0"
198
+ "@codemirror/view" "^6.23.0"
199
+ "@lezer/common" "^1.1.0"
200
+ "@lezer/highlight" "^1.0.0"
201
+ "@lezer/lr" "^1.0.0"
202
+ style-mod "^4.0.0"
203
+
204
+ "@codemirror/lint@^6.0.0":
205
+ version "6.8.4"
206
+ resolved "https://registry.yarnpkg.com/@codemirror/lint/-/lint-6.8.4.tgz#7d8aa5d1a6dec89ffcc23ad45ddca2e12e90982d"
207
+ integrity sha512-u4q7PnZlJUojeRe8FJa/njJcMctISGgPQ4PnWsd9268R4ZTtU+tfFYmwkBvgcrK2+QQ8tYFVALVb5fVJykKc5A==
208
+ dependencies:
209
+ "@codemirror/state" "^6.0.0"
210
+ "@codemirror/view" "^6.35.0"
211
+ crelt "^1.0.5"
212
+
213
+ "@codemirror/search@^6.0.0":
214
+ version "6.5.8"
215
+ resolved "https://registry.yarnpkg.com/@codemirror/search/-/search-6.5.8.tgz#b59b3659b46184cc75d6108d7c050a4ca344c3a0"
216
+ integrity sha512-PoWtZvo7c1XFeZWmmyaOp2G0XVbOnm+fJzvghqGAktBW3cufwJUWvSCcNG0ppXiBEM05mZu6RhMtXPv2hpllig==
217
+ dependencies:
218
+ "@codemirror/state" "^6.0.0"
219
+ "@codemirror/view" "^6.0.0"
220
+ crelt "^1.0.5"
221
+
222
+ "@codemirror/state@^6.0.0", "@codemirror/state@^6.4.0":
223
+ version "6.4.1"
224
+ resolved "https://registry.yarnpkg.com/@codemirror/state/-/state-6.4.1.tgz#da57143695c056d9a3c38705ed34136e2b68171b"
225
+ integrity sha512-QkEyUiLhsJoZkbumGZlswmAhA7CBU02Wrz7zvH4SrcifbsqwlXShVXg65f3v/ts57W3dqyamEriMhij1Z3Zz4A==
226
+
227
+ "@codemirror/view@^6.0.0", "@codemirror/view@^6.17.0", "@codemirror/view@^6.23.0", "@codemirror/view@^6.27.0", "@codemirror/view@^6.35.0":
228
+ version "6.35.0"
229
+ resolved "https://registry.yarnpkg.com/@codemirror/view/-/view-6.35.0.tgz#890e8e31a58edf65cdf193049fe9f3fdec20cc82"
230
+ integrity sha512-I0tYy63q5XkaWsJ8QRv5h6ves7kvtrBWjBcnf/bzohFJQc5c14a1AQRdE8QpPF9eMp5Mq2FMm59TCj1gDfE7kw==
231
+ dependencies:
232
+ "@codemirror/state" "^6.4.0"
233
+ style-mod "^4.1.0"
234
+ w3c-keyname "^2.2.4"
235
+
172
236
  "@esbuild/aix-ppc64@0.21.5":
173
237
  version "0.21.5"
174
238
  resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz#c7184a326533fcdf1b8ee0733e21c713b975575f"
@@ -372,6 +436,25 @@
372
436
  "@jridgewell/resolve-uri" "^3.1.0"
373
437
  "@jridgewell/sourcemap-codec" "^1.4.14"
374
438
 
439
+ "@lezer/common@^1.0.0", "@lezer/common@^1.1.0":
440
+ version "1.2.3"
441
+ resolved "https://registry.yarnpkg.com/@lezer/common/-/common-1.2.3.tgz#138fcddab157d83da557554851017c6c1e5667fd"
442
+ integrity sha512-w7ojc8ejBqr2REPsWxJjrMFsA/ysDCFICn8zEOR9mrqzOu2amhITYuLD8ag6XZf0CFXDrhKqw7+tW8cX66NaDA==
443
+
444
+ "@lezer/highlight@^1.0.0":
445
+ version "1.2.1"
446
+ resolved "https://registry.yarnpkg.com/@lezer/highlight/-/highlight-1.2.1.tgz#596fa8f9aeb58a608be0a563e960c373cbf23f8b"
447
+ integrity sha512-Z5duk4RN/3zuVO7Jq0pGLJ3qynpxUVsh7IbUbGj88+uV2ApSAn6kWg2au3iJb+0Zi7kKtqffIESgNcRXWZWmSA==
448
+ dependencies:
449
+ "@lezer/common" "^1.0.0"
450
+
451
+ "@lezer/lr@^1.0.0":
452
+ version "1.4.2"
453
+ resolved "https://registry.yarnpkg.com/@lezer/lr/-/lr-1.4.2.tgz#931ea3dea8e9de84e90781001dae30dea9ff1727"
454
+ integrity sha512-pu0K1jCIdnQ12aWNaAVU5bzi7Bd1w54J3ECgANPmYLtQKP0HBj2cE/5coBD66MT10xbtIuUr7tg0Shbsvk0mDA==
455
+ dependencies:
456
+ "@lezer/common" "^1.0.0"
457
+
375
458
  "@nodelib/fs.scandir@2.1.5":
376
459
  version "2.1.5"
377
460
  resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5"
@@ -619,6 +702,19 @@ clipboard@^2.0.11:
619
702
  select "^1.1.2"
620
703
  tiny-emitter "^2.0.0"
621
704
 
705
+ codemirror@^6.0.1:
706
+ version "6.0.1"
707
+ resolved "https://registry.yarnpkg.com/codemirror/-/codemirror-6.0.1.tgz#62b91142d45904547ee3e0e0e4c1a79158035a29"
708
+ integrity sha512-J8j+nZ+CdWmIeFIGXEFbFPtpiYacFMDR8GlHK3IyHQJMCaVRfGx9NT+Hxivv1ckLWPvNdZqndbr/7lVhrf/Svg==
709
+ dependencies:
710
+ "@codemirror/autocomplete" "^6.0.0"
711
+ "@codemirror/commands" "^6.0.0"
712
+ "@codemirror/language" "^6.0.0"
713
+ "@codemirror/lint" "^6.0.0"
714
+ "@codemirror/search" "^6.0.0"
715
+ "@codemirror/state" "^6.0.0"
716
+ "@codemirror/view" "^6.0.0"
717
+
622
718
  color-convert@^1.9.0:
623
719
  version "1.9.3"
624
720
  resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-1.9.3.tgz#bb71850690e1f136567de629d2d5471deda4c1e8"
@@ -653,6 +749,11 @@ convert-source-map@^2.0.0:
653
749
  resolved "https://registry.yarnpkg.com/convert-source-map/-/convert-source-map-2.0.0.tgz#4b560f649fc4e918dd0ab75cf4961e8bc882d82a"
654
750
  integrity sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==
655
751
 
752
+ crelt@^1.0.5:
753
+ version "1.0.6"
754
+ resolved "https://registry.yarnpkg.com/crelt/-/crelt-1.0.6.tgz#7cc898ea74e190fb6ef9dae57f8f81cf7302df72"
755
+ integrity sha512-VQ2MBenTq1fWZUH9DJNGti7kKv6EeAuYr3cLwxUWhIu1baTaXh4Ib5W2CqHVqib4/MqbYGJqiL3Zb8GJZr3l4g==
756
+
656
757
  cross-spawn@^7.0.2:
657
758
  version "7.0.3"
658
759
  resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
@@ -885,6 +986,11 @@ file-entry-cache@^8.0.0:
885
986
  dependencies:
886
987
  flat-cache "^4.0.0"
887
988
 
989
+ filtrex@^3.1.0:
990
+ version "3.1.0"
991
+ resolved "https://registry.yarnpkg.com/filtrex/-/filtrex-3.1.0.tgz#5ec00994615ff10e5e09c89bb290c855cb408c21"
992
+ integrity sha512-mHzZ2wUISETF1OaEcNRiGz1ljuIV8c/C9td9qyAZ+wTwigkAk5RO9YrCxQKk5H9v7joDRFIBik9U5RTK9eXZ/A==
993
+
888
994
  find-up@^5.0.0:
889
995
  version "5.0.0"
890
996
  resolved "https://registry.yarnpkg.com/find-up/-/find-up-5.0.0.tgz#4c92819ecb7083561e4f4a240a86be5198f536fc"
@@ -1367,6 +1473,11 @@ strip-json-comments@^3.1.1:
1367
1473
  resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz#31f1281b3832630434831c310c01cccda8cbe006"
1368
1474
  integrity sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==
1369
1475
 
1476
+ style-mod@^4.0.0, style-mod@^4.1.0:
1477
+ version "4.1.2"
1478
+ resolved "https://registry.yarnpkg.com/style-mod/-/style-mod-4.1.2.tgz#ca238a1ad4786520f7515a8539d5a63691d7bf67"
1479
+ integrity sha512-wnD1HyVqpJUI2+eKZ+eo1UwghftP6yuFheBqqe+bWCotBjC2K1YnteJILRMs3SM4V/0dLEW1SC27MWP5y+mwmw==
1480
+
1370
1481
  supports-color@^5.3.0:
1371
1482
  version "5.5.0"
1372
1483
  resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-5.5.0.tgz#e2e69a44ac8772f78a1ec0b35b689df6530efc8f"
@@ -1442,6 +1553,11 @@ vite@^5.3.2:
1442
1553
  optionalDependencies:
1443
1554
  fsevents "~2.3.3"
1444
1555
 
1556
+ w3c-keyname@^2.2.4:
1557
+ version "2.2.8"
1558
+ resolved "https://registry.yarnpkg.com/w3c-keyname/-/w3c-keyname-2.2.8.tgz#7b17c8c6883d4e8b86ac8aba79d39e880f8869c5"
1559
+ integrity sha512-dpojBhNsCNN7T82Tm7k26A6G9ML3NkhDsnw9n/eoxSRlVBB4CEtIQ/KTCLI2Fwf3ataSXRhYFkQi3SlnFwPvPQ==
1560
+
1445
1561
  which@^2.0.1:
1446
1562
  version "2.0.2"
1447
1563
  resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1"
File without changes
@@ -1,13 +1,13 @@
1
1
  import fnmatch
2
- import re
2
+ import sys
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from re import Pattern
6
5
  from typing import Any, Generator, cast
7
6
 
8
7
  from pydantic import BaseModel, Field, model_validator
9
8
 
10
9
  from inspect_ai._util.config import read_config_object
10
+ from inspect_ai._util.format import format_function_call
11
11
  from inspect_ai._util.registry import registry_create, registry_lookup
12
12
  from inspect_ai.solver._task_state import TaskState
13
13
  from inspect_ai.tool._tool_call import ToolCall, ToolCallView
@@ -30,17 +30,23 @@ def policy_approver(policies: str | list[ApprovalPolicy]) -> Approver:
30
30
  policies = approval_policies_from_config(policies)
31
31
 
32
32
  # compile policy into approvers and regexes for matching
33
- policy_matchers: list[tuple[list[Pattern[str]], Approver]] = []
33
+ policy_matchers: list[tuple[list[str], Approver]] = []
34
34
  for policy in policies:
35
35
  tools = [policy.tools] if isinstance(policy.tools, str) else policy.tools
36
- patterns = [re.compile(fnmatch.translate(tool)) for tool in tools]
37
- policy_matchers.append((patterns, policy.approver))
36
+ globs = [f"{tool}*" for tool in tools]
37
+ policy_matchers.append((globs, policy.approver))
38
38
 
39
39
  # generator for policies that match a tool_call
40
40
  def tool_approvers(tool_call: ToolCall) -> Generator[Approver, None, None]:
41
41
  for policy_matcher in iter(policy_matchers):
42
+ function_call = format_function_call(
43
+ tool_call.function, tool_call.arguments, width=sys.maxsize
44
+ )
42
45
  if any(
43
- [pattern.match(tool_call.function) for pattern in policy_matcher[0]]
46
+ [
47
+ fnmatch.fnmatch(function_call, pattern)
48
+ for pattern in policy_matcher[0]
49
+ ]
44
50
  ):
45
51
  yield policy_matcher[1]
46
52
 
inspect_ai/log/_log.py CHANGED
@@ -114,7 +114,7 @@ class EvalConfig(BaseModel):
114
114
 
115
115
 
116
116
  class EvalSampleLimit(BaseModel):
117
- type: Literal["context", "time", "message", "token", "operator"]
117
+ type: Literal["context", "time", "message", "token", "operator", "custom"]
118
118
  """The type of limit"""
119
119
 
120
120
  limit: int
@@ -113,6 +113,14 @@ def sample_active() -> ActiveSample | None:
113
113
  return _sample_active.get(None)
114
114
 
115
115
 
116
+ def active_sample_token_limit() -> int | None:
117
+ active = sample_active()
118
+ if active:
119
+ return active.token_limit
120
+ else:
121
+ return None
122
+
123
+
116
124
  def set_active_sample_token_limit(token_limit: int | None) -> None:
117
125
  active = sample_active()
118
126
  if active:
@@ -125,6 +133,14 @@ def set_active_sample_total_tokens(total_tokens: int) -> None:
125
133
  active.total_tokens = total_tokens
126
134
 
127
135
 
136
+ def active_sample_message_limit() -> int | None:
137
+ active = sample_active()
138
+ if active:
139
+ return active.message_limit
140
+ else:
141
+ return None
142
+
143
+
128
144
  def set_active_sample_message_limit(message_limit: int | None) -> None:
129
145
  active = sample_active()
130
146
  if active:
@@ -70,7 +70,7 @@ class SampleLimitEvent(BaseEvent):
70
70
  event: Literal["sample_limit"] = Field(default="sample_limit")
71
71
  """Event type."""
72
72
 
73
- type: Literal["message", "time", "token", "operator"]
73
+ type: Literal["message", "time", "token", "operator", "custom"]
74
74
  """Type of limit that halted processing"""
75
75
 
76
76
  message: str
@@ -124,6 +124,9 @@ class ModelEvent(BaseEvent):
124
124
  output: ModelOutput
125
125
  """Output from model."""
126
126
 
127
+ error: str | None = Field(default=None)
128
+ """Error which occurred during model call."""
129
+
127
130
  cache: Literal["read", "write"] | None = Field(default=None)
128
131
  """Was this a cache read or write."""
129
132
 
@@ -328,6 +328,10 @@ def tool_params(input: dict[str, Any], func: Callable[..., Any]) -> dict[str, An
328
328
  type_hints = get_type_hints(func)
329
329
  docstring = inspect.getdoc(func)
330
330
 
331
+ # if the function takes **kwargs: Any then just pass the tool arguments through
332
+ if "kwargs" in type_hints and type_hints["kwargs"] == Any:
333
+ return input
334
+
331
335
  # build params
332
336
  params: dict[str, Any] = {}
333
337
  for param_name, param in signature.parameters.items():
@@ -1,6 +1,7 @@
1
1
  from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
+ from inspect_ai._util.constants import NO_CONTENT
4
5
  from inspect_ai._util.rich import lines_display
5
6
  from inspect_ai._util.transcript import transcript_markdown
6
7
  from inspect_ai.util._conversation import conversation_panel
@@ -15,13 +16,16 @@ MESSAGE_TITLE = "Message"
15
16
  def conversation_tool_mesage(message: ChatMessageTool) -> None:
16
17
  if display_type() == "conversation":
17
18
  # truncate output to 100 lines
18
- output = message.error.message if message.error else message.text.strip()
19
- content = lines_display(output, 100)
20
-
21
- conversation_panel(
22
- title=f"Tool Output: {message.function}",
23
- content=content,
19
+ output = (
20
+ message.error.message.strip() if message.error else message.text.strip()
24
21
  )
22
+ if output:
23
+ content = lines_display(output, 100)
24
+
25
+ conversation_panel(
26
+ title=f"Tool Output: {message.function}",
27
+ content=content,
28
+ )
25
29
 
26
30
 
27
31
  def conversation_assistant_message(
@@ -37,13 +41,21 @@ def conversation_assistant_message(
37
41
 
38
42
  # start with assistant content
39
43
  content: list[RenderableType] = (
40
- [transcript_markdown(message.text, escape=True)] if message.text else []
44
+ [transcript_markdown(message.text, escape=True)]
45
+ if message.text and message.text != NO_CONTENT
46
+ else []
41
47
  )
42
48
 
43
49
  # print tool calls
44
50
  if message.tool_calls:
45
- content.append(Text())
51
+ if content:
52
+ content.append(Text())
46
53
  content.extend(render_tool_calls(message.tool_calls))
47
54
 
48
55
  # print the assistant message
49
56
  conversation_panel(title="Assistant", content=content)
57
+
58
+
59
+ def conversation_assistant_error(error: Exception) -> None:
60
+ if display_type() == "conversation":
61
+ conversation_panel(title="Assistant", content=repr(error))
@@ -58,14 +58,17 @@ class GenerateConfigArgs(TypedDict, total=False):
58
58
  """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, and TogetherAI only."""
59
59
 
60
60
  logprobs: bool | None
61
- """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
61
+ """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
62
62
 
63
63
  top_logprobs: int | None
64
- """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, and Huggingface only."""
64
+ """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, and Huggingface only."""
65
65
 
66
66
  parallel_tool_calls: bool | None
67
67
  """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
68
68
 
69
+ internal_tools: bool | None
70
+ """Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
71
+
69
72
  max_tool_output: int | None
70
73
  """Maximum tool output (in bytes). Defaults to 16 * 1024."""
71
74
 
@@ -128,14 +131,17 @@ class GenerateConfig(BaseModel):
128
131
  """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and vLLM only."""
129
132
 
130
133
  logprobs: bool | None = Field(default=None)
131
- """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
134
+ """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
132
135
 
133
136
  top_logprobs: int | None = Field(default=None)
134
- """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, Huggingface, and vLLM only."""
137
+ """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, and vLLM only."""
135
138
 
136
139
  parallel_tool_calls: bool | None = Field(default=None)
137
140
  """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
138
141
 
142
+ internal_tools: bool | None = Field(default=None)
143
+ """Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
144
+
139
145
  max_tool_output: int | None = Field(default=None)
140
146
  """Maximum tool output (in bytes). Defaults to 16 * 1024."""
141
147
 
@@ -33,6 +33,7 @@ from inspect_ai._util.trace import trace_action
33
33
  from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
34
34
  from inspect_ai.tool._tool_def import ToolDef, tool_defs
35
35
  from inspect_ai.util import concurrency
36
+ from inspect_ai.util._limit import SampleLimitExceededError
36
37
 
37
38
  from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
38
39
  from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
@@ -43,7 +44,7 @@ from ._chat_message import (
43
44
  ChatMessageTool,
44
45
  ChatMessageUser,
45
46
  )
46
- from ._conversation import conversation_assistant_message
47
+ from ._conversation import conversation_assistant_error, conversation_assistant_message
47
48
  from ._generate_config import (
48
49
  GenerateConfig,
49
50
  active_generate_config,
@@ -116,7 +117,7 @@ class ModelAPI(abc.ABC):
116
117
  tools: list[ToolInfo],
117
118
  tool_choice: ToolChoice,
118
119
  config: GenerateConfig,
119
- ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
120
+ ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
120
121
  """Generate output from the model.
121
122
 
122
123
  Args:
@@ -165,7 +166,7 @@ class ModelAPI(abc.ABC):
165
166
  return False
166
167
 
167
168
  def tool_result_images(self) -> bool:
168
- """Tool results can containe images"""
169
+ """Tool results can contain images"""
169
170
  return False
170
171
 
171
172
 
@@ -222,11 +223,17 @@ class Model:
222
223
  Returns:
223
224
  ModelOutput
224
225
  """
226
+ # if we are the default model then enforce message limit if it
227
+ # exists (raise an exception if it is exceeded)
228
+ is_active_model = self == active_model()
229
+ if is_active_model:
230
+ handle_sample_message_limit(input)
231
+
225
232
  # base config for this model
226
233
  base_config = self.config
227
234
 
228
235
  # if we are the active_model then merge active generate config
229
- if self == active_model():
236
+ if is_active_model:
230
237
  base_config = base_config.merge(active_generate_config())
231
238
 
232
239
  # merge passed config
@@ -296,6 +303,9 @@ class Model:
296
303
  tools = []
297
304
  tool_choice = "none"
298
305
 
306
+ # apply any tool model_input handlers
307
+ input = resolve_tool_model_input(tdefs, input)
308
+
299
309
  # break tool image content out into user messages if the model doesn't
300
310
  # support tools returning images
301
311
  if not self.api.tool_result_images():
@@ -389,6 +399,17 @@ class Model:
389
399
  output = result
390
400
  call = None
391
401
 
402
+ # raise error
403
+ if isinstance(output, Exception):
404
+ complete(output, call)
405
+
406
+ # Wrap the error in a runtime error which will show the
407
+ # request which caused the error
408
+ error = repr(output)
409
+ request = json.dumps(call.request, indent=2) if call is not None else ""
410
+ error_message = f"{error}\n\nRequest:\n{request}"
411
+ raise RuntimeError(error_message)
412
+
392
413
  # update output with time elapsed
393
414
  output.time = time_elapsed
394
415
 
@@ -464,7 +485,7 @@ class Model:
464
485
  cache: Literal["read", "write"] | None,
465
486
  output: ModelOutput | None = None,
466
487
  call: ModelCall | None = None,
467
- ) -> Callable[[ModelOutput, ModelCall | None], None]:
488
+ ) -> Callable[[ModelOutput | Exception, ModelCall | None], None]:
468
489
  from inspect_ai.log._transcript import ModelEvent, transcript
469
490
 
470
491
  # create event and add it to the transcript
@@ -484,13 +505,16 @@ class Model:
484
505
 
485
506
  # callable that can be used to update the interaction w/ output
486
507
  def complete(
487
- updated_output: ModelOutput, updated_call: ModelCall | None
508
+ result: ModelOutput | Exception, updated_call: ModelCall | None
488
509
  ) -> None:
489
510
  # trace
490
- conversation_assistant_message(input, updated_output.choices[0].message)
511
+ if isinstance(result, ModelOutput):
512
+ conversation_assistant_message(input, result.choices[0].message)
513
+ event.output = result
514
+ else:
515
+ conversation_assistant_error(result)
516
+ event.error = repr(result)
491
517
 
492
- # update event
493
- event.output = updated_output
494
518
  event.call = updated_call
495
519
  event.pending = None
496
520
 
@@ -703,6 +727,40 @@ def simple_input_messages(
703
727
  return messages
704
728
 
705
729
 
730
+ def resolve_tool_model_input(
731
+ tdefs: list[ToolDef], messages: list[ChatMessage]
732
+ ) -> list[ChatMessage]:
733
+ # filter on tooldefs that have a model input handler
734
+ tdefs = [tdef for tdef in tdefs if tdef.model_input is not None]
735
+
736
+ # bail if there are no handlers
737
+ if len(tdefs) == 0:
738
+ return messages
739
+
740
+ # don't mutate the original messages
741
+ messages = deepcopy(messages)
742
+
743
+ # extract tool messages
744
+ tool_messages = [
745
+ message for message in messages if isinstance(message, ChatMessageTool)
746
+ ]
747
+ # run model_input handlers over all tool_messages with the same function name
748
+ for tdef in tdefs:
749
+ assert tdef.model_input
750
+ # filter messages down to just this tool
751
+ tdef_tool_messages = [
752
+ message for message in tool_messages if message.function == tdef.name
753
+ ]
754
+ # call the function for each tool, passing the index, total, and content
755
+ for index, message in enumerate(tdef_tool_messages):
756
+ message.content = tdef.model_input(
757
+ index, len(tool_messages), message.content
758
+ )
759
+
760
+ # return modified messages
761
+ return messages
762
+
763
+
706
764
  def tool_result_images_as_user_message(
707
765
  messages: list[ChatMessage],
708
766
  ) -> list[ChatMessage]:
@@ -713,16 +771,21 @@ def tool_result_images_reducer(
713
771
  messages: list[ChatMessage],
714
772
  message: ChatMessage,
715
773
  ) -> list[ChatMessage]:
716
- # append the message
717
- messages.append(message)
718
-
719
774
  # if there are tool result images, pull them out into a ChatUserMessage
720
775
  if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
776
+ tool_message = ChatMessageTool(
777
+ content=message.content.copy(),
778
+ tool_call_id=message.tool_call_id,
779
+ function=message.function,
780
+ )
781
+ assert isinstance(tool_message.content, list)
782
+ messages.append(tool_message)
783
+
721
784
  user_content: list[Content] = []
722
- for i in range(0, len(message.content)):
723
- if isinstance(message.content[i], ContentImage):
785
+ for i in range(0, len(tool_message.content)):
786
+ if isinstance(tool_message.content[i], ContentImage):
724
787
  user_content.append(message.content[i])
725
- message.content[i] = ContentText(
788
+ tool_message.content[i] = ContentText(
726
789
  text="Image content is in the message below."
727
790
  )
728
791
  if len(user_content) > 0:
@@ -730,6 +793,9 @@ def tool_result_images_reducer(
730
793
  ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
731
794
  )
732
795
 
796
+ else:
797
+ messages.append(message)
798
+
733
799
  # return messages
734
800
  return messages
735
801
 
@@ -813,6 +879,24 @@ def active_model() -> Model | None:
813
879
  active_model_context_var: ContextVar[Model] = ContextVar("active_model")
814
880
 
815
881
 
882
+ def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
883
+ from inspect_ai.log._samples import (
884
+ active_sample_message_limit,
885
+ set_active_sample_total_messages,
886
+ )
887
+
888
+ total_messages = 1 if isinstance(input, str) else len(input)
889
+ message_limit = active_sample_message_limit()
890
+ if message_limit is not None:
891
+ if total_messages >= message_limit:
892
+ raise SampleLimitExceededError(
893
+ "message", value=total_messages, limit=message_limit
894
+ )
895
+
896
+ # set total messages
897
+ set_active_sample_total_messages(total_messages)
898
+
899
+
816
900
  def init_model_usage() -> None:
817
901
  model_usage_context_var.set({})
818
902
 
@@ -822,13 +906,28 @@ def init_sample_model_usage() -> None:
822
906
 
823
907
 
824
908
  def record_model_usage(model: str, usage: ModelUsage) -> None:
909
+ from inspect_ai.log._samples import (
910
+ active_sample_token_limit,
911
+ set_active_sample_total_tokens,
912
+ )
913
+
914
+ # record usage
825
915
  set_model_usage(model, usage, sample_model_usage_context_var.get(None))
826
916
  set_model_usage(model, usage, model_usage_context_var.get(None))
827
917
 
828
- # update active sample
829
- from inspect_ai.log._samples import set_active_sample_total_tokens
918
+ # compute total tokens
919
+ total_tokens = sample_total_tokens()
830
920
 
831
- set_active_sample_total_tokens(sample_total_tokens())
921
+ # update active sample
922
+ set_active_sample_total_tokens(total_tokens)
923
+
924
+ # check for token limit overflow and raise
925
+ token_limit = active_sample_token_limit()
926
+ if token_limit is not None:
927
+ if total_tokens > token_limit:
928
+ raise SampleLimitExceededError(
929
+ "token", value=total_tokens, limit=token_limit
930
+ )
832
931
 
833
932
 
834
933
  def set_model_usage(
@@ -26,9 +26,14 @@ class ModelUsage(BaseModel):
26
26
 
27
27
 
28
28
  StopReason = Literal[
29
- "stop", "max_tokens", "model_length", "tool_calls", "content_filter", "unknown"
29
+ "stop",
30
+ "max_tokens",
31
+ "model_length",
32
+ "tool_calls",
33
+ "content_filter",
34
+ "unknown",
30
35
  ]
31
- """Reason that the model stopped generating."""
36
+ """Reason that the model stopped or failed to generate."""
32
37
 
33
38
 
34
39
  class TopLogprob(BaseModel):