inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. inspect_ai/__init__.py +1 -0
  2. inspect_ai/_cli/common.py +1 -1
  3. inspect_ai/_cli/trace.py +33 -20
  4. inspect_ai/_display/core/active.py +1 -1
  5. inspect_ai/_display/core/display.py +1 -1
  6. inspect_ai/_display/core/footer.py +1 -1
  7. inspect_ai/_display/core/progress.py +0 -6
  8. inspect_ai/_display/core/rich.py +1 -1
  9. inspect_ai/_display/rich/display.py +2 -2
  10. inspect_ai/_display/textual/app.py +15 -17
  11. inspect_ai/_display/textual/widgets/clock.py +3 -3
  12. inspect_ai/_display/textual/widgets/samples.py +6 -13
  13. inspect_ai/_eval/context.py +9 -1
  14. inspect_ai/_eval/score.py +4 -10
  15. inspect_ai/_eval/task/results.py +5 -4
  16. inspect_ai/_eval/task/run.py +6 -12
  17. inspect_ai/_eval/task/task.py +10 -0
  18. inspect_ai/_util/ansi.py +31 -0
  19. inspect_ai/_util/format.py +7 -0
  20. inspect_ai/_util/logger.py +12 -12
  21. inspect_ai/_util/throttle.py +10 -1
  22. inspect_ai/_util/trace.py +43 -47
  23. inspect_ai/_util/transcript.py +4 -0
  24. inspect_ai/_util/vscode.py +51 -0
  25. inspect_ai/_view/notify.py +2 -1
  26. inspect_ai/_view/www/App.css +22 -1
  27. inspect_ai/_view/www/dist/assets/index.css +2374 -2
  28. inspect_ai/_view/www/dist/assets/index.js +29622 -24424
  29. inspect_ai/_view/www/log-schema.json +138 -90
  30. inspect_ai/_view/www/package.json +1 -0
  31. inspect_ai/_view/www/src/App.mjs +1 -0
  32. inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
  33. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
  34. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
  35. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
  36. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
  37. inspect_ai/_view/www/src/components/Tools.mjs +11 -3
  38. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
  39. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
  40. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
  41. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
  42. inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
  43. inspect_ai/_view/www/src/types/log.d.ts +26 -12
  44. inspect_ai/_view/www/yarn.lock +44 -0
  45. inspect_ai/approval/_apply.py +4 -0
  46. inspect_ai/approval/_human/panel.py +5 -8
  47. inspect_ai/dataset/_dataset.py +51 -10
  48. inspect_ai/dataset/_util.py +31 -3
  49. inspect_ai/log/__init__.py +2 -0
  50. inspect_ai/log/_log.py +5 -2
  51. inspect_ai/model/_call_tools.py +4 -2
  52. inspect_ai/model/_chat_message.py +3 -0
  53. inspect_ai/model/_model.py +42 -1
  54. inspect_ai/model/_providers/anthropic.py +4 -0
  55. inspect_ai/model/_render.py +9 -2
  56. inspect_ai/scorer/_metric.py +12 -1
  57. inspect_ai/solver/__init__.py +2 -0
  58. inspect_ai/solver/_human_agent/agent.py +83 -0
  59. inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
  60. inspect_ai/solver/_human_agent/commands/clock.py +70 -0
  61. inspect_ai/solver/_human_agent/commands/command.py +59 -0
  62. inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
  63. inspect_ai/solver/_human_agent/commands/note.py +42 -0
  64. inspect_ai/solver/_human_agent/commands/score.py +80 -0
  65. inspect_ai/solver/_human_agent/commands/status.py +62 -0
  66. inspect_ai/solver/_human_agent/commands/submit.py +151 -0
  67. inspect_ai/solver/_human_agent/install.py +222 -0
  68. inspect_ai/solver/_human_agent/panel.py +252 -0
  69. inspect_ai/solver/_human_agent/service.py +45 -0
  70. inspect_ai/solver/_human_agent/state.py +55 -0
  71. inspect_ai/solver/_human_agent/view.py +24 -0
  72. inspect_ai/solver/_task_state.py +28 -2
  73. inspect_ai/tool/_tool.py +10 -2
  74. inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
  75. inspect_ai/util/__init__.py +8 -4
  76. inspect_ai/{_util/display.py → util/_display.py} +6 -0
  77. inspect_ai/util/_panel.py +31 -9
  78. inspect_ai/util/_sandbox/__init__.py +0 -3
  79. inspect_ai/util/_sandbox/context.py +5 -1
  80. inspect_ai/util/_sandbox/docker/compose.py +16 -10
  81. inspect_ai/util/_sandbox/docker/docker.py +9 -6
  82. inspect_ai/util/_sandbox/docker/internal.py +1 -1
  83. inspect_ai/util/_sandbox/docker/util.py +2 -2
  84. inspect_ai/util/_sandbox/environment.py +6 -5
  85. inspect_ai/util/_sandbox/local.py +1 -1
  86. inspect_ai/util/_sandbox/service.py +22 -7
  87. inspect_ai/util/_store.py +5 -6
  88. inspect_ai/util/_store_model.py +110 -0
  89. inspect_ai/util/_throttle.py +32 -0
  90. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
  91. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
  92. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
  93. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
  94. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
  95. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown];
29
29
  export type Model = string;
30
30
  export type ModelBaseUrl = string | null;
31
31
  export type Limit = number | [unknown, unknown] | null;
32
+ export type SampleId = string | number | (string | number)[] | null;
32
33
  export type Epochs = number | null;
33
34
  export type EpochsReducer = string[] | null;
34
35
  export type Trace = boolean | null;
@@ -42,10 +43,12 @@ export type TimeLimit = number | null;
42
43
  export type MaxSamples = number | null;
43
44
  export type MaxTasks = number | null;
44
45
  export type MaxSubprocesses = number | null;
46
+ export type MaxSandboxes = number | null;
45
47
  export type SandboxCleanup = boolean | null;
46
48
  export type LogSamples = boolean | null;
47
49
  export type LogImages = boolean | null;
48
50
  export type LogBuffer = number | null;
51
+ export type ScoreDisplay = boolean | null;
49
52
  export type Type = "git";
50
53
  export type Origin = string;
51
54
  export type Commit = string;
@@ -76,6 +79,7 @@ export type TopLogprobs = number | null;
76
79
  export type ParallelToolCalls = boolean | null;
77
80
  export type MaxToolOutput = number | null;
78
81
  export type CachePrompt = "auto" | boolean | null;
82
+ export type ReasoningEffort = ("low" | "medium" | "high") | null;
79
83
  export type TotalSamples = number;
80
84
  export type CompletedSamples = number;
81
85
  export type Name3 = string;
@@ -119,6 +123,7 @@ export type Role = "system";
119
123
  export type Content1 = string | (ContentText | ContentImage)[];
120
124
  export type Source1 = ("input" | "generate") | null;
121
125
  export type Role1 = "user";
126
+ export type ToolCallId = string | null;
122
127
  export type Content2 = string | (ContentText | ContentImage)[];
123
128
  export type Source2 = ("input" | "generate") | null;
124
129
  export type Role2 = "assistant";
@@ -133,7 +138,7 @@ export type Content3 = string;
133
138
  export type Content4 = string | (ContentText | ContentImage)[];
134
139
  export type Source3 = ("input" | "generate") | null;
135
140
  export type Role3 = "tool";
136
- export type ToolCallId = string | null;
141
+ export type ToolCallId1 = string | null;
137
142
  export type Function1 = string | null;
138
143
  export type Type4 =
139
144
  | "parsing"
@@ -241,14 +246,10 @@ export type Name5 = string;
241
246
  export type Description = string;
242
247
  export type Type6 = "object";
243
248
  export type Type7 =
244
- | "string"
245
- | "integer"
246
- | "number"
247
- | "boolean"
248
- | "array"
249
- | "object"
250
- | "null";
249
+ | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
250
+ | null;
251
251
  export type Description1 = string | null;
252
+ export type Enum = unknown[] | null;
252
253
  export type Properties1 = {
253
254
  [k: string]: ToolParam;
254
255
  } | null;
@@ -267,7 +268,13 @@ export type Event5 = "tool";
267
268
  export type Type8 = "function";
268
269
  export type Id3 = string;
269
270
  export type Function2 = string;
270
- export type Result = string | number | boolean | (ContentText | ContentImage)[];
271
+ export type Result =
272
+ | string
273
+ | number
274
+ | boolean
275
+ | ContentText
276
+ | ContentImage
277
+ | (ContentText | ContentImage)[];
271
278
  export type Truncated = [unknown, unknown] | null;
272
279
  export type Timestamp6 = string;
273
280
  export type Pending6 = boolean | null;
@@ -388,7 +395,7 @@ export type Value2 =
388
395
  export type Answer1 = string | null;
389
396
  export type Explanation2 = string | null;
390
397
  export type Metadata8 = {} | null;
391
- export type SampleId = string | number | null;
398
+ export type SampleId1 = string | number | null;
392
399
  export type Samples2 = SampleScore[];
393
400
  export type Location1 = string;
394
401
 
@@ -438,6 +445,7 @@ export interface EvalDataset {
438
445
  export interface ModelArgs {}
439
446
  export interface EvalConfig {
440
447
  limit: Limit;
448
+ sample_id: SampleId;
441
449
  epochs: Epochs;
442
450
  epochs_reducer: EpochsReducer;
443
451
  trace: Trace;
@@ -449,10 +457,12 @@ export interface EvalConfig {
449
457
  max_samples: MaxSamples;
450
458
  max_tasks: MaxTasks;
451
459
  max_subprocesses: MaxSubprocesses;
460
+ max_sandboxes: MaxSandboxes;
452
461
  sandbox_cleanup: SandboxCleanup;
453
462
  log_samples: LogSamples;
454
463
  log_images: LogImages;
455
464
  log_buffer: LogBuffer;
465
+ score_display: ScoreDisplay;
456
466
  }
457
467
  export interface ApprovalPolicyConfig {
458
468
  approvers: Approvers;
@@ -523,6 +533,7 @@ export interface GenerateConfig {
523
533
  parallel_tool_calls: ParallelToolCalls;
524
534
  max_tool_output: MaxToolOutput;
525
535
  cache_prompt: CachePrompt;
536
+ reasoning_effort: ReasoningEffort;
526
537
  }
527
538
  export interface EvalResults {
528
539
  total_samples: TotalSamples;
@@ -607,6 +618,7 @@ export interface ChatMessageUser {
607
618
  content: Content1;
608
619
  source: Source1;
609
620
  role: Role1;
621
+ tool_call_id: ToolCallId;
610
622
  }
611
623
  export interface ChatMessageAssistant {
612
624
  content: Content2;
@@ -635,7 +647,7 @@ export interface ChatMessageTool {
635
647
  content: Content4;
636
648
  source: Source3;
637
649
  role: Role3;
638
- tool_call_id: ToolCallId;
650
+ tool_call_id: ToolCallId1;
639
651
  function: Function1;
640
652
  error: ToolCallError | null;
641
653
  }
@@ -825,6 +837,7 @@ export interface ToolParam {
825
837
  type: Type7;
826
838
  description: Description1;
827
839
  default: Default;
840
+ enum: Enum;
828
841
  items: ToolParam | null;
829
842
  properties: Properties1;
830
843
  additionalProperties: Additionalproperties;
@@ -862,6 +875,7 @@ export interface GenerateConfig1 {
862
875
  parallel_tool_calls: ParallelToolCalls;
863
876
  max_tool_output: MaxToolOutput;
864
877
  cache_prompt: CachePrompt;
878
+ reasoning_effort: ReasoningEffort;
865
879
  }
866
880
  /**
867
881
  * Model call (raw request/response data).
@@ -1031,5 +1045,5 @@ export interface SampleScore {
1031
1045
  answer: Answer1;
1032
1046
  explanation: Explanation2;
1033
1047
  metadata: Metadata8;
1034
- sample_id: SampleId;
1048
+ sample_id: SampleId1;
1035
1049
  }
@@ -131,6 +131,13 @@
131
131
  dependencies:
132
132
  "@babel/types" "^7.25.2"
133
133
 
134
+ "@babel/runtime@^7.21.0":
135
+ version "7.26.0"
136
+ resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.0.tgz#8600c2f595f277c60815256418b85356a65173c1"
137
+ integrity sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==
138
+ dependencies:
139
+ regenerator-runtime "^0.14.0"
140
+
134
141
  "@babel/template@^7.25.0":
135
142
  version "7.25.0"
136
143
  resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.25.0.tgz#e733dc3134b4fede528c15bc95e89cb98c52592a"
@@ -525,6 +532,14 @@ argparse@^2.0.1:
525
532
  resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
526
533
  integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
527
534
 
535
+ asciinema-player@^3.8.1:
536
+ version "3.8.1"
537
+ resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.8.1.tgz#d56ccc04a85570559900b2297cf44c2a7453d118"
538
+ integrity sha512-NkpbFg81Y6iJFpDRndakLCQ0G26XSpvuT3vJTFjMRgHb26lqHgRNY9gun54e5MehZ4fEDNYkMZv+z6MfZ8c2aA==
539
+ dependencies:
540
+ "@babel/runtime" "^7.21.0"
541
+ solid-js "^1.3.0"
542
+
528
543
  babel-plugin-prismjs@^2.1.0:
529
544
  version "2.1.0"
530
545
  resolved "https://registry.yarnpkg.com/babel-plugin-prismjs/-/babel-plugin-prismjs-2.1.0.tgz#ade627896106326ad04d6d77fba92877618de571"
@@ -647,6 +662,11 @@ cross-spawn@^7.0.2:
647
662
  shebang-command "^2.0.0"
648
663
  which "^2.0.1"
649
664
 
665
+ csstype@^3.1.0:
666
+ version "3.1.3"
667
+ resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81"
668
+ integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==
669
+
650
670
  cuint@^0.2.2:
651
671
  version "0.2.2"
652
672
  resolved "https://registry.yarnpkg.com/cuint/-/cuint-0.2.2.tgz#408086d409550c2631155619e9fa7bcadc3b991b"
@@ -1242,6 +1262,11 @@ queue-microtask@^1.2.2:
1242
1262
  resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
1243
1263
  integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==
1244
1264
 
1265
+ regenerator-runtime@^0.14.0:
1266
+ version "0.14.1"
1267
+ resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
1268
+ integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==
1269
+
1245
1270
  resolve-from@^4.0.0:
1246
1271
  version "4.0.0"
1247
1272
  resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-4.0.0.tgz#4abcd852ad32dd7baabfe9b40e00a36db5f392e6"
@@ -1294,6 +1319,16 @@ semver@^6.0.0, semver@^6.3.1:
1294
1319
  resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4"
1295
1320
  integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==
1296
1321
 
1322
+ seroval-plugins@^1.1.0:
1323
+ version "1.1.1"
1324
+ resolved "https://registry.yarnpkg.com/seroval-plugins/-/seroval-plugins-1.1.1.tgz#1e0c175e13bb4c620d4ce5916fbbb63de70c31f9"
1325
+ integrity sha512-qNSy1+nUj7hsCOon7AO4wdAIo9P0jrzAMp18XhiOzA6/uO5TKtP7ScozVJ8T293oRIvi5wyCHSM4TrJo/c/GJA==
1326
+
1327
+ seroval@^1.1.0:
1328
+ version "1.1.1"
1329
+ resolved "https://registry.yarnpkg.com/seroval/-/seroval-1.1.1.tgz#7630e0c17a3efa6be43f17ad6bcf9f966a61b443"
1330
+ integrity sha512-rqEO6FZk8mv7Hyv4UCj3FD3b6Waqft605TLfsCe/BiaylRpyyMC0b+uA5TJKawX3KzMrdi3wsLbCaLplrQmBvQ==
1331
+
1297
1332
  shebang-command@^2.0.0:
1298
1333
  version "2.0.0"
1299
1334
  resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea"
@@ -1306,6 +1341,15 @@ shebang-regex@^3.0.0:
1306
1341
  resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172"
1307
1342
  integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==
1308
1343
 
1344
+ solid-js@^1.3.0:
1345
+ version "1.9.3"
1346
+ resolved "https://registry.yarnpkg.com/solid-js/-/solid-js-1.9.3.tgz#078f026fe32f6b9b48e8e0557be150f0c2d610a9"
1347
+ integrity sha512-5ba3taPoZGt9GY3YlsCB24kCg0Lv/rie/HTD4kG6h4daZZz7+yK02xn8Vx8dLYBc9i6Ps5JwAbEiqjmKaLB3Ag==
1348
+ dependencies:
1349
+ csstype "^3.1.0"
1350
+ seroval "^1.1.0"
1351
+ seroval-plugins "^1.1.0"
1352
+
1309
1353
  source-map-js@^1.2.0:
1310
1354
  version "1.2.0"
1311
1355
  resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.2.0.tgz#16b809c162517b5b8c3e7dcd315a2a5c2612b2af"
@@ -75,4 +75,8 @@ def init_tool_approval(approval: list[ApprovalPolicy] | None) -> None:
75
75
  _tool_approver.set(None)
76
76
 
77
77
 
78
+ def have_tool_approval() -> bool:
79
+ return _tool_approver.get(None) is not None
80
+
81
+
78
82
  _tool_approver: ContextVar[Approver | None] = ContextVar("tool_approver", default=None)
@@ -24,8 +24,6 @@ from .util import (
24
24
  render_tool_approval,
25
25
  )
26
26
 
27
- PANEL_TITLE = "Approvals"
28
-
29
27
 
30
28
  async def panel_approval(
31
29
  message: str,
@@ -35,7 +33,7 @@ async def panel_approval(
35
33
  choices: list[ApprovalDecision],
36
34
  ) -> Approval:
37
35
  # ensure the approvals panel is shown
38
- await input_panel(PANEL_TITLE, ApprovalInputPanel)
36
+ await input_panel(ApprovalInputPanel)
39
37
 
40
38
  # submit to human approval manager (will be picked up by panel)
41
39
  approvals = human_approval_manager()
@@ -52,11 +50,10 @@ async def panel_approval(
52
50
 
53
51
 
54
52
  class ApprovalInputPanel(InputPanel):
53
+ DEFAULT_TITLE = "Approval"
54
+
55
55
  DEFAULT_CSS = """
56
56
  ApprovalInputPanel {
57
- width: 1fr;
58
- height: 1fr;
59
- padding: 0 1 1 1;
60
57
  layout: grid;
61
58
  grid-size: 1 3;
62
59
  grid-rows: auto 1fr auto;
@@ -88,7 +85,7 @@ class ApprovalInputPanel(InputPanel):
88
85
  self._approvals = human_approval_manager().approval_requests()
89
86
  if len(self._approvals) > 0:
90
87
  approval_id, approval_request = self._approvals[0]
91
- self.title = f"{PANEL_TITLE} ({len(self._approvals):,})"
88
+ self.title = f"{self.DEFAULT_TITLE} ({len(self._approvals):,})"
92
89
  heading.request = approval_request
93
90
  content.approval = approval_request.request
94
91
  actions.approval_request = approval_id, approval_request
@@ -97,7 +94,7 @@ class ApprovalInputPanel(InputPanel):
97
94
  actions.activate()
98
95
  self.visible = True
99
96
  else:
100
- self.title = PANEL_TITLE
97
+ self.title = self.DEFAULT_TITLE
101
98
  heading.request = None
102
99
  content.approval = None
103
100
  actions.approval_request = None
@@ -1,16 +1,19 @@
1
1
  import abc
2
2
  import random
3
+ from dataclasses import dataclass, field
3
4
  from typing import (
4
5
  TYPE_CHECKING,
5
6
  Any,
6
7
  Callable,
7
8
  Iterator,
8
9
  Sequence,
10
+ Type,
11
+ TypeVar,
9
12
  Union,
10
13
  overload,
11
14
  )
12
15
 
13
- from pydantic import BaseModel, Field
16
+ from pydantic import BaseModel, Field, ValidationError
14
17
  from typing_extensions import override
15
18
 
16
19
  from inspect_ai.model import ChatMessage
@@ -20,6 +23,8 @@ from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
20
23
  if TYPE_CHECKING:
21
24
  from _typeshed import SupportsRichComparison
22
25
 
26
+ MT = TypeVar("MT", bound=BaseModel)
27
+
23
28
 
24
29
  class Sample(BaseModel):
25
30
  def __init__(
@@ -76,6 +81,20 @@ class Sample(BaseModel):
76
81
  metadata: dict[str, Any] | None = Field(default=None)
77
82
  """Arbitrary metadata associated with the sample."""
78
83
 
84
+ def metadata_as(self, metadata_cls: Type[MT]) -> MT:
85
+ """Metadata as a Pydantic model.
86
+
87
+ Args:
88
+ metadata_cls: BaseModel derived class.
89
+
90
+ Returns:
91
+ BaseModel: Instance of metadata_cls.
92
+ """
93
+ if self.metadata is None:
94
+ raise ValueError("Sample does not have metadata")
95
+
96
+ return metadata_as(self.metadata, metadata_cls)
97
+
79
98
  sandbox: SandboxEnvironmentSpec | None = Field(default=None)
80
99
  """Sandbox environment type and optional config file."""
81
100
 
@@ -177,7 +196,8 @@ class Dataset(Sequence[Sample], abc.ABC):
177
196
  """
178
197
 
179
198
 
180
- class FieldSpec(BaseModel):
199
+ @dataclass
200
+ class FieldSpec:
181
201
  r"""Specification for mapping data source fields to sample fields.
182
202
 
183
203
  Args:
@@ -191,28 +211,28 @@ class FieldSpec(BaseModel):
191
211
  setup (str): Optional. Setup script to run for sample .
192
212
  """
193
213
 
194
- input: str = Field(default="input")
214
+ input: str = field(default="input")
195
215
  """Name of the field containing the sample input."""
196
216
 
197
- target: str = Field(default="target")
217
+ target: str = field(default="target")
198
218
  """Name of the field containing the sample target."""
199
219
 
200
- choices: str = Field(default="choices")
220
+ choices: str = field(default="choices")
201
221
  """Name of field containing the list of answer choices."""
202
222
 
203
- id: str = Field(default="id")
223
+ id: str = field(default="id")
204
224
  """ Unique identifier for the sample."""
205
225
 
206
- metadata: list[str] | None = Field(default=None)
226
+ metadata: list[str] | Type[BaseModel] | None = field(default=None)
207
227
  """List of additional field names that should be read as metadata."""
208
228
 
209
- sandbox: str = Field(default="sandbox")
229
+ sandbox: str = field(default="sandbox")
210
230
  """Sandbox type along with optional config file."""
211
231
 
212
- files: str = Field(default="files")
232
+ files: str = field(default="files")
213
233
  """Files that go along wtih the sample."""
214
234
 
215
- setup: str = Field(default="setup")
235
+ setup: str = field(default="setup")
216
236
  """Setup script to run for sample (run within default SandboxEnvironment)."""
217
237
 
218
238
 
@@ -313,3 +333,24 @@ class MemoryDataset(Dataset):
313
333
  samples=[sample for sample in self if predicate(sample)],
314
334
  shuffled=self.shuffled,
315
335
  )
336
+
337
+
338
+ def metadata_as(metadata: dict[str, Any], metadata_cls: Type[MT]) -> MT:
339
+ # validate that metadata_cls is frozen
340
+ if not metadata_cls.model_config.get("frozen", False):
341
+ raise ValueError(
342
+ f"Metadata model {metadata_cls.__name__} must have frozen=True"
343
+ )
344
+
345
+ # filter to only fields in the model
346
+ model_fields = {
347
+ k: v
348
+ for k, v in metadata.items()
349
+ if k in metadata_cls.__pydantic_fields__.keys()
350
+ }
351
+
352
+ # parse and return model instance
353
+ try:
354
+ return metadata_cls(**model_fields)
355
+ except ValidationError as ex:
356
+ raise ValueError(f"Could not parse metadata into {metadata_cls.__name__}: {ex}")
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  from typing import Any, Iterable, cast
3
3
 
4
+ from pydantic import ValidationError
5
+
4
6
  from inspect_ai.model import (
5
7
  ChatMessage,
6
8
  ChatMessageAssistant,
@@ -33,9 +35,35 @@ def record_to_sample_fn(
33
35
  # collect metadata if specified
34
36
  metadata: dict[str, Any] | None = None
35
37
  if sample_fields.metadata:
36
- metadata = {}
37
- for name in sample_fields.metadata:
38
- metadata[name] = record.get(name)
38
+ if isinstance(sample_fields.metadata, list):
39
+ metadata = {}
40
+ for name in sample_fields.metadata:
41
+ metadata[name] = record.get(name)
42
+ else:
43
+ # must be frozen
44
+ if not sample_fields.metadata.model_config.get("frozen", False):
45
+ raise ValueError(
46
+ f"Metadata model {sample_fields.metadata.__name__} must have frozen=True"
47
+ )
48
+
49
+ # filter to only fields in the model
50
+ model_fields = record.get("metadata", None)
51
+ if isinstance(model_fields, str):
52
+ model_fields = json.loads(model_fields)
53
+ elif model_fields is None:
54
+ model_fields = {
55
+ k: v
56
+ for k, v in record.items()
57
+ if k in sample_fields.metadata.__pydantic_fields__.keys()
58
+ }
59
+
60
+ # parse and return metadata
61
+ try:
62
+ metadata = sample_fields.metadata(**model_fields).model_dump()
63
+ except ValidationError as ex:
64
+ raise ValueError(
65
+ f"Could not parse metadata into {sample_fields.metadata.__name__}: {ex}"
66
+ )
39
67
  elif "metadata" in record:
40
68
  metadata_field = record.get("metadata")
41
69
  if isinstance(metadata_field, str):
@@ -23,6 +23,7 @@ from ._log import (
23
23
  EvalRevision,
24
24
  EvalSample,
25
25
  EvalSampleReductions,
26
+ EvalSampleScore,
26
27
  EvalScore,
27
28
  EvalSpec,
28
29
  EvalStats,
@@ -60,6 +61,7 @@ __all__ = [
60
61
  "EvalResults",
61
62
  "EvalRevision",
62
63
  "EvalSample",
64
+ "EvalSampleScore",
63
65
  "EvalSampleReductions",
64
66
  "EvalScore",
65
67
  "EvalSpec",
inspect_ai/log/_log.py CHANGED
@@ -23,7 +23,6 @@ from inspect_ai.model import (
23
23
  ModelUsage,
24
24
  )
25
25
  from inspect_ai.scorer import Score
26
- from inspect_ai.scorer._metric import SampleScore
27
26
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
28
27
 
29
28
  from ._transcript import Event
@@ -301,6 +300,10 @@ class EvalScore(BaseModel):
301
300
  """Additional scorer metadata."""
302
301
 
303
302
 
303
+ class EvalSampleScore(Score):
304
+ sample_id: str | int | None = Field(default=None)
305
+
306
+
304
307
  class EvalSampleReductions(BaseModel):
305
308
  scorer: str
306
309
  """Name the of scorer"""
@@ -308,7 +311,7 @@ class EvalSampleReductions(BaseModel):
308
311
  reducer: str | None = Field(default=None)
309
312
  """Name the of reducer"""
310
313
 
311
- samples: list[SampleScore]
314
+ samples: list[EvalSampleScore]
312
315
  """List of reduced scores"""
313
316
 
314
317
 
@@ -118,10 +118,12 @@ async def call_tools(
118
118
  # massage result, leave list[Content] alone, convert all other
119
119
  # types to string as that is what the model APIs accept
120
120
  truncated: tuple[int, int] | None = None
121
- if isinstance(result, list) and (
121
+ if isinstance(result, ContentText | ContentImage):
122
+ content: str | list[Content] = [result]
123
+ elif isinstance(result, list) and (
122
124
  isinstance(result[0], ContentText | ContentImage)
123
125
  ):
124
- content: str | list[Content] = result
126
+ content = result
125
127
  else:
126
128
  content = str(result)
127
129
 
@@ -74,6 +74,9 @@ class ChatMessageUser(ChatMessageBase):
74
74
  role: Literal["user"] = Field(default="user")
75
75
  """Conversation role."""
76
76
 
77
+ tool_call_id: str | None = Field(default=None)
78
+ """ID of tool call this message has the content payload for."""
79
+
77
80
 
78
81
  class ChatMessageAssistant(ChatMessageBase):
79
82
  role: Literal["assistant"] = Field(default="assistant")
@@ -19,7 +19,7 @@ from tenacity import (
19
19
  )
20
20
 
21
21
  from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
22
- from inspect_ai._util.content import ContentText
22
+ from inspect_ai._util.content import Content, ContentImage, ContentText
23
23
  from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
24
24
  from inspect_ai._util.platform import platform_init
25
25
  from inspect_ai._util.registry import (
@@ -40,6 +40,7 @@ from ._chat_message import (
40
40
  ChatMessage,
41
41
  ChatMessageAssistant,
42
42
  ChatMessageSystem,
43
+ ChatMessageTool,
43
44
  ChatMessageUser,
44
45
  )
45
46
  from ._generate_config import (
@@ -163,6 +164,10 @@ class ModelAPI(abc.ABC):
163
164
  """Any tool use in a message stream means that tools must be passed."""
164
165
  return False
165
166
 
167
+ def tool_result_images(self) -> bool:
168
+ """Tool results can containe images"""
169
+ return False
170
+
166
171
 
167
172
  class Model:
168
173
  """Model interface."""
@@ -291,6 +296,11 @@ class Model:
291
296
  tools = []
292
297
  tool_choice = "none"
293
298
 
299
+ # break tool image content out into user messages if the model doesn't
300
+ # support tools returning images
301
+ if not self.api.tool_result_images():
302
+ input = tool_result_images_as_user_message(input)
303
+
294
304
  # optionally collapse *consecutive* messages into one -
295
305
  # (some apis e.g. anthropic require this)
296
306
  if self.api.collapse_user_messages():
@@ -693,6 +703,37 @@ def simple_input_messages(
693
703
  return messages
694
704
 
695
705
 
706
+ def tool_result_images_as_user_message(
707
+ messages: list[ChatMessage],
708
+ ) -> list[ChatMessage]:
709
+ return functools.reduce(tool_result_images_reducer, messages, [])
710
+
711
+
712
+ def tool_result_images_reducer(
713
+ messages: list[ChatMessage],
714
+ message: ChatMessage,
715
+ ) -> list[ChatMessage]:
716
+ # append the message
717
+ messages.append(message)
718
+
719
+ # if there are tool result images, pull them out into a ChatUserMessage
720
+ if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
721
+ user_content: list[Content] = []
722
+ for i in range(0, len(message.content)):
723
+ if isinstance(message.content[i], ContentImage):
724
+ user_content.append(message.content[i])
725
+ message.content[i] = ContentText(
726
+ text="Image content is in the message below."
727
+ )
728
+ if len(user_content) > 0:
729
+ messages.append(
730
+ ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
731
+ )
732
+
733
+ # return messages
734
+ return messages
735
+
736
+
696
737
  # Functions to reduce consecutive user messages to a single user message -> required for some models
697
738
  def collapse_consecutive_user_messages(
698
739
  messages: list[ChatMessage],
@@ -229,6 +229,10 @@ class AnthropicAPI(ModelAPI):
229
229
  def tools_required(self) -> bool:
230
230
  return True
231
231
 
232
+ @override
233
+ def tool_result_images(self) -> bool:
234
+ return True
235
+
232
236
  # convert some common BadRequestError states into 'refusal' model output
233
237
  def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | None:
234
238
  error = exception_message(ex).lower()
@@ -3,13 +3,20 @@ from rich.console import RenderableType
3
3
  from inspect_ai.tool._tool_call import ToolCall
4
4
  from inspect_ai.tool._tool_transcript import transcript_tool_call
5
5
 
6
- from ._chat_message import ChatMessage, ChatMessageAssistant, ChatMessageTool
6
+ from ._chat_message import (
7
+ ChatMessage,
8
+ ChatMessageAssistant,
9
+ ChatMessageTool,
10
+ ChatMessageUser,
11
+ )
7
12
 
8
13
 
9
14
  def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessage]:
10
15
  preceding: list[ChatMessage] = []
11
16
  for m in reversed(messages):
12
- if not isinstance(m, ChatMessageTool | ChatMessageAssistant):
17
+ if not isinstance(m, ChatMessageTool | ChatMessageAssistant) and not (
18
+ isinstance(m, ChatMessageUser) and m.tool_call_id
19
+ ):
13
20
  preceding.append(m)
14
21
  else:
15
22
  break
@@ -90,6 +90,13 @@ class Score(BaseModel):
90
90
  """Read the score as a boolean."""
91
91
  return bool(self._as_scalar())
92
92
 
93
+ def as_list(self) -> list[str | int | float | bool]:
94
+ """Read the score as a list."""
95
+ if isinstance(self.value, list):
96
+ return self.value
97
+ else:
98
+ raise ValueError("This score is not a list")
99
+
93
100
  def as_dict(self) -> dict[str, str | int | float | bool | None]:
94
101
  """Read the score as a dictionary."""
95
102
  if isinstance(self.value, dict):
@@ -104,13 +111,17 @@ class Score(BaseModel):
104
111
  raise ValueError("This score is not a scalar")
105
112
 
106
113
 
107
- class SampleScore(Score):
114
+ class SampleScore(BaseModel):
108
115
  """Score for a Sample
109
116
 
110
117
  Args:
118
+ score: Score
111
119
  sample_id: (str | int | None) Unique id of a sample
112
120
  """
113
121
 
122
+ score: Score
123
+ """A score"""
124
+
114
125
  sample_id: str | int | None = Field(default=None)
115
126
  """A sample id"""
116
127