inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +4 -2
  3. inspect_ai/_cli/eval.py +2 -0
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +0 -2
  6. inspect_ai/_display/core/panel.py +1 -1
  7. inspect_ai/_display/rich/display.py +4 -4
  8. inspect_ai/_display/textual/app.py +4 -1
  9. inspect_ai/_display/textual/widgets/samples.py +41 -5
  10. inspect_ai/_eval/eval.py +32 -20
  11. inspect_ai/_eval/evalset.py +7 -5
  12. inspect_ai/_eval/run.py +16 -11
  13. inspect_ai/_eval/task/__init__.py +2 -2
  14. inspect_ai/_eval/task/images.py +40 -25
  15. inspect_ai/_eval/task/run.py +141 -119
  16. inspect_ai/_eval/task/task.py +140 -25
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/content.py +23 -1
  19. inspect_ai/_util/datetime.py +1 -1
  20. inspect_ai/_util/deprecation.py +1 -1
  21. inspect_ai/_util/images.py +20 -17
  22. inspect_ai/_util/json.py +11 -1
  23. inspect_ai/_util/kvstore.py +73 -0
  24. inspect_ai/_util/logger.py +2 -1
  25. inspect_ai/_util/notgiven.py +18 -0
  26. inspect_ai/_util/thread.py +5 -0
  27. inspect_ai/_util/trace.py +39 -3
  28. inspect_ai/_util/transcript.py +36 -7
  29. inspect_ai/_view/www/.prettierrc.js +12 -0
  30. inspect_ai/_view/www/dist/assets/index.js +322 -226
  31. inspect_ai/_view/www/log-schema.json +221 -138
  32. inspect_ai/_view/www/src/App.mjs +18 -9
  33. inspect_ai/_view/www/src/Types.mjs +0 -1
  34. inspect_ai/_view/www/src/api/Types.mjs +15 -4
  35. inspect_ai/_view/www/src/api/api-http.mjs +2 -0
  36. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
  37. inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
  38. inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
  39. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  40. inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
  41. inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
  42. inspect_ai/_view/www/src/components/Tools.mjs +18 -3
  43. inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
  44. inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
  45. inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
  46. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
  47. inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
  48. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
  49. inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
  50. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
  51. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
  52. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
  53. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
  54. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
  55. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
  56. inspect_ai/_view/www/src/types/log.d.ts +53 -35
  57. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  58. inspect_ai/approval/_human/util.py +2 -2
  59. inspect_ai/dataset/_sources/csv.py +2 -1
  60. inspect_ai/dataset/_sources/json.py +2 -1
  61. inspect_ai/dataset/_sources/util.py +15 -7
  62. inspect_ai/log/_condense.py +11 -1
  63. inspect_ai/log/_log.py +27 -5
  64. inspect_ai/log/_recorders/eval.py +21 -8
  65. inspect_ai/log/_samples.py +10 -5
  66. inspect_ai/log/_transcript.py +28 -1
  67. inspect_ai/model/__init__.py +10 -2
  68. inspect_ai/model/_call_tools.py +82 -17
  69. inspect_ai/model/_chat_message.py +2 -4
  70. inspect_ai/model/{_trace.py → _conversation.py} +9 -8
  71. inspect_ai/model/_model.py +2 -2
  72. inspect_ai/model/_providers/anthropic.py +9 -7
  73. inspect_ai/model/_providers/azureai.py +6 -4
  74. inspect_ai/model/_providers/bedrock.py +6 -4
  75. inspect_ai/model/_providers/google.py +103 -14
  76. inspect_ai/model/_providers/groq.py +7 -5
  77. inspect_ai/model/_providers/hf.py +11 -6
  78. inspect_ai/model/_providers/mistral.py +6 -9
  79. inspect_ai/model/_providers/openai.py +34 -8
  80. inspect_ai/model/_providers/openai_o1.py +10 -12
  81. inspect_ai/model/_providers/vertex.py +17 -4
  82. inspect_ai/scorer/__init__.py +13 -2
  83. inspect_ai/scorer/_metrics/__init__.py +2 -2
  84. inspect_ai/scorer/_metrics/std.py +3 -3
  85. inspect_ai/tool/__init__.py +9 -1
  86. inspect_ai/tool/_tool.py +9 -2
  87. inspect_ai/tool/_tool_info.py +2 -1
  88. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
  89. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
  90. inspect_ai/util/__init__.py +4 -3
  91. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  92. inspect_ai/util/_display.py +14 -4
  93. inspect_ai/util/_sandbox/context.py +12 -13
  94. inspect_ai/util/_sandbox/docker/compose.py +24 -13
  95. inspect_ai/util/_sandbox/docker/docker.py +20 -13
  96. inspect_ai/util/_sandbox/docker/util.py +2 -1
  97. inspect_ai/util/_sandbox/environment.py +13 -1
  98. inspect_ai/util/_sandbox/local.py +1 -0
  99. inspect_ai/util/_sandbox/self_check.py +18 -18
  100. inspect_ai/util/_store.py +2 -2
  101. inspect_ai/util/_subprocess.py +3 -3
  102. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
  103. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
  104. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
  105. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
  106. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
  107. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,7 @@ export const ToolEventView = ({ id, event, style, depth }) => {
37
37
  functionCall=${functionCall}
38
38
  input=${input}
39
39
  inputType=${inputType}
40
- output=${event.result}
40
+ output=${event.error?.message || event.result}
41
41
  mode="compact"
42
42
  view=${event.view}
43
43
  />
@@ -32,7 +32,6 @@ export type Limit = number | [unknown, unknown] | null;
32
32
  export type SampleId = string | number | (string | number)[] | null;
33
33
  export type Epochs = number | null;
34
34
  export type EpochsReducer = string[] | null;
35
- export type Trace = boolean | null;
36
35
  export type Name1 = string;
37
36
  export type Tools = string | string[];
38
37
  export type Approvers = ApproverPolicyConfig[];
@@ -112,35 +111,49 @@ export type Input =
112
111
  | ChatMessageAssistant
113
112
  | ChatMessageTool
114
113
  )[];
115
- export type Content = string | (ContentText | ContentImage)[];
114
+ export type Content =
115
+ | string
116
+ | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
116
117
  export type Type1 = "text";
117
118
  export type Text = string;
118
119
  export type Type2 = "image";
119
120
  export type Image = string;
120
121
  export type Detail = "auto" | "low" | "high";
122
+ export type Type3 = "audio";
123
+ export type Audio = string;
124
+ export type Format = "wav" | "mp3";
125
+ export type Type4 = "video";
126
+ export type Video = string;
127
+ export type Format1 = "mp4" | "mpeg" | "mov";
121
128
  export type Source = ("input" | "generate") | null;
122
129
  export type Role = "system";
123
- export type Content1 = string | (ContentText | ContentImage)[];
130
+ export type Content1 =
131
+ | string
132
+ | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
124
133
  export type Source1 = ("input" | "generate") | null;
125
134
  export type Role1 = "user";
126
135
  export type ToolCallId = string | null;
127
- export type Content2 = string | (ContentText | ContentImage)[];
136
+ export type Content2 =
137
+ | string
138
+ | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
128
139
  export type Source2 = ("input" | "generate") | null;
129
140
  export type Role2 = "assistant";
130
141
  export type ToolCalls = ToolCall[] | null;
131
142
  export type Id1 = string;
132
143
  export type Function = string;
133
- export type Type3 = "function";
144
+ export type Type5 = "function";
134
145
  export type ParseError = string | null;
135
146
  export type Title = string | null;
136
- export type Format = "text" | "markdown";
147
+ export type Format2 = "text" | "markdown";
137
148
  export type Content3 = string;
138
- export type Content4 = string | (ContentText | ContentImage)[];
149
+ export type Content4 =
150
+ | string
151
+ | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
139
152
  export type Source3 = ("input" | "generate") | null;
140
153
  export type Role3 = "tool";
141
154
  export type ToolCallId1 = string | null;
142
155
  export type Function1 = string | null;
143
- export type Type4 =
156
+ export type Type6 =
144
157
  | "parsing"
145
158
  | "timeout"
146
159
  | "unicode_decode"
@@ -218,7 +231,7 @@ export type JsonValue = unknown;
218
231
  export type Timestamp1 = string;
219
232
  export type Pending1 = boolean | null;
220
233
  export type Event1 = "sample_limit";
221
- export type Type5 = "message" | "time" | "token" | "operator";
234
+ export type Type7 = "message" | "time" | "token" | "operator";
222
235
  export type Message2 = string;
223
236
  export type Limit1 = number | null;
224
237
  export type Timestamp2 = string;
@@ -244,8 +257,8 @@ export type Input2 = (
244
257
  )[];
245
258
  export type Name5 = string;
246
259
  export type Description = string;
247
- export type Type6 = "object";
248
- export type Type7 =
260
+ export type Type8 = "object";
261
+ export type Type9 =
249
262
  | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
250
263
  | null;
251
264
  export type Description1 = string | null;
@@ -265,7 +278,7 @@ export type Cache = ("read" | "write") | null;
265
278
  export type Timestamp5 = string;
266
279
  export type Pending5 = boolean | null;
267
280
  export type Event5 = "tool";
268
- export type Type8 = "function";
281
+ export type Type10 = "function";
269
282
  export type Id3 = string;
270
283
  export type Function2 = string;
271
284
  export type Result =
@@ -274,7 +287,9 @@ export type Result =
274
287
  | boolean
275
288
  | ContentText
276
289
  | ContentImage
277
- | (ContentText | ContentImage)[];
290
+ | ContentAudio
291
+ | ContentVideo
292
+ | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
278
293
  export type Truncated = [unknown, unknown] | null;
279
294
  export type Timestamp6 = string;
280
295
  export type Pending6 = boolean | null;
@@ -324,13 +339,13 @@ export type Timestamp12 = string;
324
339
  export type Pending12 = boolean | null;
325
340
  export type Event12 = "step";
326
341
  export type Action = "begin" | "end";
327
- export type Type9 = string | null;
342
+ export type Type11 = string | null;
328
343
  export type Name8 = string;
329
344
  export type Timestamp13 = string;
330
345
  export type Pending13 = boolean | null;
331
346
  export type Event13 = "subtask";
332
347
  export type Name9 = string;
333
- export type Type10 = string | null;
348
+ export type Type12 = string | null;
334
349
  export type Events2 = (
335
350
  | SampleInitEvent
336
351
  | SampleLimitEvent
@@ -379,7 +394,7 @@ export type Events = (
379
394
  | StepEvent
380
395
  | SubtaskEvent
381
396
  )[];
382
- export type Type11 = "context" | "time" | "message" | "token" | "operator";
397
+ export type Type13 = "context" | "time" | "message" | "token" | "operator";
383
398
  export type Limit2 = number;
384
399
  export type Reductions = EvalSampleReductions[] | null;
385
400
  export type Scorer1 = string;
@@ -396,7 +411,7 @@ export type Answer1 = string | null;
396
411
  export type Explanation2 = string | null;
397
412
  export type Metadata8 = {} | null;
398
413
  export type SampleId1 = string | number | null;
399
- export type Samples2 = SampleScore[];
414
+ export type Samples2 = EvalSampleScore[];
400
415
  export type Location1 = string;
401
416
 
402
417
  export interface EvalLog {
@@ -448,7 +463,6 @@ export interface EvalConfig {
448
463
  sample_id: SampleId;
449
464
  epochs: Epochs;
450
465
  epochs_reducer: EpochsReducer;
451
- trace: Trace;
452
466
  approval: ApprovalPolicyConfig | null;
453
467
  fail_on_error: FailOnError;
454
468
  message_limit: MessageLimit;
@@ -614,6 +628,16 @@ export interface ContentImage {
614
628
  image: Image;
615
629
  detail: Detail;
616
630
  }
631
+ export interface ContentAudio {
632
+ type: Type3;
633
+ audio: Audio;
634
+ format: Format;
635
+ }
636
+ export interface ContentVideo {
637
+ type: Type4;
638
+ video: Video;
639
+ format: Format1;
640
+ }
617
641
  export interface ChatMessageUser {
618
642
  content: Content1;
619
643
  source: Source1;
@@ -630,7 +654,7 @@ export interface ToolCall {
630
654
  id: Id1;
631
655
  function: Function;
632
656
  arguments: Arguments;
633
- type: Type3;
657
+ type: Type5;
634
658
  parse_error: ParseError;
635
659
  view: ToolCallContent | null;
636
660
  }
@@ -640,7 +664,7 @@ export interface Arguments {}
640
664
  */
641
665
  export interface ToolCallContent {
642
666
  title: Title;
643
- format: Format;
667
+ format: Format2;
644
668
  content: Content3;
645
669
  }
646
670
  export interface ChatMessageTool {
@@ -652,7 +676,7 @@ export interface ChatMessageTool {
652
676
  error: ToolCallError | null;
653
677
  }
654
678
  export interface ToolCallError {
655
- type: Type4;
679
+ type: Type6;
656
680
  message: Message1;
657
681
  }
658
682
  export interface ModelOutput {
@@ -735,7 +759,7 @@ export interface SampleLimitEvent {
735
759
  timestamp: Timestamp1;
736
760
  pending: Pending1;
737
761
  event: Event1;
738
- type: Type5;
762
+ type: Type7;
739
763
  message: Message2;
740
764
  limit: Limit1;
741
765
  }
@@ -822,7 +846,7 @@ export interface ToolInfo {
822
846
  * Description of tool parameters object in JSON Schema format.
823
847
  */
824
848
  export interface ToolParams {
825
- type: Type6;
849
+ type: Type8;
826
850
  properties: Properties;
827
851
  required: Required1;
828
852
  additionalProperties: Additionalproperties1;
@@ -834,7 +858,7 @@ export interface Properties {
834
858
  * Description of tool parameter in JSON Schema format.
835
859
  */
836
860
  export interface ToolParam {
837
- type: Type7;
861
+ type: Type9;
838
862
  description: Description1;
839
863
  default: Default;
840
864
  enum: Enum;
@@ -897,7 +921,7 @@ export interface ToolEvent {
897
921
  timestamp: Timestamp5;
898
922
  pending: Pending5;
899
923
  event: Event5;
900
- type: Type8;
924
+ type: Type10;
901
925
  id: Id3;
902
926
  function: Function2;
903
927
  arguments: Arguments1;
@@ -999,7 +1023,7 @@ export interface StepEvent {
999
1023
  pending: Pending12;
1000
1024
  event: Event12;
1001
1025
  action: Action;
1002
- type: Type9;
1026
+ type: Type11;
1003
1027
  name: Name8;
1004
1028
  }
1005
1029
  /**
@@ -1010,7 +1034,7 @@ export interface SubtaskEvent {
1010
1034
  pending: Pending13;
1011
1035
  event: Event13;
1012
1036
  name: Name9;
1013
- type: Type10;
1037
+ type: Type12;
1014
1038
  input: Input4;
1015
1039
  result: Result1;
1016
1040
  events: Events2;
@@ -1026,7 +1050,7 @@ export interface Attachments {
1026
1050
  [k: string]: string;
1027
1051
  }
1028
1052
  export interface EvalSampleLimit {
1029
- type: Type11;
1053
+ type: Type13;
1030
1054
  limit: Limit2;
1031
1055
  }
1032
1056
  export interface EvalSampleReductions {
@@ -1034,13 +1058,7 @@ export interface EvalSampleReductions {
1034
1058
  reducer: Reducer1;
1035
1059
  samples: Samples2;
1036
1060
  }
1037
- /**
1038
- * Score for a Sample
1039
- *
1040
- * Args:
1041
- * sample_id: (str | int | None) Unique id of a sample
1042
- */
1043
- export interface SampleScore {
1061
+ export interface EvalSampleScore {
1044
1062
  value: Value2;
1045
1063
  answer: Answer1;
1046
1064
  explanation: Explanation2;
@@ -150,7 +150,7 @@ export const WorkSpace = ({
150
150
 
151
151
  // The samples tab
152
152
  // Currently only appears when the result is successful
153
- if (evalStatus !== "error" && sampleMode !== "none") {
153
+ if (sampleMode !== "none") {
154
154
  resolvedTabs.samples = {
155
155
  id: kEvalWorkspaceTabId,
156
156
  scrollable: samples.length === 1,
@@ -5,7 +5,7 @@ from rich.text import Text
5
5
 
6
6
  from inspect_ai._util.transcript import transcript_markdown
7
7
  from inspect_ai.tool._tool_call import ToolCallContent, ToolCallView
8
- from inspect_ai.util._trace import trace_enabled
8
+ from inspect_ai.util._display import display_type
9
9
 
10
10
  HUMAN_APPROVED = "Human operator approved tool call."
11
11
  HUMAN_REJECTED = "Human operator rejected the tool call."
@@ -18,7 +18,7 @@ def render_tool_approval(message: str, view: ToolCallView) -> list[RenderableTyp
18
18
  text_highlighter = ReprHighlighter()
19
19
 
20
20
  # ignore content if trace enabled
21
- message = message.strip() if not trace_enabled() else ""
21
+ message = message.strip() if display_type() != "conversation" else ""
22
22
 
23
23
  def add_view_content(view_content: ToolCallContent) -> None:
24
24
  if view_content.title:
@@ -1,4 +1,5 @@
1
1
  import csv
2
+ import os
2
3
  from io import TextIOWrapper
3
4
  from pathlib import Path
4
5
  from typing import Any
@@ -75,7 +76,7 @@ def csv_dataset(
75
76
  dataset = MemoryDataset(
76
77
  samples=data_to_samples(valid_data, data_to_sample, auto_id),
77
78
  name=name,
78
- location=csv_file,
79
+ location=os.path.abspath(csv_file),
79
80
  )
80
81
 
81
82
  # resolve relative file paths
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  from io import TextIOWrapper
3
4
  from pathlib import Path
4
5
  from typing import Any, cast
@@ -75,7 +76,7 @@ def json_dataset(
75
76
  dataset = MemoryDataset(
76
77
  samples=data_to_samples(dataset_reader(f), data_to_sample, auto_id),
77
78
  name=name,
78
- location=json_file,
79
+ location=os.path.abspath(json_file),
79
80
  )
80
81
 
81
82
  # resolve relative file paths
@@ -1,6 +1,6 @@
1
1
  from typing import Callable
2
2
 
3
- from inspect_ai._util.content import Content, ContentImage
3
+ from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentVideo
4
4
  from inspect_ai._util.file import filesystem
5
5
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
6
6
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
@@ -44,24 +44,28 @@ def resolve_sample_files(dataset: Dataset) -> None:
44
44
  for path in sample.files.keys():
45
45
  sample.files[path] = resolve_file(sample.files[path])
46
46
 
47
+ # check for setup script
48
+ if sample.setup is not None:
49
+ sample.setup = resolve_file(sample.setup)
50
+
47
51
  # check for image paths
48
52
  if not isinstance(sample.input, str):
49
- sample.input = messages_with_resolved_images(sample.input, resolve_file)
53
+ sample.input = messages_with_resolved_content(sample.input, resolve_file)
50
54
 
51
55
 
52
- def messages_with_resolved_images(
56
+ def messages_with_resolved_content(
53
57
  messages: list[ChatMessage], resolver: Callable[[str], str]
54
58
  ) -> list[ChatMessage]:
55
- return [message_with_resolved_image(message, resolver) for message in messages]
59
+ return [message_with_resolved_content(message, resolver) for message in messages]
56
60
 
57
61
 
58
- def message_with_resolved_image(
62
+ def message_with_resolved_content(
59
63
  message: ChatMessage, resolver: Callable[[str], str]
60
64
  ) -> ChatMessage:
61
65
  if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
62
66
  return ChatMessageUser(
63
67
  content=[
64
- chat_content_with_resolved_image(content, resolver)
68
+ chat_content_with_resolved_content(content, resolver)
65
69
  for content in message.content
66
70
  ],
67
71
  source=message.source,
@@ -70,7 +74,7 @@ def message_with_resolved_image(
70
74
  return message
71
75
 
72
76
 
73
- def chat_content_with_resolved_image(
77
+ def chat_content_with_resolved_content(
74
78
  content: Content, resolver: Callable[[str], str]
75
79
  ) -> Content:
76
80
  if isinstance(content, ContentImage):
@@ -78,5 +82,9 @@ def chat_content_with_resolved_image(
78
82
  image=resolver(content.image),
79
83
  detail=content.detail,
80
84
  )
85
+ elif isinstance(content, ContentAudio):
86
+ return ContentAudio(audio=resolver(content.audio), format=content.format)
87
+ elif isinstance(content, ContentVideo):
88
+ return ContentVideo(video=resolver(content.video), format=content.format)
81
89
  else:
82
90
  return content
@@ -6,7 +6,13 @@ from typing import (
6
6
  from pydantic import JsonValue
7
7
 
8
8
  from inspect_ai._util.constants import BASE_64_DATA_REMOVED
9
- from inspect_ai._util.content import Content, ContentImage, ContentText
9
+ from inspect_ai._util.content import (
10
+ Content,
11
+ ContentAudio,
12
+ ContentImage,
13
+ ContentText,
14
+ ContentVideo,
15
+ )
10
16
  from inspect_ai._util.hash import mm3_hash
11
17
  from inspect_ai._util.json import JsonChange
12
18
  from inspect_ai._util.url import is_data_uri
@@ -304,3 +310,7 @@ def walk_content(content: Content, content_fn: Callable[[str], str]) -> Content:
304
310
  return content.model_copy(update=dict(text=content_fn(content.text)))
305
311
  elif isinstance(content, ContentImage):
306
312
  return content.model_copy(update=dict(image=content_fn(content.image)))
313
+ elif isinstance(content, ContentAudio):
314
+ return content.model_copy(update=dict(audio=content_fn(content.audio)))
315
+ elif isinstance(content, ContentVideo):
316
+ return content.model_copy(update=dict(video=content_fn(content.video)))
inspect_ai/log/_log.py CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
16
16
  from inspect_ai._util.error import EvalError, exception_message
17
17
  from inspect_ai._util.logger import warn_once
18
18
  from inspect_ai.approval._policy import ApprovalPolicyConfig
19
+ from inspect_ai.dataset._dataset import MT, metadata_as
19
20
  from inspect_ai.model import (
20
21
  ChatMessage,
21
22
  GenerateConfig,
@@ -24,6 +25,8 @@ from inspect_ai.model import (
24
25
  )
25
26
  from inspect_ai.scorer import Score
26
27
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
28
+ from inspect_ai.util._store import Store
29
+ from inspect_ai.util._store_model import SMT
27
30
 
28
31
  from ._transcript import Event
29
32
 
@@ -45,9 +48,6 @@ class EvalConfig(BaseModel):
45
48
  epochs_reducer: list[str] | None = Field(default=None)
46
49
  """Reducers for aggregating per-sample scores."""
47
50
 
48
- trace: bool | None = Field(default=None)
49
- """Trace message interactions with evaluated model to terminal."""
50
-
51
51
  approval: ApprovalPolicyConfig | None = Field(default=None)
52
52
  """Approval policy for tool use."""
53
53
 
@@ -158,9 +158,31 @@ class EvalSample(BaseModel):
158
158
  metadata: dict[str, Any]
159
159
  """Additional sample metadata."""
160
160
 
161
+ def metadata_as(self, metadata_cls: Type[MT]) -> MT:
162
+ """Pydantic model interface to metadata.
163
+
164
+ Args:
165
+ metadata_cls: Pydantic model type
166
+
167
+ Returns:
168
+ BaseModel: Instance of metadata_cls bound to sample metadata.
169
+ """
170
+ return metadata_as(self.metadata, metadata_cls)
171
+
161
172
  store: dict[str, Any] = Field(default_factory=dict)
162
173
  """State at end of sample execution."""
163
174
 
175
+ def store_as(self, model_cls: Type[SMT]) -> SMT:
176
+ """Pydantic model interface to the store.
177
+
178
+ Args:
179
+ model_cls: Pydantic model type (must derive from StoreModel)
180
+
181
+ Returns:
182
+ StoreModel: Instance of model_cls bound to sample store data.
183
+ """
184
+ return model_cls(store=Store(self.store))
185
+
164
186
  events: list[Event] = Field(default_factory=list)
165
187
  """Events that occurred during sample execution."""
166
188
 
@@ -330,7 +352,7 @@ class EvalResults(BaseModel):
330
352
  """Scorer used to compute results (deprecated)."""
331
353
  warn_once(
332
354
  logger,
333
- "The 'scorer' field is deprecated. Use 'scorers' instead.",
355
+ "The 'scorer' field is deprecated. Use 'scores' instead.",
334
356
  )
335
357
  return self.scores[0] if self.scores else None
336
358
 
@@ -339,7 +361,7 @@ class EvalResults(BaseModel):
339
361
  """Metrics computed (deprecated)."""
340
362
  warn_once(
341
363
  logger,
342
- "The 'metrics' field is deprecated. Access metrics through 'scorers' instead.",
364
+ "The 'metrics' field is deprecated. Access metrics through 'scores' instead.",
343
365
  )
344
366
  return self.scores[0].metrics if self.scores else {}
345
367
 
@@ -13,7 +13,12 @@ from pydantic_core import to_json
13
13
  from typing_extensions import override
14
14
 
15
15
  from inspect_ai._util.constants import LOG_SCHEMA_VERSION
16
- from inspect_ai._util.content import ContentImage, ContentText
16
+ from inspect_ai._util.content import (
17
+ ContentAudio,
18
+ ContentImage,
19
+ ContentText,
20
+ ContentVideo,
21
+ )
17
22
  from inspect_ai._util.error import EvalError
18
23
  from inspect_ai._util.file import FileSystem, async_fileystem, dirname, file, filesystem
19
24
  from inspect_ai._util.json import jsonable_python
@@ -90,9 +95,11 @@ class EvalRecorder(FileRecorder):
90
95
  self.data: dict[str, ZipLogFile] = {}
91
96
 
92
97
  @override
93
- async def log_init(self, eval: EvalSpec, location: str | None = None) -> str:
98
+ async def log_init(
99
+ self, eval: EvalSpec, location: str | None = None, *, clean: bool = False
100
+ ) -> str:
94
101
  # if the file exists then read summaries
95
- if location is not None and self.fs.exists(location):
102
+ if not clean and location is not None and self.fs.exists(location):
96
103
  with file(location, "rb") as f:
97
104
  with ZipFile(f, "r") as zip:
98
105
  log_start = _read_start(zip)
@@ -229,7 +236,7 @@ class EvalRecorder(FileRecorder):
229
236
  async def write_log(cls, location: str, log: EvalLog) -> None:
230
237
  # write using the recorder (so we get all of the extra streams)
231
238
  recorder = EvalRecorder(dirname(location))
232
- await recorder.log_init(log.eval, location)
239
+ await recorder.log_init(log.eval, location, clean=True)
233
240
  await recorder.log_start(log.eval, log.plan)
234
241
  for sample in log.samples or []:
235
242
  await recorder.log_sample(log.eval, sample)
@@ -244,14 +251,20 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
244
251
  input: list[ChatMessage] = []
245
252
  for message in inputs:
246
253
  if not isinstance(message.content, str):
247
- filtered_content: list[ContentText | ContentImage] = []
254
+ filtered_content: list[
255
+ ContentText | ContentImage | ContentAudio | ContentVideo
256
+ ] = []
248
257
  for content in message.content:
249
- if content.type != "image":
258
+ if content.type == "text":
250
259
  filtered_content.append(content)
251
- if len(filtered_content) == 0:
252
- filtered_content.append(ContentText(text="(Image)"))
260
+ else:
261
+ filtered_content.append(
262
+ ContentText(text=f"({content.type.capitalize()})")
263
+ )
253
264
  message.content = filtered_content
254
265
  input.append(message)
266
+ else:
267
+ input.append(message)
255
268
 
256
269
  return input
257
270
  else:
@@ -29,7 +29,7 @@ class ActiveSample:
29
29
  sandboxes: dict[str, SandboxConnection],
30
30
  ) -> None:
31
31
  self.id = uuid()
32
- self.started = datetime.now().timestamp()
32
+ self.started: float | None = None
33
33
  self.completed: float | None = None
34
34
  self.task = task
35
35
  self.model = model
@@ -48,10 +48,15 @@ class ActiveSample:
48
48
 
49
49
  @property
50
50
  def execution_time(self) -> float:
51
- completed = (
52
- self.completed if self.completed is not None else datetime.now().timestamp()
53
- )
54
- return completed - self.started
51
+ if self.started is not None:
52
+ completed = (
53
+ self.completed
54
+ if self.completed is not None
55
+ else datetime.now().timestamp()
56
+ )
57
+ return completed - self.started
58
+ else:
59
+ return 0
55
60
 
56
61
  def interrupt(self, action: Literal["score", "error"]) -> None:
57
62
  self._interrupt_action = action
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import contextlib
2
3
  from contextvars import ContextVar
3
4
  from datetime import datetime
@@ -11,7 +12,7 @@ from typing import (
11
12
  Union,
12
13
  )
13
14
 
14
- from pydantic import BaseModel, Field, JsonValue, field_serializer
15
+ from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
15
16
 
16
17
  from inspect_ai._util.constants import SAMPLE_SUBTASK
17
18
  from inspect_ai._util.error import EvalError
@@ -176,6 +177,32 @@ class ToolEvent(BaseEvent):
176
177
  self.events = events
177
178
  self.pending = None
178
179
 
180
+ # mechanism for operator to cancel the tool call
181
+
182
+ def set_task(self, task: asyncio.Task[Any]) -> None:
183
+ """Set the tool task (for possible cancellation)"""
184
+ self._task = task
185
+
186
+ def cancel(self) -> None:
187
+ """Cancel the tool task."""
188
+ if self._task:
189
+ self._cancelled = True
190
+ self._task.cancel()
191
+
192
+ @property
193
+ def cancelled(self) -> bool:
194
+ """Was the task cancelled?"""
195
+ return self._cancelled is True
196
+
197
+ _cancelled: bool | None = None
198
+ """Was this tool call cancelled?"""
199
+
200
+ _task: asyncio.Task[Any] | None = None
201
+ """Handle to task (used for cancellation)"""
202
+
203
+ model_config = ConfigDict(arbitrary_types_allowed=True)
204
+ """Required so that we can include '_task' as a member."""
205
+
179
206
 
180
207
  class ApprovalEvent(BaseEvent):
181
208
  """Tool approval."""
@@ -1,6 +1,12 @@
1
1
  # ruff: noqa: F401 F403 F405
2
2
 
3
- from inspect_ai._util.content import Content, ContentImage, ContentText
3
+ from inspect_ai._util.content import (
4
+ Content,
5
+ ContentAudio,
6
+ ContentImage,
7
+ ContentText,
8
+ ContentVideo,
9
+ )
4
10
  from inspect_ai._util.deprecation import relocated_module_attribute
5
11
 
6
12
  from ._cache import (
@@ -42,8 +48,10 @@ __all__ = [
42
48
  "GenerateConfig",
43
49
  "GenerateConfigArgs",
44
50
  "CachePolicy",
45
- "ContentText",
51
+ "ContentAudio",
46
52
  "ContentImage",
53
+ "ContentText",
54
+ "ContentVideo",
47
55
  "Content",
48
56
  "ChatMessage",
49
57
  "ChatMessageSystem",