inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +4 -2
- inspect_ai/_cli/eval.py +2 -0
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +0 -2
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/rich/display.py +4 -4
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/samples.py +41 -5
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/run.py +141 -119
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/logger.py +2 -1
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_util/trace.py +39 -3
- inspect_ai/_util/transcript.py +36 -7
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/dist/assets/index.js +322 -226
- inspect_ai/_view/www/log-schema.json +221 -138
- inspect_ai/_view/www/src/App.mjs +18 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +18 -3
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/types/log.d.ts +53 -35
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +27 -5
- inspect_ai/log/_recorders/eval.py +21 -8
- inspect_ai/log/_samples.py +10 -5
- inspect_ai/log/_transcript.py +28 -1
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +82 -17
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/{_trace.py → _conversation.py} +9 -8
- inspect_ai/model/_model.py +2 -2
- inspect_ai/model/_providers/anthropic.py +9 -7
- inspect_ai/model/_providers/azureai.py +6 -4
- inspect_ai/model/_providers/bedrock.py +6 -4
- inspect_ai/model/_providers/google.py +103 -14
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +6 -9
- inspect_ai/model/_providers/openai.py +34 -8
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_providers/vertex.py +17 -4
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/tool/__init__.py +9 -1
- inspect_ai/tool/_tool.py +9 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
- inspect_ai/util/__init__.py +4 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -13
- inspect_ai/util/_sandbox/docker/docker.py +20 -13
- inspect_ai/util/_sandbox/docker/util.py +2 -1
- inspect_ai/util/_sandbox/environment.py +13 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_store.py +2 -2
- inspect_ai/util/_subprocess.py +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,7 @@ export const ToolEventView = ({ id, event, style, depth }) => {
|
|
37
37
|
functionCall=${functionCall}
|
38
38
|
input=${input}
|
39
39
|
inputType=${inputType}
|
40
|
-
output=${event.result}
|
40
|
+
output=${event.error?.message || event.result}
|
41
41
|
mode="compact"
|
42
42
|
view=${event.view}
|
43
43
|
/>
|
@@ -32,7 +32,6 @@ export type Limit = number | [unknown, unknown] | null;
|
|
32
32
|
export type SampleId = string | number | (string | number)[] | null;
|
33
33
|
export type Epochs = number | null;
|
34
34
|
export type EpochsReducer = string[] | null;
|
35
|
-
export type Trace = boolean | null;
|
36
35
|
export type Name1 = string;
|
37
36
|
export type Tools = string | string[];
|
38
37
|
export type Approvers = ApproverPolicyConfig[];
|
@@ -112,35 +111,49 @@ export type Input =
|
|
112
111
|
| ChatMessageAssistant
|
113
112
|
| ChatMessageTool
|
114
113
|
)[];
|
115
|
-
export type Content =
|
114
|
+
export type Content =
|
115
|
+
| string
|
116
|
+
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
116
117
|
export type Type1 = "text";
|
117
118
|
export type Text = string;
|
118
119
|
export type Type2 = "image";
|
119
120
|
export type Image = string;
|
120
121
|
export type Detail = "auto" | "low" | "high";
|
122
|
+
export type Type3 = "audio";
|
123
|
+
export type Audio = string;
|
124
|
+
export type Format = "wav" | "mp3";
|
125
|
+
export type Type4 = "video";
|
126
|
+
export type Video = string;
|
127
|
+
export type Format1 = "mp4" | "mpeg" | "mov";
|
121
128
|
export type Source = ("input" | "generate") | null;
|
122
129
|
export type Role = "system";
|
123
|
-
export type Content1 =
|
130
|
+
export type Content1 =
|
131
|
+
| string
|
132
|
+
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
124
133
|
export type Source1 = ("input" | "generate") | null;
|
125
134
|
export type Role1 = "user";
|
126
135
|
export type ToolCallId = string | null;
|
127
|
-
export type Content2 =
|
136
|
+
export type Content2 =
|
137
|
+
| string
|
138
|
+
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
128
139
|
export type Source2 = ("input" | "generate") | null;
|
129
140
|
export type Role2 = "assistant";
|
130
141
|
export type ToolCalls = ToolCall[] | null;
|
131
142
|
export type Id1 = string;
|
132
143
|
export type Function = string;
|
133
|
-
export type
|
144
|
+
export type Type5 = "function";
|
134
145
|
export type ParseError = string | null;
|
135
146
|
export type Title = string | null;
|
136
|
-
export type
|
147
|
+
export type Format2 = "text" | "markdown";
|
137
148
|
export type Content3 = string;
|
138
|
-
export type Content4 =
|
149
|
+
export type Content4 =
|
150
|
+
| string
|
151
|
+
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
139
152
|
export type Source3 = ("input" | "generate") | null;
|
140
153
|
export type Role3 = "tool";
|
141
154
|
export type ToolCallId1 = string | null;
|
142
155
|
export type Function1 = string | null;
|
143
|
-
export type
|
156
|
+
export type Type6 =
|
144
157
|
| "parsing"
|
145
158
|
| "timeout"
|
146
159
|
| "unicode_decode"
|
@@ -218,7 +231,7 @@ export type JsonValue = unknown;
|
|
218
231
|
export type Timestamp1 = string;
|
219
232
|
export type Pending1 = boolean | null;
|
220
233
|
export type Event1 = "sample_limit";
|
221
|
-
export type
|
234
|
+
export type Type7 = "message" | "time" | "token" | "operator";
|
222
235
|
export type Message2 = string;
|
223
236
|
export type Limit1 = number | null;
|
224
237
|
export type Timestamp2 = string;
|
@@ -244,8 +257,8 @@ export type Input2 = (
|
|
244
257
|
)[];
|
245
258
|
export type Name5 = string;
|
246
259
|
export type Description = string;
|
247
|
-
export type
|
248
|
-
export type
|
260
|
+
export type Type8 = "object";
|
261
|
+
export type Type9 =
|
249
262
|
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
250
263
|
| null;
|
251
264
|
export type Description1 = string | null;
|
@@ -265,7 +278,7 @@ export type Cache = ("read" | "write") | null;
|
|
265
278
|
export type Timestamp5 = string;
|
266
279
|
export type Pending5 = boolean | null;
|
267
280
|
export type Event5 = "tool";
|
268
|
-
export type
|
281
|
+
export type Type10 = "function";
|
269
282
|
export type Id3 = string;
|
270
283
|
export type Function2 = string;
|
271
284
|
export type Result =
|
@@ -274,7 +287,9 @@ export type Result =
|
|
274
287
|
| boolean
|
275
288
|
| ContentText
|
276
289
|
| ContentImage
|
277
|
-
|
|
290
|
+
| ContentAudio
|
291
|
+
| ContentVideo
|
292
|
+
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
278
293
|
export type Truncated = [unknown, unknown] | null;
|
279
294
|
export type Timestamp6 = string;
|
280
295
|
export type Pending6 = boolean | null;
|
@@ -324,13 +339,13 @@ export type Timestamp12 = string;
|
|
324
339
|
export type Pending12 = boolean | null;
|
325
340
|
export type Event12 = "step";
|
326
341
|
export type Action = "begin" | "end";
|
327
|
-
export type
|
342
|
+
export type Type11 = string | null;
|
328
343
|
export type Name8 = string;
|
329
344
|
export type Timestamp13 = string;
|
330
345
|
export type Pending13 = boolean | null;
|
331
346
|
export type Event13 = "subtask";
|
332
347
|
export type Name9 = string;
|
333
|
-
export type
|
348
|
+
export type Type12 = string | null;
|
334
349
|
export type Events2 = (
|
335
350
|
| SampleInitEvent
|
336
351
|
| SampleLimitEvent
|
@@ -379,7 +394,7 @@ export type Events = (
|
|
379
394
|
| StepEvent
|
380
395
|
| SubtaskEvent
|
381
396
|
)[];
|
382
|
-
export type
|
397
|
+
export type Type13 = "context" | "time" | "message" | "token" | "operator";
|
383
398
|
export type Limit2 = number;
|
384
399
|
export type Reductions = EvalSampleReductions[] | null;
|
385
400
|
export type Scorer1 = string;
|
@@ -396,7 +411,7 @@ export type Answer1 = string | null;
|
|
396
411
|
export type Explanation2 = string | null;
|
397
412
|
export type Metadata8 = {} | null;
|
398
413
|
export type SampleId1 = string | number | null;
|
399
|
-
export type Samples2 =
|
414
|
+
export type Samples2 = EvalSampleScore[];
|
400
415
|
export type Location1 = string;
|
401
416
|
|
402
417
|
export interface EvalLog {
|
@@ -448,7 +463,6 @@ export interface EvalConfig {
|
|
448
463
|
sample_id: SampleId;
|
449
464
|
epochs: Epochs;
|
450
465
|
epochs_reducer: EpochsReducer;
|
451
|
-
trace: Trace;
|
452
466
|
approval: ApprovalPolicyConfig | null;
|
453
467
|
fail_on_error: FailOnError;
|
454
468
|
message_limit: MessageLimit;
|
@@ -614,6 +628,16 @@ export interface ContentImage {
|
|
614
628
|
image: Image;
|
615
629
|
detail: Detail;
|
616
630
|
}
|
631
|
+
export interface ContentAudio {
|
632
|
+
type: Type3;
|
633
|
+
audio: Audio;
|
634
|
+
format: Format;
|
635
|
+
}
|
636
|
+
export interface ContentVideo {
|
637
|
+
type: Type4;
|
638
|
+
video: Video;
|
639
|
+
format: Format1;
|
640
|
+
}
|
617
641
|
export interface ChatMessageUser {
|
618
642
|
content: Content1;
|
619
643
|
source: Source1;
|
@@ -630,7 +654,7 @@ export interface ToolCall {
|
|
630
654
|
id: Id1;
|
631
655
|
function: Function;
|
632
656
|
arguments: Arguments;
|
633
|
-
type:
|
657
|
+
type: Type5;
|
634
658
|
parse_error: ParseError;
|
635
659
|
view: ToolCallContent | null;
|
636
660
|
}
|
@@ -640,7 +664,7 @@ export interface Arguments {}
|
|
640
664
|
*/
|
641
665
|
export interface ToolCallContent {
|
642
666
|
title: Title;
|
643
|
-
format:
|
667
|
+
format: Format2;
|
644
668
|
content: Content3;
|
645
669
|
}
|
646
670
|
export interface ChatMessageTool {
|
@@ -652,7 +676,7 @@ export interface ChatMessageTool {
|
|
652
676
|
error: ToolCallError | null;
|
653
677
|
}
|
654
678
|
export interface ToolCallError {
|
655
|
-
type:
|
679
|
+
type: Type6;
|
656
680
|
message: Message1;
|
657
681
|
}
|
658
682
|
export interface ModelOutput {
|
@@ -735,7 +759,7 @@ export interface SampleLimitEvent {
|
|
735
759
|
timestamp: Timestamp1;
|
736
760
|
pending: Pending1;
|
737
761
|
event: Event1;
|
738
|
-
type:
|
762
|
+
type: Type7;
|
739
763
|
message: Message2;
|
740
764
|
limit: Limit1;
|
741
765
|
}
|
@@ -822,7 +846,7 @@ export interface ToolInfo {
|
|
822
846
|
* Description of tool parameters object in JSON Schema format.
|
823
847
|
*/
|
824
848
|
export interface ToolParams {
|
825
|
-
type:
|
849
|
+
type: Type8;
|
826
850
|
properties: Properties;
|
827
851
|
required: Required1;
|
828
852
|
additionalProperties: Additionalproperties1;
|
@@ -834,7 +858,7 @@ export interface Properties {
|
|
834
858
|
* Description of tool parameter in JSON Schema format.
|
835
859
|
*/
|
836
860
|
export interface ToolParam {
|
837
|
-
type:
|
861
|
+
type: Type9;
|
838
862
|
description: Description1;
|
839
863
|
default: Default;
|
840
864
|
enum: Enum;
|
@@ -897,7 +921,7 @@ export interface ToolEvent {
|
|
897
921
|
timestamp: Timestamp5;
|
898
922
|
pending: Pending5;
|
899
923
|
event: Event5;
|
900
|
-
type:
|
924
|
+
type: Type10;
|
901
925
|
id: Id3;
|
902
926
|
function: Function2;
|
903
927
|
arguments: Arguments1;
|
@@ -999,7 +1023,7 @@ export interface StepEvent {
|
|
999
1023
|
pending: Pending12;
|
1000
1024
|
event: Event12;
|
1001
1025
|
action: Action;
|
1002
|
-
type:
|
1026
|
+
type: Type11;
|
1003
1027
|
name: Name8;
|
1004
1028
|
}
|
1005
1029
|
/**
|
@@ -1010,7 +1034,7 @@ export interface SubtaskEvent {
|
|
1010
1034
|
pending: Pending13;
|
1011
1035
|
event: Event13;
|
1012
1036
|
name: Name9;
|
1013
|
-
type:
|
1037
|
+
type: Type12;
|
1014
1038
|
input: Input4;
|
1015
1039
|
result: Result1;
|
1016
1040
|
events: Events2;
|
@@ -1026,7 +1050,7 @@ export interface Attachments {
|
|
1026
1050
|
[k: string]: string;
|
1027
1051
|
}
|
1028
1052
|
export interface EvalSampleLimit {
|
1029
|
-
type:
|
1053
|
+
type: Type13;
|
1030
1054
|
limit: Limit2;
|
1031
1055
|
}
|
1032
1056
|
export interface EvalSampleReductions {
|
@@ -1034,13 +1058,7 @@ export interface EvalSampleReductions {
|
|
1034
1058
|
reducer: Reducer1;
|
1035
1059
|
samples: Samples2;
|
1036
1060
|
}
|
1037
|
-
|
1038
|
-
* Score for a Sample
|
1039
|
-
*
|
1040
|
-
* Args:
|
1041
|
-
* sample_id: (str | int | None) Unique id of a sample
|
1042
|
-
*/
|
1043
|
-
export interface SampleScore {
|
1061
|
+
export interface EvalSampleScore {
|
1044
1062
|
value: Value2;
|
1045
1063
|
answer: Answer1;
|
1046
1064
|
explanation: Explanation2;
|
@@ -150,7 +150,7 @@ export const WorkSpace = ({
|
|
150
150
|
|
151
151
|
// The samples tab
|
152
152
|
// Currently only appears when the result is successful
|
153
|
-
if (
|
153
|
+
if (sampleMode !== "none") {
|
154
154
|
resolvedTabs.samples = {
|
155
155
|
id: kEvalWorkspaceTabId,
|
156
156
|
scrollable: samples.length === 1,
|
@@ -5,7 +5,7 @@ from rich.text import Text
|
|
5
5
|
|
6
6
|
from inspect_ai._util.transcript import transcript_markdown
|
7
7
|
from inspect_ai.tool._tool_call import ToolCallContent, ToolCallView
|
8
|
-
from inspect_ai.util.
|
8
|
+
from inspect_ai.util._display import display_type
|
9
9
|
|
10
10
|
HUMAN_APPROVED = "Human operator approved tool call."
|
11
11
|
HUMAN_REJECTED = "Human operator rejected the tool call."
|
@@ -18,7 +18,7 @@ def render_tool_approval(message: str, view: ToolCallView) -> list[RenderableTyp
|
|
18
18
|
text_highlighter = ReprHighlighter()
|
19
19
|
|
20
20
|
# ignore content if trace enabled
|
21
|
-
message = message.strip() if
|
21
|
+
message = message.strip() if display_type() != "conversation" else ""
|
22
22
|
|
23
23
|
def add_view_content(view_content: ToolCallContent) -> None:
|
24
24
|
if view_content.title:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import csv
|
2
|
+
import os
|
2
3
|
from io import TextIOWrapper
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any
|
@@ -75,7 +76,7 @@ def csv_dataset(
|
|
75
76
|
dataset = MemoryDataset(
|
76
77
|
samples=data_to_samples(valid_data, data_to_sample, auto_id),
|
77
78
|
name=name,
|
78
|
-
location=csv_file,
|
79
|
+
location=os.path.abspath(csv_file),
|
79
80
|
)
|
80
81
|
|
81
82
|
# resolve relative file paths
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
from io import TextIOWrapper
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, cast
|
@@ -75,7 +76,7 @@ def json_dataset(
|
|
75
76
|
dataset = MemoryDataset(
|
76
77
|
samples=data_to_samples(dataset_reader(f), data_to_sample, auto_id),
|
77
78
|
name=name,
|
78
|
-
location=json_file,
|
79
|
+
location=os.path.abspath(json_file),
|
79
80
|
)
|
80
81
|
|
81
82
|
# resolve relative file paths
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import Callable
|
2
2
|
|
3
|
-
from inspect_ai._util.content import Content, ContentImage
|
3
|
+
from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentVideo
|
4
4
|
from inspect_ai._util.file import filesystem
|
5
5
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
6
6
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
@@ -44,24 +44,28 @@ def resolve_sample_files(dataset: Dataset) -> None:
|
|
44
44
|
for path in sample.files.keys():
|
45
45
|
sample.files[path] = resolve_file(sample.files[path])
|
46
46
|
|
47
|
+
# check for setup script
|
48
|
+
if sample.setup is not None:
|
49
|
+
sample.setup = resolve_file(sample.setup)
|
50
|
+
|
47
51
|
# check for image paths
|
48
52
|
if not isinstance(sample.input, str):
|
49
|
-
sample.input =
|
53
|
+
sample.input = messages_with_resolved_content(sample.input, resolve_file)
|
50
54
|
|
51
55
|
|
52
|
-
def
|
56
|
+
def messages_with_resolved_content(
|
53
57
|
messages: list[ChatMessage], resolver: Callable[[str], str]
|
54
58
|
) -> list[ChatMessage]:
|
55
|
-
return [
|
59
|
+
return [message_with_resolved_content(message, resolver) for message in messages]
|
56
60
|
|
57
61
|
|
58
|
-
def
|
62
|
+
def message_with_resolved_content(
|
59
63
|
message: ChatMessage, resolver: Callable[[str], str]
|
60
64
|
) -> ChatMessage:
|
61
65
|
if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
|
62
66
|
return ChatMessageUser(
|
63
67
|
content=[
|
64
|
-
|
68
|
+
chat_content_with_resolved_content(content, resolver)
|
65
69
|
for content in message.content
|
66
70
|
],
|
67
71
|
source=message.source,
|
@@ -70,7 +74,7 @@ def message_with_resolved_image(
|
|
70
74
|
return message
|
71
75
|
|
72
76
|
|
73
|
-
def
|
77
|
+
def chat_content_with_resolved_content(
|
74
78
|
content: Content, resolver: Callable[[str], str]
|
75
79
|
) -> Content:
|
76
80
|
if isinstance(content, ContentImage):
|
@@ -78,5 +82,9 @@ def chat_content_with_resolved_image(
|
|
78
82
|
image=resolver(content.image),
|
79
83
|
detail=content.detail,
|
80
84
|
)
|
85
|
+
elif isinstance(content, ContentAudio):
|
86
|
+
return ContentAudio(audio=resolver(content.audio), format=content.format)
|
87
|
+
elif isinstance(content, ContentVideo):
|
88
|
+
return ContentVideo(video=resolver(content.video), format=content.format)
|
81
89
|
else:
|
82
90
|
return content
|
inspect_ai/log/_condense.py
CHANGED
@@ -6,7 +6,13 @@ from typing import (
|
|
6
6
|
from pydantic import JsonValue
|
7
7
|
|
8
8
|
from inspect_ai._util.constants import BASE_64_DATA_REMOVED
|
9
|
-
from inspect_ai._util.content import
|
9
|
+
from inspect_ai._util.content import (
|
10
|
+
Content,
|
11
|
+
ContentAudio,
|
12
|
+
ContentImage,
|
13
|
+
ContentText,
|
14
|
+
ContentVideo,
|
15
|
+
)
|
10
16
|
from inspect_ai._util.hash import mm3_hash
|
11
17
|
from inspect_ai._util.json import JsonChange
|
12
18
|
from inspect_ai._util.url import is_data_uri
|
@@ -304,3 +310,7 @@ def walk_content(content: Content, content_fn: Callable[[str], str]) -> Content:
|
|
304
310
|
return content.model_copy(update=dict(text=content_fn(content.text)))
|
305
311
|
elif isinstance(content, ContentImage):
|
306
312
|
return content.model_copy(update=dict(image=content_fn(content.image)))
|
313
|
+
elif isinstance(content, ContentAudio):
|
314
|
+
return content.model_copy(update=dict(audio=content_fn(content.audio)))
|
315
|
+
elif isinstance(content, ContentVideo):
|
316
|
+
return content.model_copy(update=dict(video=content_fn(content.video)))
|
inspect_ai/log/_log.py
CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
|
|
16
16
|
from inspect_ai._util.error import EvalError, exception_message
|
17
17
|
from inspect_ai._util.logger import warn_once
|
18
18
|
from inspect_ai.approval._policy import ApprovalPolicyConfig
|
19
|
+
from inspect_ai.dataset._dataset import MT, metadata_as
|
19
20
|
from inspect_ai.model import (
|
20
21
|
ChatMessage,
|
21
22
|
GenerateConfig,
|
@@ -24,6 +25,8 @@ from inspect_ai.model import (
|
|
24
25
|
)
|
25
26
|
from inspect_ai.scorer import Score
|
26
27
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
28
|
+
from inspect_ai.util._store import Store
|
29
|
+
from inspect_ai.util._store_model import SMT
|
27
30
|
|
28
31
|
from ._transcript import Event
|
29
32
|
|
@@ -45,9 +48,6 @@ class EvalConfig(BaseModel):
|
|
45
48
|
epochs_reducer: list[str] | None = Field(default=None)
|
46
49
|
"""Reducers for aggregating per-sample scores."""
|
47
50
|
|
48
|
-
trace: bool | None = Field(default=None)
|
49
|
-
"""Trace message interactions with evaluated model to terminal."""
|
50
|
-
|
51
51
|
approval: ApprovalPolicyConfig | None = Field(default=None)
|
52
52
|
"""Approval policy for tool use."""
|
53
53
|
|
@@ -158,9 +158,31 @@ class EvalSample(BaseModel):
|
|
158
158
|
metadata: dict[str, Any]
|
159
159
|
"""Additional sample metadata."""
|
160
160
|
|
161
|
+
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
162
|
+
"""Pydantic model interface to metadata.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
metadata_cls: Pydantic model type
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
BaseModel: Instance of metadata_cls bound to sample metadata.
|
169
|
+
"""
|
170
|
+
return metadata_as(self.metadata, metadata_cls)
|
171
|
+
|
161
172
|
store: dict[str, Any] = Field(default_factory=dict)
|
162
173
|
"""State at end of sample execution."""
|
163
174
|
|
175
|
+
def store_as(self, model_cls: Type[SMT]) -> SMT:
|
176
|
+
"""Pydantic model interface to the store.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
model_cls: Pydantic model type (must derive from StoreModel)
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
StoreModel: Instance of model_cls bound to sample store data.
|
183
|
+
"""
|
184
|
+
return model_cls(store=Store(self.store))
|
185
|
+
|
164
186
|
events: list[Event] = Field(default_factory=list)
|
165
187
|
"""Events that occurred during sample execution."""
|
166
188
|
|
@@ -330,7 +352,7 @@ class EvalResults(BaseModel):
|
|
330
352
|
"""Scorer used to compute results (deprecated)."""
|
331
353
|
warn_once(
|
332
354
|
logger,
|
333
|
-
"The 'scorer' field is deprecated. Use '
|
355
|
+
"The 'scorer' field is deprecated. Use 'scores' instead.",
|
334
356
|
)
|
335
357
|
return self.scores[0] if self.scores else None
|
336
358
|
|
@@ -339,7 +361,7 @@ class EvalResults(BaseModel):
|
|
339
361
|
"""Metrics computed (deprecated)."""
|
340
362
|
warn_once(
|
341
363
|
logger,
|
342
|
-
"The 'metrics' field is deprecated. Access metrics through '
|
364
|
+
"The 'metrics' field is deprecated. Access metrics through 'scores' instead.",
|
343
365
|
)
|
344
366
|
return self.scores[0].metrics if self.scores else {}
|
345
367
|
|
@@ -13,7 +13,12 @@ from pydantic_core import to_json
|
|
13
13
|
from typing_extensions import override
|
14
14
|
|
15
15
|
from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
16
|
-
from inspect_ai._util.content import
|
16
|
+
from inspect_ai._util.content import (
|
17
|
+
ContentAudio,
|
18
|
+
ContentImage,
|
19
|
+
ContentText,
|
20
|
+
ContentVideo,
|
21
|
+
)
|
17
22
|
from inspect_ai._util.error import EvalError
|
18
23
|
from inspect_ai._util.file import FileSystem, async_fileystem, dirname, file, filesystem
|
19
24
|
from inspect_ai._util.json import jsonable_python
|
@@ -90,9 +95,11 @@ class EvalRecorder(FileRecorder):
|
|
90
95
|
self.data: dict[str, ZipLogFile] = {}
|
91
96
|
|
92
97
|
@override
|
93
|
-
async def log_init(
|
98
|
+
async def log_init(
|
99
|
+
self, eval: EvalSpec, location: str | None = None, *, clean: bool = False
|
100
|
+
) -> str:
|
94
101
|
# if the file exists then read summaries
|
95
|
-
if location is not None and self.fs.exists(location):
|
102
|
+
if not clean and location is not None and self.fs.exists(location):
|
96
103
|
with file(location, "rb") as f:
|
97
104
|
with ZipFile(f, "r") as zip:
|
98
105
|
log_start = _read_start(zip)
|
@@ -229,7 +236,7 @@ class EvalRecorder(FileRecorder):
|
|
229
236
|
async def write_log(cls, location: str, log: EvalLog) -> None:
|
230
237
|
# write using the recorder (so we get all of the extra streams)
|
231
238
|
recorder = EvalRecorder(dirname(location))
|
232
|
-
await recorder.log_init(log.eval, location)
|
239
|
+
await recorder.log_init(log.eval, location, clean=True)
|
233
240
|
await recorder.log_start(log.eval, log.plan)
|
234
241
|
for sample in log.samples or []:
|
235
242
|
await recorder.log_sample(log.eval, sample)
|
@@ -244,14 +251,20 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
|
|
244
251
|
input: list[ChatMessage] = []
|
245
252
|
for message in inputs:
|
246
253
|
if not isinstance(message.content, str):
|
247
|
-
filtered_content: list[
|
254
|
+
filtered_content: list[
|
255
|
+
ContentText | ContentImage | ContentAudio | ContentVideo
|
256
|
+
] = []
|
248
257
|
for content in message.content:
|
249
|
-
if content.type
|
258
|
+
if content.type == "text":
|
250
259
|
filtered_content.append(content)
|
251
|
-
|
252
|
-
|
260
|
+
else:
|
261
|
+
filtered_content.append(
|
262
|
+
ContentText(text=f"({content.type.capitalize()})")
|
263
|
+
)
|
253
264
|
message.content = filtered_content
|
254
265
|
input.append(message)
|
266
|
+
else:
|
267
|
+
input.append(message)
|
255
268
|
|
256
269
|
return input
|
257
270
|
else:
|
inspect_ai/log/_samples.py
CHANGED
@@ -29,7 +29,7 @@ class ActiveSample:
|
|
29
29
|
sandboxes: dict[str, SandboxConnection],
|
30
30
|
) -> None:
|
31
31
|
self.id = uuid()
|
32
|
-
self.started =
|
32
|
+
self.started: float | None = None
|
33
33
|
self.completed: float | None = None
|
34
34
|
self.task = task
|
35
35
|
self.model = model
|
@@ -48,10 +48,15 @@ class ActiveSample:
|
|
48
48
|
|
49
49
|
@property
|
50
50
|
def execution_time(self) -> float:
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
if self.started is not None:
|
52
|
+
completed = (
|
53
|
+
self.completed
|
54
|
+
if self.completed is not None
|
55
|
+
else datetime.now().timestamp()
|
56
|
+
)
|
57
|
+
return completed - self.started
|
58
|
+
else:
|
59
|
+
return 0
|
55
60
|
|
56
61
|
def interrupt(self, action: Literal["score", "error"]) -> None:
|
57
62
|
self._interrupt_action = action
|
inspect_ai/log/_transcript.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import asyncio
|
1
2
|
import contextlib
|
2
3
|
from contextvars import ContextVar
|
3
4
|
from datetime import datetime
|
@@ -11,7 +12,7 @@ from typing import (
|
|
11
12
|
Union,
|
12
13
|
)
|
13
14
|
|
14
|
-
from pydantic import BaseModel, Field, JsonValue, field_serializer
|
15
|
+
from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
|
15
16
|
|
16
17
|
from inspect_ai._util.constants import SAMPLE_SUBTASK
|
17
18
|
from inspect_ai._util.error import EvalError
|
@@ -176,6 +177,32 @@ class ToolEvent(BaseEvent):
|
|
176
177
|
self.events = events
|
177
178
|
self.pending = None
|
178
179
|
|
180
|
+
# mechanism for operator to cancel the tool call
|
181
|
+
|
182
|
+
def set_task(self, task: asyncio.Task[Any]) -> None:
|
183
|
+
"""Set the tool task (for possible cancellation)"""
|
184
|
+
self._task = task
|
185
|
+
|
186
|
+
def cancel(self) -> None:
|
187
|
+
"""Cancel the tool task."""
|
188
|
+
if self._task:
|
189
|
+
self._cancelled = True
|
190
|
+
self._task.cancel()
|
191
|
+
|
192
|
+
@property
|
193
|
+
def cancelled(self) -> bool:
|
194
|
+
"""Was the task cancelled?"""
|
195
|
+
return self._cancelled is True
|
196
|
+
|
197
|
+
_cancelled: bool | None = None
|
198
|
+
"""Was this tool call cancelled?"""
|
199
|
+
|
200
|
+
_task: asyncio.Task[Any] | None = None
|
201
|
+
"""Handle to task (used for cancellation)"""
|
202
|
+
|
203
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
204
|
+
"""Required so that we can include '_task' as a member."""
|
205
|
+
|
179
206
|
|
180
207
|
class ApprovalEvent(BaseEvent):
|
181
208
|
"""Tool approval."""
|
inspect_ai/model/__init__.py
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
# ruff: noqa: F401 F403 F405
|
2
2
|
|
3
|
-
from inspect_ai._util.content import
|
3
|
+
from inspect_ai._util.content import (
|
4
|
+
Content,
|
5
|
+
ContentAudio,
|
6
|
+
ContentImage,
|
7
|
+
ContentText,
|
8
|
+
ContentVideo,
|
9
|
+
)
|
4
10
|
from inspect_ai._util.deprecation import relocated_module_attribute
|
5
11
|
|
6
12
|
from ._cache import (
|
@@ -42,8 +48,10 @@ __all__ = [
|
|
42
48
|
"GenerateConfig",
|
43
49
|
"GenerateConfigArgs",
|
44
50
|
"CachePolicy",
|
45
|
-
"
|
51
|
+
"ContentAudio",
|
46
52
|
"ContentImage",
|
53
|
+
"ContentText",
|
54
|
+
"ContentVideo",
|
47
55
|
"Content",
|
48
56
|
"ChatMessage",
|
49
57
|
"ChatMessageSystem",
|