inspect-ai 0.3.72__py3-none-any.whl → 0.3.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +14 -3
- inspect_ai/_cli/sandbox.py +3 -3
- inspect_ai/_cli/score.py +6 -4
- inspect_ai/_cli/trace.py +53 -6
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +2 -1
- inspect_ai/_display/core/footer.py +6 -6
- inspect_ai/_display/plain/display.py +11 -6
- inspect_ai/_display/rich/display.py +23 -13
- inspect_ai/_display/textual/app.py +10 -9
- inspect_ai/_display/textual/display.py +2 -2
- inspect_ai/_display/textual/widgets/footer.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +14 -5
- inspect_ai/_eval/context.py +1 -2
- inspect_ai/_eval/eval.py +54 -41
- inspect_ai/_eval/loader.py +9 -2
- inspect_ai/_eval/run.py +148 -81
- inspect_ai/_eval/score.py +13 -8
- inspect_ai/_eval/task/images.py +31 -21
- inspect_ai/_eval/task/run.py +62 -59
- inspect_ai/_eval/task/rundir.py +16 -9
- inspect_ai/_eval/task/sandbox.py +7 -8
- inspect_ai/_eval/task/util.py +7 -0
- inspect_ai/_util/_async.py +118 -10
- inspect_ai/_util/constants.py +0 -2
- inspect_ai/_util/file.py +15 -29
- inspect_ai/_util/future.py +37 -0
- inspect_ai/_util/http.py +3 -99
- inspect_ai/_util/httpx.py +60 -0
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/json.py +5 -52
- inspect_ai/_util/logger.py +30 -86
- inspect_ai/_util/retry.py +10 -61
- inspect_ai/_util/trace.py +2 -2
- inspect_ai/_view/server.py +86 -3
- inspect_ai/_view/www/dist/assets/index.js +25837 -13269
- inspect_ai/_view/www/log-schema.json +253 -186
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +8 -3
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +2 -3
- inspect_ai/_view/www/src/types/log.d.ts +122 -94
- inspect_ai/approval/_human/manager.py +6 -10
- inspect_ai/approval/_human/panel.py +2 -2
- inspect_ai/dataset/_sources/util.py +7 -6
- inspect_ai/log/__init__.py +4 -0
- inspect_ai/log/_file.py +35 -61
- inspect_ai/log/_log.py +18 -1
- inspect_ai/log/_recorders/eval.py +14 -23
- inspect_ai/log/_recorders/json.py +3 -18
- inspect_ai/log/_samples.py +27 -2
- inspect_ai/log/_transcript.py +8 -8
- inspect_ai/model/__init__.py +2 -1
- inspect_ai/model/_call_tools.py +60 -40
- inspect_ai/model/_chat_message.py +3 -2
- inspect_ai/model/_generate_config.py +25 -0
- inspect_ai/model/_model.py +74 -36
- inspect_ai/model/_openai.py +9 -1
- inspect_ai/model/_providers/anthropic.py +24 -26
- inspect_ai/model/_providers/azureai.py +11 -9
- inspect_ai/model/_providers/bedrock.py +33 -24
- inspect_ai/model/_providers/cloudflare.py +8 -9
- inspect_ai/model/_providers/goodfire.py +7 -3
- inspect_ai/model/_providers/google.py +47 -13
- inspect_ai/model/_providers/groq.py +15 -15
- inspect_ai/model/_providers/hf.py +24 -17
- inspect_ai/model/_providers/mistral.py +36 -20
- inspect_ai/model/_providers/openai.py +30 -25
- inspect_ai/model/_providers/openai_o1.py +1 -1
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/together.py +3 -4
- inspect_ai/model/_providers/util/__init__.py +2 -2
- inspect_ai/model/_providers/util/chatapi.py +6 -19
- inspect_ai/model/_providers/util/hooks.py +165 -0
- inspect_ai/model/_providers/vertex.py +20 -3
- inspect_ai/model/_providers/vllm.py +16 -19
- inspect_ai/scorer/_multi.py +5 -2
- inspect_ai/solver/_bridge/patch.py +31 -1
- inspect_ai/solver/_fork.py +5 -3
- inspect_ai/solver/_human_agent/agent.py +3 -2
- inspect_ai/tool/__init__.py +8 -2
- inspect_ai/tool/_tool_info.py +4 -90
- inspect_ai/tool/_tool_params.py +4 -34
- inspect_ai/tool/_tools/_web_search.py +30 -24
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_concurrency.py +5 -6
- inspect_ai/util/_display.py +6 -0
- inspect_ai/util/_json.py +170 -0
- inspect_ai/util/_sandbox/docker/cleanup.py +13 -9
- inspect_ai/util/_sandbox/docker/docker.py +5 -0
- inspect_ai/util/_sandbox/environment.py +56 -9
- inspect_ai/util/_sandbox/service.py +12 -5
- inspect_ai/util/_subprocess.py +94 -113
- inspect_ai/util/_subtask.py +2 -4
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/METADATA +6 -2
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/RECORD +99 -99
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/WHEEL +1 -1
- inspect_ai/_util/timeouts.py +0 -160
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
- inspect_ai/model/_providers/util/tracker.py +0 -92
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/top_level.txt +0 -0
@@ -21,11 +21,7 @@ export type Location = string | null;
|
|
21
21
|
export type Samples = number | null;
|
22
22
|
export type SampleIds = (number | string)[] | null;
|
23
23
|
export type Shuffled = boolean | null;
|
24
|
-
|
25
|
-
* @minItems 1
|
26
|
-
* @maxItems 2
|
27
|
-
*/
|
28
|
-
export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown];
|
24
|
+
export type Type = string;
|
29
25
|
export type Model = string;
|
30
26
|
export type ModelBaseUrl = string | null;
|
31
27
|
export type Limit = number | [unknown, unknown] | null;
|
@@ -49,7 +45,7 @@ export type LogSamples = boolean | null;
|
|
49
45
|
export type LogImages = boolean | null;
|
50
46
|
export type LogBuffer = number | null;
|
51
47
|
export type ScoreDisplay = boolean | null;
|
52
|
-
export type
|
48
|
+
export type Type1 = "git";
|
53
49
|
export type Origin = string;
|
54
50
|
export type Commit = string;
|
55
51
|
export type Metadata = {} | null;
|
@@ -105,12 +101,26 @@ export type CachePrompt = "auto" | boolean | null;
|
|
105
101
|
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
106
102
|
export type ReasoningTokens = number | null;
|
107
103
|
export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
|
104
|
+
export type Name5 = string;
|
105
|
+
export type Type2 =
|
106
|
+
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
107
|
+
| null;
|
108
|
+
export type Description = string | null;
|
109
|
+
export type Enum = unknown[] | null;
|
110
|
+
export type Properties = {
|
111
|
+
[k: string]: JSONSchema;
|
112
|
+
} | null;
|
113
|
+
export type Additionalproperties = JSONSchema | boolean | null;
|
114
|
+
export type Anyof = JSONSchema[] | null;
|
115
|
+
export type Required = string[] | null;
|
116
|
+
export type Description1 = string | null;
|
117
|
+
export type Strict = boolean | null;
|
108
118
|
export type TotalSamples = number;
|
109
119
|
export type CompletedSamples = number;
|
110
|
-
export type
|
120
|
+
export type Name6 = string;
|
111
121
|
export type Scorer = string;
|
112
122
|
export type Reducer = string | null;
|
113
|
-
export type
|
123
|
+
export type Name7 = string;
|
114
124
|
export type Value = number;
|
115
125
|
export type Metadata2 = {} | null;
|
116
126
|
export type Metadata3 = {} | null;
|
@@ -138,7 +148,7 @@ export type Input =
|
|
138
148
|
| ChatMessageAssistant
|
139
149
|
| ChatMessageTool
|
140
150
|
)[];
|
141
|
-
export type
|
151
|
+
export type Id1 = string;
|
142
152
|
export type Content =
|
143
153
|
| string
|
144
154
|
| (
|
@@ -148,23 +158,24 @@ export type Content =
|
|
148
158
|
| ContentAudio
|
149
159
|
| ContentVideo
|
150
160
|
)[];
|
151
|
-
export type
|
161
|
+
export type Type3 = "text";
|
152
162
|
export type Text = string;
|
153
|
-
export type
|
163
|
+
export type Type4 = "reasoning";
|
154
164
|
export type Reasoning = string;
|
155
165
|
export type Signature = string | null;
|
156
166
|
export type Redacted = boolean;
|
157
|
-
export type
|
167
|
+
export type Type5 = "image";
|
158
168
|
export type Image = string;
|
159
169
|
export type Detail = "auto" | "low" | "high";
|
160
|
-
export type
|
170
|
+
export type Type6 = "audio";
|
161
171
|
export type Audio = string;
|
162
172
|
export type Format = "wav" | "mp3";
|
163
|
-
export type
|
173
|
+
export type Type7 = "video";
|
164
174
|
export type Video = string;
|
165
175
|
export type Format1 = "mp4" | "mpeg" | "mov";
|
166
176
|
export type Source = ("input" | "generate") | null;
|
167
|
-
export type
|
177
|
+
export type Role = "system";
|
178
|
+
export type Id2 = string;
|
168
179
|
export type Content1 =
|
169
180
|
| string
|
170
181
|
| (
|
@@ -175,8 +186,9 @@ export type Content1 =
|
|
175
186
|
| ContentVideo
|
176
187
|
)[];
|
177
188
|
export type Source1 = ("input" | "generate") | null;
|
189
|
+
export type Role1 = "user";
|
178
190
|
export type ToolCallId = string[] | null;
|
179
|
-
export type
|
191
|
+
export type Id3 = string;
|
180
192
|
export type Content2 =
|
181
193
|
| string
|
182
194
|
| (
|
@@ -187,15 +199,16 @@ export type Content2 =
|
|
187
199
|
| ContentVideo
|
188
200
|
)[];
|
189
201
|
export type Source2 = ("input" | "generate") | null;
|
202
|
+
export type Role2 = "assistant";
|
190
203
|
export type ToolCalls = ToolCall[] | null;
|
191
|
-
export type
|
204
|
+
export type Id4 = string;
|
192
205
|
export type Function = string;
|
193
|
-
export type
|
206
|
+
export type Type8 = "function";
|
194
207
|
export type ParseError = string | null;
|
195
208
|
export type Title = string | null;
|
196
209
|
export type Format2 = "text" | "markdown";
|
197
210
|
export type Content3 = string;
|
198
|
-
export type
|
211
|
+
export type Id5 = string;
|
199
212
|
export type Content4 =
|
200
213
|
| string
|
201
214
|
| (
|
@@ -206,9 +219,10 @@ export type Content4 =
|
|
206
219
|
| ContentVideo
|
207
220
|
)[];
|
208
221
|
export type Source3 = ("input" | "generate") | null;
|
222
|
+
export type Role3 = "tool";
|
209
223
|
export type ToolCallId1 = string | null;
|
210
224
|
export type Function1 = string | null;
|
211
|
-
export type
|
225
|
+
export type Type9 =
|
212
226
|
| "parsing"
|
213
227
|
| "timeout"
|
214
228
|
| "unicode_decode"
|
@@ -277,7 +291,7 @@ export type Input1 =
|
|
277
291
|
)[];
|
278
292
|
export type Choices2 = string[] | null;
|
279
293
|
export type Target1 = string | string[];
|
280
|
-
export type
|
294
|
+
export type Id6 = number | string | null;
|
281
295
|
export type Metadata8 = {} | null;
|
282
296
|
export type Files1 = {
|
283
297
|
[k: string]: string;
|
@@ -288,7 +302,7 @@ export type Timestamp1 = string;
|
|
288
302
|
export type WorkingStart1 = number;
|
289
303
|
export type Pending1 = boolean | null;
|
290
304
|
export type Event1 = "sample_limit";
|
291
|
-
export type
|
305
|
+
export type Type10 =
|
292
306
|
| "message"
|
293
307
|
| "time"
|
294
308
|
| "working"
|
@@ -335,25 +349,14 @@ export type Input3 = (
|
|
335
349
|
| ChatMessageAssistant
|
336
350
|
| ChatMessageTool
|
337
351
|
)[];
|
338
|
-
export type
|
339
|
-
export type
|
340
|
-
export type
|
341
|
-
export type Type10 =
|
342
|
-
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
343
|
-
| null;
|
344
|
-
export type Description1 = string | null;
|
345
|
-
export type Enum = unknown[] | null;
|
346
|
-
export type Properties1 = {
|
347
|
-
[k: string]: ToolParam;
|
348
|
-
} | null;
|
349
|
-
export type Additionalproperties = ToolParam | boolean | null;
|
350
|
-
export type Anyof = ToolParam[] | null;
|
351
|
-
export type Required = string[] | null;
|
352
|
+
export type Name8 = string;
|
353
|
+
export type Description2 = string;
|
354
|
+
export type Type11 = "object";
|
352
355
|
export type Required1 = string[];
|
353
356
|
export type Additionalproperties1 = boolean;
|
354
357
|
export type Tools1 = ToolInfo[];
|
355
358
|
export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
|
356
|
-
export type
|
359
|
+
export type Name9 = string;
|
357
360
|
export type Error1 = string | null;
|
358
361
|
export type Cache = ("read" | "write") | null;
|
359
362
|
export type Time1 = number | null;
|
@@ -363,8 +366,8 @@ export type Timestamp6 = string;
|
|
363
366
|
export type WorkingStart6 = number;
|
364
367
|
export type Pending6 = boolean | null;
|
365
368
|
export type Event6 = "tool";
|
366
|
-
export type
|
367
|
-
export type
|
369
|
+
export type Type12 = "function";
|
370
|
+
export type Id7 = string;
|
368
371
|
export type Function2 = string;
|
369
372
|
export type Result1 =
|
370
373
|
| string
|
@@ -416,7 +419,7 @@ export type Timestamp11 = string;
|
|
416
419
|
export type WorkingStart11 = number;
|
417
420
|
export type Pending11 = boolean | null;
|
418
421
|
export type Event11 = "logger";
|
419
|
-
export type
|
422
|
+
export type Name10 = string | null;
|
420
423
|
export type Level =
|
421
424
|
| "debug"
|
422
425
|
| "trace"
|
@@ -441,14 +444,14 @@ export type WorkingStart13 = number;
|
|
441
444
|
export type Pending13 = boolean | null;
|
442
445
|
export type Event13 = "step";
|
443
446
|
export type Action1 = "begin" | "end";
|
444
|
-
export type
|
445
|
-
export type
|
447
|
+
export type Type13 = string | null;
|
448
|
+
export type Name11 = string;
|
446
449
|
export type Timestamp14 = string;
|
447
450
|
export type WorkingStart14 = number;
|
448
451
|
export type Pending14 = boolean | null;
|
449
452
|
export type Event14 = "subtask";
|
450
|
-
export type
|
451
|
-
export type
|
453
|
+
export type Name12 = string;
|
454
|
+
export type Type14 = string | null;
|
452
455
|
export type Events2 = (
|
453
456
|
| SampleInitEvent
|
454
457
|
| SampleLimitEvent
|
@@ -507,7 +510,7 @@ export type Events = (
|
|
507
510
|
export type TotalTime = number | null;
|
508
511
|
export type WorkingTime3 = number | null;
|
509
512
|
export type Uuid = string | null;
|
510
|
-
export type
|
513
|
+
export type Type15 =
|
511
514
|
| "context"
|
512
515
|
| "time"
|
513
516
|
| "working"
|
@@ -588,6 +591,16 @@ export interface EvalDataset {
|
|
588
591
|
sample_ids: SampleIds;
|
589
592
|
shuffled: Shuffled;
|
590
593
|
}
|
594
|
+
/**
|
595
|
+
* Specification of a SandboxEnvironment.
|
596
|
+
*/
|
597
|
+
export interface SandboxEnvironmentSpec {
|
598
|
+
type: Type;
|
599
|
+
config: Config;
|
600
|
+
}
|
601
|
+
export interface Config {
|
602
|
+
[k: string]: unknown;
|
603
|
+
}
|
591
604
|
export interface ModelArgs {}
|
592
605
|
/**
|
593
606
|
* Configuration used for evaluation.
|
@@ -642,7 +655,7 @@ export interface Params {}
|
|
642
655
|
* Git revision for evaluation.
|
643
656
|
*/
|
644
657
|
export interface EvalRevision {
|
645
|
-
type:
|
658
|
+
type: Type1;
|
646
659
|
origin: Origin;
|
647
660
|
commit: Commit;
|
648
661
|
}
|
@@ -704,6 +717,33 @@ export interface GenerateConfig {
|
|
704
717
|
reasoning_effort: ReasoningEffort;
|
705
718
|
reasoning_tokens: ReasoningTokens;
|
706
719
|
reasoning_history: ReasoningHistory;
|
720
|
+
response_schema: ResponseSchema | null;
|
721
|
+
}
|
722
|
+
/**
|
723
|
+
* Schema for model response when using Structured Output.
|
724
|
+
*/
|
725
|
+
export interface ResponseSchema {
|
726
|
+
name: Name5;
|
727
|
+
json_schema: JSONSchema;
|
728
|
+
description: Description1;
|
729
|
+
strict: Strict;
|
730
|
+
}
|
731
|
+
/**
|
732
|
+
* JSON Schema for type.
|
733
|
+
*/
|
734
|
+
export interface JSONSchema {
|
735
|
+
type: Type2;
|
736
|
+
description: Description;
|
737
|
+
default: Default;
|
738
|
+
enum: Enum;
|
739
|
+
items: JSONSchema | null;
|
740
|
+
properties: Properties;
|
741
|
+
additionalProperties: Additionalproperties;
|
742
|
+
anyOf: Anyof;
|
743
|
+
required: Required;
|
744
|
+
}
|
745
|
+
export interface Default {
|
746
|
+
[k: string]: unknown;
|
707
747
|
}
|
708
748
|
/**
|
709
749
|
* Scoring results from evaluation.
|
@@ -718,7 +758,7 @@ export interface EvalResults {
|
|
718
758
|
* Score for evaluation task.
|
719
759
|
*/
|
720
760
|
export interface EvalScore {
|
721
|
-
name:
|
761
|
+
name: Name6;
|
722
762
|
scorer: Scorer;
|
723
763
|
reducer: Reducer;
|
724
764
|
params: Params2;
|
@@ -733,7 +773,7 @@ export interface Metrics2 {
|
|
733
773
|
* Metric for evaluation score.
|
734
774
|
*/
|
735
775
|
export interface EvalMetric {
|
736
|
-
name:
|
776
|
+
name: Name7;
|
737
777
|
value: Value;
|
738
778
|
params: Params3;
|
739
779
|
metadata: Metadata2;
|
@@ -799,15 +839,16 @@ export interface EvalSample {
|
|
799
839
|
* System chat message.
|
800
840
|
*/
|
801
841
|
export interface ChatMessageSystem {
|
802
|
-
|
842
|
+
id: Id1;
|
803
843
|
content: Content;
|
804
844
|
source: Source;
|
845
|
+
role: Role;
|
805
846
|
}
|
806
847
|
/**
|
807
848
|
* Text content.
|
808
849
|
*/
|
809
850
|
export interface ContentText {
|
810
|
-
type:
|
851
|
+
type: Type3;
|
811
852
|
text: Text;
|
812
853
|
}
|
813
854
|
/**
|
@@ -816,7 +857,7 @@ export interface ContentText {
|
|
816
857
|
* See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
817
858
|
*/
|
818
859
|
export interface ContentReasoning {
|
819
|
-
type:
|
860
|
+
type: Type4;
|
820
861
|
reasoning: Reasoning;
|
821
862
|
signature: Signature;
|
822
863
|
redacted: Redacted;
|
@@ -825,7 +866,7 @@ export interface ContentReasoning {
|
|
825
866
|
* Image content.
|
826
867
|
*/
|
827
868
|
export interface ContentImage {
|
828
|
-
type:
|
869
|
+
type: Type5;
|
829
870
|
image: Image;
|
830
871
|
detail: Detail;
|
831
872
|
}
|
@@ -833,7 +874,7 @@ export interface ContentImage {
|
|
833
874
|
* Audio content.
|
834
875
|
*/
|
835
876
|
export interface ContentAudio {
|
836
|
-
type:
|
877
|
+
type: Type6;
|
837
878
|
audio: Audio;
|
838
879
|
format: Format;
|
839
880
|
}
|
@@ -841,7 +882,7 @@ export interface ContentAudio {
|
|
841
882
|
* Video content.
|
842
883
|
*/
|
843
884
|
export interface ContentVideo {
|
844
|
-
type:
|
885
|
+
type: Type7;
|
845
886
|
video: Video;
|
846
887
|
format: Format1;
|
847
888
|
}
|
@@ -849,25 +890,27 @@ export interface ContentVideo {
|
|
849
890
|
* User chat message.
|
850
891
|
*/
|
851
892
|
export interface ChatMessageUser {
|
852
|
-
|
893
|
+
id: Id2;
|
853
894
|
content: Content1;
|
854
895
|
source: Source1;
|
896
|
+
role: Role1;
|
855
897
|
tool_call_id: ToolCallId;
|
856
898
|
}
|
857
899
|
/**
|
858
900
|
* Assistant chat message.
|
859
901
|
*/
|
860
902
|
export interface ChatMessageAssistant {
|
861
|
-
|
903
|
+
id: Id3;
|
862
904
|
content: Content2;
|
863
905
|
source: Source2;
|
906
|
+
role: Role2;
|
864
907
|
tool_calls: ToolCalls;
|
865
908
|
}
|
866
909
|
export interface ToolCall {
|
867
|
-
id:
|
910
|
+
id: Id4;
|
868
911
|
function: Function;
|
869
912
|
arguments: Arguments;
|
870
|
-
type:
|
913
|
+
type: Type8;
|
871
914
|
parse_error: ParseError;
|
872
915
|
view: ToolCallContent | null;
|
873
916
|
}
|
@@ -884,15 +927,16 @@ export interface ToolCallContent {
|
|
884
927
|
* Tool chat message.
|
885
928
|
*/
|
886
929
|
export interface ChatMessageTool {
|
887
|
-
|
930
|
+
id: Id5;
|
888
931
|
content: Content4;
|
889
932
|
source: Source3;
|
933
|
+
role: Role3;
|
890
934
|
tool_call_id: ToolCallId1;
|
891
935
|
function: Function1;
|
892
936
|
error: ToolCallError | null;
|
893
937
|
}
|
894
938
|
export interface ToolCallError {
|
895
|
-
type:
|
939
|
+
type: Type9;
|
896
940
|
message: Message1;
|
897
941
|
}
|
898
942
|
/**
|
@@ -966,7 +1010,7 @@ export interface Sample {
|
|
966
1010
|
input: Input1;
|
967
1011
|
choices: Choices2;
|
968
1012
|
target: Target1;
|
969
|
-
id:
|
1013
|
+
id: Id6;
|
970
1014
|
metadata: Metadata8;
|
971
1015
|
sandbox: SandboxEnvironmentSpec | null;
|
972
1016
|
files: Files1;
|
@@ -980,7 +1024,7 @@ export interface SampleLimitEvent {
|
|
980
1024
|
working_start: WorkingStart1;
|
981
1025
|
pending: Pending1;
|
982
1026
|
event: Event1;
|
983
|
-
type:
|
1027
|
+
type: Type10;
|
984
1028
|
message: Message2;
|
985
1029
|
limit: Limit1;
|
986
1030
|
}
|
@@ -1082,41 +1126,24 @@ export interface ModelEvent {
|
|
1082
1126
|
* ```
|
1083
1127
|
*/
|
1084
1128
|
export interface ToolInfo {
|
1085
|
-
name:
|
1086
|
-
description:
|
1129
|
+
name: Name8;
|
1130
|
+
description: Description2;
|
1087
1131
|
parameters: ToolParams;
|
1088
1132
|
}
|
1089
1133
|
/**
|
1090
1134
|
* Description of tool parameters object in JSON Schema format.
|
1091
1135
|
*/
|
1092
1136
|
export interface ToolParams {
|
1093
|
-
type:
|
1094
|
-
properties:
|
1137
|
+
type: Type11;
|
1138
|
+
properties: Properties1;
|
1095
1139
|
required: Required1;
|
1096
1140
|
additionalProperties: Additionalproperties1;
|
1097
1141
|
}
|
1098
|
-
export interface
|
1099
|
-
[k: string]:
|
1100
|
-
}
|
1101
|
-
/**
|
1102
|
-
* Description of tool parameter in JSON Schema format.
|
1103
|
-
*/
|
1104
|
-
export interface ToolParam {
|
1105
|
-
type: Type10;
|
1106
|
-
description: Description1;
|
1107
|
-
default: Default;
|
1108
|
-
enum: Enum;
|
1109
|
-
items: ToolParam | null;
|
1110
|
-
properties: Properties1;
|
1111
|
-
additionalProperties: Additionalproperties;
|
1112
|
-
anyOf: Anyof;
|
1113
|
-
required: Required;
|
1114
|
-
}
|
1115
|
-
export interface Default {
|
1116
|
-
[k: string]: unknown;
|
1142
|
+
export interface Properties1 {
|
1143
|
+
[k: string]: JSONSchema;
|
1117
1144
|
}
|
1118
1145
|
export interface ToolFunction {
|
1119
|
-
name:
|
1146
|
+
name: Name9;
|
1120
1147
|
}
|
1121
1148
|
/**
|
1122
1149
|
* Model generation options.
|
@@ -1146,6 +1173,7 @@ export interface GenerateConfig1 {
|
|
1146
1173
|
reasoning_effort: ReasoningEffort;
|
1147
1174
|
reasoning_tokens: ReasoningTokens;
|
1148
1175
|
reasoning_history: ReasoningHistory;
|
1176
|
+
response_schema: ResponseSchema | null;
|
1149
1177
|
}
|
1150
1178
|
/**
|
1151
1179
|
* Model call (raw request/response data).
|
@@ -1169,8 +1197,8 @@ export interface ToolEvent {
|
|
1169
1197
|
working_start: WorkingStart6;
|
1170
1198
|
pending: Pending6;
|
1171
1199
|
event: Event6;
|
1172
|
-
type:
|
1173
|
-
id:
|
1200
|
+
type: Type12;
|
1201
|
+
id: Id7;
|
1174
1202
|
function: Function2;
|
1175
1203
|
arguments: Arguments1;
|
1176
1204
|
view: ToolCallContent | null;
|
@@ -1260,7 +1288,7 @@ export interface LoggerEvent {
|
|
1260
1288
|
* Message written to Python log.
|
1261
1289
|
*/
|
1262
1290
|
export interface LoggingMessage {
|
1263
|
-
name:
|
1291
|
+
name: Name10;
|
1264
1292
|
level: Level;
|
1265
1293
|
message: Message4;
|
1266
1294
|
created: Created1;
|
@@ -1288,8 +1316,8 @@ export interface StepEvent {
|
|
1288
1316
|
pending: Pending13;
|
1289
1317
|
event: Event13;
|
1290
1318
|
action: Action1;
|
1291
|
-
type:
|
1292
|
-
name:
|
1319
|
+
type: Type13;
|
1320
|
+
name: Name11;
|
1293
1321
|
}
|
1294
1322
|
/**
|
1295
1323
|
* Subtask spawned.
|
@@ -1299,8 +1327,8 @@ export interface SubtaskEvent {
|
|
1299
1327
|
working_start: WorkingStart14;
|
1300
1328
|
pending: Pending14;
|
1301
1329
|
event: Event14;
|
1302
|
-
name:
|
1303
|
-
type:
|
1330
|
+
name: Name12;
|
1331
|
+
type: Type14;
|
1304
1332
|
input: Input5;
|
1305
1333
|
result: Result2;
|
1306
1334
|
events: Events2;
|
@@ -1321,7 +1349,7 @@ export interface Attachments {
|
|
1321
1349
|
* Limit encontered by sample.
|
1322
1350
|
*/
|
1323
1351
|
export interface EvalSampleLimit {
|
1324
|
-
type:
|
1352
|
+
type: Type15;
|
1325
1353
|
limit: Limit2;
|
1326
1354
|
}
|
1327
1355
|
/**
|
@@ -1,9 +1,8 @@
|
|
1
|
-
import asyncio
|
2
1
|
import uuid
|
3
|
-
from asyncio import Future
|
4
2
|
from contextvars import ContextVar
|
5
|
-
from typing import Callable, Literal, NamedTuple
|
3
|
+
from typing import Callable, Literal, NamedTuple
|
6
4
|
|
5
|
+
from inspect_ai._util.future import Future
|
7
6
|
from inspect_ai.solver._task_state import TaskState
|
8
7
|
from inspect_ai.tool._tool_call import ToolCall, ToolCallView
|
9
8
|
|
@@ -37,7 +36,6 @@ class HumanApprovalManager:
|
|
37
36
|
from inspect_ai.log._samples import sample_active
|
38
37
|
|
39
38
|
id = str(uuid.uuid4())
|
40
|
-
future = cast(Future[Approval], asyncio.get_event_loop().create_future())
|
41
39
|
sample = sample_active()
|
42
40
|
assert sample
|
43
41
|
assert sample.sample.id is not None
|
@@ -48,7 +46,7 @@ class HumanApprovalManager:
|
|
48
46
|
id=sample.sample.id,
|
49
47
|
epoch=sample.epoch,
|
50
48
|
)
|
51
|
-
self._approval_requests[id] = (pending,
|
49
|
+
self._approval_requests[id] = (pending, Future[Approval]())
|
52
50
|
self._notify_change("add")
|
53
51
|
return id
|
54
52
|
|
@@ -58,7 +56,7 @@ class HumanApprovalManager:
|
|
58
56
|
|
59
57
|
async def wait_for_approval(self, id: str) -> Approval:
|
60
58
|
_, future = self._approval_requests[id]
|
61
|
-
return await future
|
59
|
+
return await future.result()
|
62
60
|
|
63
61
|
def on_change(
|
64
62
|
self, callback: Callable[[Literal["add", "remove"]], None]
|
@@ -77,16 +75,14 @@ class HumanApprovalManager:
|
|
77
75
|
def complete_approval(self, id: str, result: Approval) -> None:
|
78
76
|
if id in self._approval_requests:
|
79
77
|
_, future = self._approval_requests[id]
|
80
|
-
|
81
|
-
future.set_result(result)
|
78
|
+
future.set_result(result)
|
82
79
|
del self._approval_requests[id]
|
83
80
|
self._notify_change("remove")
|
84
81
|
|
85
82
|
def fail_approval(self, id: str, error: Exception) -> None:
|
86
83
|
if id in self._approval_requests:
|
87
84
|
_, future = self._approval_requests[id]
|
88
|
-
|
89
|
-
future.set_exception(error)
|
85
|
+
future.set_exception(error)
|
90
86
|
del self._approval_requests[id]
|
91
87
|
self._notify_change("remove")
|
92
88
|
|
@@ -1,6 +1,6 @@
|
|
1
|
-
from asyncio import CancelledError
|
2
1
|
from typing import Callable, Literal
|
3
2
|
|
3
|
+
import anyio
|
4
4
|
from rich.console import RenderableType
|
5
5
|
from rich.text import Text
|
6
6
|
from textual.app import ComposeResult
|
@@ -44,7 +44,7 @@ async def panel_approval(
|
|
44
44
|
)
|
45
45
|
try:
|
46
46
|
return await approvals.wait_for_approval(id)
|
47
|
-
except
|
47
|
+
except anyio.get_cancelled_exc_class():
|
48
48
|
approvals.withdraw_request(id)
|
49
49
|
raise
|
50
50
|
|
@@ -63,12 +63,13 @@ def message_with_resolved_content(
|
|
63
63
|
message: ChatMessage, resolver: Callable[[str], str]
|
64
64
|
) -> ChatMessage:
|
65
65
|
if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
|
66
|
-
return
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
66
|
+
return message.model_copy(
|
67
|
+
update=dict(
|
68
|
+
content=[
|
69
|
+
chat_content_with_resolved_content(content, resolver)
|
70
|
+
for content in message.content
|
71
|
+
],
|
72
|
+
)
|
72
73
|
)
|
73
74
|
else:
|
74
75
|
return message
|
inspect_ai/log/__init__.py
CHANGED
@@ -7,9 +7,11 @@ from ._file import (
|
|
7
7
|
EvalLogInfo,
|
8
8
|
list_eval_logs,
|
9
9
|
read_eval_log,
|
10
|
+
read_eval_log_async,
|
10
11
|
read_eval_log_sample,
|
11
12
|
read_eval_log_samples,
|
12
13
|
write_eval_log,
|
14
|
+
write_eval_log_async,
|
13
15
|
write_log_dir_manifest,
|
14
16
|
)
|
15
17
|
from ._log import (
|
@@ -93,11 +95,13 @@ __all__ = [
|
|
93
95
|
"convert_eval_logs",
|
94
96
|
"list_eval_logs",
|
95
97
|
"read_eval_log",
|
98
|
+
"read_eval_log_async",
|
96
99
|
"read_eval_log_sample",
|
97
100
|
"read_eval_log_samples",
|
98
101
|
"condense_sample",
|
99
102
|
"resolve_sample_attachments",
|
100
103
|
"write_eval_log",
|
104
|
+
"write_eval_log_async",
|
101
105
|
"write_log_dir_manifest",
|
102
106
|
"retryable_eval_logs",
|
103
107
|
"bundle_log_dir",
|