inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/logger.py +12 -12
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +43 -47
- inspect_ai/_util/transcript.py +4 -0
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29622 -24424
- inspect_ai/_view/www/log-schema.json +138 -90
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +1 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/Tools.mjs +11 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +26 -12
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +5 -2
- inspect_ai/model/_call_tools.py +4 -2
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
- inspect_ai/util/__init__.py +8 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +16 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +2 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +5 -6
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown];
|
|
29
29
|
export type Model = string;
|
30
30
|
export type ModelBaseUrl = string | null;
|
31
31
|
export type Limit = number | [unknown, unknown] | null;
|
32
|
+
export type SampleId = string | number | (string | number)[] | null;
|
32
33
|
export type Epochs = number | null;
|
33
34
|
export type EpochsReducer = string[] | null;
|
34
35
|
export type Trace = boolean | null;
|
@@ -42,10 +43,12 @@ export type TimeLimit = number | null;
|
|
42
43
|
export type MaxSamples = number | null;
|
43
44
|
export type MaxTasks = number | null;
|
44
45
|
export type MaxSubprocesses = number | null;
|
46
|
+
export type MaxSandboxes = number | null;
|
45
47
|
export type SandboxCleanup = boolean | null;
|
46
48
|
export type LogSamples = boolean | null;
|
47
49
|
export type LogImages = boolean | null;
|
48
50
|
export type LogBuffer = number | null;
|
51
|
+
export type ScoreDisplay = boolean | null;
|
49
52
|
export type Type = "git";
|
50
53
|
export type Origin = string;
|
51
54
|
export type Commit = string;
|
@@ -76,6 +79,7 @@ export type TopLogprobs = number | null;
|
|
76
79
|
export type ParallelToolCalls = boolean | null;
|
77
80
|
export type MaxToolOutput = number | null;
|
78
81
|
export type CachePrompt = "auto" | boolean | null;
|
82
|
+
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
79
83
|
export type TotalSamples = number;
|
80
84
|
export type CompletedSamples = number;
|
81
85
|
export type Name3 = string;
|
@@ -119,6 +123,7 @@ export type Role = "system";
|
|
119
123
|
export type Content1 = string | (ContentText | ContentImage)[];
|
120
124
|
export type Source1 = ("input" | "generate") | null;
|
121
125
|
export type Role1 = "user";
|
126
|
+
export type ToolCallId = string | null;
|
122
127
|
export type Content2 = string | (ContentText | ContentImage)[];
|
123
128
|
export type Source2 = ("input" | "generate") | null;
|
124
129
|
export type Role2 = "assistant";
|
@@ -133,7 +138,7 @@ export type Content3 = string;
|
|
133
138
|
export type Content4 = string | (ContentText | ContentImage)[];
|
134
139
|
export type Source3 = ("input" | "generate") | null;
|
135
140
|
export type Role3 = "tool";
|
136
|
-
export type
|
141
|
+
export type ToolCallId1 = string | null;
|
137
142
|
export type Function1 = string | null;
|
138
143
|
export type Type4 =
|
139
144
|
| "parsing"
|
@@ -241,14 +246,10 @@ export type Name5 = string;
|
|
241
246
|
export type Description = string;
|
242
247
|
export type Type6 = "object";
|
243
248
|
export type Type7 =
|
244
|
-
| "string"
|
245
|
-
|
|
246
|
-
| "number"
|
247
|
-
| "boolean"
|
248
|
-
| "array"
|
249
|
-
| "object"
|
250
|
-
| "null";
|
249
|
+
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
250
|
+
| null;
|
251
251
|
export type Description1 = string | null;
|
252
|
+
export type Enum = unknown[] | null;
|
252
253
|
export type Properties1 = {
|
253
254
|
[k: string]: ToolParam;
|
254
255
|
} | null;
|
@@ -267,7 +268,13 @@ export type Event5 = "tool";
|
|
267
268
|
export type Type8 = "function";
|
268
269
|
export type Id3 = string;
|
269
270
|
export type Function2 = string;
|
270
|
-
export type Result =
|
271
|
+
export type Result =
|
272
|
+
| string
|
273
|
+
| number
|
274
|
+
| boolean
|
275
|
+
| ContentText
|
276
|
+
| ContentImage
|
277
|
+
| (ContentText | ContentImage)[];
|
271
278
|
export type Truncated = [unknown, unknown] | null;
|
272
279
|
export type Timestamp6 = string;
|
273
280
|
export type Pending6 = boolean | null;
|
@@ -388,7 +395,7 @@ export type Value2 =
|
|
388
395
|
export type Answer1 = string | null;
|
389
396
|
export type Explanation2 = string | null;
|
390
397
|
export type Metadata8 = {} | null;
|
391
|
-
export type
|
398
|
+
export type SampleId1 = string | number | null;
|
392
399
|
export type Samples2 = SampleScore[];
|
393
400
|
export type Location1 = string;
|
394
401
|
|
@@ -438,6 +445,7 @@ export interface EvalDataset {
|
|
438
445
|
export interface ModelArgs {}
|
439
446
|
export interface EvalConfig {
|
440
447
|
limit: Limit;
|
448
|
+
sample_id: SampleId;
|
441
449
|
epochs: Epochs;
|
442
450
|
epochs_reducer: EpochsReducer;
|
443
451
|
trace: Trace;
|
@@ -449,10 +457,12 @@ export interface EvalConfig {
|
|
449
457
|
max_samples: MaxSamples;
|
450
458
|
max_tasks: MaxTasks;
|
451
459
|
max_subprocesses: MaxSubprocesses;
|
460
|
+
max_sandboxes: MaxSandboxes;
|
452
461
|
sandbox_cleanup: SandboxCleanup;
|
453
462
|
log_samples: LogSamples;
|
454
463
|
log_images: LogImages;
|
455
464
|
log_buffer: LogBuffer;
|
465
|
+
score_display: ScoreDisplay;
|
456
466
|
}
|
457
467
|
export interface ApprovalPolicyConfig {
|
458
468
|
approvers: Approvers;
|
@@ -523,6 +533,7 @@ export interface GenerateConfig {
|
|
523
533
|
parallel_tool_calls: ParallelToolCalls;
|
524
534
|
max_tool_output: MaxToolOutput;
|
525
535
|
cache_prompt: CachePrompt;
|
536
|
+
reasoning_effort: ReasoningEffort;
|
526
537
|
}
|
527
538
|
export interface EvalResults {
|
528
539
|
total_samples: TotalSamples;
|
@@ -607,6 +618,7 @@ export interface ChatMessageUser {
|
|
607
618
|
content: Content1;
|
608
619
|
source: Source1;
|
609
620
|
role: Role1;
|
621
|
+
tool_call_id: ToolCallId;
|
610
622
|
}
|
611
623
|
export interface ChatMessageAssistant {
|
612
624
|
content: Content2;
|
@@ -635,7 +647,7 @@ export interface ChatMessageTool {
|
|
635
647
|
content: Content4;
|
636
648
|
source: Source3;
|
637
649
|
role: Role3;
|
638
|
-
tool_call_id:
|
650
|
+
tool_call_id: ToolCallId1;
|
639
651
|
function: Function1;
|
640
652
|
error: ToolCallError | null;
|
641
653
|
}
|
@@ -825,6 +837,7 @@ export interface ToolParam {
|
|
825
837
|
type: Type7;
|
826
838
|
description: Description1;
|
827
839
|
default: Default;
|
840
|
+
enum: Enum;
|
828
841
|
items: ToolParam | null;
|
829
842
|
properties: Properties1;
|
830
843
|
additionalProperties: Additionalproperties;
|
@@ -862,6 +875,7 @@ export interface GenerateConfig1 {
|
|
862
875
|
parallel_tool_calls: ParallelToolCalls;
|
863
876
|
max_tool_output: MaxToolOutput;
|
864
877
|
cache_prompt: CachePrompt;
|
878
|
+
reasoning_effort: ReasoningEffort;
|
865
879
|
}
|
866
880
|
/**
|
867
881
|
* Model call (raw request/response data).
|
@@ -1031,5 +1045,5 @@ export interface SampleScore {
|
|
1031
1045
|
answer: Answer1;
|
1032
1046
|
explanation: Explanation2;
|
1033
1047
|
metadata: Metadata8;
|
1034
|
-
sample_id:
|
1048
|
+
sample_id: SampleId1;
|
1035
1049
|
}
|
inspect_ai/_view/www/yarn.lock
CHANGED
@@ -131,6 +131,13 @@
|
|
131
131
|
dependencies:
|
132
132
|
"@babel/types" "^7.25.2"
|
133
133
|
|
134
|
+
"@babel/runtime@^7.21.0":
|
135
|
+
version "7.26.0"
|
136
|
+
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.0.tgz#8600c2f595f277c60815256418b85356a65173c1"
|
137
|
+
integrity sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==
|
138
|
+
dependencies:
|
139
|
+
regenerator-runtime "^0.14.0"
|
140
|
+
|
134
141
|
"@babel/template@^7.25.0":
|
135
142
|
version "7.25.0"
|
136
143
|
resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.25.0.tgz#e733dc3134b4fede528c15bc95e89cb98c52592a"
|
@@ -525,6 +532,14 @@ argparse@^2.0.1:
|
|
525
532
|
resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
|
526
533
|
integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
|
527
534
|
|
535
|
+
asciinema-player@^3.8.1:
|
536
|
+
version "3.8.1"
|
537
|
+
resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.8.1.tgz#d56ccc04a85570559900b2297cf44c2a7453d118"
|
538
|
+
integrity sha512-NkpbFg81Y6iJFpDRndakLCQ0G26XSpvuT3vJTFjMRgHb26lqHgRNY9gun54e5MehZ4fEDNYkMZv+z6MfZ8c2aA==
|
539
|
+
dependencies:
|
540
|
+
"@babel/runtime" "^7.21.0"
|
541
|
+
solid-js "^1.3.0"
|
542
|
+
|
528
543
|
babel-plugin-prismjs@^2.1.0:
|
529
544
|
version "2.1.0"
|
530
545
|
resolved "https://registry.yarnpkg.com/babel-plugin-prismjs/-/babel-plugin-prismjs-2.1.0.tgz#ade627896106326ad04d6d77fba92877618de571"
|
@@ -647,6 +662,11 @@ cross-spawn@^7.0.2:
|
|
647
662
|
shebang-command "^2.0.0"
|
648
663
|
which "^2.0.1"
|
649
664
|
|
665
|
+
csstype@^3.1.0:
|
666
|
+
version "3.1.3"
|
667
|
+
resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81"
|
668
|
+
integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==
|
669
|
+
|
650
670
|
cuint@^0.2.2:
|
651
671
|
version "0.2.2"
|
652
672
|
resolved "https://registry.yarnpkg.com/cuint/-/cuint-0.2.2.tgz#408086d409550c2631155619e9fa7bcadc3b991b"
|
@@ -1242,6 +1262,11 @@ queue-microtask@^1.2.2:
|
|
1242
1262
|
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
|
1243
1263
|
integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==
|
1244
1264
|
|
1265
|
+
regenerator-runtime@^0.14.0:
|
1266
|
+
version "0.14.1"
|
1267
|
+
resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
|
1268
|
+
integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==
|
1269
|
+
|
1245
1270
|
resolve-from@^4.0.0:
|
1246
1271
|
version "4.0.0"
|
1247
1272
|
resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-4.0.0.tgz#4abcd852ad32dd7baabfe9b40e00a36db5f392e6"
|
@@ -1294,6 +1319,16 @@ semver@^6.0.0, semver@^6.3.1:
|
|
1294
1319
|
resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4"
|
1295
1320
|
integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==
|
1296
1321
|
|
1322
|
+
seroval-plugins@^1.1.0:
|
1323
|
+
version "1.1.1"
|
1324
|
+
resolved "https://registry.yarnpkg.com/seroval-plugins/-/seroval-plugins-1.1.1.tgz#1e0c175e13bb4c620d4ce5916fbbb63de70c31f9"
|
1325
|
+
integrity sha512-qNSy1+nUj7hsCOon7AO4wdAIo9P0jrzAMp18XhiOzA6/uO5TKtP7ScozVJ8T293oRIvi5wyCHSM4TrJo/c/GJA==
|
1326
|
+
|
1327
|
+
seroval@^1.1.0:
|
1328
|
+
version "1.1.1"
|
1329
|
+
resolved "https://registry.yarnpkg.com/seroval/-/seroval-1.1.1.tgz#7630e0c17a3efa6be43f17ad6bcf9f966a61b443"
|
1330
|
+
integrity sha512-rqEO6FZk8mv7Hyv4UCj3FD3b6Waqft605TLfsCe/BiaylRpyyMC0b+uA5TJKawX3KzMrdi3wsLbCaLplrQmBvQ==
|
1331
|
+
|
1297
1332
|
shebang-command@^2.0.0:
|
1298
1333
|
version "2.0.0"
|
1299
1334
|
resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea"
|
@@ -1306,6 +1341,15 @@ shebang-regex@^3.0.0:
|
|
1306
1341
|
resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172"
|
1307
1342
|
integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==
|
1308
1343
|
|
1344
|
+
solid-js@^1.3.0:
|
1345
|
+
version "1.9.3"
|
1346
|
+
resolved "https://registry.yarnpkg.com/solid-js/-/solid-js-1.9.3.tgz#078f026fe32f6b9b48e8e0557be150f0c2d610a9"
|
1347
|
+
integrity sha512-5ba3taPoZGt9GY3YlsCB24kCg0Lv/rie/HTD4kG6h4daZZz7+yK02xn8Vx8dLYBc9i6Ps5JwAbEiqjmKaLB3Ag==
|
1348
|
+
dependencies:
|
1349
|
+
csstype "^3.1.0"
|
1350
|
+
seroval "^1.1.0"
|
1351
|
+
seroval-plugins "^1.1.0"
|
1352
|
+
|
1309
1353
|
source-map-js@^1.2.0:
|
1310
1354
|
version "1.2.0"
|
1311
1355
|
resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.2.0.tgz#16b809c162517b5b8c3e7dcd315a2a5c2612b2af"
|
inspect_ai/approval/_apply.py
CHANGED
@@ -75,4 +75,8 @@ def init_tool_approval(approval: list[ApprovalPolicy] | None) -> None:
|
|
75
75
|
_tool_approver.set(None)
|
76
76
|
|
77
77
|
|
78
|
+
def have_tool_approval() -> bool:
|
79
|
+
return _tool_approver.get(None) is not None
|
80
|
+
|
81
|
+
|
78
82
|
_tool_approver: ContextVar[Approver | None] = ContextVar("tool_approver", default=None)
|
@@ -24,8 +24,6 @@ from .util import (
|
|
24
24
|
render_tool_approval,
|
25
25
|
)
|
26
26
|
|
27
|
-
PANEL_TITLE = "Approvals"
|
28
|
-
|
29
27
|
|
30
28
|
async def panel_approval(
|
31
29
|
message: str,
|
@@ -35,7 +33,7 @@ async def panel_approval(
|
|
35
33
|
choices: list[ApprovalDecision],
|
36
34
|
) -> Approval:
|
37
35
|
# ensure the approvals panel is shown
|
38
|
-
await input_panel(
|
36
|
+
await input_panel(ApprovalInputPanel)
|
39
37
|
|
40
38
|
# submit to human approval manager (will be picked up by panel)
|
41
39
|
approvals = human_approval_manager()
|
@@ -52,11 +50,10 @@ async def panel_approval(
|
|
52
50
|
|
53
51
|
|
54
52
|
class ApprovalInputPanel(InputPanel):
|
53
|
+
DEFAULT_TITLE = "Approval"
|
54
|
+
|
55
55
|
DEFAULT_CSS = """
|
56
56
|
ApprovalInputPanel {
|
57
|
-
width: 1fr;
|
58
|
-
height: 1fr;
|
59
|
-
padding: 0 1 1 1;
|
60
57
|
layout: grid;
|
61
58
|
grid-size: 1 3;
|
62
59
|
grid-rows: auto 1fr auto;
|
@@ -88,7 +85,7 @@ class ApprovalInputPanel(InputPanel):
|
|
88
85
|
self._approvals = human_approval_manager().approval_requests()
|
89
86
|
if len(self._approvals) > 0:
|
90
87
|
approval_id, approval_request = self._approvals[0]
|
91
|
-
self.title = f"{
|
88
|
+
self.title = f"{self.DEFAULT_TITLE} ({len(self._approvals):,})"
|
92
89
|
heading.request = approval_request
|
93
90
|
content.approval = approval_request.request
|
94
91
|
actions.approval_request = approval_id, approval_request
|
@@ -97,7 +94,7 @@ class ApprovalInputPanel(InputPanel):
|
|
97
94
|
actions.activate()
|
98
95
|
self.visible = True
|
99
96
|
else:
|
100
|
-
self.title =
|
97
|
+
self.title = self.DEFAULT_TITLE
|
101
98
|
heading.request = None
|
102
99
|
content.approval = None
|
103
100
|
actions.approval_request = None
|
inspect_ai/dataset/_dataset.py
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
import abc
|
2
2
|
import random
|
3
|
+
from dataclasses import dataclass, field
|
3
4
|
from typing import (
|
4
5
|
TYPE_CHECKING,
|
5
6
|
Any,
|
6
7
|
Callable,
|
7
8
|
Iterator,
|
8
9
|
Sequence,
|
10
|
+
Type,
|
11
|
+
TypeVar,
|
9
12
|
Union,
|
10
13
|
overload,
|
11
14
|
)
|
12
15
|
|
13
|
-
from pydantic import BaseModel, Field
|
16
|
+
from pydantic import BaseModel, Field, ValidationError
|
14
17
|
from typing_extensions import override
|
15
18
|
|
16
19
|
from inspect_ai.model import ChatMessage
|
@@ -20,6 +23,8 @@ from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
|
20
23
|
if TYPE_CHECKING:
|
21
24
|
from _typeshed import SupportsRichComparison
|
22
25
|
|
26
|
+
MT = TypeVar("MT", bound=BaseModel)
|
27
|
+
|
23
28
|
|
24
29
|
class Sample(BaseModel):
|
25
30
|
def __init__(
|
@@ -76,6 +81,20 @@ class Sample(BaseModel):
|
|
76
81
|
metadata: dict[str, Any] | None = Field(default=None)
|
77
82
|
"""Arbitrary metadata associated with the sample."""
|
78
83
|
|
84
|
+
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
85
|
+
"""Metadata as a Pydantic model.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
metadata_cls: BaseModel derived class.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
BaseModel: Instance of metadata_cls.
|
92
|
+
"""
|
93
|
+
if self.metadata is None:
|
94
|
+
raise ValueError("Sample does not have metadata")
|
95
|
+
|
96
|
+
return metadata_as(self.metadata, metadata_cls)
|
97
|
+
|
79
98
|
sandbox: SandboxEnvironmentSpec | None = Field(default=None)
|
80
99
|
"""Sandbox environment type and optional config file."""
|
81
100
|
|
@@ -177,7 +196,8 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
177
196
|
"""
|
178
197
|
|
179
198
|
|
180
|
-
|
199
|
+
@dataclass
|
200
|
+
class FieldSpec:
|
181
201
|
r"""Specification for mapping data source fields to sample fields.
|
182
202
|
|
183
203
|
Args:
|
@@ -191,28 +211,28 @@ class FieldSpec(BaseModel):
|
|
191
211
|
setup (str): Optional. Setup script to run for sample .
|
192
212
|
"""
|
193
213
|
|
194
|
-
input: str =
|
214
|
+
input: str = field(default="input")
|
195
215
|
"""Name of the field containing the sample input."""
|
196
216
|
|
197
|
-
target: str =
|
217
|
+
target: str = field(default="target")
|
198
218
|
"""Name of the field containing the sample target."""
|
199
219
|
|
200
|
-
choices: str =
|
220
|
+
choices: str = field(default="choices")
|
201
221
|
"""Name of field containing the list of answer choices."""
|
202
222
|
|
203
|
-
id: str =
|
223
|
+
id: str = field(default="id")
|
204
224
|
""" Unique identifier for the sample."""
|
205
225
|
|
206
|
-
metadata: list[str] | None =
|
226
|
+
metadata: list[str] | Type[BaseModel] | None = field(default=None)
|
207
227
|
"""List of additional field names that should be read as metadata."""
|
208
228
|
|
209
|
-
sandbox: str =
|
229
|
+
sandbox: str = field(default="sandbox")
|
210
230
|
"""Sandbox type along with optional config file."""
|
211
231
|
|
212
|
-
files: str =
|
232
|
+
files: str = field(default="files")
|
213
233
|
"""Files that go along wtih the sample."""
|
214
234
|
|
215
|
-
setup: str =
|
235
|
+
setup: str = field(default="setup")
|
216
236
|
"""Setup script to run for sample (run within default SandboxEnvironment)."""
|
217
237
|
|
218
238
|
|
@@ -313,3 +333,24 @@ class MemoryDataset(Dataset):
|
|
313
333
|
samples=[sample for sample in self if predicate(sample)],
|
314
334
|
shuffled=self.shuffled,
|
315
335
|
)
|
336
|
+
|
337
|
+
|
338
|
+
def metadata_as(metadata: dict[str, Any], metadata_cls: Type[MT]) -> MT:
|
339
|
+
# validate that metadata_cls is frozen
|
340
|
+
if not metadata_cls.model_config.get("frozen", False):
|
341
|
+
raise ValueError(
|
342
|
+
f"Metadata model {metadata_cls.__name__} must have frozen=True"
|
343
|
+
)
|
344
|
+
|
345
|
+
# filter to only fields in the model
|
346
|
+
model_fields = {
|
347
|
+
k: v
|
348
|
+
for k, v in metadata.items()
|
349
|
+
if k in metadata_cls.__pydantic_fields__.keys()
|
350
|
+
}
|
351
|
+
|
352
|
+
# parse and return model instance
|
353
|
+
try:
|
354
|
+
return metadata_cls(**model_fields)
|
355
|
+
except ValidationError as ex:
|
356
|
+
raise ValueError(f"Could not parse metadata into {metadata_cls.__name__}: {ex}")
|
inspect_ai/dataset/_util.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import json
|
2
2
|
from typing import Any, Iterable, cast
|
3
3
|
|
4
|
+
from pydantic import ValidationError
|
5
|
+
|
4
6
|
from inspect_ai.model import (
|
5
7
|
ChatMessage,
|
6
8
|
ChatMessageAssistant,
|
@@ -33,9 +35,35 @@ def record_to_sample_fn(
|
|
33
35
|
# collect metadata if specified
|
34
36
|
metadata: dict[str, Any] | None = None
|
35
37
|
if sample_fields.metadata:
|
36
|
-
metadata
|
37
|
-
|
38
|
-
|
38
|
+
if isinstance(sample_fields.metadata, list):
|
39
|
+
metadata = {}
|
40
|
+
for name in sample_fields.metadata:
|
41
|
+
metadata[name] = record.get(name)
|
42
|
+
else:
|
43
|
+
# must be frozen
|
44
|
+
if not sample_fields.metadata.model_config.get("frozen", False):
|
45
|
+
raise ValueError(
|
46
|
+
f"Metadata model {sample_fields.metadata.__name__} must have frozen=True"
|
47
|
+
)
|
48
|
+
|
49
|
+
# filter to only fields in the model
|
50
|
+
model_fields = record.get("metadata", None)
|
51
|
+
if isinstance(model_fields, str):
|
52
|
+
model_fields = json.loads(model_fields)
|
53
|
+
elif model_fields is None:
|
54
|
+
model_fields = {
|
55
|
+
k: v
|
56
|
+
for k, v in record.items()
|
57
|
+
if k in sample_fields.metadata.__pydantic_fields__.keys()
|
58
|
+
}
|
59
|
+
|
60
|
+
# parse and return metadata
|
61
|
+
try:
|
62
|
+
metadata = sample_fields.metadata(**model_fields).model_dump()
|
63
|
+
except ValidationError as ex:
|
64
|
+
raise ValueError(
|
65
|
+
f"Could not parse metadata into {sample_fields.metadata.__name__}: {ex}"
|
66
|
+
)
|
39
67
|
elif "metadata" in record:
|
40
68
|
metadata_field = record.get("metadata")
|
41
69
|
if isinstance(metadata_field, str):
|
inspect_ai/log/__init__.py
CHANGED
@@ -23,6 +23,7 @@ from ._log import (
|
|
23
23
|
EvalRevision,
|
24
24
|
EvalSample,
|
25
25
|
EvalSampleReductions,
|
26
|
+
EvalSampleScore,
|
26
27
|
EvalScore,
|
27
28
|
EvalSpec,
|
28
29
|
EvalStats,
|
@@ -60,6 +61,7 @@ __all__ = [
|
|
60
61
|
"EvalResults",
|
61
62
|
"EvalRevision",
|
62
63
|
"EvalSample",
|
64
|
+
"EvalSampleScore",
|
63
65
|
"EvalSampleReductions",
|
64
66
|
"EvalScore",
|
65
67
|
"EvalSpec",
|
inspect_ai/log/_log.py
CHANGED
@@ -23,7 +23,6 @@ from inspect_ai.model import (
|
|
23
23
|
ModelUsage,
|
24
24
|
)
|
25
25
|
from inspect_ai.scorer import Score
|
26
|
-
from inspect_ai.scorer._metric import SampleScore
|
27
26
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
28
27
|
|
29
28
|
from ._transcript import Event
|
@@ -301,6 +300,10 @@ class EvalScore(BaseModel):
|
|
301
300
|
"""Additional scorer metadata."""
|
302
301
|
|
303
302
|
|
303
|
+
class EvalSampleScore(Score):
|
304
|
+
sample_id: str | int | None = Field(default=None)
|
305
|
+
|
306
|
+
|
304
307
|
class EvalSampleReductions(BaseModel):
|
305
308
|
scorer: str
|
306
309
|
"""Name the of scorer"""
|
@@ -308,7 +311,7 @@ class EvalSampleReductions(BaseModel):
|
|
308
311
|
reducer: str | None = Field(default=None)
|
309
312
|
"""Name the of reducer"""
|
310
313
|
|
311
|
-
samples: list[
|
314
|
+
samples: list[EvalSampleScore]
|
312
315
|
"""List of reduced scores"""
|
313
316
|
|
314
317
|
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -118,10 +118,12 @@ async def call_tools(
|
|
118
118
|
# massage result, leave list[Content] alone, convert all other
|
119
119
|
# types to string as that is what the model APIs accept
|
120
120
|
truncated: tuple[int, int] | None = None
|
121
|
-
if isinstance(result,
|
121
|
+
if isinstance(result, ContentText | ContentImage):
|
122
|
+
content: str | list[Content] = [result]
|
123
|
+
elif isinstance(result, list) and (
|
122
124
|
isinstance(result[0], ContentText | ContentImage)
|
123
125
|
):
|
124
|
-
content
|
126
|
+
content = result
|
125
127
|
else:
|
126
128
|
content = str(result)
|
127
129
|
|
@@ -74,6 +74,9 @@ class ChatMessageUser(ChatMessageBase):
|
|
74
74
|
role: Literal["user"] = Field(default="user")
|
75
75
|
"""Conversation role."""
|
76
76
|
|
77
|
+
tool_call_id: str | None = Field(default=None)
|
78
|
+
"""ID of tool call this message has the content payload for."""
|
79
|
+
|
77
80
|
|
78
81
|
class ChatMessageAssistant(ChatMessageBase):
|
79
82
|
role: Literal["assistant"] = Field(default="assistant")
|
inspect_ai/model/_model.py
CHANGED
@@ -19,7 +19,7 @@ from tenacity import (
|
|
19
19
|
)
|
20
20
|
|
21
21
|
from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
|
22
|
-
from inspect_ai._util.content import ContentText
|
22
|
+
from inspect_ai._util.content import Content, ContentImage, ContentText
|
23
23
|
from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
|
24
24
|
from inspect_ai._util.platform import platform_init
|
25
25
|
from inspect_ai._util.registry import (
|
@@ -40,6 +40,7 @@ from ._chat_message import (
|
|
40
40
|
ChatMessage,
|
41
41
|
ChatMessageAssistant,
|
42
42
|
ChatMessageSystem,
|
43
|
+
ChatMessageTool,
|
43
44
|
ChatMessageUser,
|
44
45
|
)
|
45
46
|
from ._generate_config import (
|
@@ -163,6 +164,10 @@ class ModelAPI(abc.ABC):
|
|
163
164
|
"""Any tool use in a message stream means that tools must be passed."""
|
164
165
|
return False
|
165
166
|
|
167
|
+
def tool_result_images(self) -> bool:
|
168
|
+
"""Tool results can containe images"""
|
169
|
+
return False
|
170
|
+
|
166
171
|
|
167
172
|
class Model:
|
168
173
|
"""Model interface."""
|
@@ -291,6 +296,11 @@ class Model:
|
|
291
296
|
tools = []
|
292
297
|
tool_choice = "none"
|
293
298
|
|
299
|
+
# break tool image content out into user messages if the model doesn't
|
300
|
+
# support tools returning images
|
301
|
+
if not self.api.tool_result_images():
|
302
|
+
input = tool_result_images_as_user_message(input)
|
303
|
+
|
294
304
|
# optionally collapse *consecutive* messages into one -
|
295
305
|
# (some apis e.g. anthropic require this)
|
296
306
|
if self.api.collapse_user_messages():
|
@@ -693,6 +703,37 @@ def simple_input_messages(
|
|
693
703
|
return messages
|
694
704
|
|
695
705
|
|
706
|
+
def tool_result_images_as_user_message(
|
707
|
+
messages: list[ChatMessage],
|
708
|
+
) -> list[ChatMessage]:
|
709
|
+
return functools.reduce(tool_result_images_reducer, messages, [])
|
710
|
+
|
711
|
+
|
712
|
+
def tool_result_images_reducer(
|
713
|
+
messages: list[ChatMessage],
|
714
|
+
message: ChatMessage,
|
715
|
+
) -> list[ChatMessage]:
|
716
|
+
# append the message
|
717
|
+
messages.append(message)
|
718
|
+
|
719
|
+
# if there are tool result images, pull them out into a ChatUserMessage
|
720
|
+
if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
|
721
|
+
user_content: list[Content] = []
|
722
|
+
for i in range(0, len(message.content)):
|
723
|
+
if isinstance(message.content[i], ContentImage):
|
724
|
+
user_content.append(message.content[i])
|
725
|
+
message.content[i] = ContentText(
|
726
|
+
text="Image content is in the message below."
|
727
|
+
)
|
728
|
+
if len(user_content) > 0:
|
729
|
+
messages.append(
|
730
|
+
ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
|
731
|
+
)
|
732
|
+
|
733
|
+
# return messages
|
734
|
+
return messages
|
735
|
+
|
736
|
+
|
696
737
|
# Functions to reduce consecutive user messages to a single user message -> required for some models
|
697
738
|
def collapse_consecutive_user_messages(
|
698
739
|
messages: list[ChatMessage],
|
@@ -229,6 +229,10 @@ class AnthropicAPI(ModelAPI):
|
|
229
229
|
def tools_required(self) -> bool:
|
230
230
|
return True
|
231
231
|
|
232
|
+
@override
|
233
|
+
def tool_result_images(self) -> bool:
|
234
|
+
return True
|
235
|
+
|
232
236
|
# convert some common BadRequestError states into 'refusal' model output
|
233
237
|
def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | None:
|
234
238
|
error = exception_message(ex).lower()
|
inspect_ai/model/_render.py
CHANGED
@@ -3,13 +3,20 @@ from rich.console import RenderableType
|
|
3
3
|
from inspect_ai.tool._tool_call import ToolCall
|
4
4
|
from inspect_ai.tool._tool_transcript import transcript_tool_call
|
5
5
|
|
6
|
-
from ._chat_message import
|
6
|
+
from ._chat_message import (
|
7
|
+
ChatMessage,
|
8
|
+
ChatMessageAssistant,
|
9
|
+
ChatMessageTool,
|
10
|
+
ChatMessageUser,
|
11
|
+
)
|
7
12
|
|
8
13
|
|
9
14
|
def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessage]:
|
10
15
|
preceding: list[ChatMessage] = []
|
11
16
|
for m in reversed(messages):
|
12
|
-
if not isinstance(m, ChatMessageTool | ChatMessageAssistant)
|
17
|
+
if not isinstance(m, ChatMessageTool | ChatMessageAssistant) and not (
|
18
|
+
isinstance(m, ChatMessageUser) and m.tool_call_id
|
19
|
+
):
|
13
20
|
preceding.append(m)
|
14
21
|
else:
|
15
22
|
break
|
inspect_ai/scorer/_metric.py
CHANGED
@@ -90,6 +90,13 @@ class Score(BaseModel):
|
|
90
90
|
"""Read the score as a boolean."""
|
91
91
|
return bool(self._as_scalar())
|
92
92
|
|
93
|
+
def as_list(self) -> list[str | int | float | bool]:
|
94
|
+
"""Read the score as a list."""
|
95
|
+
if isinstance(self.value, list):
|
96
|
+
return self.value
|
97
|
+
else:
|
98
|
+
raise ValueError("This score is not a list")
|
99
|
+
|
93
100
|
def as_dict(self) -> dict[str, str | int | float | bool | None]:
|
94
101
|
"""Read the score as a dictionary."""
|
95
102
|
if isinstance(self.value, dict):
|
@@ -104,13 +111,17 @@ class Score(BaseModel):
|
|
104
111
|
raise ValueError("This score is not a scalar")
|
105
112
|
|
106
113
|
|
107
|
-
class SampleScore(
|
114
|
+
class SampleScore(BaseModel):
|
108
115
|
"""Score for a Sample
|
109
116
|
|
110
117
|
Args:
|
118
|
+
score: Score
|
111
119
|
sample_id: (str | int | None) Unique id of a sample
|
112
120
|
"""
|
113
121
|
|
122
|
+
score: Score
|
123
|
+
"""A score"""
|
124
|
+
|
114
125
|
sample_id: str | int | None = Field(default=None)
|
115
126
|
"""A sample id"""
|
116
127
|
|