inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +3 -2
- inspect_ai/_cli/cache.py +1 -1
- inspect_ai/_cli/common.py +15 -0
- inspect_ai/_cli/eval.py +4 -5
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_cli/sandbox.py +1 -1
- inspect_ai/_cli/trace.py +1 -1
- inspect_ai/_cli/view.py +1 -1
- inspect_ai/_display/core/config.py +3 -1
- inspect_ai/_eval/eval.py +55 -61
- inspect_ai/_eval/evalset.py +64 -154
- inspect_ai/_eval/loader.py +27 -54
- inspect_ai/_eval/registry.py +4 -15
- inspect_ai/_eval/run.py +7 -4
- inspect_ai/_eval/task/__init__.py +8 -2
- inspect_ai/_eval/task/log.py +9 -1
- inspect_ai/_eval/task/resolved.py +35 -0
- inspect_ai/_eval/task/run.py +4 -0
- inspect_ai/_eval/task/task.py +50 -69
- inspect_ai/_eval/task/tasks.py +30 -0
- inspect_ai/_util/constants.py +3 -0
- inspect_ai/_util/dotenv.py +17 -0
- inspect_ai/_util/logger.py +3 -0
- inspect_ai/_util/registry.py +43 -2
- inspect_ai/_view/server.py +28 -10
- inspect_ai/_view/www/dist/assets/index.css +32 -19
- inspect_ai/_view/www/dist/assets/index.js +17682 -29989
- inspect_ai/_view/www/log-schema.json +79 -9
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/appearance/styles.ts +6 -5
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
- inspect_ai/_view/www/src/constants.ts +3 -0
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
- inspect_ai/_view/www/src/types/log.d.ts +11 -5
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
- inspect_ai/_view/www/yarn.lock +12 -5
- inspect_ai/log/_log.py +10 -1
- inspect_ai/log/_recorders/eval.py +27 -8
- inspect_ai/log/_recorders/json.py +10 -2
- inspect_ai/log/_transcript.py +13 -4
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +15 -1
- inspect_ai/model/_model.py +30 -12
- inspect_ai/model/_model_output.py +6 -1
- inspect_ai/model/_openai.py +11 -6
- inspect_ai/model/_providers/anthropic.py +167 -77
- inspect_ai/model/_providers/google.py +6 -2
- inspect_ai/model/_providers/none.py +31 -0
- inspect_ai/model/_providers/openai.py +11 -8
- inspect_ai/model/_providers/providers.py +7 -0
- inspect_ai/model/_providers/vertex.py +5 -2
- inspect_ai/solver/_bridge/bridge.py +1 -1
- inspect_ai/solver/_chain.py +7 -6
- inspect_ai/tool/__init__.py +4 -0
- inspect_ai/tool/_tool_call.py +5 -2
- inspect_ai/tool/_tool_support_helpers.py +200 -0
- inspect_ai/tool/_tools/_bash_session.py +119 -0
- inspect_ai/tool/_tools/_computer/_computer.py +1 -1
- inspect_ai/tool/_tools/_text_editor.py +121 -0
- inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
- inspect_ai/tool/_tools/_web_search.py +2 -2
- inspect_ai/util/_json.py +28 -0
- inspect_ai/util/_sandbox/context.py +18 -8
- inspect_ai/util/_sandbox/docker/config.py +1 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -3
- inspect_ai/util/_sandbox/environment.py +17 -2
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
@@ -148,7 +148,7 @@ export type Input =
|
|
148
148
|
| ChatMessageAssistant
|
149
149
|
| ChatMessageTool
|
150
150
|
)[];
|
151
|
-
export type Id1 = string;
|
151
|
+
export type Id1 = string | null;
|
152
152
|
export type Content =
|
153
153
|
| string
|
154
154
|
| (
|
@@ -175,7 +175,7 @@ export type Video = string;
|
|
175
175
|
export type Format1 = "mp4" | "mpeg" | "mov";
|
176
176
|
export type Source = ("input" | "generate") | null;
|
177
177
|
export type Role = "system";
|
178
|
-
export type Id2 = string;
|
178
|
+
export type Id2 = string | null;
|
179
179
|
export type Content1 =
|
180
180
|
| string
|
181
181
|
| (
|
@@ -188,7 +188,7 @@ export type Content1 =
|
|
188
188
|
export type Source1 = ("input" | "generate") | null;
|
189
189
|
export type Role1 = "user";
|
190
190
|
export type ToolCallId = string[] | null;
|
191
|
-
export type Id3 = string;
|
191
|
+
export type Id3 = string | null;
|
192
192
|
export type Content2 =
|
193
193
|
| string
|
194
194
|
| (
|
@@ -203,12 +203,13 @@ export type Role2 = "assistant";
|
|
203
203
|
export type ToolCalls = ToolCall[] | null;
|
204
204
|
export type Id4 = string;
|
205
205
|
export type Function = string;
|
206
|
-
export type Type8 =
|
206
|
+
export type Type8 = string;
|
207
|
+
export type InternalName = string | null;
|
207
208
|
export type ParseError = string | null;
|
208
209
|
export type Title = string | null;
|
209
210
|
export type Format2 = "text" | "markdown";
|
210
211
|
export type Content3 = string;
|
211
|
-
export type Id5 = string;
|
212
|
+
export type Id5 = string | null;
|
212
213
|
export type Content4 =
|
213
214
|
| string
|
214
215
|
| (
|
@@ -222,6 +223,7 @@ export type Source3 = ("input" | "generate") | null;
|
|
222
223
|
export type Role3 = "tool";
|
223
224
|
export type ToolCallId1 = string | null;
|
224
225
|
export type Function1 = string | null;
|
226
|
+
export type InternalName1 = string | null;
|
225
227
|
export type Type9 =
|
226
228
|
| "parsing"
|
227
229
|
| "timeout"
|
@@ -369,6 +371,7 @@ export type Event6 = "tool";
|
|
369
371
|
export type Type12 = "function";
|
370
372
|
export type Id7 = string;
|
371
373
|
export type Function2 = string;
|
374
|
+
export type InternalName2 = string | null;
|
372
375
|
export type Result1 =
|
373
376
|
| string
|
374
377
|
| number
|
@@ -911,6 +914,7 @@ export interface ToolCall {
|
|
911
914
|
function: Function;
|
912
915
|
arguments: Arguments;
|
913
916
|
type: Type8;
|
917
|
+
internal_name: InternalName;
|
914
918
|
parse_error: ParseError;
|
915
919
|
view: ToolCallContent | null;
|
916
920
|
}
|
@@ -933,6 +937,7 @@ export interface ChatMessageTool {
|
|
933
937
|
role: Role3;
|
934
938
|
tool_call_id: ToolCallId1;
|
935
939
|
function: Function1;
|
940
|
+
internal_name: InternalName1;
|
936
941
|
error: ToolCallError | null;
|
937
942
|
}
|
938
943
|
export interface ToolCallError {
|
@@ -1201,6 +1206,7 @@ export interface ToolEvent {
|
|
1201
1206
|
id: Id7;
|
1202
1207
|
function: Function2;
|
1203
1208
|
arguments: Arguments1;
|
1209
|
+
internal_name: InternalName2;
|
1204
1210
|
view: ToolCallContent | null;
|
1205
1211
|
result: Result1;
|
1206
1212
|
truncated: Truncated;
|
@@ -3,6 +3,7 @@ import { FC, useCallback } from "react";
|
|
3
3
|
import { SampleSummary } from "../../api/types";
|
4
4
|
import { ApplicationIcons } from "../../appearance/icons";
|
5
5
|
import { CopyButton } from "../../components/CopyButton";
|
6
|
+
import { kModelNone } from "../../constants";
|
6
7
|
import { EvalResults, EvalSpec, Status } from "../../types/log";
|
7
8
|
import { filename } from "../../utils/path";
|
8
9
|
import styles from "./PrimaryBar.module.css";
|
@@ -71,18 +72,22 @@ export const PrimaryBar: FC<PrimaryBarProps> = ({
|
|
71
72
|
>
|
72
73
|
{evalSpec?.task}
|
73
74
|
</div>
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
75
|
+
{evalSpec?.model && evalSpec.model !== kModelNone ? (
|
76
|
+
<div
|
77
|
+
id="task-model"
|
78
|
+
className={clsx(
|
79
|
+
"task-model",
|
80
|
+
"text-truncate",
|
81
|
+
styles.taskModel,
|
82
|
+
"text-size-base",
|
83
|
+
)}
|
84
|
+
title={evalSpec?.model}
|
85
|
+
>
|
86
|
+
{evalSpec?.model}
|
87
|
+
</div>
|
88
|
+
) : (
|
89
|
+
""
|
90
|
+
)}
|
86
91
|
</div>
|
87
92
|
<div className={clsx("text-size-small", styles.secondaryContainer)}>
|
88
93
|
<div className={clsx("navbar-secondary-text", "text-truncate")}>
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import clsx from "clsx";
|
2
2
|
import { FC, Fragment } from "react";
|
3
3
|
import { EvalLogHeader } from "../../api/types";
|
4
|
+
import { kModelNone } from "../../constants";
|
4
5
|
import { EvalStatus } from "./EvalStatus";
|
5
6
|
import styles from "./SidebarLogEntry.module.css";
|
6
7
|
|
@@ -51,7 +52,7 @@ export const SidebarLogEntry: FC<SidebarLogEntryProps> = ({
|
|
51
52
|
</div>
|
52
53
|
<small className={clsx("mb-1", "text-size-small")}>{timeStr}</small>
|
53
54
|
|
54
|
-
{model ? (
|
55
|
+
{model && model !== kModelNone ? (
|
55
56
|
<div>
|
56
57
|
<small className={clsx("mb-1", "text-size-small")}>{model}</small>
|
57
58
|
</div>
|
inspect_ai/_view/www/yarn.lock
CHANGED
@@ -220,13 +220,20 @@
|
|
220
220
|
"@codemirror/view" "^6.0.0"
|
221
221
|
crelt "^1.0.5"
|
222
222
|
|
223
|
-
"@codemirror/state@^6.0.0", "@codemirror/state@^6.4.0", "@codemirror/state@^6.5.0"
|
223
|
+
"@codemirror/state@^6.0.0", "@codemirror/state@^6.4.0", "@codemirror/state@^6.5.0":
|
224
224
|
version "6.5.1"
|
225
225
|
resolved "https://registry.yarnpkg.com/@codemirror/state/-/state-6.5.1.tgz#e5c0599f7b43cf03f19e05861317df5425c07904"
|
226
226
|
integrity sha512-3rA9lcwciEB47ZevqvD8qgbzhM9qMb8vCcQCNmDfVRPQG4JT9mSb0Jg8H7YjKGGQcFnLN323fj9jdnG59Kx6bg==
|
227
227
|
dependencies:
|
228
228
|
"@marijn/find-cluster-break" "^1.0.0"
|
229
229
|
|
230
|
+
"@codemirror/state@^6.5.2":
|
231
|
+
version "6.5.2"
|
232
|
+
resolved "https://registry.yarnpkg.com/@codemirror/state/-/state-6.5.2.tgz#8eca3a64212a83367dc85475b7d78d5c9b7076c6"
|
233
|
+
integrity sha512-FVqsPqtPWKVVL3dPSxy8wEF/ymIEuVzF1PK3VbUgrxXpJUSHQWWZz4JMToquRxnkw+36LTamCZG2iua2Ptq0fA==
|
234
|
+
dependencies:
|
235
|
+
"@marijn/find-cluster-break" "^1.0.0"
|
236
|
+
|
230
237
|
"@codemirror/view@^6.0.0", "@codemirror/view@^6.17.0", "@codemirror/view@^6.23.0", "@codemirror/view@^6.27.0", "@codemirror/view@^6.35.0":
|
231
238
|
version "6.36.2"
|
232
239
|
resolved "https://registry.yarnpkg.com/@codemirror/view/-/view-6.36.2.tgz#aeb644e161440734ac5a153bf6e5b4a4355047be"
|
@@ -862,10 +869,10 @@ argparse@^2.0.1:
|
|
862
869
|
resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
|
863
870
|
integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
|
864
871
|
|
865
|
-
asciinema-player@^3.
|
866
|
-
version "3.
|
867
|
-
resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.
|
868
|
-
integrity sha512-
|
872
|
+
asciinema-player@^3.9.0:
|
873
|
+
version "3.9.0"
|
874
|
+
resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.9.0.tgz#c60742f85978e861b878fc7eb6289a5622c298af"
|
875
|
+
integrity sha512-SXVFImVzeNr8ZUdNIHABGuzlbnGWTKy245AquAjODsAnv+Lp6vxjYGN0LfA8ns30tnx/ag/bMrTbLq13TpHE6w==
|
869
876
|
dependencies:
|
870
877
|
"@babel/runtime" "^7.21.0"
|
871
878
|
solid-js "^1.3.0"
|
inspect_ai/log/_log.py
CHANGED
@@ -215,7 +215,16 @@ class EvalSample(BaseModel):
|
|
215
215
|
Returns:
|
216
216
|
StoreModel: Instance of model_cls bound to sample store data.
|
217
217
|
"""
|
218
|
-
|
218
|
+
# un-namespace names for creation
|
219
|
+
data = {
|
220
|
+
k.replace(f"{model_cls.__name__}:", "", 1): v for k, v in self.store.items()
|
221
|
+
}
|
222
|
+
|
223
|
+
# since we are reading from the log provide a fully detached store
|
224
|
+
data["store"] = Store()
|
225
|
+
|
226
|
+
# create the model
|
227
|
+
return model_cls.model_validate(data)
|
219
228
|
|
220
229
|
events: list[Event] = Field(default_factory=list)
|
221
230
|
"""Events that occurred during sample execution."""
|
@@ -10,7 +10,7 @@ from pydantic import BaseModel, Field
|
|
10
10
|
from pydantic_core import to_json
|
11
11
|
from typing_extensions import override
|
12
12
|
|
13
|
-
from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
13
|
+
from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
|
14
14
|
from inspect_ai._util.content import (
|
15
15
|
ContentAudio,
|
16
16
|
ContentImage,
|
@@ -224,7 +224,9 @@ class EvalRecorder(FileRecorder):
|
|
224
224
|
with ZipFile(z, mode="r") as zip:
|
225
225
|
try:
|
226
226
|
with zip.open(_sample_filename(id, epoch), "r") as f:
|
227
|
-
return EvalSample
|
227
|
+
return EvalSample.model_validate(
|
228
|
+
json.load(f), context=DESERIALIZING_CONTEXT
|
229
|
+
)
|
228
230
|
except KeyError:
|
229
231
|
raise IndexError(
|
230
232
|
f"Sample id {id} for epoch {epoch} not found in log {location}"
|
@@ -414,7 +416,10 @@ def _read_log(log: BinaryIO, location: str, header_only: bool = False) -> EvalLo
|
|
414
416
|
if REDUCTIONS_JSON in zip.namelist():
|
415
417
|
with zip.open(REDUCTIONS_JSON, "r") as f:
|
416
418
|
reductions = [
|
417
|
-
EvalSampleReductions
|
419
|
+
EvalSampleReductions.model_validate(
|
420
|
+
reduction, context=DESERIALIZING_CONTEXT
|
421
|
+
)
|
422
|
+
for reduction in json.load(f)
|
418
423
|
]
|
419
424
|
if evalLog.results is not None:
|
420
425
|
evalLog.reductions = reductions
|
@@ -425,7 +430,11 @@ def _read_log(log: BinaryIO, location: str, header_only: bool = False) -> EvalLo
|
|
425
430
|
for name in zip.namelist():
|
426
431
|
if name.startswith(f"{SAMPLES_DIR}/") and name.endswith(".json"):
|
427
432
|
with zip.open(name, "r") as f:
|
428
|
-
samples.append(
|
433
|
+
samples.append(
|
434
|
+
EvalSample.model_validate(
|
435
|
+
json.load(f), context=DESERIALIZING_CONTEXT
|
436
|
+
),
|
437
|
+
)
|
429
438
|
sort_samples(samples)
|
430
439
|
evalLog.samples = samples
|
431
440
|
return evalLog
|
@@ -452,7 +461,10 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
|
|
452
461
|
if SUMMARIES_JSON in zip.namelist():
|
453
462
|
summaries_raw = _read_json(zip, SUMMARIES_JSON)
|
454
463
|
if isinstance(summaries_raw, list):
|
455
|
-
return [
|
464
|
+
return [
|
465
|
+
SampleSummary.model_validate(value, context=DESERIALIZING_CONTEXT)
|
466
|
+
for value in summaries_raw
|
467
|
+
]
|
456
468
|
else:
|
457
469
|
raise ValueError(
|
458
470
|
f"Expected a list of summaries when reading {SUMMARIES_JSON}"
|
@@ -464,7 +476,14 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
|
|
464
476
|
summary_path = _journal_summary_path(summary_file)
|
465
477
|
summary = _read_json(zip, summary_path)
|
466
478
|
if isinstance(summary, list):
|
467
|
-
summaries.extend(
|
479
|
+
summaries.extend(
|
480
|
+
[
|
481
|
+
SampleSummary.model_validate(
|
482
|
+
value, context=DESERIALIZING_CONTEXT
|
483
|
+
)
|
484
|
+
for value in summary
|
485
|
+
]
|
486
|
+
)
|
468
487
|
else:
|
469
488
|
raise ValueError(
|
470
489
|
f"Expected a list of summaries when reading {summary_file}"
|
@@ -476,12 +495,12 @@ def _read_header(zip: ZipFile, location: str) -> EvalLog:
|
|
476
495
|
# first see if the header is here
|
477
496
|
if HEADER_JSON in zip.namelist():
|
478
497
|
with zip.open(HEADER_JSON, "r") as f:
|
479
|
-
log = EvalLog(
|
498
|
+
log = EvalLog.model_validate(json.load(f), context=DESERIALIZING_CONTEXT)
|
480
499
|
log.location = location
|
481
500
|
return log
|
482
501
|
else:
|
483
502
|
with zip.open(_journal_path(START_JSON), "r") as f:
|
484
|
-
start = LogStart(
|
503
|
+
start = LogStart.model_validate(json.load(f), context=DESERIALIZING_CONTEXT)
|
485
504
|
return EvalLog(
|
486
505
|
version=start.version, eval=start.eval, plan=start.plan, location=location
|
487
506
|
)
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|
7
7
|
from pydantic_core import from_json
|
8
8
|
from typing_extensions import override
|
9
9
|
|
10
|
-
from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
10
|
+
from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
|
11
11
|
from inspect_ai._util.error import EvalError
|
12
12
|
from inspect_ai._util.file import absolute_file_path, file
|
13
13
|
from inspect_ai._util.trace import trace_action
|
@@ -143,7 +143,7 @@ class JSONRecorder(FileRecorder):
|
|
143
143
|
with file(location, "r") as f:
|
144
144
|
# parse w/ pydantic
|
145
145
|
raw_data = from_json(f.read())
|
146
|
-
log = EvalLog(
|
146
|
+
log = EvalLog.model_validate(raw_data, context=DESERIALIZING_CONTEXT)
|
147
147
|
log.location = location
|
148
148
|
|
149
149
|
# fail for unknown version
|
@@ -217,6 +217,11 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
217
217
|
|
218
218
|
# Parse the log file, stopping before parsing samples
|
219
219
|
status: Literal["started", "success", "cancelled", "error"] | None = None
|
220
|
+
eval: EvalSpec | None = None
|
221
|
+
plan: EvalPlan | None = None
|
222
|
+
results: EvalResults | None = None
|
223
|
+
stats: EvalStats | None = None
|
224
|
+
error: EvalError | None = None
|
220
225
|
for k, v in ijson.kvitems(f, ""):
|
221
226
|
if k == "status":
|
222
227
|
assert v in get_args(
|
@@ -239,6 +244,9 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
239
244
|
break
|
240
245
|
|
241
246
|
assert status, "Must encounter a 'status'"
|
247
|
+
assert eval, "Must encounter a 'eval'"
|
248
|
+
assert plan, "Must encounter a 'plan'"
|
249
|
+
assert stats, "Must encounter a 'stats'"
|
242
250
|
|
243
251
|
return EvalLog(
|
244
252
|
eval=eval,
|
inspect_ai/log/_transcript.py
CHANGED
@@ -146,7 +146,7 @@ class ModelEvent(BaseEvent):
|
|
146
146
|
"""working time for model call that succeeded (i.e. was not retried)."""
|
147
147
|
|
148
148
|
@field_serializer("completed")
|
149
|
-
def serialize_completed(self, dt: datetime) -> str:
|
149
|
+
def serialize_completed(self, dt: datetime | None) -> str | None:
|
150
150
|
if dt is None:
|
151
151
|
return None
|
152
152
|
return dt.astimezone().isoformat()
|
@@ -170,6 +170,9 @@ class ToolEvent(BaseEvent):
|
|
170
170
|
arguments: dict[str, JsonValue]
|
171
171
|
"""Arguments to function."""
|
172
172
|
|
173
|
+
internal_name: str | None = Field(default=None)
|
174
|
+
"""Internal name for tool (if any)."""
|
175
|
+
|
173
176
|
view: ToolCallContent | None = Field(default=None)
|
174
177
|
"""Custom view of tool call input."""
|
175
178
|
|
@@ -235,7 +238,9 @@ class ToolEvent(BaseEvent):
|
|
235
238
|
"""Required so that we can include '_cancel_fn' as a member."""
|
236
239
|
|
237
240
|
@field_serializer("completed")
|
238
|
-
def serialize_completed(self, dt: datetime) -> str:
|
241
|
+
def serialize_completed(self, dt: datetime | None) -> str | None:
|
242
|
+
if dt is None:
|
243
|
+
return None
|
239
244
|
return dt.astimezone().isoformat()
|
240
245
|
|
241
246
|
|
@@ -270,7 +275,9 @@ class SandboxEvent(BaseEvent):
|
|
270
275
|
"""Time that sandbox action completed (see `timestamp` for started)"""
|
271
276
|
|
272
277
|
@field_serializer("completed")
|
273
|
-
def serialize_completed(self, dt: datetime) -> str:
|
278
|
+
def serialize_completed(self, dt: datetime | None) -> str | None:
|
279
|
+
if dt is None:
|
280
|
+
return None
|
274
281
|
return dt.astimezone().isoformat()
|
275
282
|
|
276
283
|
|
@@ -412,7 +419,9 @@ class SubtaskEvent(BaseEvent):
|
|
412
419
|
"""Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
|
413
420
|
|
414
421
|
@field_serializer("completed")
|
415
|
-
def serialize_completed(self, dt: datetime) -> str:
|
422
|
+
def serialize_completed(self, dt: datetime | None) -> str | None:
|
423
|
+
if dt is None:
|
424
|
+
return None
|
416
425
|
return dt.astimezone().isoformat()
|
417
426
|
|
418
427
|
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -25,7 +25,6 @@ from typing import (
|
|
25
25
|
if sys.version_info < (3, 11):
|
26
26
|
from exceptiongroup import ExceptionGroup
|
27
27
|
|
28
|
-
|
29
28
|
import anyio
|
30
29
|
import yaml
|
31
30
|
from anyio.streams.memory import MemoryObjectSendStream
|
@@ -168,6 +167,7 @@ async def call_tools(
|
|
168
167
|
id=call.id,
|
169
168
|
function=call.function,
|
170
169
|
arguments=call.arguments,
|
170
|
+
internal_name=call.internal_name,
|
171
171
|
result=content,
|
172
172
|
truncated=truncated,
|
173
173
|
view=call.view,
|
@@ -183,6 +183,7 @@ async def call_tools(
|
|
183
183
|
content=content,
|
184
184
|
tool_call_id=call.id,
|
185
185
|
function=call.function,
|
186
|
+
internal_name=call.internal_name,
|
186
187
|
error=tool_error,
|
187
188
|
),
|
188
189
|
event,
|
@@ -201,6 +202,7 @@ async def call_tools(
|
|
201
202
|
id=call.id,
|
202
203
|
function=call.function,
|
203
204
|
arguments=call.arguments,
|
205
|
+
internal_name=call.internal_name,
|
204
206
|
view=call.view,
|
205
207
|
pending=True,
|
206
208
|
)
|
@@ -216,9 +218,7 @@ async def call_tools(
|
|
216
218
|
tg.start_soon(call_tool_task, call, send_stream)
|
217
219
|
event._set_cancel_fn(tg.cancel_scope.cancel)
|
218
220
|
async with receive_stream:
|
219
|
-
|
220
|
-
tool_message, result_event = result
|
221
|
-
break
|
221
|
+
tool_message, result_event = await receive_stream.receive()
|
222
222
|
except ExceptionGroup as ex:
|
223
223
|
raise ex.exceptions[0]
|
224
224
|
|
@@ -226,6 +226,7 @@ async def call_tools(
|
|
226
226
|
tool_message = ChatMessageTool(
|
227
227
|
content="",
|
228
228
|
function=call.function,
|
229
|
+
internal_name=call.internal_name,
|
229
230
|
tool_call_id=call.id,
|
230
231
|
error=ToolCallError(
|
231
232
|
"timeout", "Command timed out before completing."
|
@@ -235,6 +236,7 @@ async def call_tools(
|
|
235
236
|
id=call.id,
|
236
237
|
function=call.function,
|
237
238
|
arguments=call.arguments,
|
239
|
+
internal_name=call.internal_name,
|
238
240
|
result=tool_message.content,
|
239
241
|
truncated=None,
|
240
242
|
view=call.view,
|
@@ -508,6 +510,13 @@ def tool_parse_error_message(arguments: str, ex: Exception) -> str:
|
|
508
510
|
def parse_tool_call(
|
509
511
|
id: str, function: str, arguments: str, tools: list[ToolInfo] | None = None
|
510
512
|
) -> ToolCall:
|
513
|
+
"""Parse a tool call from a JSON payload.
|
514
|
+
|
515
|
+
Note that this function doesn't know about internal tool names so the caller
|
516
|
+
should ammend the returned `ToolCall` by mapping the parsed `function` field from
|
517
|
+
from an internal name to an inspect tool name and fixing up the `ToolCall` object
|
518
|
+
as required to reflect this change.
|
519
|
+
"""
|
511
520
|
error: str | None = None
|
512
521
|
arguments_dict: dict[str, Any] = {}
|
513
522
|
|
@@ -4,6 +4,7 @@ from typing import Any, Literal, Type, Union
|
|
4
4
|
from pydantic import BaseModel, Field, model_validator
|
5
5
|
from shortuuid import uuid
|
6
6
|
|
7
|
+
from inspect_ai._util.constants import DESERIALIZING
|
7
8
|
from inspect_ai._util.content import Content, ContentReasoning, ContentText
|
8
9
|
from inspect_ai.tool import ToolCall
|
9
10
|
from inspect_ai.tool._tool_call import ToolCallError
|
@@ -16,7 +17,7 @@ logger = getLogger(__name__)
|
|
16
17
|
class ChatMessageBase(BaseModel):
|
17
18
|
"""Base class for chat messages."""
|
18
19
|
|
19
|
-
id: str = Field(
|
20
|
+
id: str | None = Field(default=None)
|
20
21
|
"""Unique identifer for message."""
|
21
22
|
|
22
23
|
content: str | list[Content]
|
@@ -25,6 +26,16 @@ class ChatMessageBase(BaseModel):
|
|
25
26
|
source: Literal["input", "generate"] | None = Field(default=None)
|
26
27
|
"""Source of message."""
|
27
28
|
|
29
|
+
def model_post_init(self, __context: Any) -> None:
|
30
|
+
# check if deserializing
|
31
|
+
is_deserializing = isinstance(__context, dict) and __context.get(
|
32
|
+
DESERIALIZING, False
|
33
|
+
)
|
34
|
+
|
35
|
+
# Generate ID if needed and not deserializing
|
36
|
+
if self.id is None and not is_deserializing:
|
37
|
+
self.id = uuid()
|
38
|
+
|
28
39
|
@property
|
29
40
|
def text(self) -> str:
|
30
41
|
"""Get the text content of this message.
|
@@ -147,6 +158,9 @@ class ChatMessageTool(ChatMessageBase):
|
|
147
158
|
function: str | None = Field(default=None)
|
148
159
|
"""Name of function called."""
|
149
160
|
|
161
|
+
internal_name: str | None = Field(default=None)
|
162
|
+
"""Internal name for tool (if any)."""
|
163
|
+
|
150
164
|
error: ToolCallError | None = Field(default=None)
|
151
165
|
"""Error which occurred during tool call."""
|
152
166
|
|
inspect_ai/model/_model.py
CHANGED
@@ -33,6 +33,7 @@ from inspect_ai._util.content import (
|
|
33
33
|
from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
|
34
34
|
from inspect_ai._util.interrupt import check_sample_interrupt
|
35
35
|
from inspect_ai._util.logger import warn_once
|
36
|
+
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
|
36
37
|
from inspect_ai._util.platform import platform_init
|
37
38
|
from inspect_ai._util.registry import (
|
38
39
|
RegistryInfo,
|
@@ -77,7 +78,7 @@ class ModelAPI(abc.ABC):
|
|
77
78
|
by the user. You can then pass these on to the approriate place in
|
78
79
|
your model initialisation code (for example, here is what many
|
79
80
|
of the built-in providers do with the `model_args` passed to them:
|
80
|
-
https://inspect.
|
81
|
+
https://inspect.aisi.org.uk/models.html#model-args)
|
81
82
|
"""
|
82
83
|
|
83
84
|
def __init__(
|
@@ -232,15 +233,19 @@ class Model:
|
|
232
233
|
config: GenerateConfig
|
233
234
|
"""Generation config."""
|
234
235
|
|
235
|
-
def __init__(
|
236
|
+
def __init__(
|
237
|
+
self, api: ModelAPI, config: GenerateConfig, model_args: dict[str, Any] = {}
|
238
|
+
) -> None:
|
236
239
|
"""Create a model.
|
237
240
|
|
238
241
|
Args:
|
239
242
|
api: Model API provider.
|
240
243
|
config: Model configuration.
|
244
|
+
model_args: Optional model args
|
241
245
|
"""
|
242
246
|
self.api = api
|
243
247
|
self.config = config
|
248
|
+
self.model_args = model_args
|
244
249
|
|
245
250
|
# state indicating whether our lifetime is bound by a context manager
|
246
251
|
self._context_bound = False
|
@@ -449,6 +454,7 @@ class Model:
|
|
449
454
|
async def generate() -> ModelOutput:
|
450
455
|
check_sample_interrupt()
|
451
456
|
|
457
|
+
cache_entry: CacheEntry | None
|
452
458
|
if cache:
|
453
459
|
if isinstance(cache, CachePolicy):
|
454
460
|
policy = cache
|
@@ -476,6 +482,8 @@ class Model:
|
|
476
482
|
call=None,
|
477
483
|
)
|
478
484
|
return existing
|
485
|
+
else:
|
486
|
+
cache_entry = None
|
479
487
|
|
480
488
|
# verify that model apis are allowed
|
481
489
|
self.verify_model_apis()
|
@@ -545,7 +553,7 @@ class Model:
|
|
545
553
|
json.dumps(dict(model=str(self), usage=output.usage.model_dump())),
|
546
554
|
)
|
547
555
|
|
548
|
-
if cache:
|
556
|
+
if cache and cache_entry:
|
549
557
|
cache_store(entry=cache_entry, output=output)
|
550
558
|
|
551
559
|
return output
|
@@ -773,6 +781,10 @@ def get_model(
|
|
773
781
|
if isinstance(model, Model):
|
774
782
|
return model
|
775
783
|
|
784
|
+
# next see if this is the special "none" model
|
785
|
+
if model == "none":
|
786
|
+
model = "none/none"
|
787
|
+
|
776
788
|
# now try finding an 'ambient' model (active or env var)
|
777
789
|
if model is None:
|
778
790
|
# return active_model if there is one
|
@@ -835,7 +847,7 @@ def get_model(
|
|
835
847
|
config=config,
|
836
848
|
**model_args,
|
837
849
|
)
|
838
|
-
m = Model(modelapi_instance, config)
|
850
|
+
m = Model(modelapi_instance, config, model_args)
|
839
851
|
if memoize:
|
840
852
|
_models[model_cache_key] = m
|
841
853
|
return m
|
@@ -860,17 +872,25 @@ def cached_model(key: str) -> Model | None:
|
|
860
872
|
|
861
873
|
|
862
874
|
def resolve_models(
|
863
|
-
model: str | Model | list[str] | list[Model] | None,
|
875
|
+
model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
|
864
876
|
model_base_url: str | None = None,
|
865
877
|
model_args: dict[str, Any] = dict(),
|
866
878
|
config: GenerateConfig = GenerateConfig(),
|
867
879
|
) -> list[Model]:
|
880
|
+
# resolve NotGiven to current INSPECT_EVAL_MODEL
|
881
|
+
if isinstance(model, NotGiven):
|
882
|
+
model = os.getenv("INSPECT_EVAL_MODEL", None)
|
883
|
+
|
884
|
+
# resolve None to NoModel
|
885
|
+
if model is None:
|
886
|
+
return [get_model("none")]
|
887
|
+
|
868
888
|
# reflect back a plain model
|
869
889
|
if isinstance(model, Model):
|
870
890
|
return [model]
|
871
891
|
|
872
892
|
# helper to resolve model of various types
|
873
|
-
def resolve_model(m: str | Model
|
893
|
+
def resolve_model(m: str | Model) -> Model:
|
874
894
|
return get_model(
|
875
895
|
model=m,
|
876
896
|
base_url=model_base_url,
|
@@ -878,11 +898,8 @@ def resolve_models(
|
|
878
898
|
**model_args,
|
879
899
|
)
|
880
900
|
|
881
|
-
#
|
882
|
-
if
|
883
|
-
model = model or os.getenv("INSPECT_EVAL_MODEL", None)
|
884
|
-
if model is None:
|
885
|
-
raise ValueError("No model specified (and no INSPECT_EVAL_MODEL defined)")
|
901
|
+
# str to list
|
902
|
+
if isinstance(model, str):
|
886
903
|
model = [m.strip() for m in model.split(",")]
|
887
904
|
|
888
905
|
# resolve models
|
@@ -1098,6 +1115,7 @@ def tool_result_images_reducer(
|
|
1098
1115
|
content=edited_tool_message_content,
|
1099
1116
|
tool_call_id=message.tool_call_id,
|
1100
1117
|
function=message.function,
|
1118
|
+
internal_name=message.internal_name,
|
1101
1119
|
)
|
1102
1120
|
],
|
1103
1121
|
pending_content + new_user_message_content,
|
@@ -1236,7 +1254,7 @@ def active_model() -> Model | None:
|
|
1236
1254
|
|
1237
1255
|
|
1238
1256
|
# shared contexts for asyncio tasks
|
1239
|
-
active_model_context_var: ContextVar[Model] = ContextVar("active_model")
|
1257
|
+
active_model_context_var: ContextVar[Model | None] = ContextVar("active_model")
|
1240
1258
|
|
1241
1259
|
|
1242
1260
|
def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
|
@@ -188,8 +188,10 @@ class ModelOutput(BaseModel):
|
|
188
188
|
model: str,
|
189
189
|
tool_name: str,
|
190
190
|
tool_arguments: dict[str, Any],
|
191
|
+
internal_tool_name: str | None = None,
|
191
192
|
tool_call_id: str | None = None,
|
192
193
|
content: str | None = None,
|
194
|
+
type: str = "function",
|
193
195
|
) -> "ModelOutput":
|
194
196
|
"""
|
195
197
|
Returns a ModelOutput for requesting a tool call.
|
@@ -197,6 +199,8 @@ class ModelOutput(BaseModel):
|
|
197
199
|
Args:
|
198
200
|
model: model name
|
199
201
|
tool_name: The name of the tool.
|
202
|
+
internal_tool_name: The model's internal name for the tool (if any).
|
203
|
+
type: The model's type for the tool. e.g. "function", "computer_use_preview"
|
200
204
|
tool_arguments: The arguments passed to the tool.
|
201
205
|
tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
|
202
206
|
content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
|
@@ -221,8 +225,9 @@ class ModelOutput(BaseModel):
|
|
221
225
|
ToolCall(
|
222
226
|
id=tool_call_id,
|
223
227
|
function=tool_name,
|
228
|
+
internal_name=internal_tool_name,
|
224
229
|
arguments=tool_arguments,
|
225
|
-
type=
|
230
|
+
type=type,
|
226
231
|
)
|
227
232
|
],
|
228
233
|
),
|