inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/logger.py +14 -13
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +79 -47
- inspect_ai/_util/transcript.py +37 -4
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29752 -24492
- inspect_ai/_view/www/log-schema.json +262 -215
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +19 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +28 -5
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +28 -20
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +30 -2
- inspect_ai/log/_recorders/eval.py +2 -0
- inspect_ai/model/_call_tools.py +31 -7
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_providers/google.py +24 -6
- inspect_ai/model/_providers/openai.py +17 -3
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
- inspect_ai/util/__init__.py +12 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +17 -13
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +3 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +7 -8
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_subprocess.py +3 -3
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -60,10 +60,11 @@ export const ModelEventView = ({ id, event, style }) => {
|
|
60
60
|
};
|
61
61
|
|
62
62
|
// For any user messages which immediately preceded this model call, including a
|
63
|
-
// panel and display those user messages
|
63
|
+
// panel and display those user messages (exclude tool_call messages as they
|
64
|
+
// are already shown in the tool call above)
|
64
65
|
const userMessages = [];
|
65
66
|
for (const msg of event.input.slice().reverse()) {
|
66
|
-
if (msg.role === "user") {
|
67
|
+
if (msg.role === "user" && !msg.tool_call_id) {
|
67
68
|
userMessages.push(msg);
|
68
69
|
} else {
|
69
70
|
break;
|
@@ -37,7 +37,7 @@ export const ToolEventView = ({ id, event, style, depth }) => {
|
|
37
37
|
functionCall=${functionCall}
|
38
38
|
input=${input}
|
39
39
|
inputType=${inputType}
|
40
|
-
output=${event.result}
|
40
|
+
output=${event.error?.message || event.result}
|
41
41
|
mode="compact"
|
42
42
|
view=${event.view}
|
43
43
|
/>
|
@@ -2,6 +2,7 @@
|
|
2
2
|
import { html } from "htm/preact";
|
3
3
|
import { ChatView } from "../../../components/ChatView.mjs";
|
4
4
|
import { FontSize, TextStyle } from "../../../appearance/Fonts.mjs";
|
5
|
+
import { HumanBaselineView } from "../../../components/HumanBaselineView.mjs";
|
5
6
|
|
6
7
|
/**
|
7
8
|
* @typedef {Object} Signature
|
@@ -62,6 +63,58 @@ const add_tools = {
|
|
62
63
|
},
|
63
64
|
};
|
64
65
|
|
66
|
+
const humanAgentKey = (key) => {
|
67
|
+
return `HumanAgentState:${key}`;
|
68
|
+
};
|
69
|
+
const human_baseline_session = {
|
70
|
+
type: "human_baseline_session",
|
71
|
+
signature: {
|
72
|
+
add: ["HumanAgentState:logs"],
|
73
|
+
replace: [],
|
74
|
+
remove: [],
|
75
|
+
},
|
76
|
+
render: (changes, resolvedState) => {
|
77
|
+
// Read the session values
|
78
|
+
const started = resolvedState[humanAgentKey("started_running")];
|
79
|
+
const runtime = resolvedState[humanAgentKey("accumulated_time")];
|
80
|
+
const answer = resolvedState[humanAgentKey("answer")];
|
81
|
+
const completed = !!answer;
|
82
|
+
const running = resolvedState[humanAgentKey("running_state")];
|
83
|
+
const rawSessions = resolvedState[humanAgentKey("logs")];
|
84
|
+
|
85
|
+
// Tweak the date value
|
86
|
+
const startedDate = started ? new Date(started * 1000) : undefined;
|
87
|
+
|
88
|
+
// Convert raw sessions into session logs
|
89
|
+
const sessions = {};
|
90
|
+
if (rawSessions) {
|
91
|
+
for (const key of Object.keys(rawSessions)) {
|
92
|
+
const value = rawSessions[key];
|
93
|
+
// this pulls the key apart into
|
94
|
+
// <user>_<timestamp>.<type>
|
95
|
+
const match = key.match(/(.*)_(\d+_\d+)\.(.*)/);
|
96
|
+
if (match) {
|
97
|
+
const user = match[1];
|
98
|
+
const timestamp = match[2];
|
99
|
+
const type = match[3];
|
100
|
+
sessions[timestamp] = sessions[timestamp] || {};
|
101
|
+
sessions[timestamp][type] = value;
|
102
|
+
sessions[timestamp]["user"] = user;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
return html`<${HumanBaselineView}
|
108
|
+
started=${startedDate}
|
109
|
+
running=${running}
|
110
|
+
completed=${completed}
|
111
|
+
answer=${answer}
|
112
|
+
runtime=${runtime}
|
113
|
+
sessionLogs=${Object.values(sessions)}
|
114
|
+
/>`;
|
115
|
+
},
|
116
|
+
};
|
117
|
+
|
65
118
|
const renderTools = (changes, resolvedState) => {
|
66
119
|
// Find which tools were added in this change
|
67
120
|
const toolIndexes = [];
|
@@ -136,6 +189,9 @@ export const RenderableChangeTypes = [
|
|
136
189
|
add_tools,
|
137
190
|
];
|
138
191
|
|
192
|
+
/** @type {ChangeType[]} */
|
193
|
+
export const StoreSpecificRenderableTypes = [human_baseline_session];
|
194
|
+
|
139
195
|
/**
|
140
196
|
* @typedef {Object} ToolParameters
|
141
197
|
* @property {string} type - The type of the parameters object, typically "object".
|
@@ -2,7 +2,10 @@
|
|
2
2
|
import { html } from "htm/preact";
|
3
3
|
|
4
4
|
import { EventPanel } from "../EventPanel.mjs";
|
5
|
-
import {
|
5
|
+
import {
|
6
|
+
RenderableChangeTypes,
|
7
|
+
StoreSpecificRenderableTypes,
|
8
|
+
} from "./StateEventRenderers.mjs";
|
6
9
|
import { StateDiffView } from "./StateDiffView.mjs";
|
7
10
|
import { formatDateTime } from "../../../utils/Format.mjs";
|
8
11
|
|
@@ -12,10 +15,11 @@ import { formatDateTime } from "../../../utils/Format.mjs";
|
|
12
15
|
* @param {Object} props - The properties passed to the component.
|
13
16
|
* @param { string } props.id - The id of this event.
|
14
17
|
* @param {import("../../../types/log").StateEvent } props.event - The event object to display.
|
18
|
+
* @param { boolean } props.isStore - Whether this event view is rendering a storage (rather than a state)
|
15
19
|
* @param { Object } props.style - The style of this event.
|
16
20
|
* @returns {import("preact").JSX.Element} The component.
|
17
21
|
*/
|
18
|
-
export const StateEventView = ({ id, event, style }) => {
|
22
|
+
export const StateEventView = ({ id, event, isStore, style }) => {
|
19
23
|
const summary = summarizeChanges(event.changes);
|
20
24
|
|
21
25
|
// Synthesize objects for comparison
|
@@ -32,7 +36,11 @@ export const StateEventView = ({ id, event, style }) => {
|
|
32
36
|
// This clone is important since the state is used by preact as potential values that are rendered
|
33
37
|
// and as a result may be decorated with additional properties, etc..., resulting in DOM elements
|
34
38
|
// appearing attached to state.
|
35
|
-
const changePreview = generatePreview(
|
39
|
+
const changePreview = generatePreview(
|
40
|
+
event.changes,
|
41
|
+
structuredClone(after),
|
42
|
+
isStore,
|
43
|
+
);
|
36
44
|
if (changePreview) {
|
37
45
|
tabs.unshift(
|
38
46
|
html`<div name="Summary" style=${{ margin: "1em 0em", width: "100%" }}>
|
@@ -55,11 +63,15 @@ export const StateEventView = ({ id, event, style }) => {
|
|
55
63
|
*
|
56
64
|
* @param {import("../../../types/log").JsonChange[]} changes - The change object containing the value.
|
57
65
|
* @param {Object} resolvedState - The change object containing the value.
|
66
|
+
* @param {boolean} isStore - Is this rendering a store event
|
58
67
|
* @returns {import("preact").JSX.Element|Object|string|undefined} - The rendered HTML template if the value is an object with content and source, otherwise the value itself.
|
59
68
|
*/
|
60
|
-
const generatePreview = (changes, resolvedState) => {
|
69
|
+
const generatePreview = (changes, resolvedState, isStore) => {
|
61
70
|
const results = [];
|
62
|
-
for (const changeType of
|
71
|
+
for (const changeType of [
|
72
|
+
...RenderableChangeTypes,
|
73
|
+
...(isStore ? StoreSpecificRenderableTypes : []),
|
74
|
+
]) {
|
63
75
|
// Note that we currently only have renderers that depend upon
|
64
76
|
// add, remove, replace, but we should likely add
|
65
77
|
// move, copy, test
|
@@ -0,0 +1,26 @@
|
|
1
|
+
declare module "asciinema-player" {
|
2
|
+
export const create: (
|
3
|
+
src: string | Object,
|
4
|
+
el: HTMLElement,
|
5
|
+
opts: {
|
6
|
+
cols?: number;
|
7
|
+
rows?: number;
|
8
|
+
autoPlay?: boolean;
|
9
|
+
preload?: boolean;
|
10
|
+
loop?: boolean;
|
11
|
+
theme?: string;
|
12
|
+
startAt?: number | string;
|
13
|
+
speed?: number;
|
14
|
+
idleTimeLimit?: number;
|
15
|
+
poster?: string;
|
16
|
+
fit?: string;
|
17
|
+
controls?: boolean;
|
18
|
+
markers?: Array<number> | Array<[number, string]>;
|
19
|
+
pauseOnMarkers?: boolean;
|
20
|
+
terminalFontSize?: string;
|
21
|
+
terminalFontFamily?: string;
|
22
|
+
terminalLineHeight?: string;
|
23
|
+
logger?: Object;
|
24
|
+
},
|
25
|
+
) => any;
|
26
|
+
}
|
@@ -29,6 +29,7 @@ export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown];
|
|
29
29
|
export type Model = string;
|
30
30
|
export type ModelBaseUrl = string | null;
|
31
31
|
export type Limit = number | [unknown, unknown] | null;
|
32
|
+
export type SampleId = string | number | (string | number)[] | null;
|
32
33
|
export type Epochs = number | null;
|
33
34
|
export type EpochsReducer = string[] | null;
|
34
35
|
export type Trace = boolean | null;
|
@@ -42,10 +43,12 @@ export type TimeLimit = number | null;
|
|
42
43
|
export type MaxSamples = number | null;
|
43
44
|
export type MaxTasks = number | null;
|
44
45
|
export type MaxSubprocesses = number | null;
|
46
|
+
export type MaxSandboxes = number | null;
|
45
47
|
export type SandboxCleanup = boolean | null;
|
46
48
|
export type LogSamples = boolean | null;
|
47
49
|
export type LogImages = boolean | null;
|
48
50
|
export type LogBuffer = number | null;
|
51
|
+
export type ScoreDisplay = boolean | null;
|
49
52
|
export type Type = "git";
|
50
53
|
export type Origin = string;
|
51
54
|
export type Commit = string;
|
@@ -76,6 +79,7 @@ export type TopLogprobs = number | null;
|
|
76
79
|
export type ParallelToolCalls = boolean | null;
|
77
80
|
export type MaxToolOutput = number | null;
|
78
81
|
export type CachePrompt = "auto" | boolean | null;
|
82
|
+
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
79
83
|
export type TotalSamples = number;
|
80
84
|
export type CompletedSamples = number;
|
81
85
|
export type Name3 = string;
|
@@ -119,6 +123,7 @@ export type Role = "system";
|
|
119
123
|
export type Content1 = string | (ContentText | ContentImage)[];
|
120
124
|
export type Source1 = ("input" | "generate") | null;
|
121
125
|
export type Role1 = "user";
|
126
|
+
export type ToolCallId = string | null;
|
122
127
|
export type Content2 = string | (ContentText | ContentImage)[];
|
123
128
|
export type Source2 = ("input" | "generate") | null;
|
124
129
|
export type Role2 = "assistant";
|
@@ -133,7 +138,7 @@ export type Content3 = string;
|
|
133
138
|
export type Content4 = string | (ContentText | ContentImage)[];
|
134
139
|
export type Source3 = ("input" | "generate") | null;
|
135
140
|
export type Role3 = "tool";
|
136
|
-
export type
|
141
|
+
export type ToolCallId1 = string | null;
|
137
142
|
export type Function1 = string | null;
|
138
143
|
export type Type4 =
|
139
144
|
| "parsing"
|
@@ -241,14 +246,10 @@ export type Name5 = string;
|
|
241
246
|
export type Description = string;
|
242
247
|
export type Type6 = "object";
|
243
248
|
export type Type7 =
|
244
|
-
| "string"
|
245
|
-
|
|
246
|
-
| "number"
|
247
|
-
| "boolean"
|
248
|
-
| "array"
|
249
|
-
| "object"
|
250
|
-
| "null";
|
249
|
+
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
250
|
+
| null;
|
251
251
|
export type Description1 = string | null;
|
252
|
+
export type Enum = unknown[] | null;
|
252
253
|
export type Properties1 = {
|
253
254
|
[k: string]: ToolParam;
|
254
255
|
} | null;
|
@@ -267,7 +268,13 @@ export type Event5 = "tool";
|
|
267
268
|
export type Type8 = "function";
|
268
269
|
export type Id3 = string;
|
269
270
|
export type Function2 = string;
|
270
|
-
export type Result =
|
271
|
+
export type Result =
|
272
|
+
| string
|
273
|
+
| number
|
274
|
+
| boolean
|
275
|
+
| ContentText
|
276
|
+
| ContentImage
|
277
|
+
| (ContentText | ContentImage)[];
|
271
278
|
export type Truncated = [unknown, unknown] | null;
|
272
279
|
export type Timestamp6 = string;
|
273
280
|
export type Pending6 = boolean | null;
|
@@ -388,8 +395,8 @@ export type Value2 =
|
|
388
395
|
export type Answer1 = string | null;
|
389
396
|
export type Explanation2 = string | null;
|
390
397
|
export type Metadata8 = {} | null;
|
391
|
-
export type
|
392
|
-
export type Samples2 =
|
398
|
+
export type SampleId1 = string | number | null;
|
399
|
+
export type Samples2 = EvalSampleScore[];
|
393
400
|
export type Location1 = string;
|
394
401
|
|
395
402
|
export interface EvalLog {
|
@@ -438,6 +445,7 @@ export interface EvalDataset {
|
|
438
445
|
export interface ModelArgs {}
|
439
446
|
export interface EvalConfig {
|
440
447
|
limit: Limit;
|
448
|
+
sample_id: SampleId;
|
441
449
|
epochs: Epochs;
|
442
450
|
epochs_reducer: EpochsReducer;
|
443
451
|
trace: Trace;
|
@@ -449,10 +457,12 @@ export interface EvalConfig {
|
|
449
457
|
max_samples: MaxSamples;
|
450
458
|
max_tasks: MaxTasks;
|
451
459
|
max_subprocesses: MaxSubprocesses;
|
460
|
+
max_sandboxes: MaxSandboxes;
|
452
461
|
sandbox_cleanup: SandboxCleanup;
|
453
462
|
log_samples: LogSamples;
|
454
463
|
log_images: LogImages;
|
455
464
|
log_buffer: LogBuffer;
|
465
|
+
score_display: ScoreDisplay;
|
456
466
|
}
|
457
467
|
export interface ApprovalPolicyConfig {
|
458
468
|
approvers: Approvers;
|
@@ -523,6 +533,7 @@ export interface GenerateConfig {
|
|
523
533
|
parallel_tool_calls: ParallelToolCalls;
|
524
534
|
max_tool_output: MaxToolOutput;
|
525
535
|
cache_prompt: CachePrompt;
|
536
|
+
reasoning_effort: ReasoningEffort;
|
526
537
|
}
|
527
538
|
export interface EvalResults {
|
528
539
|
total_samples: TotalSamples;
|
@@ -607,6 +618,7 @@ export interface ChatMessageUser {
|
|
607
618
|
content: Content1;
|
608
619
|
source: Source1;
|
609
620
|
role: Role1;
|
621
|
+
tool_call_id: ToolCallId;
|
610
622
|
}
|
611
623
|
export interface ChatMessageAssistant {
|
612
624
|
content: Content2;
|
@@ -635,7 +647,7 @@ export interface ChatMessageTool {
|
|
635
647
|
content: Content4;
|
636
648
|
source: Source3;
|
637
649
|
role: Role3;
|
638
|
-
tool_call_id:
|
650
|
+
tool_call_id: ToolCallId1;
|
639
651
|
function: Function1;
|
640
652
|
error: ToolCallError | null;
|
641
653
|
}
|
@@ -825,6 +837,7 @@ export interface ToolParam {
|
|
825
837
|
type: Type7;
|
826
838
|
description: Description1;
|
827
839
|
default: Default;
|
840
|
+
enum: Enum;
|
828
841
|
items: ToolParam | null;
|
829
842
|
properties: Properties1;
|
830
843
|
additionalProperties: Additionalproperties;
|
@@ -862,6 +875,7 @@ export interface GenerateConfig1 {
|
|
862
875
|
parallel_tool_calls: ParallelToolCalls;
|
863
876
|
max_tool_output: MaxToolOutput;
|
864
877
|
cache_prompt: CachePrompt;
|
878
|
+
reasoning_effort: ReasoningEffort;
|
865
879
|
}
|
866
880
|
/**
|
867
881
|
* Model call (raw request/response data).
|
@@ -1020,16 +1034,10 @@ export interface EvalSampleReductions {
|
|
1020
1034
|
reducer: Reducer1;
|
1021
1035
|
samples: Samples2;
|
1022
1036
|
}
|
1023
|
-
|
1024
|
-
* Score for a Sample
|
1025
|
-
*
|
1026
|
-
* Args:
|
1027
|
-
* sample_id: (str | int | None) Unique id of a sample
|
1028
|
-
*/
|
1029
|
-
export interface SampleScore {
|
1037
|
+
export interface EvalSampleScore {
|
1030
1038
|
value: Value2;
|
1031
1039
|
answer: Answer1;
|
1032
1040
|
explanation: Explanation2;
|
1033
1041
|
metadata: Metadata8;
|
1034
|
-
sample_id:
|
1042
|
+
sample_id: SampleId1;
|
1035
1043
|
}
|
@@ -150,7 +150,7 @@ export const WorkSpace = ({
|
|
150
150
|
|
151
151
|
// The samples tab
|
152
152
|
// Currently only appears when the result is successful
|
153
|
-
if (
|
153
|
+
if (sampleMode !== "none") {
|
154
154
|
resolvedTabs.samples = {
|
155
155
|
id: kEvalWorkspaceTabId,
|
156
156
|
scrollable: samples.length === 1,
|
inspect_ai/_view/www/yarn.lock
CHANGED
@@ -131,6 +131,13 @@
|
|
131
131
|
dependencies:
|
132
132
|
"@babel/types" "^7.25.2"
|
133
133
|
|
134
|
+
"@babel/runtime@^7.21.0":
|
135
|
+
version "7.26.0"
|
136
|
+
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.0.tgz#8600c2f595f277c60815256418b85356a65173c1"
|
137
|
+
integrity sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==
|
138
|
+
dependencies:
|
139
|
+
regenerator-runtime "^0.14.0"
|
140
|
+
|
134
141
|
"@babel/template@^7.25.0":
|
135
142
|
version "7.25.0"
|
136
143
|
resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.25.0.tgz#e733dc3134b4fede528c15bc95e89cb98c52592a"
|
@@ -525,6 +532,14 @@ argparse@^2.0.1:
|
|
525
532
|
resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
|
526
533
|
integrity sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==
|
527
534
|
|
535
|
+
asciinema-player@^3.8.1:
|
536
|
+
version "3.8.1"
|
537
|
+
resolved "https://registry.yarnpkg.com/asciinema-player/-/asciinema-player-3.8.1.tgz#d56ccc04a85570559900b2297cf44c2a7453d118"
|
538
|
+
integrity sha512-NkpbFg81Y6iJFpDRndakLCQ0G26XSpvuT3vJTFjMRgHb26lqHgRNY9gun54e5MehZ4fEDNYkMZv+z6MfZ8c2aA==
|
539
|
+
dependencies:
|
540
|
+
"@babel/runtime" "^7.21.0"
|
541
|
+
solid-js "^1.3.0"
|
542
|
+
|
528
543
|
babel-plugin-prismjs@^2.1.0:
|
529
544
|
version "2.1.0"
|
530
545
|
resolved "https://registry.yarnpkg.com/babel-plugin-prismjs/-/babel-plugin-prismjs-2.1.0.tgz#ade627896106326ad04d6d77fba92877618de571"
|
@@ -647,6 +662,11 @@ cross-spawn@^7.0.2:
|
|
647
662
|
shebang-command "^2.0.0"
|
648
663
|
which "^2.0.1"
|
649
664
|
|
665
|
+
csstype@^3.1.0:
|
666
|
+
version "3.1.3"
|
667
|
+
resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81"
|
668
|
+
integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==
|
669
|
+
|
650
670
|
cuint@^0.2.2:
|
651
671
|
version "0.2.2"
|
652
672
|
resolved "https://registry.yarnpkg.com/cuint/-/cuint-0.2.2.tgz#408086d409550c2631155619e9fa7bcadc3b991b"
|
@@ -1242,6 +1262,11 @@ queue-microtask@^1.2.2:
|
|
1242
1262
|
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
|
1243
1263
|
integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==
|
1244
1264
|
|
1265
|
+
regenerator-runtime@^0.14.0:
|
1266
|
+
version "0.14.1"
|
1267
|
+
resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
|
1268
|
+
integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==
|
1269
|
+
|
1245
1270
|
resolve-from@^4.0.0:
|
1246
1271
|
version "4.0.0"
|
1247
1272
|
resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-4.0.0.tgz#4abcd852ad32dd7baabfe9b40e00a36db5f392e6"
|
@@ -1294,6 +1319,16 @@ semver@^6.0.0, semver@^6.3.1:
|
|
1294
1319
|
resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4"
|
1295
1320
|
integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==
|
1296
1321
|
|
1322
|
+
seroval-plugins@^1.1.0:
|
1323
|
+
version "1.1.1"
|
1324
|
+
resolved "https://registry.yarnpkg.com/seroval-plugins/-/seroval-plugins-1.1.1.tgz#1e0c175e13bb4c620d4ce5916fbbb63de70c31f9"
|
1325
|
+
integrity sha512-qNSy1+nUj7hsCOon7AO4wdAIo9P0jrzAMp18XhiOzA6/uO5TKtP7ScozVJ8T293oRIvi5wyCHSM4TrJo/c/GJA==
|
1326
|
+
|
1327
|
+
seroval@^1.1.0:
|
1328
|
+
version "1.1.1"
|
1329
|
+
resolved "https://registry.yarnpkg.com/seroval/-/seroval-1.1.1.tgz#7630e0c17a3efa6be43f17ad6bcf9f966a61b443"
|
1330
|
+
integrity sha512-rqEO6FZk8mv7Hyv4UCj3FD3b6Waqft605TLfsCe/BiaylRpyyMC0b+uA5TJKawX3KzMrdi3wsLbCaLplrQmBvQ==
|
1331
|
+
|
1297
1332
|
shebang-command@^2.0.0:
|
1298
1333
|
version "2.0.0"
|
1299
1334
|
resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea"
|
@@ -1306,6 +1341,15 @@ shebang-regex@^3.0.0:
|
|
1306
1341
|
resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172"
|
1307
1342
|
integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==
|
1308
1343
|
|
1344
|
+
solid-js@^1.3.0:
|
1345
|
+
version "1.9.3"
|
1346
|
+
resolved "https://registry.yarnpkg.com/solid-js/-/solid-js-1.9.3.tgz#078f026fe32f6b9b48e8e0557be150f0c2d610a9"
|
1347
|
+
integrity sha512-5ba3taPoZGt9GY3YlsCB24kCg0Lv/rie/HTD4kG6h4daZZz7+yK02xn8Vx8dLYBc9i6Ps5JwAbEiqjmKaLB3Ag==
|
1348
|
+
dependencies:
|
1349
|
+
csstype "^3.1.0"
|
1350
|
+
seroval "^1.1.0"
|
1351
|
+
seroval-plugins "^1.1.0"
|
1352
|
+
|
1309
1353
|
source-map-js@^1.2.0:
|
1310
1354
|
version "1.2.0"
|
1311
1355
|
resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.2.0.tgz#16b809c162517b5b8c3e7dcd315a2a5c2612b2af"
|
inspect_ai/approval/_apply.py
CHANGED
@@ -75,4 +75,8 @@ def init_tool_approval(approval: list[ApprovalPolicy] | None) -> None:
|
|
75
75
|
_tool_approver.set(None)
|
76
76
|
|
77
77
|
|
78
|
+
def have_tool_approval() -> bool:
|
79
|
+
return _tool_approver.get(None) is not None
|
80
|
+
|
81
|
+
|
78
82
|
_tool_approver: ContextVar[Approver | None] = ContextVar("tool_approver", default=None)
|
@@ -24,8 +24,6 @@ from .util import (
|
|
24
24
|
render_tool_approval,
|
25
25
|
)
|
26
26
|
|
27
|
-
PANEL_TITLE = "Approvals"
|
28
|
-
|
29
27
|
|
30
28
|
async def panel_approval(
|
31
29
|
message: str,
|
@@ -35,7 +33,7 @@ async def panel_approval(
|
|
35
33
|
choices: list[ApprovalDecision],
|
36
34
|
) -> Approval:
|
37
35
|
# ensure the approvals panel is shown
|
38
|
-
await input_panel(
|
36
|
+
await input_panel(ApprovalInputPanel)
|
39
37
|
|
40
38
|
# submit to human approval manager (will be picked up by panel)
|
41
39
|
approvals = human_approval_manager()
|
@@ -52,11 +50,10 @@ async def panel_approval(
|
|
52
50
|
|
53
51
|
|
54
52
|
class ApprovalInputPanel(InputPanel):
|
53
|
+
DEFAULT_TITLE = "Approval"
|
54
|
+
|
55
55
|
DEFAULT_CSS = """
|
56
56
|
ApprovalInputPanel {
|
57
|
-
width: 1fr;
|
58
|
-
height: 1fr;
|
59
|
-
padding: 0 1 1 1;
|
60
57
|
layout: grid;
|
61
58
|
grid-size: 1 3;
|
62
59
|
grid-rows: auto 1fr auto;
|
@@ -88,7 +85,7 @@ class ApprovalInputPanel(InputPanel):
|
|
88
85
|
self._approvals = human_approval_manager().approval_requests()
|
89
86
|
if len(self._approvals) > 0:
|
90
87
|
approval_id, approval_request = self._approvals[0]
|
91
|
-
self.title = f"{
|
88
|
+
self.title = f"{self.DEFAULT_TITLE} ({len(self._approvals):,})"
|
92
89
|
heading.request = approval_request
|
93
90
|
content.approval = approval_request.request
|
94
91
|
actions.approval_request = approval_id, approval_request
|
@@ -97,7 +94,7 @@ class ApprovalInputPanel(InputPanel):
|
|
97
94
|
actions.activate()
|
98
95
|
self.visible = True
|
99
96
|
else:
|
100
|
-
self.title =
|
97
|
+
self.title = self.DEFAULT_TITLE
|
101
98
|
heading.request = None
|
102
99
|
content.approval = None
|
103
100
|
actions.approval_request = None
|
inspect_ai/dataset/_dataset.py
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
import abc
|
2
2
|
import random
|
3
|
+
from dataclasses import dataclass, field
|
3
4
|
from typing import (
|
4
5
|
TYPE_CHECKING,
|
5
6
|
Any,
|
6
7
|
Callable,
|
7
8
|
Iterator,
|
8
9
|
Sequence,
|
10
|
+
Type,
|
11
|
+
TypeVar,
|
9
12
|
Union,
|
10
13
|
overload,
|
11
14
|
)
|
12
15
|
|
13
|
-
from pydantic import BaseModel, Field
|
16
|
+
from pydantic import BaseModel, Field, ValidationError
|
14
17
|
from typing_extensions import override
|
15
18
|
|
16
19
|
from inspect_ai.model import ChatMessage
|
@@ -20,6 +23,8 @@ from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
|
20
23
|
if TYPE_CHECKING:
|
21
24
|
from _typeshed import SupportsRichComparison
|
22
25
|
|
26
|
+
MT = TypeVar("MT", bound=BaseModel)
|
27
|
+
|
23
28
|
|
24
29
|
class Sample(BaseModel):
|
25
30
|
def __init__(
|
@@ -76,6 +81,20 @@ class Sample(BaseModel):
|
|
76
81
|
metadata: dict[str, Any] | None = Field(default=None)
|
77
82
|
"""Arbitrary metadata associated with the sample."""
|
78
83
|
|
84
|
+
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
85
|
+
"""Metadata as a Pydantic model.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
metadata_cls: BaseModel derived class.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
BaseModel: Instance of metadata_cls.
|
92
|
+
"""
|
93
|
+
if self.metadata is None:
|
94
|
+
raise ValueError("Sample does not have metadata")
|
95
|
+
|
96
|
+
return metadata_as(self.metadata, metadata_cls)
|
97
|
+
|
79
98
|
sandbox: SandboxEnvironmentSpec | None = Field(default=None)
|
80
99
|
"""Sandbox environment type and optional config file."""
|
81
100
|
|
@@ -177,7 +196,8 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
177
196
|
"""
|
178
197
|
|
179
198
|
|
180
|
-
|
199
|
+
@dataclass
|
200
|
+
class FieldSpec:
|
181
201
|
r"""Specification for mapping data source fields to sample fields.
|
182
202
|
|
183
203
|
Args:
|
@@ -191,28 +211,28 @@ class FieldSpec(BaseModel):
|
|
191
211
|
setup (str): Optional. Setup script to run for sample .
|
192
212
|
"""
|
193
213
|
|
194
|
-
input: str =
|
214
|
+
input: str = field(default="input")
|
195
215
|
"""Name of the field containing the sample input."""
|
196
216
|
|
197
|
-
target: str =
|
217
|
+
target: str = field(default="target")
|
198
218
|
"""Name of the field containing the sample target."""
|
199
219
|
|
200
|
-
choices: str =
|
220
|
+
choices: str = field(default="choices")
|
201
221
|
"""Name of field containing the list of answer choices."""
|
202
222
|
|
203
|
-
id: str =
|
223
|
+
id: str = field(default="id")
|
204
224
|
""" Unique identifier for the sample."""
|
205
225
|
|
206
|
-
metadata: list[str] | None =
|
226
|
+
metadata: list[str] | Type[BaseModel] | None = field(default=None)
|
207
227
|
"""List of additional field names that should be read as metadata."""
|
208
228
|
|
209
|
-
sandbox: str =
|
229
|
+
sandbox: str = field(default="sandbox")
|
210
230
|
"""Sandbox type along with optional config file."""
|
211
231
|
|
212
|
-
files: str =
|
232
|
+
files: str = field(default="files")
|
213
233
|
"""Files that go along wtih the sample."""
|
214
234
|
|
215
|
-
setup: str =
|
235
|
+
setup: str = field(default="setup")
|
216
236
|
"""Setup script to run for sample (run within default SandboxEnvironment)."""
|
217
237
|
|
218
238
|
|
@@ -313,3 +333,24 @@ class MemoryDataset(Dataset):
|
|
313
333
|
samples=[sample for sample in self if predicate(sample)],
|
314
334
|
shuffled=self.shuffled,
|
315
335
|
)
|
336
|
+
|
337
|
+
|
338
|
+
def metadata_as(metadata: dict[str, Any], metadata_cls: Type[MT]) -> MT:
|
339
|
+
# validate that metadata_cls is frozen
|
340
|
+
if not metadata_cls.model_config.get("frozen", False):
|
341
|
+
raise ValueError(
|
342
|
+
f"Metadata model {metadata_cls.__name__} must have frozen=True"
|
343
|
+
)
|
344
|
+
|
345
|
+
# filter to only fields in the model
|
346
|
+
model_fields = {
|
347
|
+
k: v
|
348
|
+
for k, v in metadata.items()
|
349
|
+
if k in metadata_cls.__pydantic_fields__.keys()
|
350
|
+
}
|
351
|
+
|
352
|
+
# parse and return model instance
|
353
|
+
try:
|
354
|
+
return metadata_cls(**model_fields)
|
355
|
+
except ValidationError as ex:
|
356
|
+
raise ValueError(f"Could not parse metadata into {metadata_cls.__name__}: {ex}")
|
inspect_ai/dataset/_util.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import json
|
2
2
|
from typing import Any, Iterable, cast
|
3
3
|
|
4
|
+
from pydantic import ValidationError
|
5
|
+
|
4
6
|
from inspect_ai.model import (
|
5
7
|
ChatMessage,
|
6
8
|
ChatMessageAssistant,
|
@@ -33,9 +35,35 @@ def record_to_sample_fn(
|
|
33
35
|
# collect metadata if specified
|
34
36
|
metadata: dict[str, Any] | None = None
|
35
37
|
if sample_fields.metadata:
|
36
|
-
metadata
|
37
|
-
|
38
|
-
|
38
|
+
if isinstance(sample_fields.metadata, list):
|
39
|
+
metadata = {}
|
40
|
+
for name in sample_fields.metadata:
|
41
|
+
metadata[name] = record.get(name)
|
42
|
+
else:
|
43
|
+
# must be frozen
|
44
|
+
if not sample_fields.metadata.model_config.get("frozen", False):
|
45
|
+
raise ValueError(
|
46
|
+
f"Metadata model {sample_fields.metadata.__name__} must have frozen=True"
|
47
|
+
)
|
48
|
+
|
49
|
+
# filter to only fields in the model
|
50
|
+
model_fields = record.get("metadata", None)
|
51
|
+
if isinstance(model_fields, str):
|
52
|
+
model_fields = json.loads(model_fields)
|
53
|
+
elif model_fields is None:
|
54
|
+
model_fields = {
|
55
|
+
k: v
|
56
|
+
for k, v in record.items()
|
57
|
+
if k in sample_fields.metadata.__pydantic_fields__.keys()
|
58
|
+
}
|
59
|
+
|
60
|
+
# parse and return metadata
|
61
|
+
try:
|
62
|
+
metadata = sample_fields.metadata(**model_fields).model_dump()
|
63
|
+
except ValidationError as ex:
|
64
|
+
raise ValueError(
|
65
|
+
f"Could not parse metadata into {sample_fields.metadata.__name__}: {ex}"
|
66
|
+
)
|
39
67
|
elif "metadata" in record:
|
40
68
|
metadata_field = record.get("metadata")
|
41
69
|
if isinstance(metadata_field, str):
|