inspect-ai 0.3.59__py3-none-any.whl → 0.3.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +0 -7
- inspect_ai/_display/textual/widgets/samples.py +1 -1
- inspect_ai/_eval/eval.py +10 -1
- inspect_ai/_eval/loader.py +79 -19
- inspect_ai/_eval/registry.py +6 -0
- inspect_ai/_eval/score.py +2 -1
- inspect_ai/_eval/task/results.py +6 -5
- inspect_ai/_eval/task/run.py +11 -11
- inspect_ai/_view/www/dist/assets/index.js +262 -303
- inspect_ai/_view/www/src/App.mjs +6 -6
- inspect_ai/_view/www/src/Types.mjs +1 -1
- inspect_ai/_view/www/src/api/Types.ts +133 -0
- inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
- inspect_ai/_view/www/src/api/api-http.ts +219 -0
- inspect_ai/_view/www/src/api/api-shared.ts +47 -0
- inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
- inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
- inspect_ai/_view/www/src/api/index.ts +51 -0
- inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
- inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
- inspect_ai/_view/www/src/index.js +2 -2
- inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
- inspect_ai/_view/www/src/navbar/Navbar.mjs +1 -1
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleList.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +14 -14
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +10 -10
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
- inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +1 -3
- inspect_ai/_view/www/src/utils/vscode.ts +36 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/manager.py +1 -1
- inspect_ai/model/_call_tools.py +55 -0
- inspect_ai/model/_conversation.py +1 -4
- inspect_ai/model/_generate_config.py +2 -8
- inspect_ai/model/_model_output.py +15 -0
- inspect_ai/model/_openai.py +383 -0
- inspect_ai/model/_providers/anthropic.py +52 -11
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/goodfire.py +248 -0
- inspect_ai/model/_providers/groq.py +7 -3
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +2 -1
- inspect_ai/model/_providers/openai.py +36 -202
- inspect_ai/model/_providers/openai_o1.py +2 -4
- inspect_ai/model/_providers/providers.py +22 -0
- inspect_ai/model/_providers/together.py +4 -4
- inspect_ai/model/_providers/util/__init__.py +2 -3
- inspect_ai/model/_providers/util/hf_handler.py +1 -1
- inspect_ai/model/_providers/util/llama31.py +1 -1
- inspect_ai/model/_providers/util/util.py +0 -76
- inspect_ai/scorer/_metric.py +3 -0
- inspect_ai/scorer/_scorer.py +2 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_basic_agent.py +1 -1
- inspect_ai/solver/_bridge/__init__.py +3 -0
- inspect_ai/solver/_bridge/bridge.py +100 -0
- inspect_ai/solver/_bridge/patch.py +170 -0
- inspect_ai/solver/_solver.py +6 -0
- inspect_ai/util/_display.py +5 -0
- inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +68 -63
- inspect_ai/_view/www/src/api/Types.mjs +0 -117
- inspect_ai/_view/www/src/api/api-http.mjs +0 -300
- inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
- inspect_ai/_view/www/src/api/index.mjs +0 -49
- inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
- inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,12 @@
|
|
1
1
|
//@ts-check
|
2
|
-
import {
|
2
|
+
import {
|
3
|
+
EvalHeader,
|
4
|
+
EvalSummary,
|
5
|
+
LogViewAPI,
|
6
|
+
SampleSummary,
|
7
|
+
} from "../api/Types";
|
8
|
+
import { EvalLog, EvalPlan, EvalSample, EvalSpec } from "../types/log";
|
9
|
+
import { asyncJsonParse } from "../utils/json-worker";
|
3
10
|
import { AsyncQueue } from "../utils/queue.mjs";
|
4
11
|
import {
|
5
12
|
FileSizeLimitError,
|
@@ -9,42 +16,46 @@ import {
|
|
9
16
|
// don't try to load samples greater than 50mb
|
10
17
|
const MAX_BYTES = 50 * 1024 * 1024;
|
11
18
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
*/
|
19
|
+
interface SampleEntry {
|
20
|
+
sampleId: string;
|
21
|
+
epoch: number;
|
22
|
+
}
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
24
|
+
export interface RemoteLogFile {
|
25
|
+
readHeader: () => Promise<EvalHeader>;
|
26
|
+
readLogSummary: () => Promise<EvalSummary>;
|
27
|
+
readSample: (sampleId: string, epoch: number) => Promise<EvalSample>;
|
28
|
+
readCompleteLog: () => Promise<EvalLog>;
|
29
|
+
}
|
30
|
+
|
31
|
+
interface LogStart {
|
32
|
+
version: number;
|
33
|
+
eval: EvalSpec;
|
34
|
+
plan: EvalPlan;
|
35
|
+
}
|
25
36
|
|
26
37
|
/**
|
27
38
|
* Opens a remote log file and provides methods to read its contents.
|
28
|
-
* @param {import("../api/Types.mjs").LogViewAPI} api - The api
|
29
|
-
* @param {string} url - The URL of the remote zip file.
|
30
|
-
* @param {number} concurrency - The number of concurrent operations allowed.
|
31
|
-
* @returns {Promise<RemoteLogFile>} An object with methods to read the log file.
|
32
39
|
*/
|
33
|
-
export const openRemoteLogFile = async (
|
40
|
+
export const openRemoteLogFile = async (
|
41
|
+
api: LogViewAPI,
|
42
|
+
url: string,
|
43
|
+
concurrency: number,
|
44
|
+
): Promise<RemoteLogFile> => {
|
34
45
|
const queue = new AsyncQueue(concurrency);
|
35
46
|
const remoteZipFile = await openRemoteZipFile(
|
36
|
-
|
47
|
+
url,
|
37
48
|
api.eval_log_size,
|
38
49
|
api.eval_log_bytes,
|
39
50
|
);
|
40
51
|
|
41
52
|
/**
|
42
53
|
* Reads and parses a JSON file from the zip.
|
43
|
-
* @param {string} file - The name of the file to read.
|
44
|
-
* @param {number} [maxBytes] - the max bytes
|
45
|
-
* @returns {Promise<Object>} The parsed JSON content.
|
46
54
|
*/
|
47
|
-
const readJSONFile = async (
|
55
|
+
const readJSONFile = async (
|
56
|
+
file: string,
|
57
|
+
maxBytes?: number,
|
58
|
+
): Promise<Object> => {
|
48
59
|
try {
|
49
60
|
const data = await remoteZipFile.readFile(file, maxBytes);
|
50
61
|
const textDecoder = new TextDecoder("utf-8");
|
@@ -53,19 +64,22 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
53
64
|
} catch (error) {
|
54
65
|
if (error instanceof FileSizeLimitError) {
|
55
66
|
throw error;
|
56
|
-
} else {
|
67
|
+
} else if (error instanceof Error) {
|
57
68
|
throw new Error(
|
58
69
|
`Failed to read or parse file ${file}: ${error.message}`,
|
59
70
|
);
|
71
|
+
} else {
|
72
|
+
throw new Error(
|
73
|
+
`Failed to read or parse file ${file} - an unknown error occurred`,
|
74
|
+
);
|
60
75
|
}
|
61
76
|
}
|
62
77
|
};
|
63
78
|
|
64
79
|
/**
|
65
80
|
* Lists all samples in the zip file.
|
66
|
-
* @returns {Promise<SampleEntry[]>} An array of sample objects.
|
67
81
|
*/
|
68
|
-
const listSamples = async () => {
|
82
|
+
const listSamples = async (): Promise<SampleEntry[]> => {
|
69
83
|
return Array.from(remoteZipFile.centralDirectory.keys())
|
70
84
|
.filter(
|
71
85
|
(filename) =>
|
@@ -82,14 +96,14 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
82
96
|
|
83
97
|
/**
|
84
98
|
* Reads a specific sample file.
|
85
|
-
* @param {string} sampleId - The ID of the sample.
|
86
|
-
* @param {number} epoch - The epoch of the sample.
|
87
|
-
* @returns {Promise<Object>} The content of the sample file.
|
88
99
|
*/
|
89
|
-
const readSample = async (
|
100
|
+
const readSample = async (
|
101
|
+
sampleId: string,
|
102
|
+
epoch: number,
|
103
|
+
): Promise<EvalSample> => {
|
90
104
|
const sampleFile = `samples/${sampleId}_epoch_${epoch}.json`;
|
91
105
|
if (remoteZipFile.centralDirectory.has(sampleFile)) {
|
92
|
-
return readJSONFile(sampleFile, MAX_BYTES);
|
106
|
+
return (await readJSONFile(sampleFile, MAX_BYTES)) as EvalSample;
|
93
107
|
} else {
|
94
108
|
console.log({ dir: remoteZipFile.centralDirectory });
|
95
109
|
throw new Error(
|
@@ -100,13 +114,12 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
100
114
|
|
101
115
|
/**
|
102
116
|
* Reads the results.json file.
|
103
|
-
* @returns {Promise<Object>} The content of results.json.
|
104
117
|
*/
|
105
|
-
const readHeader = async () => {
|
118
|
+
const readHeader = async (): Promise<EvalHeader> => {
|
106
119
|
if (remoteZipFile.centralDirectory.has("header.json")) {
|
107
|
-
return readJSONFile("header.json");
|
120
|
+
return (await readJSONFile("header.json")) as EvalHeader;
|
108
121
|
} else {
|
109
|
-
const evalSpec = await readJSONFile("_journal/start.json");
|
122
|
+
const evalSpec = (await readJSONFile("_journal/start.json")) as LogStart;
|
110
123
|
return {
|
111
124
|
status: "started",
|
112
125
|
eval: evalSpec.eval,
|
@@ -117,9 +130,8 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
117
130
|
|
118
131
|
/**
|
119
132
|
* Reads individual summary files when summaries.json is not available.
|
120
|
-
* @returns {Promise<Object>} Combined summaries from individual files.
|
121
133
|
*/
|
122
|
-
const readFallbackSummaries = async () => {
|
134
|
+
const readFallbackSummaries = async (): Promise<SampleSummary[]> => {
|
123
135
|
const summaryFiles = Array.from(
|
124
136
|
remoteZipFile.centralDirectory.keys(),
|
125
137
|
).filter(
|
@@ -128,14 +140,16 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
128
140
|
filename.endsWith(".json"),
|
129
141
|
);
|
130
142
|
|
131
|
-
const summaries = [];
|
132
|
-
const errors = [];
|
143
|
+
const summaries: SampleSummary[] = [];
|
144
|
+
const errors: unknown[] = [];
|
133
145
|
|
134
146
|
await Promise.all(
|
135
147
|
summaryFiles.map((filename) =>
|
136
148
|
queue.enqueue(async () => {
|
137
149
|
try {
|
138
|
-
const partialSummary = await readJSONFile(
|
150
|
+
const partialSummary = (await readJSONFile(
|
151
|
+
filename,
|
152
|
+
)) as SampleSummary[];
|
139
153
|
summaries.push(...partialSummary);
|
140
154
|
} catch (error) {
|
141
155
|
errors.push(error);
|
@@ -156,11 +170,10 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
156
170
|
|
157
171
|
/**
|
158
172
|
* Reads all summaries, falling back to individual files if necessary.
|
159
|
-
* @returns {Promise<Object>} All summaries.
|
160
173
|
*/
|
161
|
-
const readSampleSummaries = async () => {
|
174
|
+
const readSampleSummaries = async (): Promise<SampleSummary[]> => {
|
162
175
|
if (remoteZipFile.centralDirectory.has("summaries.json")) {
|
163
|
-
return await readJSONFile("summaries.json");
|
176
|
+
return (await readJSONFile("summaries.json")) as SampleSummary[];
|
164
177
|
} else {
|
165
178
|
return readFallbackSummaries();
|
166
179
|
}
|
@@ -187,14 +200,17 @@ export const openRemoteLogFile = async (api, url, concurrency) => {
|
|
187
200
|
readSample,
|
188
201
|
/**
|
189
202
|
* Reads the complete log file.
|
190
|
-
* @returns {Promise<import("../types/log").EvalLog>} The complete log data.
|
191
203
|
*/
|
192
|
-
readCompleteLog: async () => {
|
204
|
+
readCompleteLog: async (): Promise<EvalLog> => {
|
193
205
|
const [evalLog, samples] = await Promise.all([
|
194
206
|
readHeader(),
|
195
207
|
listSamples().then((sampleIds) =>
|
196
208
|
Promise.all(
|
197
|
-
sampleIds.map(({ sampleId, epoch }) =>
|
209
|
+
sampleIds.map(({ sampleId, epoch }) =>
|
210
|
+
readSample(sampleId, epoch).then(
|
211
|
+
(sample) => sample as EvalSample,
|
212
|
+
),
|
213
|
+
),
|
198
214
|
),
|
199
215
|
),
|
200
216
|
]);
|
@@ -18,8 +18,8 @@ import { SecondaryBar } from "./SecondaryBar.mjs";
|
|
18
18
|
* @param {import("../types/log").EvalResults} [props.evalResults] - The EvalResults
|
19
19
|
* @param {import("../types/log").EvalPlan} [props.evalPlan] - The EvalSpec
|
20
20
|
* @param {import("../types/log").EvalStats} [props.evalStats] - The EvalStats
|
21
|
-
* @param {import("../api/Types.mjs").SampleSummary[]} [props.samples] - the samples
|
22
21
|
* @param {import("../samples/SamplesDescriptor.mjs").EvalDescriptor} [props.evalDescriptor] - The EvalDescriptor
|
22
|
+
* @param {import("../api/Types.ts").SampleSummary[]} [props.samples] - the samples
|
23
23
|
* @param {string} [props.status] - the status
|
24
24
|
* @param {boolean} props.offcanvas - Are we in offcanvas mode?
|
25
25
|
* @param {boolean} props.showToggle - Should we show the toggle?
|
@@ -13,8 +13,8 @@ import { scoreFilterItems } from "../samples/tools/filters.mjs";
|
|
13
13
|
* @param {import("../types/log").EvalPlan} [props.evalPlan] - The EvalSpec
|
14
14
|
* @param {import("../types/log").EvalResults} [props.evalResults] - The EvalResults
|
15
15
|
* @param {import("../types/log").EvalStats} [props.evalStats] - The EvalStats
|
16
|
-
* @param {import("../api/Types.mjs").SampleSummary[]} [props.samples] - the samples
|
17
16
|
* @param {import("../samples/SamplesDescriptor.mjs").EvalDescriptor} [props.evalDescriptor] - The EvalDescriptor
|
17
|
+
* @param {import("../api/Types.ts").SampleSummary[]} [props.samples] - the samples
|
18
18
|
* @param {string} [props.status] - the status
|
19
19
|
* @param {Map<string, string>} [props.style] - is this off canvas
|
20
20
|
*
|
@@ -252,7 +252,7 @@ const SeparatorRow = ({ id, title, height }) => {
|
|
252
252
|
* @param {Object} props - The parameters for the component.
|
253
253
|
* @param {string} props.id - The unique identifier for the sample.
|
254
254
|
* @param {number} props.index - The index of the sample.
|
255
|
-
* @param {import("../api/Types.
|
255
|
+
* @param {import("../api/Types.ts").SampleSummary} props.sample - The sample.
|
256
256
|
* @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} props.sampleDescriptor - The sample descriptor.
|
257
257
|
* @param {number} props.height - The height of the sample row.
|
258
258
|
* @param {boolean} props.selected - Whether the sample is selected.
|
@@ -2,7 +2,7 @@ import { html } from "htm/preact";
|
|
2
2
|
|
3
3
|
/**
|
4
4
|
* @param {Object} props
|
5
|
-
* @param {import("../api/Types.
|
5
|
+
* @param {import("../api/Types.ts").SampleSummary} props.sample
|
6
6
|
* @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} props.sampleDescriptor
|
7
7
|
* @param {string} props.scorer
|
8
8
|
* @returns {import("preact").JSX.Element}
|
@@ -21,12 +21,12 @@ import {
|
|
21
21
|
* Represents a utility summary of the samples that doesn't change with the selected score.
|
22
22
|
* @typedef {Object} EvalDescriptor
|
23
23
|
* @property {number} epochs - The number of epochs.
|
24
|
-
* @property {import("../api/Types.
|
24
|
+
* @property {import("../api/Types.ts").SampleSummary[]} samples - The list of sample summaries.
|
25
25
|
* @property {import("../Types.mjs").ScoreLabel[]} scores - the list of available scores
|
26
|
-
* @property {(sample: import("../api/Types.
|
26
|
+
* @property {(sample: import("../api/Types.ts").BasicSampleData, scoreLabel: import("../Types.mjs").ScoreLabel) => ScorerDescriptor} scorerDescriptor - Returns the scorer descriptor for a sample and a specified scorer.
|
27
27
|
* @property {(scoreLabel: import("../Types.mjs").ScoreLabel) => ScoreDescriptor} scoreDescriptor - Provides information about the score types and how to render them.
|
28
|
-
* @property {(sample: import("../api/Types.
|
29
|
-
* @property {(sample: import("../api/Types.
|
28
|
+
* @property {(sample: import("../api/Types.ts").BasicSampleData, scoreLabel: import("../Types.mjs").ScoreLabel) => SelectedScore} score - Returns information about a score for a sample.
|
29
|
+
* @property {(sample: import("../api/Types.ts").BasicSampleData, scorer: string) => string} scoreAnswer - Returns the answer for a sample and a specified scorer.
|
30
30
|
*/
|
31
31
|
|
32
32
|
/**
|
@@ -35,8 +35,8 @@ import {
|
|
35
35
|
* @property {EvalDescriptor} evalDescriptor - The EvalDescriptor.
|
36
36
|
* @property {MessageShape} messageShape - The normalized sizes of input, target, and answer messages.
|
37
37
|
* @property {ScoreDescriptor} selectedScoreDescriptor - Provides information about the score types and how to render them.
|
38
|
-
* @property {(sample: import("../api/Types.
|
39
|
-
* @property {(sample: import("../api/Types.
|
38
|
+
* @property {(sample: import("../api/Types.ts").BasicSampleData) => SelectedScore} selectedScore - Returns the selected score for a sample.
|
39
|
+
* @property {(sample: import("../api/Types.ts").BasicSampleData) => ScorerDescriptor} selectedScorerDescriptor - Returns the scorer descriptor for a sample using the selected scorer.
|
40
40
|
*/
|
41
41
|
|
42
42
|
/**
|
@@ -108,7 +108,7 @@ export const parseScoreLabelKey = (key) => {
|
|
108
108
|
|
109
109
|
/**
|
110
110
|
* @param {import("../Types.mjs").ScoreLabel[]} scores - the list of available scores
|
111
|
-
* @param {import("../api/Types.
|
111
|
+
* @param {import("../api/Types.ts").SampleSummary[]} samples - the list of sample summaries
|
112
112
|
* @param {number} epochs - The number of epochs
|
113
113
|
* @returns {EvalDescriptor} The EvalDescriptor
|
114
114
|
*/
|
@@ -118,7 +118,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
118
118
|
}
|
119
119
|
|
120
120
|
/**
|
121
|
-
* @param {import("../api/Types.
|
121
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample - the currently selected score
|
122
122
|
* @param {import("../Types.mjs").ScoreLabel} scoreLabel - the score label
|
123
123
|
* @returns {import("../types/log").Value2} The Score
|
124
124
|
*/
|
@@ -142,7 +142,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
142
142
|
};
|
143
143
|
|
144
144
|
/**
|
145
|
-
* @param {import("../api/Types.
|
145
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample - the currently selected score
|
146
146
|
* @param {string} scorer - the scorer name
|
147
147
|
* @returns {string} The answer
|
148
148
|
*/
|
@@ -158,7 +158,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
158
158
|
};
|
159
159
|
|
160
160
|
/**
|
161
|
-
* @param {import("../api/Types.
|
161
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample - the currently selected score
|
162
162
|
* @param {string} scorer - the scorer name
|
163
163
|
* @returns {string} The explanation
|
164
164
|
*/
|
@@ -174,7 +174,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
174
174
|
|
175
175
|
// Retrieve the metadata for a sample
|
176
176
|
/**
|
177
|
-
* @param {import("../api/Types.
|
177
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample - the currently selected score
|
178
178
|
* @param {string} scorer - the scorer name
|
179
179
|
* @returns {Object} The explanation
|
180
180
|
*/
|
@@ -248,7 +248,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
248
248
|
};
|
249
249
|
|
250
250
|
/**
|
251
|
-
* @param {import("../api/Types.
|
251
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample
|
252
252
|
* @param {import("../Types.mjs").ScoreLabel} scoreLabel
|
253
253
|
* @returns {any}
|
254
254
|
*/
|
@@ -265,7 +265,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
265
265
|
};
|
266
266
|
|
267
267
|
/**
|
268
|
-
* @param {import("../api/Types.
|
268
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample
|
269
269
|
* @param {import("../Types.mjs").ScoreLabel} scoreLabel
|
270
270
|
* @returns {ScorerDescriptor}
|
271
271
|
*/
|
@@ -348,7 +348,7 @@ export const createEvalDescriptor = (scores, samples, epochs) => {
|
|
348
348
|
};
|
349
349
|
|
350
350
|
/**
|
351
|
-
* @param {import("../api/Types.
|
351
|
+
* @param {import("../api/Types.ts").BasicSampleData} sample
|
352
352
|
* @param {import("../Types.mjs").ScoreLabel} scoreLabel
|
353
353
|
* @returns {SelectedScore}
|
354
354
|
*/
|
@@ -13,7 +13,7 @@ import { EmptyPanel } from "../components/EmptyPanel.mjs";
|
|
13
13
|
* @param {Object} props - The parameters for the component.
|
14
14
|
* @param {import("../types/log").Sample} [props.sample] - The sample
|
15
15
|
* @param {string} [props.task_id] - The task id
|
16
|
-
* @param {import("../api/Types.
|
16
|
+
* @param {import("../api/Types.ts").SampleSummary[]} [props.samples] - the samples
|
17
17
|
* @param {import("../Types.mjs").SampleMode} props.sampleMode - the mode for displaying samples
|
18
18
|
* @param {"epoch" | "sample" | "none" } props.groupBy - how to group items
|
19
19
|
* @param {"asc" | "desc" } props.groupByOrder - whether grouping is ascending or descending
|
@@ -213,19 +213,19 @@ export const SamplesTab = ({
|
|
213
213
|
* @property {string} label - The label for the sample, formatted as "Sample {group} (Epoch {item})".
|
214
214
|
* @property {number} number - The current counter item value.
|
215
215
|
* @property {number} index - The index of the sample.
|
216
|
-
* @property {import("../api/Types.
|
216
|
+
* @property {import("../api/Types.ts").SampleSummary | string} data - The items data payload.
|
217
217
|
* @property {string} type - The type of the result, in this case, "sample". (or "separator")
|
218
218
|
*/
|
219
219
|
|
220
220
|
/**
|
221
221
|
* Perform any grouping of the samples
|
222
222
|
*
|
223
|
-
* @param {import("../api/Types.
|
223
|
+
* @param {import("../api/Types.ts").SampleSummary[]} samples - the list of sample summaries
|
224
224
|
* @param {"sample" | "epoch" | "none"} groupBy - how to group samples
|
225
225
|
* @param {"asc" | "desc"} groupByOrder - how to order grouped samples
|
226
226
|
* @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} sampleDescriptor - the sample descriptor
|
227
227
|
|
228
|
-
* @returns {(sample: import("../api/Types.
|
228
|
+
* @returns {(sample: import("../api/Types.ts").SampleSummary, index: number, previousSample: import("../api/Types.ts").SampleSummary) => ListItem[]} The list items
|
229
229
|
*/
|
230
230
|
const getSampleProcessor = (
|
231
231
|
samples,
|
@@ -246,9 +246,9 @@ const getSampleProcessor = (
|
|
246
246
|
/**
|
247
247
|
* Performs no grouping
|
248
248
|
*
|
249
|
-
* @param {import("../api/Types.
|
249
|
+
* @param {import("../api/Types.ts").SampleSummary[]} samples - the list of sample summaries
|
250
250
|
* @param {string} order - the selected order
|
251
|
-
* @returns {(sample: import("../api/Types.
|
251
|
+
* @returns {(sample: import("../api/Types.ts").SampleSummary, index: number, previousSample: import("../api/Types.ts").SampleSummary) => ListItem[]} The list
|
252
252
|
*/
|
253
253
|
const noGrouping = (samples, order) => {
|
254
254
|
const counter = getCounter(samples.length, 1, order);
|
@@ -270,10 +270,10 @@ const noGrouping = (samples, order) => {
|
|
270
270
|
/**
|
271
271
|
* Groups by sample (showing separators for Epochs)
|
272
272
|
*
|
273
|
-
* @param {import("../api/Types.
|
273
|
+
* @param {import("../api/Types.ts").SampleSummary[]} samples - the list of sample summaries
|
274
274
|
* @param {string} order - the selected order
|
275
275
|
* @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} sampleDescriptor - the sample descriptor
|
276
|
-
* @returns {(sample: import("../api/Types.
|
276
|
+
* @returns {(sample: import("../api/Types.ts").SampleSummary, index: number, previousSample: import("../api/Types.ts").SampleSummary) => ListItem[]} The list
|
277
277
|
*/
|
278
278
|
const groupBySample = (samples, sampleDescriptor, order) => {
|
279
279
|
// ensure that we are sorted by id
|
@@ -327,10 +327,10 @@ const groupBySample = (samples, sampleDescriptor, order) => {
|
|
327
327
|
/**
|
328
328
|
* Groups by epoch (showing a separator for each sample)
|
329
329
|
*
|
330
|
-
* @param {import("../api/Types.
|
330
|
+
* @param {import("../api/Types.ts").SampleSummary[]} samples - the list of sample summaries
|
331
331
|
* @param {string} order - the selected order
|
332
332
|
* @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} sampleDescriptor - the sample descriptor
|
333
|
-
* @returns {(sample: import("../api/Types.
|
333
|
+
* @returns {(sample: import("../api/Types.ts").SampleSummary, index: number, previousSample: import("../api/Types.ts").SampleSummary) => ListItem[]} The list
|
334
334
|
*/
|
335
335
|
const groupByEpoch = (samples, sampleDescriptor, order) => {
|
336
336
|
const groupCount = sampleDescriptor.evalDescriptor.epochs;
|
@@ -89,9 +89,9 @@ const sortId = (a, b) => {
|
|
89
89
|
* Sorts a list of samples
|
90
90
|
*
|
91
91
|
* @param {string} sort - The sort direction
|
92
|
-
* @param {import("../../api/Types.
|
92
|
+
* @param {import("../../api/Types.ts").SampleSummary[]} samples - The samples
|
93
93
|
* @param {import("../SamplesDescriptor.mjs").SamplesDescriptor} samplesDescriptor - The samples descriptor
|
94
|
-
* @returns {{ sorted: import("../../api/Types.
|
94
|
+
* @returns {{ sorted: import("../../api/Types.ts").SampleSummary[], order: 'asc' | 'desc' }} An object with sorted samples and the sort order.
|
95
95
|
*/
|
96
96
|
export const sortSamples = (sort, samples, samplesDescriptor) => {
|
97
97
|
const sortedSamples = samples.sort((a, b) => {
|
@@ -1,6 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
export const asyncJsonParse = async (text) => {
|
1
|
+
export const asyncJsonParse = async (text: string): Promise<any> => {
|
4
2
|
const encoder = new TextEncoder();
|
5
3
|
const encodedText = encoder.encode(text);
|
6
4
|
const blob = new Blob([kWorkerCode], { type: "application/javascript" });
|
@@ -0,0 +1,36 @@
|
|
1
|
+
/**
|
2
|
+
* Type definition for the VS Code API object
|
3
|
+
* Note: This is a minimal definition - expand based on your needs
|
4
|
+
*/
|
5
|
+
interface VSCodeApi {
|
6
|
+
postMessage(message: unknown): void;
|
7
|
+
getState(): unknown;
|
8
|
+
setState(state: unknown): void;
|
9
|
+
}
|
10
|
+
|
11
|
+
/**
|
12
|
+
* The cached instance of the VS Code API
|
13
|
+
*/
|
14
|
+
let vscodeApi: VSCodeApi | undefined;
|
15
|
+
|
16
|
+
// Declare the acquireVsCodeApi function on the window object
|
17
|
+
declare global {
|
18
|
+
interface Window {
|
19
|
+
acquireVsCodeApi?: () => VSCodeApi;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
/**
|
24
|
+
* Gets or initializes the VS Code API instance
|
25
|
+
* @returns {VSCodeApi | undefined} The VS Code API instance if in VS Code environment, undefined otherwise
|
26
|
+
*/
|
27
|
+
export const getVscodeApi = (): VSCodeApi | undefined => {
|
28
|
+
if (window.acquireVsCodeApi) {
|
29
|
+
if (vscodeApi === undefined) {
|
30
|
+
vscodeApi = window.acquireVsCodeApi();
|
31
|
+
}
|
32
|
+
return vscodeApi;
|
33
|
+
} else {
|
34
|
+
return undefined;
|
35
|
+
}
|
36
|
+
};
|
@@ -43,7 +43,7 @@ import { debounce } from "../utils/sync.mjs";
|
|
43
43
|
* @param {import("../types/log").EvalStats} [props.evalStats] - The EvalStats for this eval
|
44
44
|
* @param {import("../types/log").EvalResults} [props.evalResults] - The EvalResults for this eval
|
45
45
|
* @param {import("../Types.mjs").CurrentLog} [props.log] - the current log
|
46
|
-
* @param {import("../api/Types.
|
46
|
+
* @param {import("../api/Types.ts").SampleSummary[]} [props.samples] - the samples
|
47
47
|
* @param {import("../Types.mjs").SampleMode} props.sampleMode - the mode for displaying samples
|
48
48
|
* @param {string} props.groupBy - what to group by
|
49
49
|
* @param {string} props.groupByOrder - the grouping order
|
@@ -40,7 +40,7 @@ class HumanApprovalManager:
|
|
40
40
|
future = cast(Future[Approval], asyncio.get_event_loop().create_future())
|
41
41
|
sample = sample_active()
|
42
42
|
assert sample
|
43
|
-
assert sample.sample.id
|
43
|
+
assert sample.sample.id is not None
|
44
44
|
pending = PendingApprovalRequest(
|
45
45
|
request=request,
|
46
46
|
task=sample.task,
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import inspect
|
3
|
+
import json
|
3
4
|
import types
|
4
5
|
from dataclasses import is_dataclass
|
5
6
|
from logging import getLogger
|
@@ -21,6 +22,7 @@ from typing import (
|
|
21
22
|
is_typeddict,
|
22
23
|
)
|
23
24
|
|
25
|
+
import yaml
|
24
26
|
from jsonschema import Draft7Validator
|
25
27
|
from pydantic import BaseModel
|
26
28
|
|
@@ -469,3 +471,56 @@ def truncate_tool_output(
|
|
469
471
|
)
|
470
472
|
else:
|
471
473
|
return None
|
474
|
+
|
475
|
+
|
476
|
+
def tool_parse_error_message(arguments: str, ex: Exception) -> str:
|
477
|
+
return f"Error parsing the following tool call arguments:\n\n{arguments}\n\nError details: {ex}"
|
478
|
+
|
479
|
+
|
480
|
+
def parse_tool_call(
|
481
|
+
id: str, function: str, arguments: str, tools: list[ToolInfo] | None = None
|
482
|
+
) -> ToolCall:
|
483
|
+
error: str | None = None
|
484
|
+
arguments_dict: dict[str, Any] = {}
|
485
|
+
|
486
|
+
def report_parse_error(ex: Exception) -> None:
|
487
|
+
nonlocal error
|
488
|
+
error = tool_parse_error_message(arguments, ex)
|
489
|
+
logger.info(error)
|
490
|
+
|
491
|
+
# if the arguments is a dict, then handle it with a plain json.loads
|
492
|
+
arguments = arguments.strip()
|
493
|
+
if arguments.startswith("{"):
|
494
|
+
try:
|
495
|
+
arguments_dict = json.loads(arguments)
|
496
|
+
except json.JSONDecodeError as ex:
|
497
|
+
report_parse_error(ex)
|
498
|
+
|
499
|
+
# otherwise parse it as yaml (which will pickup unquoted strings, numbers, and true/false)
|
500
|
+
# and then create a dict that maps it to the first function argument
|
501
|
+
elif function and tools:
|
502
|
+
tool_info = next(
|
503
|
+
(
|
504
|
+
tool
|
505
|
+
for tool in tools
|
506
|
+
if tool.name == function and len(tool.parameters.properties) > 0
|
507
|
+
),
|
508
|
+
None,
|
509
|
+
)
|
510
|
+
if tool_info:
|
511
|
+
param_names = list(tool_info.parameters.properties.keys())
|
512
|
+
try:
|
513
|
+
value = yaml.safe_load(arguments)
|
514
|
+
arguments_dict[param_names[0]] = value
|
515
|
+
except yaml.error.YAMLError:
|
516
|
+
# If the yaml parser fails, we treat it as a string argument.
|
517
|
+
arguments_dict[param_names[0]] = arguments
|
518
|
+
|
519
|
+
# return ToolCall with error payload
|
520
|
+
return ToolCall(
|
521
|
+
id=id,
|
522
|
+
function=function,
|
523
|
+
arguments=arguments_dict,
|
524
|
+
type="function",
|
525
|
+
parse_error=error,
|
526
|
+
)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from rich.console import RenderableType
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
|
-
from inspect_ai._util.constants import NO_CONTENT
|
5
4
|
from inspect_ai._util.rich import lines_display
|
6
5
|
from inspect_ai._util.transcript import transcript_markdown
|
7
6
|
from inspect_ai.util._conversation import conversation_panel
|
@@ -41,9 +40,7 @@ def conversation_assistant_message(
|
|
41
40
|
|
42
41
|
# start with assistant content
|
43
42
|
content: list[RenderableType] = (
|
44
|
-
[transcript_markdown(message.text, escape=True)]
|
45
|
-
if message.text and message.text != NO_CONTENT
|
46
|
-
else []
|
43
|
+
[transcript_markdown(message.text, escape=True)] if message.text else []
|
47
44
|
)
|
48
45
|
|
49
46
|
# print tool calls
|
@@ -34,7 +34,7 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
34
34
|
"""Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
|
35
35
|
|
36
36
|
best_of: int | None
|
37
|
-
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token).
|
37
|
+
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
|
38
38
|
|
39
39
|
frequency_penalty: float | None
|
40
40
|
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, and vLLM only."""
|
@@ -48,9 +48,6 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
48
48
|
seed: int | None
|
49
49
|
"""Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
|
50
50
|
|
51
|
-
suffix: str | None
|
52
|
-
"""The suffix that comes after a completion of inserted text. OpenAI only."""
|
53
|
-
|
54
51
|
top_k: int | None
|
55
52
|
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, and HuggingFace only."""
|
56
53
|
|
@@ -107,7 +104,7 @@ class GenerateConfig(BaseModel):
|
|
107
104
|
"""Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
|
108
105
|
|
109
106
|
best_of: int | None = Field(default=None)
|
110
|
-
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token).
|
107
|
+
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
|
111
108
|
|
112
109
|
frequency_penalty: float | None = Field(default=None)
|
113
110
|
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, and vLLM only."""
|
@@ -121,9 +118,6 @@ class GenerateConfig(BaseModel):
|
|
121
118
|
seed: int | None = Field(default=None)
|
122
119
|
"""Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
|
123
120
|
|
124
|
-
suffix: str | None = Field(default=None)
|
125
|
-
"""The suffix that comes after a completion of inserted text. OpenAI only."""
|
126
|
-
|
127
121
|
top_k: int | None = Field(default=None)
|
128
122
|
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, and vLLM only."""
|
129
123
|
|
@@ -214,3 +214,18 @@ class ModelOutput(BaseModel):
|
|
214
214
|
)
|
215
215
|
],
|
216
216
|
)
|
217
|
+
|
218
|
+
|
219
|
+
def as_stop_reason(reason: str | None) -> StopReason:
|
220
|
+
"""Encode common reason strings into standard StopReason."""
|
221
|
+
match reason:
|
222
|
+
case "stop" | "eos":
|
223
|
+
return "stop"
|
224
|
+
case "length":
|
225
|
+
return "max_tokens"
|
226
|
+
case "tool_calls" | "function_call":
|
227
|
+
return "tool_calls"
|
228
|
+
case "content_filter" | "model_length" | "max_tokens":
|
229
|
+
return reason
|
230
|
+
case _:
|
231
|
+
return "unknown"
|