inspect-ai 0.3.103__py3-none-any.whl → 0.3.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +2 -1
- inspect_ai/_cli/eval.py +2 -2
- inspect_ai/_display/core/active.py +3 -0
- inspect_ai/_display/core/config.py +1 -0
- inspect_ai/_display/core/panel.py +21 -13
- inspect_ai/_display/core/results.py +3 -7
- inspect_ai/_display/core/rich.py +3 -5
- inspect_ai/_display/log/__init__.py +0 -0
- inspect_ai/_display/log/display.py +173 -0
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_display/rich/display.py +2 -4
- inspect_ai/_display/textual/app.py +1 -6
- inspect_ai/_display/textual/widgets/task_detail.py +3 -14
- inspect_ai/_display/textual/widgets/tasks.py +1 -1
- inspect_ai/_eval/eval.py +1 -1
- inspect_ai/_eval/evalset.py +3 -3
- inspect_ai/_eval/registry.py +6 -1
- inspect_ai/_eval/run.py +5 -1
- inspect_ai/_eval/task/constants.py +1 -0
- inspect_ai/_eval/task/log.py +2 -0
- inspect_ai/_eval/task/run.py +65 -39
- inspect_ai/_util/citation.py +88 -0
- inspect_ai/_util/content.py +24 -2
- inspect_ai/_util/json.py +17 -2
- inspect_ai/_util/registry.py +19 -4
- inspect_ai/_view/schema.py +0 -6
- inspect_ai/_view/server.py +17 -0
- inspect_ai/_view/www/dist/assets/index.css +93 -31
- inspect_ai/_view/www/dist/assets/index.js +10639 -10011
- inspect_ai/_view/www/log-schema.json +418 -1
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/@types/log.d.ts +140 -39
- inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
- inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
- inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
- inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
- inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
- inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
- inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
- inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
- inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
- inspect_ai/_view/www/src/client/api/types.ts +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
- inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
- inspect_ai/_view/www/src/tests/README.md +2 -2
- inspect_ai/_view/www/src/utils/git.ts +3 -1
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/agent/_handoff.py +8 -5
- inspect_ai/agent/_react.py +5 -5
- inspect_ai/dataset/_dataset.py +1 -1
- inspect_ai/log/_condense.py +5 -0
- inspect_ai/log/_file.py +4 -1
- inspect_ai/log/_log.py +9 -4
- inspect_ai/log/_recorders/json.py +4 -2
- inspect_ai/log/_samples.py +5 -0
- inspect_ai/log/_util.py +2 -0
- inspect_ai/model/__init__.py +14 -0
- inspect_ai/model/_call_tools.py +17 -8
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_openai_responses.py +80 -34
- inspect_ai/model/_providers/_anthropic_citations.py +158 -0
- inspect_ai/model/_providers/_google_citations.py +100 -0
- inspect_ai/model/_providers/anthropic.py +219 -36
- inspect_ai/model/_providers/google.py +98 -22
- inspect_ai/model/_providers/mistral.py +20 -7
- inspect_ai/model/_providers/openai.py +11 -10
- inspect_ai/model/_providers/openai_compatible.py +3 -2
- inspect_ai/model/_providers/openai_responses.py +2 -5
- inspect_ai/model/_providers/perplexity.py +123 -0
- inspect_ai/model/_providers/providers.py +13 -2
- inspect_ai/model/_providers/vertex.py +3 -0
- inspect_ai/model/_trim.py +5 -0
- inspect_ai/tool/__init__.py +14 -0
- inspect_ai/tool/_mcp/_mcp.py +5 -2
- inspect_ai/tool/_mcp/sampling.py +19 -3
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tool.py +10 -1
- inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
- inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
- inspect_ai/tool/_tools/_web_search/_google.py +22 -25
- inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
- inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
- inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
- inspect_ai/util/__init__.py +8 -0
- inspect_ai/util/_background.py +64 -0
- inspect_ai/util/_display.py +11 -2
- inspect_ai/util/_limit.py +72 -5
- inspect_ai/util/_sandbox/__init__.py +2 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/service.py +28 -7
- inspect_ai/util/_span.py +12 -1
- inspect_ai/util/_subprocess.py +51 -38
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +134 -109
- /inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
- /inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0
@@ -51,8 +51,11 @@ export const MarkdownDiv = forwardRef<HTMLDivElement, MarkdownDivProps>(
|
|
51
51
|
// For `code` tags, reverse the escaping if we can
|
52
52
|
const withCode = unescapeCodeHtmlEntities(unescaped);
|
53
53
|
|
54
|
+
// For `sup` tags, reverse the escaping if we can
|
55
|
+
const withSup = unescapeSupHtmlEntities(withCode);
|
56
|
+
|
54
57
|
// Return the rendered markdown
|
55
|
-
const markup = { __html:
|
58
|
+
const markup = { __html: withSup };
|
56
59
|
|
57
60
|
return (
|
58
61
|
<div
|
@@ -65,7 +68,7 @@ export const MarkdownDiv = forwardRef<HTMLDivElement, MarkdownDivProps>(
|
|
65
68
|
},
|
66
69
|
);
|
67
70
|
|
68
|
-
const kLetterListPattern = /^([a-zA-
|
71
|
+
const kLetterListPattern = /^([a-zA-Z][).]\s.*?)$/gm;
|
69
72
|
const kCommonmarkReferenceLinkPattern = /\[([^\]]*)\]: (?!http)(.*)/g;
|
70
73
|
|
71
74
|
const protectBackslashesInLatex = (content: string): string => {
|
@@ -193,6 +196,16 @@ const unprotectMarkdown = (txt: string): string => {
|
|
193
196
|
return txt;
|
194
197
|
};
|
195
198
|
|
199
|
+
function unescapeSupHtmlEntities(str: string): string {
|
200
|
+
// replace <sup> with <sup>
|
201
|
+
if (!str) {
|
202
|
+
return str;
|
203
|
+
}
|
204
|
+
return str
|
205
|
+
.replace(/<sup>/g, "<sup>")
|
206
|
+
.replace(/<\/sup>/g, "</sup>");
|
207
|
+
}
|
208
|
+
|
196
209
|
function unescapeCodeHtmlEntities(str: string): string {
|
197
210
|
if (!str) return str;
|
198
211
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import { Event } from "../app/types";
|
2
2
|
import {
|
3
3
|
AttachmentData,
|
4
|
+
ClientAPI,
|
4
5
|
EventData,
|
5
6
|
SampleData,
|
6
7
|
SampleSummary,
|
@@ -183,6 +184,8 @@ export function createSamplePolling(
|
|
183
184
|
const processedEvents = processEvents(
|
184
185
|
sampleDataResponse.sampleData,
|
185
186
|
pollingState,
|
187
|
+
api,
|
188
|
+
logFile,
|
186
189
|
);
|
187
190
|
|
188
191
|
// update max attachment id
|
@@ -268,7 +271,12 @@ function processAttachments(
|
|
268
271
|
});
|
269
272
|
}
|
270
273
|
|
271
|
-
function processEvents(
|
274
|
+
function processEvents(
|
275
|
+
sampleData: SampleData,
|
276
|
+
pollingState: PollingState,
|
277
|
+
api: ClientAPI,
|
278
|
+
log_file: string,
|
279
|
+
) {
|
272
280
|
// Go through each event and resolve it, either appending or replacing
|
273
281
|
log.debug(`Processing ${sampleData.events.length} events`);
|
274
282
|
if (sampleData.events.length === 0) {
|
@@ -289,6 +297,14 @@ function processEvents(sampleData: SampleData, pollingState: PollingState) {
|
|
289
297
|
attachmentId,
|
290
298
|
available_attachments: Object.keys(pollingState.attachments),
|
291
299
|
};
|
300
|
+
|
301
|
+
if (api.log_message) {
|
302
|
+
api.log_message(
|
303
|
+
log_file,
|
304
|
+
`Unable to resolve attachment ${attachmentId}\n` +
|
305
|
+
JSON.stringify(snapshot),
|
306
|
+
);
|
307
|
+
}
|
292
308
|
console.warn(`Unable to resolve attachment ${attachmentId}`, snapshot);
|
293
309
|
},
|
294
310
|
);
|
@@ -5,8 +5,8 @@ This directory contains the test files for the application. The test framework i
|
|
5
5
|
## Directory Structure
|
6
6
|
|
7
7
|
- `tests/`: Root directory for all tests
|
8
|
-
|
9
|
-
|
8
|
+
- `__mocks__/`: Mock files for CSS modules and other assets
|
9
|
+
- `setupTests.mjs`: Setup file for Jest tests
|
10
10
|
|
11
11
|
## Running Tests
|
12
12
|
|
@@ -2,6 +2,8 @@
|
|
2
2
|
* Generates a GitHub commit URL based on the repository origin URL and the commit hash.
|
3
3
|
*/
|
4
4
|
export const ghCommitUrl = (origin: string, commit: string): string => {
|
5
|
-
const baseUrl = origin
|
5
|
+
const baseUrl = origin
|
6
|
+
.replace(/\.git$/, "")
|
7
|
+
.replace(/^git@github.com:/, "https://github.com/");
|
6
8
|
return `${baseUrl}/commit/${commit}`;
|
7
9
|
};
|
@@ -4,3 +4,9 @@
|
|
4
4
|
export function escapeSelector(id: string): string {
|
5
5
|
return id.replace(/([ #.;,?!+*~'":^$[\]()=>|/\\])/g, "\\$1");
|
6
6
|
}
|
7
|
+
|
8
|
+
export const decodeHtmlEntities = (text: string): string => {
|
9
|
+
const parser = new DOMParser();
|
10
|
+
const doc = parser.parseFromString(text, "text/html");
|
11
|
+
return doc.documentElement.textContent || text;
|
12
|
+
};
|
inspect_ai/agent/_handoff.py
CHANGED
@@ -6,7 +6,7 @@ from inspect_ai._util.registry import (
|
|
6
6
|
registry_unqualified_name,
|
7
7
|
set_registry_info,
|
8
8
|
)
|
9
|
-
from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
|
9
|
+
from inspect_ai.tool._tool import TOOL_PARALLEL, Tool, ToolResult, ToolSource
|
10
10
|
from inspect_ai.tool._tool_def import ToolDef
|
11
11
|
from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
|
12
12
|
from inspect_ai.util._limit import Limit
|
@@ -37,9 +37,9 @@ def handoff(
|
|
37
37
|
Use the built-in `last_message` filter to return only the last message
|
38
38
|
or alternatively specify a custom `MessageFilter` function.
|
39
39
|
tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
|
40
|
-
limits: List of limits to apply to the agent.
|
41
|
-
the agent
|
42
|
-
exceeded.
|
40
|
+
limits: List of limits to apply to the agent. Limits are scoped to each
|
41
|
+
handoff to the agent. Should a limit be exceeded, the agent stops and a user
|
42
|
+
message is appended explaining that a limit was exceeded.
|
43
43
|
**agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
|
44
44
|
will not be presented to the model as part of the tool interface).
|
45
45
|
|
@@ -61,7 +61,10 @@ def handoff(
|
|
61
61
|
agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
|
62
62
|
)
|
63
63
|
tool_name = tool_name or f"transfer_to_{tool_info.name}"
|
64
|
-
set_registry_info(
|
64
|
+
set_registry_info(
|
65
|
+
agent_tool,
|
66
|
+
RegistryInfo(type="tool", name=tool_name, metadata={TOOL_PARALLEL: False}),
|
67
|
+
)
|
65
68
|
set_tool_description(
|
66
69
|
agent_tool,
|
67
70
|
ToolDescription(
|
inspect_ai/agent/_react.py
CHANGED
@@ -361,13 +361,13 @@ def _prompt_to_system_message(
|
|
361
361
|
and ("{submit}" not in prompt.assistant_prompt)
|
362
362
|
and prompt.submit_prompt
|
363
363
|
):
|
364
|
-
assistant_prompt = f"{prompt.assistant_prompt}\n{prompt.submit_prompt}"
|
364
|
+
assistant_prompt = f"{prompt.assistant_prompt}\n{prompt.submit_prompt.format(submit=submit_tool)}"
|
365
365
|
else:
|
366
|
-
assistant_prompt = prompt.assistant_prompt
|
366
|
+
assistant_prompt = prompt.assistant_prompt.format(
|
367
|
+
submit=submit_tool or "submit"
|
368
|
+
)
|
367
369
|
prompt_lines.append(assistant_prompt)
|
368
|
-
prompt_content = "\n\n".join(prompt_lines)
|
369
|
-
submit=submit_tool or "submit"
|
370
|
-
)
|
370
|
+
prompt_content = "\n\n".join(prompt_lines)
|
371
371
|
system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
|
372
372
|
else:
|
373
373
|
system_message = None
|
inspect_ai/dataset/_dataset.py
CHANGED
inspect_ai/log/_condense.py
CHANGED
@@ -9,6 +9,7 @@ from inspect_ai._util.constants import BASE_64_DATA_REMOVED
|
|
9
9
|
from inspect_ai._util.content import (
|
10
10
|
Content,
|
11
11
|
ContentAudio,
|
12
|
+
ContentData,
|
12
13
|
ContentImage,
|
13
14
|
ContentReasoning,
|
14
15
|
ContentText,
|
@@ -344,3 +345,7 @@ def walk_content(content: Content, content_fn: Callable[[str], str]) -> Content:
|
|
344
345
|
return content.model_copy(update=dict(video=content_fn(content.video)))
|
345
346
|
elif isinstance(content, ContentReasoning):
|
346
347
|
return content.model_copy(update=dict(reasoning=content_fn(content.reasoning)))
|
348
|
+
elif isinstance(content, ContentData):
|
349
|
+
return content.model_copy(
|
350
|
+
update=dict(data=walk_json_value(content.data, content_fn))
|
351
|
+
)
|
inspect_ai/log/_file.py
CHANGED
@@ -198,7 +198,10 @@ def write_log_dir_manifest(
|
|
198
198
|
fs = filesystem(output_dir)
|
199
199
|
manifest = f"{output_dir}{fs.sep}{filename}"
|
200
200
|
manifest_json = to_json(
|
201
|
-
value=manifest_logs,
|
201
|
+
value=jsonable_python(manifest_logs),
|
202
|
+
indent=2,
|
203
|
+
exclude_none=True,
|
204
|
+
fallback=lambda _x: None,
|
202
205
|
)
|
203
206
|
with file(manifest, mode="wb", fs_options=fs_options) as f:
|
204
207
|
f.write(manifest_json)
|
inspect_ai/log/_log.py
CHANGED
@@ -422,7 +422,7 @@ class EvalSample(BaseModel):
|
|
422
422
|
# warning will handle this)
|
423
423
|
del values["transcript"]
|
424
424
|
|
425
|
-
return
|
425
|
+
return migrate_values(values)
|
426
426
|
|
427
427
|
# allow field model_usage
|
428
428
|
model_config = ConfigDict(protected_namespaces=())
|
@@ -707,7 +707,10 @@ class EvalSpec(BaseModel):
|
|
707
707
|
"""Attributes of the @task decorator."""
|
708
708
|
|
709
709
|
task_args: dict[str, Any] = Field(default_factory=dict)
|
710
|
-
"""Arguments used for invoking the task."""
|
710
|
+
"""Arguments used for invoking the task (including defaults)."""
|
711
|
+
|
712
|
+
task_args_passed: dict[str, Any] = Field(default_factory=dict)
|
713
|
+
"""Arguments explicitly passed by caller for invoking the task."""
|
711
714
|
|
712
715
|
solver: str | None = Field(default=None)
|
713
716
|
"""Solver name."""
|
@@ -782,16 +785,18 @@ class EvalSpec(BaseModel):
|
|
782
785
|
def read_sandbox_spec(
|
783
786
|
cls: Type["EvalSpec"], values: dict[str, Any]
|
784
787
|
) -> dict[str, Any]:
|
785
|
-
return
|
788
|
+
return migrate_values(values)
|
786
789
|
|
787
790
|
|
788
|
-
def
|
791
|
+
def migrate_values(values: dict[str, Any]) -> dict[str, Any]:
|
789
792
|
if "sandbox" in values:
|
790
793
|
sandbox = values.get("sandbox")
|
791
794
|
if isinstance(sandbox, list):
|
792
795
|
values["sandbox"] = SandboxEnvironmentSpec(
|
793
796
|
type=sandbox[0], config=sandbox[1]
|
794
797
|
)
|
798
|
+
if "task_args_passed" not in values:
|
799
|
+
values["task_args_passed"] = values.get("task_args", {})
|
795
800
|
return values
|
796
801
|
|
797
802
|
|
@@ -3,6 +3,7 @@ from typing import Any, Literal, get_args
|
|
3
3
|
|
4
4
|
import ijson # type: ignore
|
5
5
|
from ijson import IncompleteJSONError
|
6
|
+
from ijson.backends.python import UnexpectedSymbol # type: ignore
|
6
7
|
from pydantic import BaseModel
|
7
8
|
from pydantic_core import from_json
|
8
9
|
from typing_extensions import override
|
@@ -129,12 +130,13 @@ class JSONRecorder(FileRecorder):
|
|
129
130
|
# The Python JSON serializer supports NaN and Inf, however
|
130
131
|
# this isn't technically part of the JSON spec. The json-stream
|
131
132
|
# library shares this limitation, so if we fail with an
|
132
|
-
# invalid character then we move on and and parse w/ pydantic
|
133
|
+
# invalid character (or Unexpected symbol) then we move on and and parse w/ pydantic
|
133
134
|
# (which does support NaN and Inf by default)
|
134
|
-
except (ValueError, IncompleteJSONError) as ex:
|
135
|
+
except (ValueError, IncompleteJSONError, UnexpectedSymbol) as ex:
|
135
136
|
if (
|
136
137
|
str(ex).find("Invalid JSON character") != -1
|
137
138
|
or str(ex).find("invalid char in json text") != -1
|
139
|
+
or str(ex).find("Unexpected symbol") != -1
|
138
140
|
):
|
139
141
|
pass
|
140
142
|
else:
|
inspect_ai/log/_samples.py
CHANGED
@@ -3,6 +3,7 @@ from contextvars import ContextVar
|
|
3
3
|
from datetime import datetime
|
4
4
|
from typing import AsyncGenerator, Iterator, Literal
|
5
5
|
|
6
|
+
from anyio.abc import TaskGroup
|
6
7
|
from shortuuid import uuid
|
7
8
|
|
8
9
|
from inspect_ai.dataset._dataset import Sample
|
@@ -28,6 +29,7 @@ class ActiveSample:
|
|
28
29
|
fails_on_error: bool,
|
29
30
|
transcript: Transcript,
|
30
31
|
sandboxes: dict[str, SandboxConnection],
|
32
|
+
tg: TaskGroup,
|
31
33
|
) -> None:
|
32
34
|
self.id = uuid()
|
33
35
|
self.started: float | None = None
|
@@ -47,6 +49,7 @@ class ActiveSample:
|
|
47
49
|
self.transcript = transcript
|
48
50
|
self.sandboxes = sandboxes
|
49
51
|
self._interrupt_action: Literal["score", "error"] | None = None
|
52
|
+
self.tg = tg
|
50
53
|
|
51
54
|
@property
|
52
55
|
def running_time(self) -> float:
|
@@ -86,6 +89,7 @@ async def active_sample(
|
|
86
89
|
working_limit: int | None,
|
87
90
|
fails_on_error: bool,
|
88
91
|
transcript: Transcript,
|
92
|
+
tg: TaskGroup,
|
89
93
|
) -> AsyncGenerator[ActiveSample, None]:
|
90
94
|
# create the sample
|
91
95
|
active = ActiveSample(
|
@@ -101,6 +105,7 @@ async def active_sample(
|
|
101
105
|
sandboxes=await sandbox_connections(),
|
102
106
|
fails_on_error=fails_on_error,
|
103
107
|
transcript=transcript,
|
108
|
+
tg=tg,
|
104
109
|
)
|
105
110
|
|
106
111
|
_active_samples.append(active)
|
inspect_ai/log/_util.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Any
|
|
4
4
|
|
5
5
|
from inspect_ai._util.content import (
|
6
6
|
ContentAudio,
|
7
|
+
ContentData,
|
7
8
|
ContentImage,
|
8
9
|
ContentReasoning,
|
9
10
|
ContentText,
|
@@ -24,6 +25,7 @@ def text_input_only(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
|
|
24
25
|
| ContentImage
|
25
26
|
| ContentAudio
|
26
27
|
| ContentVideo
|
28
|
+
| ContentData
|
27
29
|
] = []
|
28
30
|
for content in message.content:
|
29
31
|
if content.type == "text":
|
inspect_ai/model/__init__.py
CHANGED
@@ -1,8 +1,16 @@
|
|
1
1
|
# ruff: noqa: F401 F403 F405
|
2
2
|
|
3
|
+
from inspect_ai._util.citation import (
|
4
|
+
Citation,
|
5
|
+
CitationBase,
|
6
|
+
ContentCitation,
|
7
|
+
DocumentCitation,
|
8
|
+
UrlCitation,
|
9
|
+
)
|
3
10
|
from inspect_ai._util.content import (
|
4
11
|
Content,
|
5
12
|
ContentAudio,
|
13
|
+
ContentData,
|
6
14
|
ContentImage,
|
7
15
|
ContentReasoning,
|
8
16
|
ContentText,
|
@@ -59,6 +67,7 @@ __all__ = [
|
|
59
67
|
"ResponseSchema",
|
60
68
|
"CachePolicy",
|
61
69
|
"ContentAudio",
|
70
|
+
"ContentData",
|
62
71
|
"ContentImage",
|
63
72
|
"ContentReasoning",
|
64
73
|
"ContentText",
|
@@ -93,6 +102,11 @@ __all__ = [
|
|
93
102
|
"cache_size",
|
94
103
|
"get_model",
|
95
104
|
"modelapi",
|
105
|
+
"Citation",
|
106
|
+
"CitationBase",
|
107
|
+
"DocumentCitation",
|
108
|
+
"ContentCitation",
|
109
|
+
"UrlCitation",
|
96
110
|
]
|
97
111
|
|
98
112
|
_TOOL_MODULE_VERSION = "0.3.18"
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import json
|
3
3
|
import types
|
4
|
-
from copy import copy
|
4
|
+
from copy import copy, deepcopy
|
5
5
|
from dataclasses import is_dataclass
|
6
6
|
from datetime import date, datetime, time
|
7
7
|
from enum import EnumMeta
|
@@ -36,6 +36,7 @@ from pydantic import BaseModel
|
|
36
36
|
from inspect_ai._util.content import (
|
37
37
|
Content,
|
38
38
|
ContentAudio,
|
39
|
+
ContentData,
|
39
40
|
ContentImage,
|
40
41
|
ContentText,
|
41
42
|
ContentVideo,
|
@@ -188,13 +189,19 @@ async def execute_tools(
|
|
188
189
|
# types to string as that is what the model APIs accept
|
189
190
|
truncated: tuple[int, int] | None = None
|
190
191
|
if isinstance(
|
191
|
-
result,
|
192
|
+
result,
|
193
|
+
ContentText | ContentImage | ContentAudio | ContentVideo | ContentData,
|
192
194
|
):
|
193
195
|
content: str | list[Content] = [result]
|
194
196
|
elif isinstance(result, list) and (
|
195
197
|
len(result) == 0
|
196
198
|
or isinstance(
|
197
|
-
result[0],
|
199
|
+
result[0],
|
200
|
+
ContentText
|
201
|
+
| ContentImage
|
202
|
+
| ContentAudio
|
203
|
+
| ContentVideo
|
204
|
+
| ContentData,
|
198
205
|
)
|
199
206
|
):
|
200
207
|
content = result
|
@@ -471,7 +478,9 @@ async def agent_handoff(
|
|
471
478
|
limit_error: LimitExceededError | None = None
|
472
479
|
agent_state = AgentState(messages=copy(agent_conversation))
|
473
480
|
try:
|
474
|
-
|
481
|
+
# The agent_tool's limits will be applied multiple times if the agent is handed
|
482
|
+
# off to multiple times which is not supported, so create a copy of each limit.
|
483
|
+
with apply_limits(deepcopy(agent_tool.limits)):
|
475
484
|
async with span(name=agent_name, type="agent"):
|
476
485
|
agent_state = await agent_tool.agent(agent_state, **arguments)
|
477
486
|
except LimitExceededError as ex:
|
@@ -525,11 +534,11 @@ def prepend_agent_name(
|
|
525
534
|
content = copy(message.content)
|
526
535
|
for i in range(0, len(content)):
|
527
536
|
if isinstance(content[i], ContentText):
|
528
|
-
|
529
|
-
|
530
|
-
|
537
|
+
text = cast(ContentText, content[i]).text
|
538
|
+
if text:
|
539
|
+
content[i] = content[i].model_copy(
|
540
|
+
update=dict(text=f"[{agent_name}] {text}")
|
531
541
|
)
|
532
|
-
)
|
533
542
|
break
|
534
543
|
return message.model_copy(update=dict(content=content))
|
535
544
|
|
@@ -26,6 +26,9 @@ class ChatMessageBase(BaseModel):
|
|
26
26
|
source: Literal["input", "generate"] | None = Field(default=None)
|
27
27
|
"""Source of message."""
|
28
28
|
|
29
|
+
metadata: dict[str, Any] | None = Field(default=None)
|
30
|
+
"""Additional message metadata."""
|
31
|
+
|
29
32
|
internal: JsonValue | None = Field(default=None)
|
30
33
|
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
31
34
|
|
@@ -31,9 +31,16 @@ from openai.types.responses.response_create_params import (
|
|
31
31
|
ToolChoice as ResponsesToolChoice,
|
32
32
|
)
|
33
33
|
from openai.types.responses.response_input_item_param import FunctionCallOutput, Message
|
34
|
+
from openai.types.responses.response_output_text import (
|
35
|
+
Annotation,
|
36
|
+
AnnotationFileCitation,
|
37
|
+
AnnotationFilePath,
|
38
|
+
AnnotationURLCitation,
|
39
|
+
)
|
34
40
|
from openai.types.responses.response_reasoning_item_param import Summary
|
35
41
|
from pydantic import JsonValue
|
36
42
|
|
43
|
+
from inspect_ai._util.citation import Citation, DocumentCitation, UrlCitation
|
37
44
|
from inspect_ai._util.content import (
|
38
45
|
Content,
|
39
46
|
ContentImage,
|
@@ -47,29 +54,30 @@ from inspect_ai.model._chat_message import ChatMessage, ChatMessageAssistant
|
|
47
54
|
from inspect_ai.model._generate_config import GenerateConfig
|
48
55
|
from inspect_ai.model._model_output import ChatCompletionChoice, StopReason
|
49
56
|
from inspect_ai.model._openai import is_o_series
|
50
|
-
from inspect_ai.
|
57
|
+
from inspect_ai.tool._tool_call import ToolCall
|
58
|
+
from inspect_ai.tool._tool_choice import ToolChoice
|
59
|
+
from inspect_ai.tool._tool_info import ToolInfo
|
60
|
+
|
61
|
+
from ._providers._openai_computer_use import (
|
51
62
|
computer_call_output,
|
52
63
|
maybe_computer_use_preview_tool,
|
53
64
|
tool_call_from_openai_computer_tool_call,
|
54
65
|
)
|
55
|
-
from
|
56
|
-
from inspect_ai.tool._tool_call import ToolCall
|
57
|
-
from inspect_ai.tool._tool_choice import ToolChoice
|
58
|
-
from inspect_ai.tool._tool_info import ToolInfo
|
66
|
+
from ._providers._openai_web_search import maybe_web_search_tool
|
59
67
|
|
60
68
|
|
61
69
|
async def openai_responses_inputs(
|
62
|
-
messages: list[ChatMessage], model: str
|
70
|
+
messages: list[ChatMessage], model: str
|
63
71
|
) -> list[ResponseInputItemParam]:
|
64
72
|
return [
|
65
73
|
item
|
66
74
|
for message in messages
|
67
|
-
for item in await _openai_input_item_from_chat_message(message, model
|
75
|
+
for item in await _openai_input_item_from_chat_message(message, model)
|
68
76
|
]
|
69
77
|
|
70
78
|
|
71
79
|
async def _openai_input_item_from_chat_message(
|
72
|
-
message: ChatMessage, model: str
|
80
|
+
message: ChatMessage, model: str
|
73
81
|
) -> list[ResponseInputItemParam]:
|
74
82
|
if message.role == "system":
|
75
83
|
content = await _openai_responses_content_list_param(message.content)
|
@@ -87,7 +95,7 @@ async def _openai_input_item_from_chat_message(
|
|
87
95
|
)
|
88
96
|
]
|
89
97
|
elif message.role == "assistant":
|
90
|
-
return _openai_input_items_from_chat_message_assistant(message
|
98
|
+
return _openai_input_items_from_chat_message_assistant(message)
|
91
99
|
elif message.role == "tool":
|
92
100
|
if message.internal:
|
93
101
|
internal = _model_tool_call_for_internal(message.internal)
|
@@ -252,7 +260,18 @@ def _chat_message_assistant_from_openai_response(
|
|
252
260
|
case ResponseOutputMessage(content=content, id=id):
|
253
261
|
message_content.extend(
|
254
262
|
[
|
255
|
-
ContentText(
|
263
|
+
ContentText(
|
264
|
+
text=c.text,
|
265
|
+
internal={"id": id},
|
266
|
+
citations=(
|
267
|
+
[
|
268
|
+
_to_inspect_citation(annotation)
|
269
|
+
for annotation in c.annotations
|
270
|
+
]
|
271
|
+
if c.annotations
|
272
|
+
else None
|
273
|
+
),
|
274
|
+
)
|
256
275
|
if isinstance(c, ResponseOutputText)
|
257
276
|
else ContentText(
|
258
277
|
text=c.refusal, refusal=True, internal={"id": id}
|
@@ -310,7 +329,7 @@ def _chat_message_assistant_from_openai_response(
|
|
310
329
|
|
311
330
|
|
312
331
|
def _openai_input_items_from_chat_message_assistant(
|
313
|
-
message: ChatMessageAssistant,
|
332
|
+
message: ChatMessageAssistant,
|
314
333
|
) -> list[ResponseInputItemParam]:
|
315
334
|
"""
|
316
335
|
Transform a `ChatMessageAssistant` into OpenAI `ResponseInputItem`'s for playback to the model.
|
@@ -343,10 +362,6 @@ def _openai_input_items_from_chat_message_assistant(
|
|
343
362
|
)
|
344
363
|
suppress_output_message = message.internal is not None and not has_content_with_ids
|
345
364
|
|
346
|
-
# if we are not storing messages on the server then blank these out
|
347
|
-
if not store:
|
348
|
-
tool_message_ids = {}
|
349
|
-
|
350
365
|
# items to return
|
351
366
|
items: list[ResponseInputItemParam] = []
|
352
367
|
# group content by message ID
|
@@ -354,30 +369,21 @@ def _openai_input_items_from_chat_message_assistant(
|
|
354
369
|
str | None, list[ResponseOutputTextParam | ResponseOutputRefusalParam]
|
355
370
|
] = {}
|
356
371
|
|
357
|
-
for content in (
|
358
|
-
list[ContentText | ContentReasoning]([ContentText(text=message.content)])
|
359
|
-
if isinstance(message.content, str)
|
360
|
-
else [
|
361
|
-
c for c in message.content if isinstance(c, ContentText | ContentReasoning)
|
362
|
-
]
|
363
|
-
):
|
372
|
+
for content in _filter_consecutive_reasoning_blocks(content_items):
|
364
373
|
match content:
|
365
374
|
case ContentReasoning(reasoning=reasoning):
|
366
375
|
assert content.signature is not None, (
|
367
376
|
"reasoning_id must be saved in signature"
|
368
377
|
)
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
summary=[Summary(type="summary_text", text=reasoning)]
|
377
|
-
if reasoning
|
378
|
-
else [],
|
379
|
-
)
|
378
|
+
items.append(
|
379
|
+
ResponseReasoningItemParam(
|
380
|
+
type="reasoning",
|
381
|
+
id=content.signature,
|
382
|
+
summary=[Summary(type="summary_text", text=reasoning)]
|
383
|
+
if reasoning
|
384
|
+
else [],
|
380
385
|
)
|
386
|
+
)
|
381
387
|
case ContentText(text=text, refusal=refusal):
|
382
388
|
if suppress_output_message:
|
383
389
|
continue
|
@@ -409,7 +415,7 @@ def _openai_input_items_from_chat_message_assistant(
|
|
409
415
|
role="assistant",
|
410
416
|
# this actually can be `None`, and it will in fact be `None` when the
|
411
417
|
# assistant message is synthesized by the scaffold as opposed to being
|
412
|
-
# replayed from the model
|
418
|
+
# replayed from the model
|
413
419
|
id=msg_id, # type: ignore[typeddict-item]
|
414
420
|
content=content_list,
|
415
421
|
status="completed",
|
@@ -531,3 +537,43 @@ def _responses_tool_alias(name: str) -> str:
|
|
531
537
|
|
532
538
|
def _from_responses_tool_alias(name: str) -> str:
|
533
539
|
return next((k for k, v in _responses_tool_aliases.items() if v == name), name)
|
540
|
+
|
541
|
+
|
542
|
+
def _to_inspect_citation(input: Annotation) -> Citation:
|
543
|
+
match input:
|
544
|
+
case AnnotationURLCitation(
|
545
|
+
end_index=end_index, start_index=start_index, title=title, url=url
|
546
|
+
):
|
547
|
+
return UrlCitation(
|
548
|
+
cited_text=(start_index, end_index), title=title, url=url
|
549
|
+
)
|
550
|
+
|
551
|
+
case (
|
552
|
+
AnnotationFileCitation(file_id=file_id, index=index)
|
553
|
+
| AnnotationFilePath(file_id=file_id, index=index)
|
554
|
+
):
|
555
|
+
return DocumentCitation(internal={"file_id": file_id, "index": index})
|
556
|
+
assert False, f"Unexpected citation type: {input.type}"
|
557
|
+
|
558
|
+
|
559
|
+
def _filter_consecutive_reasoning_blocks(
|
560
|
+
content_list: list[ContentText | ContentReasoning],
|
561
|
+
) -> list[ContentText | ContentReasoning]:
|
562
|
+
return [
|
563
|
+
content
|
564
|
+
for i, content in enumerate(content_list)
|
565
|
+
if _should_keep_content(i, content, content_list)
|
566
|
+
]
|
567
|
+
|
568
|
+
|
569
|
+
def _should_keep_content(
|
570
|
+
i: int,
|
571
|
+
content: ContentText | ContentReasoning,
|
572
|
+
content_list: list[ContentText | ContentReasoning],
|
573
|
+
) -> bool:
|
574
|
+
return (
|
575
|
+
True
|
576
|
+
if not isinstance(content, ContentReasoning)
|
577
|
+
else i == len(content_list) - 1
|
578
|
+
or not isinstance(content_list[i + 1], ContentReasoning)
|
579
|
+
)
|