inspect-ai 0.3.99__py3-none-any.whl → 0.3.101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +2 -1
- inspect_ai/_display/core/config.py +11 -5
- inspect_ai/_display/core/panel.py +66 -2
- inspect_ai/_display/core/textual.py +5 -2
- inspect_ai/_display/plain/display.py +1 -0
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/widgets/transcript.py +37 -9
- inspect_ai/_eval/eval.py +13 -1
- inspect_ai/_eval/evalset.py +3 -2
- inspect_ai/_eval/run.py +2 -0
- inspect_ai/_eval/score.py +2 -4
- inspect_ai/_eval/task/log.py +3 -1
- inspect_ai/_eval/task/run.py +59 -81
- inspect_ai/_util/content.py +11 -6
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/text.py +7 -0
- inspect_ai/_util/working.py +8 -37
- inspect_ai/_view/__init__.py +0 -0
- inspect_ai/_view/schema.py +2 -1
- inspect_ai/_view/www/CLAUDE.md +15 -0
- inspect_ai/_view/www/dist/assets/index.css +307 -171
- inspect_ai/_view/www/dist/assets/index.js +24733 -21641
- inspect_ai/_view/www/log-schema.json +77 -3
- inspect_ai/_view/www/package.json +9 -5
- inspect_ai/_view/www/src/@types/log.d.ts +9 -0
- inspect_ai/_view/www/src/app/App.tsx +1 -15
- inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +220 -205
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
- inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
- inspect_ai/_view/www/src/app/routing/url.ts +84 -4
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
- inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +24 -17
- inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
- inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
- inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
- inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
- inspect_ai/_view/www/src/app/types.ts +5 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
- inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
- inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
- inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
- inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
- inspect_ai/_view/www/src/state/hooks.ts +52 -2
- inspect_ai/_view/www/src/state/logSlice.ts +4 -3
- inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
- inspect_ai/_view/www/src/state/scrolling.ts +152 -0
- inspect_ai/_view/www/src/utils/attachments.ts +7 -0
- inspect_ai/_view/www/src/utils/python.ts +18 -0
- inspect_ai/_view/www/yarn.lock +290 -33
- inspect_ai/agent/_react.py +12 -7
- inspect_ai/agent/_run.py +2 -3
- inspect_ai/analysis/beta/__init__.py +2 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
- inspect_ai/dataset/_sources/csv.py +2 -6
- inspect_ai/dataset/_sources/hf.py +2 -6
- inspect_ai/dataset/_sources/json.py +2 -6
- inspect_ai/dataset/_util.py +23 -0
- inspect_ai/log/_log.py +1 -1
- inspect_ai/log/_recorders/eval.py +4 -3
- inspect_ai/log/_recorders/file.py +2 -9
- inspect_ai/log/_recorders/json.py +1 -0
- inspect_ai/log/_recorders/recorder.py +1 -0
- inspect_ai/log/_transcript.py +1 -1
- inspect_ai/model/_call_tools.py +6 -2
- inspect_ai/model/_openai.py +1 -1
- inspect_ai/model/_openai_responses.py +85 -41
- inspect_ai/model/_openai_web_search.py +38 -0
- inspect_ai/model/_providers/azureai.py +72 -3
- inspect_ai/model/_providers/openai.py +4 -1
- inspect_ai/model/_providers/openai_responses.py +5 -1
- inspect_ai/scorer/_metric.py +1 -2
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/solver/_task_state.py +2 -2
- inspect_ai/tool/_tool.py +6 -2
- inspect_ai/tool/_tool_def.py +27 -4
- inspect_ai/tool/_tool_info.py +2 -0
- inspect_ai/tool/_tools/_web_search/_google.py +43 -15
- inspect_ai/tool/_tools/_web_search/_tavily.py +46 -13
- inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_json.py +3 -0
- inspect_ai/util/_limit.py +230 -20
- inspect_ai/util/_sandbox/docker/compose.py +20 -11
- inspect_ai/util/_span.py +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +138 -124
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0
inspect_ai/dataset/_util.py
CHANGED
@@ -13,6 +13,7 @@ from inspect_ai.model import (
|
|
13
13
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
14
14
|
|
15
15
|
from ._dataset import (
|
16
|
+
Dataset,
|
16
17
|
DatasetRecord,
|
17
18
|
FieldSpec,
|
18
19
|
RecordToSample,
|
@@ -225,3 +226,25 @@ def read_files(files: Any | None) -> dict[str, str] | None:
|
|
225
226
|
raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
|
226
227
|
else:
|
227
228
|
return None
|
229
|
+
|
230
|
+
|
231
|
+
def shuffle_choices_if_requested(
|
232
|
+
dataset: Dataset, shuffle_choices: bool | int | None
|
233
|
+
) -> None:
|
234
|
+
"""
|
235
|
+
Shuffle the choices in the dataset if requested.
|
236
|
+
|
237
|
+
The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
|
238
|
+
and `hf_dataset` can be a boolean, an integer, or `None` (default).
|
239
|
+
If it is a boolean, it will shuffle the choices if the value is `True`,
|
240
|
+
and do nothing if it is `False`.
|
241
|
+
If it is an integer, it will shuffle the choices using the integer as the seed.
|
242
|
+
"""
|
243
|
+
# Note that `isinstance(x, int)` returns True if x is True or False,
|
244
|
+
# so we need to check for both explicitly
|
245
|
+
if shuffle_choices is True:
|
246
|
+
dataset.shuffle_choices()
|
247
|
+
elif shuffle_choices is False:
|
248
|
+
pass
|
249
|
+
elif isinstance(shuffle_choices, int):
|
250
|
+
dataset.shuffle_choices(seed=shuffle_choices)
|
inspect_ai/log/_log.py
CHANGED
@@ -133,6 +133,7 @@ class EvalRecorder(FileRecorder):
|
|
133
133
|
results: EvalResults | None,
|
134
134
|
reductions: list[EvalSampleReductions] | None,
|
135
135
|
error: EvalError | None = None,
|
136
|
+
header_only: bool = False,
|
136
137
|
) -> EvalLog:
|
137
138
|
# get the key and log
|
138
139
|
key = self._log_file_key(eval)
|
@@ -174,7 +175,7 @@ class EvalRecorder(FileRecorder):
|
|
174
175
|
|
175
176
|
# flush and write the results
|
176
177
|
await log.flush()
|
177
|
-
return await log.close()
|
178
|
+
return await log.close(header_only)
|
178
179
|
|
179
180
|
@classmethod
|
180
181
|
@override
|
@@ -321,12 +322,12 @@ class ZipLogFile:
|
|
321
322
|
# re-open zip file w/ self.temp_file pointer at end
|
322
323
|
self._open()
|
323
324
|
|
324
|
-
async def close(self) -> EvalLog:
|
325
|
+
async def close(self, header_only: bool) -> EvalLog:
|
325
326
|
async with self._lock:
|
326
327
|
# read the log from the temp file then close it
|
327
328
|
try:
|
328
329
|
self._temp_file.seek(0)
|
329
|
-
return _read_log(self._temp_file, self._file)
|
330
|
+
return _read_log(self._temp_file, self._file, header_only=header_only)
|
330
331
|
finally:
|
331
332
|
self._temp_file.close()
|
332
333
|
if self._zip:
|
@@ -67,16 +67,9 @@ class FileRecorder(Recorder):
|
|
67
67
|
async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
|
68
68
|
# establish the log to read from (might be cached)
|
69
69
|
eval_log = await cls._log_file_maybe_cached(location)
|
70
|
-
|
71
|
-
# throw if no samples
|
72
70
|
if not eval_log.samples:
|
73
|
-
|
74
|
-
|
75
|
-
summaries: list[EvalSampleSummary] = []
|
76
|
-
for sample in eval_log.samples:
|
77
|
-
summaries.append(sample.summary())
|
78
|
-
|
79
|
-
return summaries
|
71
|
+
return []
|
72
|
+
return [sample.summary() for sample in eval_log.samples]
|
80
73
|
|
81
74
|
@classmethod
|
82
75
|
async def _log_file_maybe_cached(cls, location: str) -> EvalLog:
|
@@ -96,6 +96,7 @@ class JSONRecorder(FileRecorder):
|
|
96
96
|
results: EvalResults | None,
|
97
97
|
reductions: list[EvalSampleReductions] | None,
|
98
98
|
error: EvalError | None = None,
|
99
|
+
header_only: bool = False,
|
99
100
|
) -> EvalLog:
|
100
101
|
log = self.data[self._log_file_key(spec)]
|
101
102
|
log.data.status = status
|
inspect_ai/log/_transcript.py
CHANGED
inspect_ai/model/_call_tools.py
CHANGED
@@ -4,6 +4,7 @@ import types
|
|
4
4
|
from copy import copy
|
5
5
|
from dataclasses import is_dataclass
|
6
6
|
from datetime import date, datetime, time
|
7
|
+
from enum import EnumMeta
|
7
8
|
from logging import getLogger
|
8
9
|
from textwrap import dedent
|
9
10
|
from types import UnionType
|
@@ -172,7 +173,7 @@ async def execute_tools(
|
|
172
173
|
except LimitExceededError as ex:
|
173
174
|
tool_error = ToolCallError(
|
174
175
|
"limit",
|
175
|
-
f"The tool exceeded its {ex.type} limit of {ex.
|
176
|
+
f"The tool exceeded its {ex.type} limit of {ex.limit_str}.",
|
176
177
|
)
|
177
178
|
except ToolParsingError as ex:
|
178
179
|
tool_error = ToolCallError("parsing", ex.message)
|
@@ -497,7 +498,7 @@ async def agent_handoff(
|
|
497
498
|
ChatMessageUser(
|
498
499
|
content=(
|
499
500
|
f"The {agent_name} exceeded its {limit_error.type} limit of "
|
500
|
-
f"{limit_error.
|
501
|
+
f"{limit_error.limit_str}."
|
501
502
|
)
|
502
503
|
)
|
503
504
|
)
|
@@ -548,6 +549,7 @@ def tools_info(
|
|
548
549
|
name=tool.name,
|
549
550
|
description=tool.description,
|
550
551
|
parameters=tool.parameters,
|
552
|
+
options=tool.options,
|
551
553
|
)
|
552
554
|
)
|
553
555
|
return tools_info
|
@@ -652,6 +654,8 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
|
|
652
654
|
return type_hint(**dataclass_data)
|
653
655
|
elif issubclass(type_hint, BaseModel):
|
654
656
|
return type_hint(**input)
|
657
|
+
elif isinstance(type_hint, EnumMeta):
|
658
|
+
return type_hint(input)
|
655
659
|
else:
|
656
660
|
return input
|
657
661
|
elif origin is list or origin is List:
|
inspect_ai/model/_openai.py
CHANGED
@@ -594,7 +594,7 @@ def chat_choices_from_openai(
|
|
594
594
|
stop_reason=as_stop_reason(choice.finish_reason),
|
595
595
|
logprobs=(
|
596
596
|
Logprobs(**choice.logprobs.model_dump())
|
597
|
-
if choice.logprobs is not None
|
597
|
+
if choice.logprobs and choice.logprobs.content is not None
|
598
598
|
else None
|
599
599
|
),
|
600
600
|
)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import json
|
2
|
-
from
|
3
|
-
from typing import TypedDict, cast
|
2
|
+
from typing import Sequence, TypedDict, cast
|
4
3
|
|
5
4
|
from openai.types.responses import (
|
6
5
|
FunctionToolParam,
|
@@ -8,6 +7,8 @@ from openai.types.responses import (
|
|
8
7
|
ResponseComputerToolCallParam,
|
9
8
|
ResponseFunctionToolCall,
|
10
9
|
ResponseFunctionToolCallParam,
|
10
|
+
ResponseFunctionWebSearch,
|
11
|
+
ResponseFunctionWebSearchParam,
|
11
12
|
ResponseInputContentParam,
|
12
13
|
ResponseInputImageParam,
|
13
14
|
ResponseInputItemParam,
|
@@ -51,6 +52,7 @@ from inspect_ai.model._openai_computer_use import (
|
|
51
52
|
maybe_computer_use_preview_tool,
|
52
53
|
tool_call_from_openai_computer_tool_call,
|
53
54
|
)
|
55
|
+
from inspect_ai.model._openai_web_search import maybe_web_search_tool
|
54
56
|
from inspect_ai.tool._tool_call import ToolCall
|
55
57
|
from inspect_ai.tool._tool_choice import ToolChoice
|
56
58
|
from inspect_ai.tool._tool_info import ToolInfo
|
@@ -160,9 +162,9 @@ def openai_responses_tool_choice(
|
|
160
162
|
|
161
163
|
|
162
164
|
def openai_responses_tools(
|
163
|
-
tools: list[ToolInfo], config: GenerateConfig
|
165
|
+
tools: list[ToolInfo], model_name: str, config: GenerateConfig
|
164
166
|
) -> list[ToolParam]:
|
165
|
-
return [_tool_param_for_tool_info(tool, config) for tool in tools]
|
167
|
+
return [_tool_param_for_tool_info(tool, model_name, config) for tool in tools]
|
166
168
|
|
167
169
|
|
168
170
|
def openai_responses_chat_choices(
|
@@ -174,6 +176,14 @@ def openai_responses_chat_choices(
|
|
174
176
|
return [ChatCompletionChoice(message=message, stop_reason=stop_reason)]
|
175
177
|
|
176
178
|
|
179
|
+
def is_native_tool_configured(
|
180
|
+
tools: Sequence[ToolInfo], model_name: str, config: GenerateConfig
|
181
|
+
) -> bool:
|
182
|
+
return any(
|
183
|
+
_maybe_native_tool_param(tool, model_name, config) is not None for tool in tools
|
184
|
+
)
|
185
|
+
|
186
|
+
|
177
187
|
# The next two function perform transformations between OpenAI types an Inspect
|
178
188
|
# ChatMessageAssistant. Here is a diagram that helps visualize the transforms.
|
179
189
|
# ┌───────────────────────────┐ ┌───────────────────────────┐ ┌───────────────────────────┐
|
@@ -207,7 +217,6 @@ def openai_responses_chat_choices(
|
|
207
217
|
|
208
218
|
|
209
219
|
class _AssistantInternal(TypedDict):
|
210
|
-
output_message_id: str | None
|
211
220
|
tool_message_ids: dict[str, str]
|
212
221
|
|
213
222
|
|
@@ -237,17 +246,17 @@ def _chat_message_assistant_from_openai_response(
|
|
237
246
|
# collect output and tool calls
|
238
247
|
message_content: list[Content] = []
|
239
248
|
tool_calls: list[ToolCall] = []
|
240
|
-
internal = _AssistantInternal(
|
249
|
+
internal = _AssistantInternal(tool_message_ids={})
|
241
250
|
for output in response.output:
|
242
251
|
match output:
|
243
252
|
case ResponseOutputMessage(content=content, id=id):
|
244
|
-
assert internal["output_message_id"] is None, "Multiple message outputs"
|
245
|
-
internal["output_message_id"] = id
|
246
253
|
message_content.extend(
|
247
254
|
[
|
248
|
-
ContentText(text=c.text)
|
255
|
+
ContentText(text=c.text, internal={"id": id})
|
249
256
|
if isinstance(c, ResponseOutputText)
|
250
|
-
else ContentText(
|
257
|
+
else ContentText(
|
258
|
+
text=c.refusal, refusal=True, internal={"id": id}
|
259
|
+
)
|
251
260
|
for c in content
|
252
261
|
]
|
253
262
|
)
|
@@ -277,6 +286,13 @@ def _chat_message_assistant_from_openai_response(
|
|
277
286
|
tool_calls.append(
|
278
287
|
tool_call_from_openai_computer_tool_call(output)
|
279
288
|
)
|
289
|
+
case ResponseFunctionWebSearch():
|
290
|
+
# We don't currently capture this since the model did the
|
291
|
+
# "tool call" internally. It's conceivable that could be
|
292
|
+
# forced to include it in `.internal` in the future, but
|
293
|
+
# for now we just ignore it.
|
294
|
+
# {"id":"ws_682cdcec3fa88198bc10b38fafefbd5e077e89e31fd4a3d5","status":"completed","type":"web_search_call"}
|
295
|
+
pass
|
280
296
|
case _:
|
281
297
|
raise ValueError(f"Unexpected output type: {output.__class__}")
|
282
298
|
|
@@ -304,25 +320,39 @@ def _openai_input_items_from_chat_message_assistant(
|
|
304
320
|
field of the `ChatMessageAssistant` to help it provide the proper id's the
|
305
321
|
items in the returned list.
|
306
322
|
"""
|
307
|
-
|
323
|
+
tool_message_ids = _ids_from_assistant_internal(message)
|
308
324
|
|
309
325
|
# we want to prevent yielding output messages in the case where we have an
|
310
326
|
# 'internal' field (so the message came from the model API as opposed to
|
311
|
-
# being user synthesized) AND there
|
312
|
-
# when reading the message from the server we didn't find output).
|
313
|
-
# happen e.g. when a react() agent sets the output.completion in response
|
327
|
+
# being user synthesized) AND there are no ContentText items with message IDs
|
328
|
+
# (indicating that when reading the message from the server we didn't find output).
|
329
|
+
# this could happen e.g. when a react() agent sets the output.completion in response
|
314
330
|
# to a submit() tool call
|
315
|
-
|
331
|
+
content_items: list[ContentText | ContentReasoning] = (
|
332
|
+
[ContentText(text=message.content)]
|
333
|
+
if isinstance(message.content, str)
|
334
|
+
else [
|
335
|
+
c for c in message.content if isinstance(c, ContentText | ContentReasoning)
|
336
|
+
]
|
337
|
+
)
|
338
|
+
has_content_with_ids = any(
|
339
|
+
isinstance(c, ContentText)
|
340
|
+
and isinstance(c.internal, dict)
|
341
|
+
and "id" in c.internal
|
342
|
+
for c in content_items
|
343
|
+
)
|
344
|
+
suppress_output_message = message.internal is not None and not has_content_with_ids
|
316
345
|
|
317
346
|
# if we are not storing messages on the server then blank these out
|
318
347
|
if not store:
|
319
|
-
output_message_id = None
|
320
348
|
tool_message_ids = {}
|
321
349
|
|
322
|
-
# items to return
|
323
|
-
# additional content on to it)
|
350
|
+
# items to return
|
324
351
|
items: list[ResponseInputItemParam] = []
|
325
|
-
|
352
|
+
# group content by message ID
|
353
|
+
messages_by_id: dict[
|
354
|
+
str | None, list[ResponseOutputTextParam | ResponseOutputRefusalParam]
|
355
|
+
] = {}
|
326
356
|
|
327
357
|
for content in (
|
328
358
|
list[ContentText | ContentReasoning]([ContentText(text=message.content)])
|
@@ -352,6 +382,14 @@ def _openai_input_items_from_chat_message_assistant(
|
|
352
382
|
if suppress_output_message:
|
353
383
|
continue
|
354
384
|
|
385
|
+
# get the message ID from ContentText.modelJson
|
386
|
+
content_message_id: str | None = None
|
387
|
+
if isinstance(content.internal, dict) and "id" in content.internal:
|
388
|
+
id_value = content.internal["id"]
|
389
|
+
content_message_id = id_value if isinstance(id_value, str) else None
|
390
|
+
else:
|
391
|
+
content_message_id = None
|
392
|
+
|
355
393
|
new_content = (
|
356
394
|
ResponseOutputRefusalParam(type="refusal", refusal=text)
|
357
395
|
if refusal
|
@@ -359,22 +397,24 @@ def _openai_input_items_from_chat_message_assistant(
|
|
359
397
|
type="output_text", text=text, annotations=[]
|
360
398
|
)
|
361
399
|
)
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
400
|
+
|
401
|
+
if content_message_id not in messages_by_id:
|
402
|
+
messages_by_id[content_message_id] = []
|
403
|
+
messages_by_id[content_message_id].append(new_content)
|
404
|
+
|
405
|
+
# create ResponseOutputMessage for each unique ID
|
406
|
+
for msg_id, content_list in messages_by_id.items():
|
407
|
+
output_message = ResponseOutputMessageParam(
|
408
|
+
type="message",
|
409
|
+
role="assistant",
|
410
|
+
# this actually can be `None`, and it will in fact be `None` when the
|
411
|
+
# assistant message is synthesized by the scaffold as opposed to being
|
412
|
+
# replayed from the model (or when store=False)
|
413
|
+
id=msg_id, # type: ignore[typeddict-item]
|
414
|
+
content=content_list,
|
415
|
+
status="completed",
|
416
|
+
)
|
417
|
+
items.append(output_message)
|
378
418
|
|
379
419
|
return items + _tool_call_items_from_assistant_message(message, tool_message_ids)
|
380
420
|
|
@@ -395,11 +435,13 @@ def _model_tool_call_for_internal(
|
|
395
435
|
|
396
436
|
def _maybe_native_tool_param(
|
397
437
|
tool: ToolInfo,
|
438
|
+
model_name: str,
|
398
439
|
config: GenerateConfig,
|
399
440
|
) -> ToolParam | None:
|
400
441
|
return (
|
401
442
|
(
|
402
443
|
maybe_computer_use_preview_tool(tool)
|
444
|
+
or maybe_web_search_tool(model_name, tool)
|
403
445
|
# or self.text_editor_tool_param(tool)
|
404
446
|
# or self.bash_tool_param(tool)
|
405
447
|
)
|
@@ -442,32 +484,34 @@ def _tool_call_items_from_assistant_message(
|
|
442
484
|
|
443
485
|
def _ids_from_assistant_internal(
|
444
486
|
message: ChatMessageAssistant,
|
445
|
-
) ->
|
487
|
+
) -> dict[str, str]:
|
446
488
|
if message.internal is not None:
|
447
489
|
assert isinstance(message.internal, dict), (
|
448
490
|
"OpenAI ChatMessageAssistant internal must be an _AssistantInternal"
|
449
491
|
)
|
450
492
|
internal = cast(_AssistantInternal, message.internal)
|
451
|
-
return
|
493
|
+
return internal["tool_message_ids"]
|
452
494
|
else:
|
453
|
-
return
|
495
|
+
return {}
|
454
496
|
|
455
497
|
|
456
498
|
_ResponseToolCallParam = (
|
457
|
-
ResponseFunctionToolCallParam
|
499
|
+
ResponseFunctionToolCallParam
|
500
|
+
| ResponseComputerToolCallParam
|
501
|
+
| ResponseFunctionWebSearchParam
|
458
502
|
# | ResponseFileSearchToolCallParam
|
459
503
|
# | ResponseFunctionToolCallParam
|
460
|
-
# | ResponseFunctionWebSearchParam
|
461
504
|
)
|
462
505
|
|
463
506
|
|
464
507
|
def _tool_param_for_tool_info(
|
465
508
|
tool: ToolInfo,
|
509
|
+
model_name: str,
|
466
510
|
config: GenerateConfig,
|
467
511
|
) -> ToolParam:
|
468
512
|
# Use a native tool implementation when available. Otherwise, use the
|
469
513
|
# standard tool implementation
|
470
|
-
return _maybe_native_tool_param(tool, config) or FunctionToolParam(
|
514
|
+
return _maybe_native_tool_param(tool, model_name, config) or FunctionToolParam(
|
471
515
|
type="function",
|
472
516
|
name=_responses_tool_alias(tool.name),
|
473
517
|
description=tool.description,
|
@@ -0,0 +1,38 @@
|
|
1
|
+
from typing import cast
|
2
|
+
|
3
|
+
from openai.types.responses import WebSearchTool, WebSearchToolParam
|
4
|
+
|
5
|
+
from inspect_ai.tool._tool_info import ToolInfo
|
6
|
+
|
7
|
+
COMPATIBLE_MODELS = ["gpt-4o", "gpt-4o-mini", "gpt-4.1"]
|
8
|
+
|
9
|
+
|
10
|
+
def maybe_web_search_tool(model_name: str, tool: ToolInfo) -> WebSearchToolParam | None:
|
11
|
+
return (
|
12
|
+
_web_search_tool(tool.options["openai"])
|
13
|
+
if (
|
14
|
+
tool.name == "web_search"
|
15
|
+
and tool.options
|
16
|
+
and "openai" in tool.options
|
17
|
+
and model_name in COMPATIBLE_MODELS
|
18
|
+
)
|
19
|
+
else None
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
def _web_search_tool(maybe_openai_options: object) -> WebSearchToolParam:
|
24
|
+
if maybe_openai_options is None:
|
25
|
+
maybe_openai_options = {}
|
26
|
+
elif not isinstance(maybe_openai_options, dict):
|
27
|
+
raise TypeError(
|
28
|
+
f"Expected a dictionary for openai_options, got {type(maybe_openai_options)}"
|
29
|
+
)
|
30
|
+
openai_options = (
|
31
|
+
WebSearchTool.model_validate(
|
32
|
+
{"type": "web_search_preview", **maybe_openai_options}
|
33
|
+
)
|
34
|
+
if maybe_openai_options
|
35
|
+
else WebSearchTool(type="web_search_preview")
|
36
|
+
)
|
37
|
+
|
38
|
+
return cast(WebSearchToolParam, openai_options.model_dump(exclude_none=True))
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import functools
|
1
2
|
import json
|
2
3
|
import os
|
3
4
|
from copy import copy
|
@@ -151,7 +152,7 @@ class AzureAIAPI(ModelAPI):
|
|
151
152
|
|
152
153
|
# prepare request
|
153
154
|
request = dict(
|
154
|
-
messages=await chat_request_messages(input, handler),
|
155
|
+
messages=await chat_request_messages(input, handler, self.is_mistral()),
|
155
156
|
**self.completion_params(config),
|
156
157
|
)
|
157
158
|
# newer versions of vllm reject requests with tools or tool_choice if the
|
@@ -280,9 +281,77 @@ class AzureAIAPI(ModelAPI):
|
|
280
281
|
|
281
282
|
|
282
283
|
async def chat_request_messages(
|
283
|
-
messages: list[ChatMessage],
|
284
|
+
messages: list[ChatMessage],
|
285
|
+
handler: ChatAPIHandler | None,
|
286
|
+
is_mistral: bool = False,
|
287
|
+
) -> list[ChatRequestMessage]:
|
288
|
+
chat_messages = [
|
289
|
+
await chat_request_message(message, handler) for message in messages
|
290
|
+
]
|
291
|
+
if is_mistral:
|
292
|
+
chat_messages = functools.reduce(mistral_message_reducer, chat_messages, [])
|
293
|
+
|
294
|
+
return chat_messages
|
295
|
+
|
296
|
+
|
297
|
+
def mistral_message_reducer(
|
298
|
+
messages: list[ChatRequestMessage],
|
299
|
+
message: ChatRequestMessage,
|
284
300
|
) -> list[ChatRequestMessage]:
|
285
|
-
|
301
|
+
"""Fold any user messages found immediately after tool messages into the last tool message."""
|
302
|
+
if (
|
303
|
+
len(messages) > 0
|
304
|
+
and isinstance(messages[-1], ToolMessage)
|
305
|
+
and isinstance(message, UserMessage)
|
306
|
+
):
|
307
|
+
messages[-1] = fold_user_message_into_tool_message(messages[-1], message)
|
308
|
+
else:
|
309
|
+
messages.append(message)
|
310
|
+
|
311
|
+
return messages
|
312
|
+
|
313
|
+
|
314
|
+
def fold_user_message_into_tool_message(
|
315
|
+
tool_message: ToolMessage,
|
316
|
+
user_message: UserMessage,
|
317
|
+
) -> ToolMessage:
|
318
|
+
def convert_content_items_to_string(list_content: list[ContentItem]) -> str:
|
319
|
+
if not all(
|
320
|
+
isinstance(item, (TextContentItem | ImageContentItem))
|
321
|
+
for item in list_content
|
322
|
+
):
|
323
|
+
raise TypeError(
|
324
|
+
"Expected all items to be TextContentItem or ImageContentItem"
|
325
|
+
)
|
326
|
+
|
327
|
+
parts = []
|
328
|
+
for item in list_content:
|
329
|
+
if isinstance(item, TextContentItem):
|
330
|
+
parts.append(item.text)
|
331
|
+
elif isinstance(item, ImageContentItem):
|
332
|
+
parts.append(f"[Image: {item.image_url.url}]")
|
333
|
+
else:
|
334
|
+
raise ValueError("Unexpected content item type")
|
335
|
+
return "".join(parts)
|
336
|
+
|
337
|
+
def normalise_content(
|
338
|
+
content: str | list[ContentItem] | None,
|
339
|
+
) -> str | None:
|
340
|
+
return (
|
341
|
+
None
|
342
|
+
if content is None
|
343
|
+
else convert_content_items_to_string(content)
|
344
|
+
if isinstance(content, list)
|
345
|
+
else content
|
346
|
+
)
|
347
|
+
|
348
|
+
tool_content = normalise_content(tool_message.content)
|
349
|
+
user_content = normalise_content(user_message.content)
|
350
|
+
|
351
|
+
return ToolMessage(
|
352
|
+
content=(tool_content or "") + (user_content or ""),
|
353
|
+
tool_call_id=tool_message.tool_call_id,
|
354
|
+
)
|
286
355
|
|
287
356
|
|
288
357
|
async def chat_request_message(
|
@@ -42,6 +42,7 @@ from .._openai import (
|
|
42
42
|
openai_media_filter,
|
43
43
|
openai_should_retry,
|
44
44
|
)
|
45
|
+
from .._openai_responses import is_native_tool_configured
|
45
46
|
from .openai_o1 import generate_o1
|
46
47
|
from .util import environment_prerequisite_error, model_base_url
|
47
48
|
|
@@ -241,7 +242,9 @@ class OpenAIAPI(ModelAPI):
|
|
241
242
|
tools=tools,
|
242
243
|
**self.completion_params(config, False),
|
243
244
|
)
|
244
|
-
elif self.responses_api
|
245
|
+
elif self.responses_api or is_native_tool_configured(
|
246
|
+
tools, self.model_name, config
|
247
|
+
):
|
245
248
|
return await generate_responses(
|
246
249
|
client=self.client,
|
247
250
|
http_hooks=self._http_hooks,
|
@@ -59,7 +59,11 @@ async def generate_responses(
|
|
59
59
|
)
|
60
60
|
|
61
61
|
# prepare request (we do this so we can log the ModelCall)
|
62
|
-
tool_params =
|
62
|
+
tool_params = (
|
63
|
+
openai_responses_tools(tools, model_name, config)
|
64
|
+
if len(tools) > 0
|
65
|
+
else NOT_GIVEN
|
66
|
+
)
|
63
67
|
request = dict(
|
64
68
|
input=await openai_responses_inputs(input, model_name, store),
|
65
69
|
tools=tool_params,
|
inspect_ai/scorer/_metric.py
CHANGED
@@ -7,7 +7,6 @@ from typing import (
|
|
7
7
|
Protocol,
|
8
8
|
Type,
|
9
9
|
Union,
|
10
|
-
cast,
|
11
10
|
overload,
|
12
11
|
runtime_checkable,
|
13
12
|
)
|
@@ -356,7 +355,7 @@ def metric(
|
|
356
355
|
)
|
357
356
|
return metric
|
358
357
|
|
359
|
-
return metric_register(
|
358
|
+
return metric_register(metric_wrapper, metric_name)
|
360
359
|
|
361
360
|
# for decorators with an explicit name, one more wrapper for the name
|
362
361
|
if isinstance(name, str):
|
@@ -121,7 +121,7 @@ def pass_at(
|
|
121
121
|
def reduce(scores: list[Score]) -> Score:
|
122
122
|
def pass_at_k(values: list[float]) -> float:
|
123
123
|
total = len(scores)
|
124
|
-
correct = sum(1 for v in values if v
|
124
|
+
correct = sum(1 for v in values if v >= value)
|
125
125
|
if total - correct < k:
|
126
126
|
return 1.0
|
127
127
|
else:
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -290,7 +290,7 @@ class TaskState:
|
|
290
290
|
return self._tools
|
291
291
|
|
292
292
|
@tools.setter
|
293
|
-
def tools(self, tools:
|
293
|
+
def tools(self, tools: Sequence[Tool | ToolDef]) -> None:
|
294
294
|
self._tools.clear()
|
295
295
|
for tool in tools:
|
296
296
|
self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
|
@@ -353,7 +353,7 @@ class TaskState:
|
|
353
353
|
def completed(self) -> bool:
|
354
354
|
"""Is the task completed.
|
355
355
|
|
356
|
-
Additionally, checks
|
356
|
+
Additionally, checks for an operator interrupt of the sample.
|
357
357
|
"""
|
358
358
|
from inspect_ai.log._samples import set_active_sample_total_messages
|
359
359
|
|