inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +3 -2
- inspect_ai/_cli/cache.py +1 -1
- inspect_ai/_cli/common.py +15 -0
- inspect_ai/_cli/eval.py +4 -5
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_cli/sandbox.py +1 -1
- inspect_ai/_cli/trace.py +1 -1
- inspect_ai/_cli/view.py +1 -1
- inspect_ai/_display/core/config.py +3 -1
- inspect_ai/_eval/eval.py +55 -61
- inspect_ai/_eval/evalset.py +64 -154
- inspect_ai/_eval/loader.py +27 -54
- inspect_ai/_eval/registry.py +4 -15
- inspect_ai/_eval/run.py +7 -4
- inspect_ai/_eval/task/__init__.py +8 -2
- inspect_ai/_eval/task/log.py +9 -1
- inspect_ai/_eval/task/resolved.py +35 -0
- inspect_ai/_eval/task/run.py +4 -0
- inspect_ai/_eval/task/task.py +50 -69
- inspect_ai/_eval/task/tasks.py +30 -0
- inspect_ai/_util/constants.py +3 -0
- inspect_ai/_util/dotenv.py +17 -0
- inspect_ai/_util/logger.py +3 -0
- inspect_ai/_util/registry.py +43 -2
- inspect_ai/_view/server.py +28 -10
- inspect_ai/_view/www/dist/assets/index.css +32 -19
- inspect_ai/_view/www/dist/assets/index.js +17682 -29989
- inspect_ai/_view/www/log-schema.json +79 -9
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/appearance/styles.ts +6 -5
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
- inspect_ai/_view/www/src/constants.ts +3 -0
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
- inspect_ai/_view/www/src/types/log.d.ts +11 -5
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
- inspect_ai/_view/www/yarn.lock +12 -5
- inspect_ai/log/_log.py +10 -1
- inspect_ai/log/_recorders/eval.py +27 -8
- inspect_ai/log/_recorders/json.py +10 -2
- inspect_ai/log/_transcript.py +13 -4
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +15 -1
- inspect_ai/model/_model.py +30 -12
- inspect_ai/model/_model_output.py +6 -1
- inspect_ai/model/_openai.py +11 -6
- inspect_ai/model/_providers/anthropic.py +167 -77
- inspect_ai/model/_providers/google.py +6 -2
- inspect_ai/model/_providers/none.py +31 -0
- inspect_ai/model/_providers/openai.py +11 -8
- inspect_ai/model/_providers/providers.py +7 -0
- inspect_ai/model/_providers/vertex.py +5 -2
- inspect_ai/solver/_bridge/bridge.py +1 -1
- inspect_ai/solver/_chain.py +7 -6
- inspect_ai/tool/__init__.py +4 -0
- inspect_ai/tool/_tool_call.py +5 -2
- inspect_ai/tool/_tool_support_helpers.py +200 -0
- inspect_ai/tool/_tools/_bash_session.py +119 -0
- inspect_ai/tool/_tools/_computer/_computer.py +1 -1
- inspect_ai/tool/_tools/_text_editor.py +121 -0
- inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
- inspect_ai/tool/_tools/_web_search.py +2 -2
- inspect_ai/util/_json.py +28 -0
- inspect_ai/util/_sandbox/context.py +18 -8
- inspect_ai/util/_sandbox/docker/config.py +1 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -3
- inspect_ai/util/_sandbox/environment.py +17 -2
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
inspect_ai/model/_openai.py
CHANGED
@@ -52,19 +52,22 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
|
|
52
52
|
|
53
53
|
|
54
54
|
def is_o_series(name: str) -> bool:
|
55
|
-
|
55
|
+
if bool(re.match(r"^o\d+", name)):
|
56
|
+
return True
|
57
|
+
else:
|
58
|
+
return not is_gpt(name) and bool(re.search(r"o\d+", name))
|
56
59
|
|
57
60
|
|
58
61
|
def is_o1_mini(name: str) -> bool:
|
59
|
-
return
|
62
|
+
return "o1-mini" in name
|
60
63
|
|
61
64
|
|
62
65
|
def is_o1_preview(name: str) -> bool:
|
63
|
-
return
|
66
|
+
return "o1-preview" in name
|
64
67
|
|
65
68
|
|
66
69
|
def is_gpt(name: str) -> bool:
|
67
|
-
return
|
70
|
+
return "gpt" in name
|
68
71
|
|
69
72
|
|
70
73
|
def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
|
@@ -80,12 +83,13 @@ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
|
|
80
83
|
def openai_chat_tool_call_param(
|
81
84
|
tool_call: ToolCall,
|
82
85
|
) -> ChatCompletionMessageToolCallParam:
|
86
|
+
assert tool_call.type == "function", f"Unexpected tool call type {tool_call.type}"
|
83
87
|
return ChatCompletionMessageToolCallParam(
|
84
88
|
id=tool_call.id,
|
85
89
|
function=dict(
|
86
90
|
name=tool_call.function, arguments=json.dumps(tool_call.arguments)
|
87
91
|
),
|
88
|
-
type=
|
92
|
+
type="function", # Type narrowing couldn't figure it out
|
89
93
|
)
|
90
94
|
|
91
95
|
|
@@ -108,7 +112,8 @@ async def openai_chat_completion_part(
|
|
108
112
|
image_url=dict(url=image_url, detail=detail),
|
109
113
|
)
|
110
114
|
elif content.type == "audio":
|
111
|
-
|
115
|
+
audio_data_uri = await file_as_data_uri(content.audio)
|
116
|
+
audio_data = audio_data_uri.split("base64,")[1]
|
112
117
|
|
113
118
|
return ChatCompletionContentPartInputAudioParam(
|
114
119
|
type="input_audio", input_audio=dict(data=audio_data, format=content.format)
|
@@ -1,23 +1,12 @@
|
|
1
1
|
import functools
|
2
2
|
import os
|
3
3
|
import re
|
4
|
-
import sys
|
5
4
|
from copy import copy
|
6
5
|
from logging import getLogger
|
7
|
-
from typing import Any, Literal, Optional, Tuple,
|
6
|
+
from typing import Any, Literal, NamedTuple, Optional, Tuple, cast
|
8
7
|
|
9
8
|
import httpcore
|
10
9
|
import httpx
|
11
|
-
|
12
|
-
from inspect_ai._util.http import is_retryable_http_status
|
13
|
-
|
14
|
-
from .util.hooks import HttpxHooks
|
15
|
-
|
16
|
-
if sys.version_info >= (3, 11):
|
17
|
-
from typing import NotRequired
|
18
|
-
else:
|
19
|
-
from typing_extensions import NotRequired
|
20
|
-
|
21
10
|
from anthropic import (
|
22
11
|
APIConnectionError,
|
23
12
|
APIStatusError,
|
@@ -39,19 +28,19 @@ from anthropic.types import (
|
|
39
28
|
TextBlockParam,
|
40
29
|
ThinkingBlock,
|
41
30
|
ThinkingBlockParam,
|
31
|
+
ToolBash20250124Param,
|
42
32
|
ToolParam,
|
43
33
|
ToolResultBlockParam,
|
34
|
+
ToolTextEditor20250124Param,
|
44
35
|
ToolUseBlock,
|
45
36
|
ToolUseBlockParam,
|
46
37
|
message_create_params,
|
47
38
|
)
|
39
|
+
from anthropic.types.beta import BetaToolComputerUse20250124Param
|
48
40
|
from pydantic import JsonValue
|
49
41
|
from typing_extensions import override
|
50
42
|
|
51
|
-
from inspect_ai._util.constants import
|
52
|
-
BASE_64_DATA_REMOVED,
|
53
|
-
NO_CONTENT,
|
54
|
-
)
|
43
|
+
from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
|
55
44
|
from inspect_ai._util.content import (
|
56
45
|
Content,
|
57
46
|
ContentImage,
|
@@ -59,6 +48,7 @@ from inspect_ai._util.content import (
|
|
59
48
|
ContentText,
|
60
49
|
)
|
61
50
|
from inspect_ai._util.error import exception_message
|
51
|
+
from inspect_ai._util.http import is_retryable_http_status
|
62
52
|
from inspect_ai._util.images import file_as_data_uri
|
63
53
|
from inspect_ai._util.logger import warn_once
|
64
54
|
from inspect_ai._util.url import data_uri_mime_type, data_uri_to_base64
|
@@ -70,11 +60,14 @@ from .._model import ModelAPI
|
|
70
60
|
from .._model_call import ModelCall
|
71
61
|
from .._model_output import ChatCompletionChoice, ModelOutput, ModelUsage, StopReason
|
72
62
|
from .util import environment_prerequisite_error, model_base_url
|
63
|
+
from .util.hooks import HttpxHooks
|
73
64
|
|
74
65
|
logger = getLogger(__name__)
|
75
66
|
|
76
67
|
ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
|
77
68
|
|
69
|
+
INTERNAL_COMPUTER_TOOL_NAME = "computer"
|
70
|
+
|
78
71
|
|
79
72
|
class AnthropicAPI(ModelAPI):
|
80
73
|
def __init__(
|
@@ -93,7 +86,7 @@ class AnthropicAPI(ModelAPI):
|
|
93
86
|
else:
|
94
87
|
self.service = None
|
95
88
|
|
96
|
-
# collect
|
89
|
+
# collect generate model_args (then delete them so we can pass the rest on)
|
97
90
|
def collect_model_arg(name: str) -> Any | None:
|
98
91
|
nonlocal model_args
|
99
92
|
value = model_args.get(name, None)
|
@@ -193,14 +186,11 @@ class AnthropicAPI(ModelAPI):
|
|
193
186
|
|
194
187
|
# generate
|
195
188
|
try:
|
196
|
-
(
|
197
|
-
|
198
|
-
|
199
|
-
messages,
|
200
|
-
computer_use,
|
201
|
-
) = await self.resolve_chat_input(input, tools, config)
|
189
|
+
system_param, tools_param, messages = await self.resolve_chat_input(
|
190
|
+
input, tools, config
|
191
|
+
)
|
202
192
|
|
203
|
-
# prepare request params (
|
193
|
+
# prepare request params (assembled this way so we can log the raw model call)
|
204
194
|
request = dict(messages=messages)
|
205
195
|
|
206
196
|
# system messages and tools
|
@@ -218,7 +208,13 @@ class AnthropicAPI(ModelAPI):
|
|
218
208
|
|
219
209
|
# extra headers (for time tracker and computer use)
|
220
210
|
extra_headers = headers | {HttpxHooks.REQUEST_ID_HEADER: request_id}
|
221
|
-
if
|
211
|
+
if any(
|
212
|
+
tool.get("type", None) == "computer_20250124" for tool in tools_param
|
213
|
+
):
|
214
|
+
# From: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#claude-3-7-sonnet-beta-flag
|
215
|
+
# Note: The Bash (bash_20250124) and Text Editor (text_editor_20250124)
|
216
|
+
# tools are generally available for Claude 3.5 Sonnet (new) as well and
|
217
|
+
# can be used without the computer use beta header.
|
222
218
|
betas.append("computer-use-2025-01-24")
|
223
219
|
if len(betas) > 0:
|
224
220
|
extra_headers["anthropic-beta"] = ",".join(betas)
|
@@ -240,7 +236,9 @@ class AnthropicAPI(ModelAPI):
|
|
240
236
|
response = message.model_dump()
|
241
237
|
|
242
238
|
# extract output
|
243
|
-
output = model_output_from_message(
|
239
|
+
output = await model_output_from_message(
|
240
|
+
self.client, self.model_name, message, tools
|
241
|
+
)
|
244
242
|
|
245
243
|
# return output and call
|
246
244
|
return output, model_call()
|
@@ -403,9 +401,7 @@ class AnthropicAPI(ModelAPI):
|
|
403
401
|
input: list[ChatMessage],
|
404
402
|
tools: list[ToolInfo],
|
405
403
|
config: GenerateConfig,
|
406
|
-
) -> Tuple[
|
407
|
-
list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
|
408
|
-
]:
|
404
|
+
) -> Tuple[list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam]]:
|
409
405
|
# extract system message
|
410
406
|
system_messages, messages = split_system_messages(input, config)
|
411
407
|
|
@@ -418,7 +414,7 @@ class AnthropicAPI(ModelAPI):
|
|
418
414
|
)
|
419
415
|
|
420
416
|
# tools
|
421
|
-
tools_params
|
417
|
+
tools_params = [self.tool_param_for_tool_info(tool, config) for tool in tools]
|
422
418
|
|
423
419
|
# system messages
|
424
420
|
if len(system_messages) > 0:
|
@@ -468,40 +464,35 @@ class AnthropicAPI(ModelAPI):
|
|
468
464
|
add_cache_control(cast(dict[str, Any], content[-1]))
|
469
465
|
|
470
466
|
# return chat input
|
471
|
-
return system_param, tools_params, message_params
|
472
|
-
|
473
|
-
def
|
474
|
-
self,
|
475
|
-
) ->
|
476
|
-
# tool
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
467
|
+
return system_param, tools_params, message_params
|
468
|
+
|
469
|
+
def tool_param_for_tool_info(
|
470
|
+
self, tool: ToolInfo, config: GenerateConfig
|
471
|
+
) -> "ToolParamDef":
|
472
|
+
# Use a native tool implementation when available. Otherwise, use the
|
473
|
+
# standard tool implementation
|
474
|
+
return self.maybe_native_tool_param(tool, config) or ToolParam(
|
475
|
+
name=tool.name,
|
476
|
+
description=tool.description,
|
477
|
+
input_schema=tool.parameters.model_dump(exclude_none=True),
|
478
|
+
)
|
479
|
+
|
480
|
+
def maybe_native_tool_param(
|
481
|
+
self, tool: ToolInfo, config: GenerateConfig
|
482
|
+
) -> Optional["ToolParamDef"]:
|
483
|
+
return (
|
484
|
+
(
|
484
485
|
self.computer_use_tool_param(tool)
|
485
|
-
|
486
|
-
|
486
|
+
or self.text_editor_tool_param(tool)
|
487
|
+
or self.bash_tool_param(tool)
|
487
488
|
)
|
488
|
-
if
|
489
|
-
|
490
|
-
|
491
|
-
else:
|
492
|
-
tool_params.append(
|
493
|
-
ToolParam(
|
494
|
-
name=tool.name,
|
495
|
-
description=tool.description,
|
496
|
-
input_schema=tool.parameters.model_dump(exclude_none=True),
|
497
|
-
)
|
498
|
-
)
|
499
|
-
|
500
|
-
return tool_params, computer_use
|
489
|
+
if config.internal_tools is not False
|
490
|
+
else None
|
491
|
+
)
|
501
492
|
|
502
493
|
def computer_use_tool_param(
|
503
494
|
self, tool: ToolInfo
|
504
|
-
) -> Optional[
|
495
|
+
) -> Optional[BetaToolComputerUse20250124Param]:
|
505
496
|
# check for compatible 'computer' tool
|
506
497
|
if tool.name == "computer" and (
|
507
498
|
sorted(tool.parameters.properties.keys())
|
@@ -523,7 +514,7 @@ class AnthropicAPI(ModelAPI):
|
|
523
514
|
"Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
|
524
515
|
)
|
525
516
|
return None
|
526
|
-
return
|
517
|
+
return BetaToolComputerUse20250124Param(
|
527
518
|
type="computer_20250124",
|
528
519
|
name="computer",
|
529
520
|
# Note: The dimensions passed here for display_width_px and display_height_px should
|
@@ -540,23 +531,58 @@ class AnthropicAPI(ModelAPI):
|
|
540
531
|
else:
|
541
532
|
return None
|
542
533
|
|
534
|
+
def text_editor_tool_param(
|
535
|
+
self, tool: ToolInfo
|
536
|
+
) -> Optional[ToolTextEditor20250124Param]:
|
537
|
+
# check for compatible 'text editor' tool
|
538
|
+
if tool.name == "text_editor" and (
|
539
|
+
sorted(tool.parameters.properties.keys())
|
540
|
+
== sorted(
|
541
|
+
[
|
542
|
+
"command",
|
543
|
+
"file_text",
|
544
|
+
"insert_line",
|
545
|
+
"new_str",
|
546
|
+
"old_str",
|
547
|
+
"path",
|
548
|
+
"view_range",
|
549
|
+
]
|
550
|
+
)
|
551
|
+
):
|
552
|
+
return ToolTextEditor20250124Param(
|
553
|
+
type="text_editor_20250124", name="str_replace_editor"
|
554
|
+
)
|
555
|
+
# not a text_editor tool
|
556
|
+
else:
|
557
|
+
return None
|
543
558
|
|
544
|
-
|
545
|
-
#
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
559
|
+
def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
|
560
|
+
# check for compatible 'bash' tool
|
561
|
+
if tool.name == "bash_session" and (
|
562
|
+
sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
|
563
|
+
):
|
564
|
+
return ToolBash20250124Param(type="bash_20250124", name="bash")
|
565
|
+
# not a bash tool
|
566
|
+
else:
|
567
|
+
return None
|
552
568
|
|
553
569
|
|
554
|
-
# tools can be either a stock tool param or a special
|
555
|
-
ToolParamDef =
|
570
|
+
# tools can be either a stock tool param or a special Anthropic native use tool param
|
571
|
+
ToolParamDef = (
|
572
|
+
ToolParam
|
573
|
+
| BetaToolComputerUse20250124Param
|
574
|
+
| ToolTextEditor20250124Param
|
575
|
+
| ToolBash20250124Param
|
576
|
+
)
|
556
577
|
|
557
578
|
|
558
579
|
def add_cache_control(
|
559
|
-
param: TextBlockParam
|
580
|
+
param: TextBlockParam
|
581
|
+
| ToolParam
|
582
|
+
| BetaToolComputerUse20250124Param
|
583
|
+
| ToolTextEditor20250124Param
|
584
|
+
| ToolBash20250124Param
|
585
|
+
| dict[str, Any],
|
560
586
|
) -> None:
|
561
587
|
cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
|
562
588
|
|
@@ -565,10 +591,10 @@ def consecutive_user_message_reducer(
|
|
565
591
|
messages: list[MessageParam],
|
566
592
|
message: MessageParam,
|
567
593
|
) -> list[MessageParam]:
|
568
|
-
return
|
594
|
+
return consecutive_message_reducer(messages, message, "user")
|
569
595
|
|
570
596
|
|
571
|
-
def
|
597
|
+
def consecutive_message_reducer(
|
572
598
|
messages: list[MessageParam],
|
573
599
|
message: MessageParam,
|
574
600
|
role: Literal["user", "assistant"],
|
@@ -581,6 +607,7 @@ def consective_message_reducer(
|
|
581
607
|
|
582
608
|
|
583
609
|
def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
|
610
|
+
# TODO: Fix this code as it currently drops interesting properties when combining
|
584
611
|
role = a["role"]
|
585
612
|
a_content = a["content"]
|
586
613
|
b_content = b["content"]
|
@@ -700,7 +727,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
|
|
700
727
|
ToolUseBlockParam(
|
701
728
|
type="tool_use",
|
702
729
|
id=tool_call.id,
|
703
|
-
name=tool_call.function,
|
730
|
+
name=tool_call.internal_name or tool_call.function,
|
704
731
|
input=tool_call.arguments,
|
705
732
|
)
|
706
733
|
)
|
@@ -724,9 +751,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
|
|
724
751
|
)
|
725
752
|
|
726
753
|
|
727
|
-
def model_output_from_message(
|
754
|
+
async def model_output_from_message(
|
755
|
+
client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
|
756
|
+
model: str,
|
757
|
+
message: Message,
|
758
|
+
tools: list[ToolInfo],
|
759
|
+
) -> ModelOutput:
|
728
760
|
# extract content and tool calls
|
729
761
|
content: list[Content] = []
|
762
|
+
reasoning_tokens = 0
|
730
763
|
tool_calls: list[ToolCall] | None = None
|
731
764
|
|
732
765
|
for content_block in message.content:
|
@@ -741,11 +774,13 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
|
|
741
774
|
content.append(ContentText(type="text", text=content_text))
|
742
775
|
elif isinstance(content_block, ToolUseBlock):
|
743
776
|
tool_calls = tool_calls or []
|
777
|
+
info = maybe_mapped_call_info(content_block.name, tools)
|
744
778
|
tool_calls.append(
|
745
779
|
ToolCall(
|
746
|
-
type=
|
780
|
+
type=info.internal_type,
|
747
781
|
id=content_block.id,
|
748
|
-
function=
|
782
|
+
function=info.inspect_name,
|
783
|
+
internal_name=info.internal_name,
|
749
784
|
arguments=content_block.model_dump().get("input", {}),
|
750
785
|
)
|
751
786
|
)
|
@@ -754,6 +789,9 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
|
|
754
789
|
ContentReasoning(reasoning=content_block.data, redacted=True)
|
755
790
|
)
|
756
791
|
elif isinstance(content_block, ThinkingBlock):
|
792
|
+
reasoning_tokens += await count_tokens(
|
793
|
+
client, model, content_block.thinking
|
794
|
+
)
|
757
795
|
content.append(
|
758
796
|
ContentReasoning(
|
759
797
|
reasoning=content_block.thinking, signature=content_block.signature
|
@@ -787,7 +825,39 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
|
|
787
825
|
total_tokens=total_tokens,
|
788
826
|
input_tokens_cache_write=input_tokens_cache_write,
|
789
827
|
input_tokens_cache_read=input_tokens_cache_read,
|
828
|
+
reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
829
|
+
),
|
830
|
+
)
|
831
|
+
|
832
|
+
|
833
|
+
class CallInfo(NamedTuple):
|
834
|
+
internal_name: str | None
|
835
|
+
internal_type: str
|
836
|
+
inspect_name: str
|
837
|
+
|
838
|
+
|
839
|
+
def maybe_mapped_call_info(tool_called: str, tools: list[ToolInfo]) -> CallInfo:
|
840
|
+
"""
|
841
|
+
Return call info - potentially transformed by native tool mappings.
|
842
|
+
|
843
|
+
Anthropic prescribes names for their native tools - `computer`, `bash`, and
|
844
|
+
`str_replace_editor`. For a variety of reasons, Inspect's tool names to not
|
845
|
+
necessarily conform to internal names. Anthropic also provides specific tool
|
846
|
+
types for these built-in tools.
|
847
|
+
"""
|
848
|
+
mappings = (
|
849
|
+
(INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
|
850
|
+
("str_replace_editor", "text_editor_20250124", "text_editor"),
|
851
|
+
("bash", "bash_20250124", "bash_session"),
|
852
|
+
)
|
853
|
+
|
854
|
+
return next(
|
855
|
+
(
|
856
|
+
CallInfo(entry[0], entry[1], entry[2])
|
857
|
+
for entry in mappings
|
858
|
+
if entry[0] == tool_called and any(tool.name == entry[2] for tool in tools)
|
790
859
|
),
|
860
|
+
CallInfo(None, "function", tool_called),
|
791
861
|
)
|
792
862
|
|
793
863
|
|
@@ -852,6 +922,26 @@ async def message_param_content(
|
|
852
922
|
)
|
853
923
|
|
854
924
|
|
925
|
+
async def count_tokens(
|
926
|
+
client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
|
927
|
+
model: str,
|
928
|
+
text: str,
|
929
|
+
) -> int:
|
930
|
+
try:
|
931
|
+
response = await client.messages.count_tokens(
|
932
|
+
model=model,
|
933
|
+
messages=[{"role": "user", "content": text}],
|
934
|
+
)
|
935
|
+
return response.input_tokens
|
936
|
+
except Exception as e:
|
937
|
+
logger.warning(
|
938
|
+
f"Error counting tokens (falling back to estimated tokens): {str(e)}"
|
939
|
+
)
|
940
|
+
words = text.split()
|
941
|
+
estimated_tokens = int(len(words) * 1.3)
|
942
|
+
return estimated_tokens
|
943
|
+
|
944
|
+
|
855
945
|
def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
|
856
946
|
# remove base64 encoded images
|
857
947
|
if (
|
@@ -267,8 +267,12 @@ class GoogleGenAIAPI(ModelAPI):
|
|
267
267
|
import requests # type: ignore
|
268
268
|
|
269
269
|
# standard http errors
|
270
|
-
if
|
271
|
-
|
270
|
+
if (
|
271
|
+
isinstance(ex, APIError)
|
272
|
+
and isinstance(ex.status, str)
|
273
|
+
and ex.status.isdigit()
|
274
|
+
):
|
275
|
+
return is_retryable_http_status(int(ex.status))
|
272
276
|
|
273
277
|
# low-level requests exceptions
|
274
278
|
elif isinstance(ex, requests.exceptions.RequestException):
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from inspect_ai._util.error import PrerequisiteError
|
2
|
+
from inspect_ai.tool import ToolChoice, ToolInfo
|
3
|
+
|
4
|
+
from .._chat_message import ChatMessage
|
5
|
+
from .._generate_config import GenerateConfig
|
6
|
+
from .._model import ModelAPI
|
7
|
+
from .._model_output import ModelOutput
|
8
|
+
|
9
|
+
|
10
|
+
class NoModel(ModelAPI):
|
11
|
+
"""A sentinel model type indicating there is no model specified."""
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
model_name: str = "none",
|
16
|
+
base_url: str | None = None,
|
17
|
+
api_key: str | None = None,
|
18
|
+
config: GenerateConfig = GenerateConfig(),
|
19
|
+
) -> None:
|
20
|
+
super().__init__(model_name, base_url, api_key, [], config)
|
21
|
+
|
22
|
+
async def generate(
|
23
|
+
self,
|
24
|
+
input: list[ChatMessage],
|
25
|
+
tools: list[ToolInfo],
|
26
|
+
tool_choice: ToolChoice,
|
27
|
+
config: GenerateConfig,
|
28
|
+
) -> ModelOutput:
|
29
|
+
raise PrerequisiteError(
|
30
|
+
"No model specified (and no INSPECT_EVAL_MODEL defined)"
|
31
|
+
)
|
@@ -67,6 +67,16 @@ class OpenAIAPI(ModelAPI):
|
|
67
67
|
config: GenerateConfig = GenerateConfig(),
|
68
68
|
**model_args: Any,
|
69
69
|
) -> None:
|
70
|
+
# extract azure service prefix from model name (other providers
|
71
|
+
# that subclass from us like together expect to have the qualifier
|
72
|
+
# in the model name e.g. google/gemma-2b-it)
|
73
|
+
parts = model_name.split("/")
|
74
|
+
if parts[0] == "azure" and len(parts) > 1:
|
75
|
+
self.service: str | None = parts[0]
|
76
|
+
model_name = "/".join(parts[1:])
|
77
|
+
else:
|
78
|
+
self.service = None
|
79
|
+
|
70
80
|
# call super
|
71
81
|
super().__init__(
|
72
82
|
model_name=model_name,
|
@@ -76,14 +86,6 @@ class OpenAIAPI(ModelAPI):
|
|
76
86
|
config=config,
|
77
87
|
)
|
78
88
|
|
79
|
-
# extract any service prefix from model name
|
80
|
-
parts = model_name.split("/")
|
81
|
-
if len(parts) > 1:
|
82
|
-
self.service: str | None = parts[0]
|
83
|
-
model_name = "/".join(parts[1:])
|
84
|
-
else:
|
85
|
-
self.service = None
|
86
|
-
|
87
89
|
# resolve api_key
|
88
90
|
if not self.api_key:
|
89
91
|
self.api_key = os.environ.get(
|
@@ -322,6 +324,7 @@ class OpenAIAPI(ModelAPI):
|
|
322
324
|
config.reasoning_effort is not None
|
323
325
|
and not self.is_gpt()
|
324
326
|
and not self.is_o1_mini()
|
327
|
+
and not self.is_o1_preview()
|
325
328
|
):
|
326
329
|
params["reasoning_effort"] = config.reasoning_effort
|
327
330
|
if config.response_schema is not None:
|
@@ -250,6 +250,13 @@ def mockllm() -> type[ModelAPI]:
|
|
250
250
|
return MockLLM
|
251
251
|
|
252
252
|
|
253
|
+
@modelapi(name="none")
|
254
|
+
def none() -> type[ModelAPI]:
|
255
|
+
from .none import NoModel
|
256
|
+
|
257
|
+
return NoModel
|
258
|
+
|
259
|
+
|
253
260
|
@modelapi("goodfire")
|
254
261
|
def goodfire() -> type[ModelAPI]:
|
255
262
|
"""Get the Goodfire API provider."""
|
@@ -34,8 +34,8 @@ from inspect_ai._util.content import (
|
|
34
34
|
Content,
|
35
35
|
ContentAudio,
|
36
36
|
ContentImage,
|
37
|
+
ContentReasoning,
|
37
38
|
ContentText,
|
38
|
-
ContentVideo,
|
39
39
|
)
|
40
40
|
from inspect_ai._util.http import is_retryable_http_status
|
41
41
|
from inspect_ai._util.images import file_as_data
|
@@ -336,10 +336,13 @@ async def content_part(content: Content | str) -> Part:
|
|
336
336
|
elif isinstance(content, ContentImage):
|
337
337
|
image_bytes, mime_type = await file_as_data(content.image)
|
338
338
|
return Part.from_image(image=Image.from_bytes(data=image_bytes))
|
339
|
+
elif isinstance(content, ContentReasoning):
|
340
|
+
return Part.from_text(content.reasoning or NO_CONTENT)
|
339
341
|
else:
|
340
342
|
if isinstance(content, ContentAudio):
|
341
343
|
file = content.audio
|
342
|
-
|
344
|
+
else:
|
345
|
+
# it's ContentVideo
|
343
346
|
file = content.video
|
344
347
|
file_bytes, mime_type = await file_as_data(file)
|
345
348
|
return Part.from_data(file_bytes, mime_type)
|
@@ -17,7 +17,7 @@ from .._task_state import TaskState
|
|
17
17
|
def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
|
18
18
|
"""Bridge an external agent into an Inspect Solver.
|
19
19
|
|
20
|
-
See documentation at <https://inspect.
|
20
|
+
See documentation at <https://inspect.aisi.org.uk/agent-bridge.html>
|
21
21
|
|
22
22
|
Args:
|
23
23
|
agent: Callable which takes a sample `dict` and returns a result `dict`.
|
inspect_ai/solver/_chain.py
CHANGED
@@ -2,10 +2,11 @@ from typing import Sequence, overload
|
|
2
2
|
|
3
3
|
from typing_extensions import override
|
4
4
|
|
5
|
-
from ._solver import Generate, Solver
|
5
|
+
from ._solver import Generate, Solver, solver
|
6
6
|
from ._task_state import TaskState
|
7
7
|
|
8
8
|
|
9
|
+
@solver
|
9
10
|
def chain(*solvers: Solver | list[Solver]) -> Solver:
|
10
11
|
"""Compose a solver from multiple other solvers.
|
11
12
|
|
@@ -22,8 +23,8 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
|
|
22
23
|
"""
|
23
24
|
# flatten lists and chains
|
24
25
|
all_solvers: list[Solver] = []
|
25
|
-
for
|
26
|
-
all_solvers.extend(unroll(
|
26
|
+
for s in solvers:
|
27
|
+
all_solvers.extend(unroll(s))
|
27
28
|
|
28
29
|
return Chain(all_solvers)
|
29
30
|
|
@@ -72,9 +73,9 @@ class Chain(Sequence[Solver], Solver):
|
|
72
73
|
) -> TaskState:
|
73
74
|
from ._transcript import solver_transcript
|
74
75
|
|
75
|
-
for
|
76
|
-
with solver_transcript(
|
77
|
-
state = await
|
76
|
+
for slv in self._solvers:
|
77
|
+
with solver_transcript(slv, state) as st:
|
78
|
+
state = await slv(state, generate)
|
78
79
|
st.complete(state)
|
79
80
|
if state.completed:
|
80
81
|
break
|
inspect_ai/tool/__init__.py
CHANGED
@@ -22,17 +22,21 @@ from ._tool_def import ToolDef
|
|
22
22
|
from ._tool_info import ToolInfo
|
23
23
|
from ._tool_params import ToolParam, ToolParams
|
24
24
|
from ._tool_with import tool_with
|
25
|
+
from ._tools._bash_session import bash_session
|
25
26
|
from ._tools._computer import computer
|
26
27
|
from ._tools._execute import bash, python
|
28
|
+
from ._tools._text_editor import text_editor
|
27
29
|
from ._tools._web_browser import web_browser
|
28
30
|
from ._tools._web_search import web_search
|
29
31
|
|
30
32
|
__all__ = [
|
31
33
|
"bash",
|
34
|
+
"bash_session",
|
32
35
|
"computer",
|
33
36
|
"python",
|
34
37
|
"web_browser",
|
35
38
|
"web_search",
|
39
|
+
"text_editor",
|
36
40
|
"tool",
|
37
41
|
"tool_with",
|
38
42
|
"Tool",
|