inspect-ai 0.3.94__py3-none-any.whl → 0.3.95__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/task/run.py +12 -6
- inspect_ai/_util/exception.py +4 -0
- inspect_ai/_util/hash.py +39 -0
- inspect_ai/_util/path.py +22 -0
- inspect_ai/_util/trace.py +1 -1
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/dist/assets/index.css +9 -9
- inspect_ai/_view/www/dist/assets/index.js +117 -120
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
- inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
- inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
- inspect_ai/_view/www/src/app/types.ts +12 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
- inspect_ai/_view/www/src/state/hooks.ts +19 -3
- inspect_ai/_view/www/src/state/logSlice.ts +23 -5
- inspect_ai/_view/www/yarn.lock +9 -9
- inspect_ai/agent/_bridge/patch.py +1 -3
- inspect_ai/analysis/__init__.py +0 -0
- inspect_ai/analysis/beta/__init__.py +57 -0
- inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
- inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
- inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
- inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
- inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
- inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
- inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
- inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
- inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
- inspect_ai/analysis/beta/_dataframe/record.py +377 -0
- inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
- inspect_ai/analysis/beta/_dataframe/util.py +157 -0
- inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
- inspect_ai/log/_file.py +1 -1
- inspect_ai/log/_log.py +21 -1
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_model.py +6 -4
- inspect_ai/model/_openai_responses.py +17 -18
- inspect_ai/model/_providers/anthropic.py +30 -5
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/solver/_multiple_choice.py +4 -1
- inspect_ai/solver/_task_state.py +7 -3
- inspect_ai/tool/_mcp/_context.py +3 -5
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
- inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
- inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
- inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
- inspect_ai/util/_sandbox/events.py +3 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +70 -43
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,171 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from logging import getLogger
|
4
|
+
from typing import Any, Iterator, Mapping, Type
|
5
|
+
|
6
|
+
import jsonref # type: ignore
|
7
|
+
from jsonpath_ng import Fields, Index, JSONPath, Slice, Where, WhereNot # type: ignore
|
8
|
+
from jsonpath_ng.ext.filter import Filter # type: ignore
|
9
|
+
from pydantic import BaseModel
|
10
|
+
|
11
|
+
logger = getLogger(__name__)
|
12
|
+
|
13
|
+
Schema = Mapping[str, Any]
|
14
|
+
|
15
|
+
|
16
|
+
def resolved_schema(model: Type[BaseModel]) -> Schema:
|
17
|
+
schema_dict = model.model_json_schema()
|
18
|
+
base = "file:///memory/inspect_schema.json"
|
19
|
+
schema: Schema = jsonref.replace_refs(
|
20
|
+
schema_dict, base_uri=base, jsonschema=True, proxies=False
|
21
|
+
)
|
22
|
+
return schema
|
23
|
+
|
24
|
+
|
25
|
+
def jsonpath_in_schema(expr: JSONPath, schema: Schema) -> bool:
|
26
|
+
# don't validate unsupported constructs
|
27
|
+
if find_unsupported(expr):
|
28
|
+
return True
|
29
|
+
|
30
|
+
def descend(sch: Schema, tok: str | int | None) -> list[Schema]:
|
31
|
+
# First, branch through anyOf/oneOf/allOf
|
32
|
+
outs: list[Schema] = []
|
33
|
+
for branch in _expand_union(sch):
|
34
|
+
outs.extend(descend_concrete(branch, tok))
|
35
|
+
return outs
|
36
|
+
|
37
|
+
def descend_concrete(sch: Schema, tok: str | int | None) -> list[Schema]:
|
38
|
+
# totally open object – accept any child
|
39
|
+
if sch == {}:
|
40
|
+
return [{}] # stay alive, accept any key
|
41
|
+
|
42
|
+
outs: list[Schema] = []
|
43
|
+
|
44
|
+
def open_dict(node: Schema) -> None:
|
45
|
+
"""Append the schema that governs unknown keys.
|
46
|
+
|
47
|
+
- None / missing -> open object -> {}
|
48
|
+
- True -> open object -> {}
|
49
|
+
- Mapping -> that mapping (could be {} or a real subschema)
|
50
|
+
- False -> closed object -> (do nothing)
|
51
|
+
"""
|
52
|
+
if "additionalProperties" not in node:
|
53
|
+
if not node.get("properties"):
|
54
|
+
outs.append({})
|
55
|
+
else:
|
56
|
+
ap = node["additionalProperties"]
|
57
|
+
if ap is True:
|
58
|
+
outs.append({})
|
59
|
+
elif isinstance(ap, Mapping): # {} or {...}
|
60
|
+
outs.append(ap)
|
61
|
+
# ap is False -> closed dict -> ignore
|
62
|
+
|
63
|
+
# Wildcard -----------------------------------------------------------
|
64
|
+
if tok is None:
|
65
|
+
if "properties" in sch:
|
66
|
+
outs.extend(sch["properties"].values())
|
67
|
+
if "object" in _types(sch):
|
68
|
+
open_dict(sch)
|
69
|
+
if "array" in _types(sch) and "items" in sch:
|
70
|
+
outs.extend(_normalize_items(sch["items"]))
|
71
|
+
return outs
|
72
|
+
|
73
|
+
# Property access ----------------------------------------------------
|
74
|
+
if isinstance(tok, str):
|
75
|
+
if "properties" in sch and tok in sch["properties"]:
|
76
|
+
outs.append(sch["properties"][tok])
|
77
|
+
elif "additionalProperties" in sch: # PRESENCE, not truthiness
|
78
|
+
open_dict(sch)
|
79
|
+
elif "object" in _types(sch):
|
80
|
+
open_dict(sch)
|
81
|
+
|
82
|
+
# Array index --------------------------------------------------------
|
83
|
+
else: # tok is int or None from an Index node
|
84
|
+
if "array" in _types(sch) and "items" in sch:
|
85
|
+
outs.extend(_normalize_items(sch["items"], index=tok))
|
86
|
+
|
87
|
+
return outs
|
88
|
+
|
89
|
+
def _types(sch: Schema) -> set[str]:
|
90
|
+
t = sch.get("type")
|
91
|
+
return set(t) if isinstance(t, list) else {t} if t else set()
|
92
|
+
|
93
|
+
def _normalize_items(items: Any, index: int | None = None) -> list[Schema]:
|
94
|
+
if isinstance(items, list):
|
95
|
+
if index is None: # wildcard/slice
|
96
|
+
return items
|
97
|
+
if 0 <= index < len(items):
|
98
|
+
return [items[index]]
|
99
|
+
return []
|
100
|
+
if isinstance(items, Mapping):
|
101
|
+
return [items]
|
102
|
+
return []
|
103
|
+
|
104
|
+
states = [schema]
|
105
|
+
for tok in iter_tokens(expr):
|
106
|
+
next_states: list[Schema] = []
|
107
|
+
for st in states:
|
108
|
+
next_states.extend(descend(st, tok))
|
109
|
+
if not next_states: # nothing matched this segment
|
110
|
+
return False
|
111
|
+
states = next_states
|
112
|
+
return True # every segment found at least one schema
|
113
|
+
|
114
|
+
|
115
|
+
def iter_tokens(node: JSONPath) -> Iterator[str | int | None]:
|
116
|
+
"""Linearise a jsonpath-ng AST into a stream of tokens we care about."""
|
117
|
+
if hasattr(node, "left"): # Child, Descendants, etc.
|
118
|
+
yield from iter_tokens(node.left)
|
119
|
+
yield from iter_tokens(node.right)
|
120
|
+
elif isinstance(node, Fields):
|
121
|
+
yield from node.fields # e.g. ["foo"]
|
122
|
+
elif isinstance(node, Index):
|
123
|
+
yield node.index # 0 / -1 / None for wildcard
|
124
|
+
elif isinstance(node, Slice):
|
125
|
+
yield None # treat any slice as wildcard
|
126
|
+
|
127
|
+
|
128
|
+
COMBINATORS = ("anyOf", "oneOf", "allOf")
|
129
|
+
|
130
|
+
|
131
|
+
def _expand_union(sch: Schema) -> list[Schema]:
|
132
|
+
"""Return sch itself or the list of subschemas if it is a combinator."""
|
133
|
+
for key in COMBINATORS:
|
134
|
+
if key in sch:
|
135
|
+
subs: list[Schema] = []
|
136
|
+
for sub in sch[key]:
|
137
|
+
# a sub-schema might itself be an anyOf/oneOf/allOf
|
138
|
+
subs.extend(_expand_union(sub))
|
139
|
+
return subs
|
140
|
+
return [sch]
|
141
|
+
|
142
|
+
|
143
|
+
UNSUPPORTED: tuple[type[JSONPath], ...] = (
|
144
|
+
Filter, # [?foo > 0]
|
145
|
+
Where, # .foo[(@.bar < 42)]
|
146
|
+
WhereNot,
|
147
|
+
Slice, # [1:5] (wildcard “[*]” is Index/None, not Slice)
|
148
|
+
)
|
149
|
+
|
150
|
+
|
151
|
+
def find_unsupported(node: JSONPath) -> list[type[JSONPath]]:
|
152
|
+
"""Return a list of node types present in `node` that we do not validate."""
|
153
|
+
bad: list[type[JSONPath]] = []
|
154
|
+
stack: list[JSONPath] = [node]
|
155
|
+
while stack:
|
156
|
+
n = stack.pop()
|
157
|
+
if isinstance(n, UNSUPPORTED):
|
158
|
+
bad.append(type(n))
|
159
|
+
# Drill into children (jsonpath-ng uses .left / .right / .child attributes)
|
160
|
+
for attr in ("left", "right", "child", "expression"):
|
161
|
+
stack.extend(
|
162
|
+
[getattr(n, attr)]
|
163
|
+
if hasattr(n, attr) and isinstance(getattr(n, attr), JSONPath)
|
164
|
+
else []
|
165
|
+
)
|
166
|
+
# handle containers like Fields(fields=[...]) and Index(index=[...])
|
167
|
+
if hasattr(n, "__dict__"):
|
168
|
+
for v in n.__dict__.values():
|
169
|
+
if isinstance(v, list):
|
170
|
+
stack.extend(x for x in v if isinstance(x, JSONPath))
|
171
|
+
return bad
|
inspect_ai/log/_file.py
CHANGED
@@ -524,7 +524,7 @@ def manifest_eval_log_name(info: EvalLogInfo, log_dir: str, sep: str) -> str:
|
|
524
524
|
|
525
525
|
def log_files_from_ls(
|
526
526
|
ls: list[FileInfo],
|
527
|
-
formats: list[Literal["eval", "json"]] | None,
|
527
|
+
formats: list[Literal["eval", "json"]] | None = None,
|
528
528
|
descending: bool = True,
|
529
529
|
) -> list[EvalLogInfo]:
|
530
530
|
extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
|
inspect_ai/log/_log.py
CHANGED
@@ -17,9 +17,11 @@ from pydantic import (
|
|
17
17
|
)
|
18
18
|
from rich.console import Console, RenderableType
|
19
19
|
from rich.traceback import Traceback
|
20
|
+
from shortuuid import uuid
|
20
21
|
|
21
|
-
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
|
22
|
+
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, DESERIALIZING, PKG_NAME
|
22
23
|
from inspect_ai._util.error import EvalError, exception_message
|
24
|
+
from inspect_ai._util.hash import base57_id_hash
|
23
25
|
from inspect_ai._util.logger import warn_once
|
24
26
|
from inspect_ai.approval._policy import ApprovalPolicyConfig
|
25
27
|
from inspect_ai.dataset._dataset import MT, metadata_as
|
@@ -677,6 +679,9 @@ class EvalModelConfig(BaseModel):
|
|
677
679
|
class EvalSpec(BaseModel):
|
678
680
|
"""Eval target and configuration."""
|
679
681
|
|
682
|
+
eval_id: str = Field(default_factory=str)
|
683
|
+
"""Globally unique id for eval."""
|
684
|
+
|
680
685
|
run_id: str = Field(default_factory=str)
|
681
686
|
"""Unique run id"""
|
682
687
|
|
@@ -757,6 +762,21 @@ class EvalSpec(BaseModel):
|
|
757
762
|
# allow field model_args
|
758
763
|
model_config = ConfigDict(protected_namespaces=())
|
759
764
|
|
765
|
+
def model_post_init(self, __context: Any) -> None:
|
766
|
+
# check if deserializing
|
767
|
+
is_deserializing = isinstance(__context, dict) and __context.get(
|
768
|
+
DESERIALIZING, False
|
769
|
+
)
|
770
|
+
|
771
|
+
# Generate eval_id if needed
|
772
|
+
if self.eval_id == "":
|
773
|
+
if is_deserializing:
|
774
|
+
# we want the eval_id to be stable across reads of the eval log so we compose it
|
775
|
+
# as a hash that matches the size/apperance of shortuuid-based uuids
|
776
|
+
self.eval_id = base57_id_hash(self.run_id + self.task_id + self.created)
|
777
|
+
else:
|
778
|
+
self.eval_id = uuid()
|
779
|
+
|
760
780
|
@model_validator(mode="before")
|
761
781
|
@classmethod
|
762
782
|
def read_sandbox_spec(
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -39,6 +39,7 @@ from inspect_ai._util.content import (
|
|
39
39
|
ContentText,
|
40
40
|
ContentVideo,
|
41
41
|
)
|
42
|
+
from inspect_ai._util.exception import TerminateSampleError
|
42
43
|
from inspect_ai._util.format import format_function_call
|
43
44
|
from inspect_ai._util.logger import warn_once
|
44
45
|
from inspect_ai._util.registry import registry_unqualified_name
|
@@ -376,7 +377,7 @@ async def call_tool(
|
|
376
377
|
transcript()._event(
|
377
378
|
SampleLimitEvent(type="operator", limit=1, message=message)
|
378
379
|
)
|
379
|
-
raise
|
380
|
+
raise TerminateSampleError(message)
|
380
381
|
else:
|
381
382
|
raise ToolApprovalError(approval.explanation if approval else None)
|
382
383
|
if approval and approval.modified:
|
inspect_ai/model/_model.py
CHANGED
@@ -1237,9 +1237,10 @@ def tool_result_images_as_user_message(
|
|
1237
1237
|
|
1238
1238
|
Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
|
1239
1239
|
"""
|
1240
|
-
init_accum: ImagesAccumulator = ([], [], [])
|
1241
1240
|
chat_messages, user_message_content, tool_call_ids = functools.reduce(
|
1242
|
-
tool_result_images_reducer,
|
1241
|
+
tool_result_images_reducer,
|
1242
|
+
messages,
|
1243
|
+
(list[ChatMessage](), list[Content](), list[str]()),
|
1243
1244
|
)
|
1244
1245
|
# if the last message was a tool result, we may need to flush the pending stuff here
|
1245
1246
|
return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
|
@@ -1265,9 +1266,10 @@ def tool_result_images_reducer(
|
|
1265
1266
|
and isinstance(message.content, list)
|
1266
1267
|
and any([isinstance(c, ContentImage) for c in message.content])
|
1267
1268
|
):
|
1268
|
-
init_accum: ImageContentAccumulator = ([], [])
|
1269
1269
|
new_user_message_content, edited_tool_message_content = functools.reduce(
|
1270
|
-
tool_result_image_content_reducer,
|
1270
|
+
tool_result_image_content_reducer,
|
1271
|
+
message.content,
|
1272
|
+
(list[Content](), list[Content]()),
|
1271
1273
|
)
|
1272
1274
|
|
1273
1275
|
return (
|
@@ -184,24 +184,23 @@ def openai_responses_chat_choices(
|
|
184
184
|
# │ │ ┌───────────────────┐ │ │ │ │ ┌───────────────────┐ │ │ │ │ ┌───────────────────┐ │ │
|
185
185
|
# │ │ │ type: "reasoning" │ │ │ │ │ │ ContentText │ │ │ │ │ │ type: "reasoning" │ │ │
|
186
186
|
# │ │ │ id: "rs_bbbbbb" │ │ │ │ │ │ text: "" │ │ │ │ │ │ id: "rs_bbbbbb" │ │ │
|
187
|
-
# │ │ │ summary: [] │ │ │ │ │
|
188
|
-
# │ │
|
189
|
-
# │ │
|
190
|
-
# │ │ │
|
191
|
-
# │ │ │
|
192
|
-
# │ │ │
|
193
|
-
# │ │ │
|
194
|
-
# │ │ │ │
|
195
|
-
# │ │ │ │
|
196
|
-
# │ │ │ │
|
197
|
-
# │ │ │ │
|
198
|
-
# │ │ │ │
|
199
|
-
# │ │ │
|
200
|
-
# │ │
|
201
|
-
# │
|
202
|
-
# │ │
|
203
|
-
#
|
204
|
-
# └───────────────────────────┘ │ │ │ "msg_ccccccc" │ │ │
|
187
|
+
# │ │ │ summary: [] │ │ │ │ │ ├───────────────────┤ │ │ │ │ │ summary: [] │ │ │
|
188
|
+
# │ │ ├───────────────────┤ │ │ │ │ │ ContentText │ │ │ │ │ ├───────────────────┤ │ │
|
189
|
+
# │ │ │ type: "message" │ │ │ │ │ │ text: "text1" │ │ │ │ │ │ type: "message" │ │ │
|
190
|
+
# │ │ │ id: "msg_ccccccc" │ │ │ │ │ ├───────────────────┤ │ │ │ │ │ id: "msg_ccccccc" │ │ │
|
191
|
+
# │ │ │ role: "assistant" │ │ │ │ │ │ ContentText │ │ │ │ │ │ role: "assistant" │ │ │
|
192
|
+
# │ │ │ ┌───────────────┐ │ │ │ -> │ │ │ text: "text2" │ │ │ -> │ │ │ ┌───────────────┐ │ │ │
|
193
|
+
# │ │ │ │ Content │ │ │ │ │ │ └───────────────────┘ │ │ │ │ │ │ Content │ │ │ │
|
194
|
+
# │ │ │ │ ┌───────────┐ │ │ │ │ │ └───────────────────────┘ │ │ │ │ │ ┌───────────┐ │ │ │ │
|
195
|
+
# │ │ │ │ │"text1" │ │ │ │ │ │ ┌───────────────────────┐ │ │ │ │ │ │"text1" │ │ │ │ │
|
196
|
+
# │ │ │ │ ├───────────┤ │ │ │ │ │ │ internal │ │ │ │ │ │ ├───────────┤ │ │ │ │
|
197
|
+
# │ │ │ │ │"text2" │ │ │ │ │ │ │ ┌───────────────────┐ │ │ │ │ │ │ │"text2" │ │ │ │ │
|
198
|
+
# │ │ │ │ └───────────┘ │ │ │ │ │ │ │ reasoning_id: │ │ │ │ │ │ │ └───────────┘ │ │ │ │
|
199
|
+
# │ │ │ └───────────────┘ │ │ │ │ │ │ "rs_bbbbbb" │ │ │ │ │ │ └───────────────┘ │ │ │
|
200
|
+
# │ │ └───────────────────┘ │ │ │ │ └───────────────────┘ │ │ │ │ └───────────────────┘ │ │
|
201
|
+
# │ └───────────────────────┘ │ │ │ ┌───────────────────┐ │ │ │ └───────────────────────┘ │
|
202
|
+
# └───────────────────────────┘ │ │ │ output_msg_id: │ │ │ └───────────────────────────┘
|
203
|
+
# │ │ │ "msg_ccccccc" │ │ │
|
205
204
|
# │ │ └───────────────────┘ │ │
|
206
205
|
# │ └───────────────────────┘ │
|
207
206
|
# └───────────────────────────┘
|
@@ -33,7 +33,10 @@ from anthropic.types import (
|
|
33
33
|
ToolUseBlockParam,
|
34
34
|
message_create_params,
|
35
35
|
)
|
36
|
-
from anthropic.types.beta import
|
36
|
+
from anthropic.types.beta import (
|
37
|
+
BetaToolComputerUse20250124Param,
|
38
|
+
BetaToolTextEditor20241022Param,
|
39
|
+
)
|
37
40
|
from pydantic import JsonValue
|
38
41
|
from typing_extensions import override
|
39
42
|
|
@@ -218,6 +221,8 @@ class AnthropicAPI(ModelAPI):
|
|
218
221
|
# tools are generally available for Claude 3.5 Sonnet (new) as well and
|
219
222
|
# can be used without the computer use beta header.
|
220
223
|
betas.append("computer-use-2025-01-24")
|
224
|
+
if any("20241022" in str(tool.get("type", "")) for tool in tools_param):
|
225
|
+
betas.append("computer-use-2024-10-22")
|
221
226
|
if len(betas) > 0:
|
222
227
|
extra_headers["anthropic-beta"] = ",".join(betas)
|
223
228
|
|
@@ -337,6 +342,15 @@ class AnthropicAPI(ModelAPI):
|
|
337
342
|
@override
|
338
343
|
def should_retry(self, ex: Exception) -> bool:
|
339
344
|
if isinstance(ex, APIStatusError):
|
345
|
+
# for unknown reasons, anthropic does not always set status_code == 529
|
346
|
+
# for "overloaded_error" so we check for it explicitly
|
347
|
+
if (
|
348
|
+
isinstance(ex.body, dict)
|
349
|
+
and ex.body.get("error", {}).get("type", "") == "overloaded_error"
|
350
|
+
):
|
351
|
+
return True
|
352
|
+
|
353
|
+
# standard http status code checking
|
340
354
|
return is_retryable_http_status(ex.status_code)
|
341
355
|
elif httpx_should_retry(ex):
|
342
356
|
return True
|
@@ -545,7 +559,7 @@ class AnthropicAPI(ModelAPI):
|
|
545
559
|
|
546
560
|
def text_editor_tool_param(
|
547
561
|
self, tool: ToolInfo
|
548
|
-
) ->
|
562
|
+
) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
|
549
563
|
# check for compatible 'text editor' tool
|
550
564
|
if tool.name == "text_editor" and (
|
551
565
|
sorted(tool.parameters.properties.keys())
|
@@ -561,8 +575,14 @@ class AnthropicAPI(ModelAPI):
|
|
561
575
|
]
|
562
576
|
)
|
563
577
|
):
|
564
|
-
return
|
565
|
-
|
578
|
+
return (
|
579
|
+
BetaToolTextEditor20241022Param(
|
580
|
+
type="text_editor_20241022", name="str_replace_editor"
|
581
|
+
)
|
582
|
+
if self.is_claude_3_5()
|
583
|
+
else ToolTextEditor20250124Param(
|
584
|
+
type="text_editor_20250124", name="str_replace_editor"
|
585
|
+
)
|
566
586
|
)
|
567
587
|
# not a text_editor tool
|
568
588
|
else:
|
@@ -571,7 +591,10 @@ class AnthropicAPI(ModelAPI):
|
|
571
591
|
|
572
592
|
# tools can be either a stock tool param or a special Anthropic native use tool param
|
573
593
|
ToolParamDef = (
|
574
|
-
ToolParam
|
594
|
+
ToolParam
|
595
|
+
| BetaToolComputerUse20250124Param
|
596
|
+
| ToolTextEditor20250124Param
|
597
|
+
| BetaToolTextEditor20241022Param
|
575
598
|
)
|
576
599
|
|
577
600
|
|
@@ -580,6 +603,7 @@ def add_cache_control(
|
|
580
603
|
| ToolParam
|
581
604
|
| BetaToolComputerUse20250124Param
|
582
605
|
| ToolTextEditor20250124Param
|
606
|
+
| BetaToolTextEditor20241022Param
|
583
607
|
| dict[str, Any],
|
584
608
|
) -> None:
|
585
609
|
cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
|
@@ -844,6 +868,7 @@ def _names_for_tool_call(
|
|
844
868
|
"""
|
845
869
|
mappings = (
|
846
870
|
(INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
|
871
|
+
("str_replace_editor", "text_editor_20241022", "text_editor"),
|
847
872
|
("str_replace_editor", "text_editor_20250124", "text_editor"),
|
848
873
|
("bash", "bash_20250124", "bash_session"),
|
849
874
|
)
|
@@ -200,6 +200,7 @@ def multiple_choice(
|
|
200
200
|
template: str | None = None,
|
201
201
|
cot: bool = False,
|
202
202
|
multiple_correct: bool = False,
|
203
|
+
max_tokens: int | None = None,
|
203
204
|
**kwargs: Unpack[DeprecatedArgs],
|
204
205
|
) -> Solver:
|
205
206
|
"""Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
|
@@ -226,6 +227,8 @@ def multiple_choice(
|
|
226
227
|
squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
|
227
228
|
as `False` if there's exactly one correct answer from the choices
|
228
229
|
available. NOTE: this has no effect if you provide a custom template.
|
230
|
+
max_tokens: Default `None`. Controls the number of tokens generated through the call
|
231
|
+
to generate().
|
229
232
|
**kwargs (Any): Deprecated arguments for backward compatibility.
|
230
233
|
|
231
234
|
#### Shuffling
|
@@ -282,7 +285,7 @@ def multiple_choice(
|
|
282
285
|
template=str(template),
|
283
286
|
)
|
284
287
|
|
285
|
-
state = await generate(state)
|
288
|
+
state = await generate(state, max_tokens=max_tokens)
|
286
289
|
|
287
290
|
answers = parse_answers(state)
|
288
291
|
if answers and answers.group(1):
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -204,13 +204,17 @@ class TaskState:
|
|
204
204
|
Convenience function for accessing the initial input from the `Sample` as a string.
|
205
205
|
|
206
206
|
If the `input` is a `list[ChatMessage]`, this will return the text from
|
207
|
-
the
|
207
|
+
the last chat message
|
208
208
|
"""
|
209
209
|
if isinstance(self._input, str):
|
210
210
|
return self._input
|
211
211
|
else:
|
212
212
|
input = next(
|
213
|
-
(
|
213
|
+
(
|
214
|
+
message.text
|
215
|
+
for message in reversed(self._input)
|
216
|
+
if message.role == "user"
|
217
|
+
),
|
214
218
|
None,
|
215
219
|
)
|
216
220
|
if input:
|
@@ -231,7 +235,7 @@ class TaskState:
|
|
231
235
|
write access to the user chat prompt. Raises an
|
232
236
|
exception if there is no user prompt
|
233
237
|
"""
|
234
|
-
prompt = next((m for m in self.messages if m.role == "user"), None)
|
238
|
+
prompt = next((m for m in reversed(self.messages) if m.role == "user"), None)
|
235
239
|
if prompt:
|
236
240
|
return prompt
|
237
241
|
else:
|
inspect_ai/tool/_mcp/_context.py
CHANGED
@@ -2,13 +2,11 @@ from contextlib import _AsyncGeneratorContextManager
|
|
2
2
|
from typing import TypeAlias
|
3
3
|
|
4
4
|
from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
|
5
|
-
from mcp.
|
6
|
-
JSONRPCMessage,
|
7
|
-
)
|
5
|
+
from mcp.shared.message import SessionMessage
|
8
6
|
|
9
7
|
MCPServerContext: TypeAlias = _AsyncGeneratorContextManager[
|
10
8
|
tuple[
|
11
|
-
MemoryObjectReceiveStream[
|
12
|
-
MemoryObjectSendStream[
|
9
|
+
MemoryObjectReceiveStream[SessionMessage | Exception],
|
10
|
+
MemoryObjectSendStream[SessionMessage],
|
13
11
|
],
|
14
12
|
]
|
inspect_ai/tool/_mcp/server.py
CHANGED
inspect_ai/tool/_tools/_think.py
CHANGED
@@ -41,7 +41,7 @@ def think(
|
|
41
41
|
def think_tool_viewer() -> ToolCallViewer:
|
42
42
|
def viewer(tool_call: ToolCall) -> ToolCallView:
|
43
43
|
call = ToolCallContent(
|
44
|
-
format="markdown", content=tool_call.arguments
|
44
|
+
format="markdown", content=tool_call.arguments.get("thought", "")
|
45
45
|
)
|
46
46
|
return ToolCallView(call=call)
|
47
47
|
|