inspect-ai 0.3.82__py3-none-any.whl → 0.3.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +3 -4
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +44 -0
- inspect_ai/_eval/eval.py +36 -24
- inspect_ai/_eval/evalset.py +17 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +8 -13
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +2 -0
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/file.py +13 -0
- inspect_ai/_util/json.py +2 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +304 -128
- inspect_ai/_view/www/dist/assets/index.js +47495 -27519
- inspect_ai/_view/www/log-schema.json +124 -31
- inspect_ai/_view/www/package.json +3 -0
- inspect_ai/_view/www/src/App.tsx +12 -0
- inspect_ai/_view/www/src/appearance/icons.ts +1 -0
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
- inspect_ai/_view/www/src/state/hooks.ts +5 -3
- inspect_ai/_view/www/src/state/logPolling.ts +5 -1
- inspect_ai/_view/www/src/state/logSlice.ts +10 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
- inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
- inspect_ai/_view/www/src/types/log.d.ts +34 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
- inspect_ai/_view/www/yarn.lock +94 -1
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_log.py +11 -2
- inspect_ai/log/_transcript.py +13 -9
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +256 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_model.py +113 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +2 -2
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +179 -153
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
from logging import getLogger
|
2
|
+
|
3
|
+
from inspect_ai._util.logger import warn_once
|
4
|
+
from inspect_ai.agent._as_solver import as_solver
|
5
|
+
|
6
|
+
from ._solver import Solver, solver
|
7
|
+
|
8
|
+
logger = getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
@solver
|
12
|
+
def human_agent(
|
13
|
+
answer: bool | str = True,
|
14
|
+
intermediate_scoring: bool = False,
|
15
|
+
record_session: bool = True,
|
16
|
+
) -> Solver:
|
17
|
+
"""Human solver for agentic tasks that run in a Linux environment.
|
18
|
+
|
19
|
+
The Human agent solver installs agent task tools in the default
|
20
|
+
sandbox and presents the user with both task instructions and
|
21
|
+
documentation for the various tools (e.g. `task submit`,
|
22
|
+
`task start`, `task stop` `task instructions`, etc.). A human agent panel
|
23
|
+
is displayed with instructions for logging in to the sandbox.
|
24
|
+
|
25
|
+
If the user is running in VS Code with the Inspect extension,
|
26
|
+
they will also be presented with links to login to the sandbox
|
27
|
+
using a VS Code Window or Terminal.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
answer: Is an explicit answer required for this task or is it scored
|
31
|
+
based on files in the container? Pass a `str` with a regex to validate
|
32
|
+
that the answer matches the expected format.
|
33
|
+
intermediate_scoring: Allow the human agent to check their score while working.
|
34
|
+
record_session: Record all user commands and outputs in the sandbox bash session.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
Solver: Human agent solver.
|
38
|
+
"""
|
39
|
+
from inspect_ai.agent._human.agent import human_cli
|
40
|
+
|
41
|
+
warn_once(
|
42
|
+
logger,
|
43
|
+
"The human_agent solver is deprecated. Please use the human_cli agent from the agents module instead.",
|
44
|
+
)
|
45
|
+
|
46
|
+
return as_solver(
|
47
|
+
human_cli(
|
48
|
+
answer=answer,
|
49
|
+
intermediate_scoring=intermediate_scoring,
|
50
|
+
record_session=record_session,
|
51
|
+
)
|
52
|
+
)
|
inspect_ai/solver/_prompt.py
CHANGED
@@ -123,7 +123,9 @@ def assistant_message(template: str, **params: Any) -> Solver:
|
|
123
123
|
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
124
124
|
kwargs = state.metadata | state.store._data | params
|
125
125
|
state.messages.append(
|
126
|
-
ChatMessageAssistant(
|
126
|
+
ChatMessageAssistant(
|
127
|
+
content=format_template(content, kwargs), model=state.model.name
|
128
|
+
)
|
127
129
|
)
|
128
130
|
return state
|
129
131
|
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from copy import copy
|
2
|
+
|
3
|
+
from inspect_ai.model import ChatMessage, ChatMessageUser, ModelName, ModelOutput
|
4
|
+
|
5
|
+
from ._fork import task_generate
|
6
|
+
from ._solver import Solver
|
7
|
+
from ._task_state import TaskState
|
8
|
+
|
9
|
+
|
10
|
+
async def run(
|
11
|
+
solver: Solver, input: str | list[ChatMessage]
|
12
|
+
) -> tuple[list[ChatMessage], ModelOutput | None]:
|
13
|
+
"""Run a solver over chat message input.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
solver: Solver to run.
|
17
|
+
input: Chat message input
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
Tuple of `list[ChatMessage], ModelOutput | None` (returns
|
21
|
+
[], None if no generates were done by the solver)
|
22
|
+
"""
|
23
|
+
from inspect_ai.log._samples import sample_active
|
24
|
+
|
25
|
+
# get the generate function for the current task
|
26
|
+
generate = task_generate()
|
27
|
+
if generate is None:
|
28
|
+
raise RuntimeError("Called run() outside of a running task.")
|
29
|
+
|
30
|
+
# get the active sample
|
31
|
+
active = sample_active()
|
32
|
+
if active is None:
|
33
|
+
raise RuntimeError("Called run() outside of a running task")
|
34
|
+
assert active.sample.id
|
35
|
+
|
36
|
+
# build messages list
|
37
|
+
messages: list[ChatMessage] = (
|
38
|
+
[ChatMessageUser(content=input)] if isinstance(input, str) else input
|
39
|
+
)
|
40
|
+
|
41
|
+
# build state
|
42
|
+
state = TaskState(
|
43
|
+
model=ModelName(active.model),
|
44
|
+
sample_id=active.sample.id,
|
45
|
+
epoch=active.epoch,
|
46
|
+
input=input,
|
47
|
+
messages=copy(messages),
|
48
|
+
)
|
49
|
+
|
50
|
+
# run solver
|
51
|
+
state = await solver(state, generate)
|
52
|
+
|
53
|
+
# return any messages that don't match our initial prefix
|
54
|
+
new_messages: list[ChatMessage] = []
|
55
|
+
for index, message in enumerate(state.messages):
|
56
|
+
if index >= len(messages) or message.id != messages[index].id:
|
57
|
+
new_messages.append(message)
|
58
|
+
|
59
|
+
return new_messages, state.output if len(state.output.choices) > 0 else None
|
inspect_ai/solver/_solver.py
CHANGED
@@ -7,6 +7,7 @@ from typing import (
|
|
7
7
|
Literal,
|
8
8
|
ParamSpec,
|
9
9
|
Protocol,
|
10
|
+
TypeAlias,
|
10
11
|
cast,
|
11
12
|
overload,
|
12
13
|
runtime_checkable,
|
@@ -23,6 +24,8 @@ from inspect_ai._util.registry import (
|
|
23
24
|
registry_name,
|
24
25
|
registry_tag,
|
25
26
|
)
|
27
|
+
from inspect_ai.agent._agent import Agent, is_agent
|
28
|
+
from inspect_ai.agent._as_solver import as_solver
|
26
29
|
from inspect_ai.model import CachePolicy, GenerateConfigArgs
|
27
30
|
|
28
31
|
from ._task_state import TaskState, set_sample_state
|
@@ -136,23 +139,27 @@ def solver_create(name: str, **kwargs: Any) -> Solver:
|
|
136
139
|
return cast(Solver, registry_create("solver", name, **kwargs))
|
137
140
|
|
138
141
|
|
142
|
+
SolverType: TypeAlias = Solver | Agent
|
143
|
+
"""Return type for @solver decorated functions. """
|
144
|
+
|
145
|
+
|
139
146
|
@overload
|
140
147
|
def solver(name: str) -> Callable[[Callable[P, Solver]], Callable[P, Solver]]: ...
|
141
148
|
|
142
149
|
|
143
150
|
@overload
|
144
|
-
def solver(name: Callable[P,
|
151
|
+
def solver(name: Callable[P, SolverType]) -> Callable[P, Solver]: ...
|
145
152
|
|
146
153
|
|
147
154
|
def solver(
|
148
|
-
name: str | Callable[P,
|
155
|
+
name: str | Callable[P, SolverType],
|
149
156
|
) -> Callable[[Callable[P, Solver]], Callable[P, Solver]] | Callable[P, Solver]:
|
150
157
|
r"""Decorator for registering solvers.
|
151
158
|
|
152
159
|
Args:
|
153
160
|
name:
|
154
161
|
Optional name for solver. If the decorator has no name
|
155
|
-
argument then the name of the underlying Callable[P,
|
162
|
+
argument then the name of the underlying Callable[P, SolverType]
|
156
163
|
object will be used to automatically assign a name.
|
157
164
|
|
158
165
|
Returns:
|
@@ -176,7 +183,7 @@ def solver(
|
|
176
183
|
# (b) Ensure that instances of Solver created by SolverType also
|
177
184
|
# carry registry info.
|
178
185
|
def create_solver_wrapper(
|
179
|
-
solver_type: Callable[P,
|
186
|
+
solver_type: Callable[P, SolverType], name: str | None = None
|
180
187
|
) -> Callable[P, Solver]:
|
181
188
|
solver_name = registry_name(
|
182
189
|
solver_type, name if name else getattr(solver_type, "__name__")
|
@@ -185,6 +192,9 @@ def solver(
|
|
185
192
|
@wraps(solver_type)
|
186
193
|
def solver_wrapper(*args: P.args, **kwargs: P.kwargs) -> Solver:
|
187
194
|
solver = solver_type(*args, **kwargs)
|
195
|
+
if is_agent(solver):
|
196
|
+
solver = as_solver(solver)
|
197
|
+
solver = cast(Solver, solver)
|
188
198
|
|
189
199
|
if not is_callable_coroutine(solver):
|
190
200
|
raise TypeError(f"'{solver}' is not declared as an async callable.")
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -394,16 +394,18 @@ class TaskState:
|
|
394
394
|
|
395
395
|
return metadata_as(self.metadata, metadata_cls)
|
396
396
|
|
397
|
-
def store_as(self, model_cls: Type[SMT]) -> SMT:
|
397
|
+
def store_as(self, model_cls: Type[SMT], instance: str | None = None) -> SMT:
|
398
398
|
"""Pydantic model interface to the store.
|
399
399
|
|
400
400
|
Args:
|
401
401
|
model_cls: Pydantic model type (must derive from StoreModel)
|
402
|
+
instance: Optional instances name for store (enables multiple instances
|
403
|
+
of a given StoreModel type within a single sample)
|
402
404
|
|
403
405
|
Returns:
|
404
|
-
StoreModel:
|
406
|
+
StoreModel: model_cls bound to sample store data.
|
405
407
|
"""
|
406
|
-
return model_cls(store=self.store)
|
408
|
+
return model_cls(store=self.store, instance=instance)
|
407
409
|
|
408
410
|
|
409
411
|
def sample_state() -> TaskState | None:
|
inspect_ai/tool/_tool_call.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from dataclasses import dataclass, field
|
2
|
-
from typing import Any, Callable, Literal
|
2
|
+
from typing import Any, Callable, Literal, TypedDict
|
3
3
|
|
4
|
-
from pydantic import BaseModel, Field
|
4
|
+
from pydantic import BaseModel, Field, JsonValue
|
5
5
|
|
6
6
|
from inspect_ai._util.content import Content
|
7
7
|
|
@@ -44,11 +44,8 @@ class ToolCall:
|
|
44
44
|
arguments: dict[str, Any]
|
45
45
|
"""Arguments to function."""
|
46
46
|
|
47
|
-
|
48
|
-
"""
|
49
|
-
|
50
|
-
internal_name: str | None = field(default=None)
|
51
|
-
"""Model's internal name for the tool - if any."""
|
47
|
+
internal: JsonValue | None = field(default=None)
|
48
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
52
49
|
|
53
50
|
parse_error: str | None = field(default=None)
|
54
51
|
"""Error which occurred parsing tool call."""
|
@@ -82,7 +79,17 @@ ToolCallViewer = Callable[[ToolCall], ToolCallView]
|
|
82
79
|
"""Custom view renderer for tool calls."""
|
83
80
|
|
84
81
|
|
85
|
-
|
82
|
+
class ToolCallModelInputHints(TypedDict):
|
83
|
+
# This type is a little sketchy but it allows tools to customize their
|
84
|
+
# input hook behavior based on model limitations without creating a tight
|
85
|
+
# coupling to the model provider.
|
86
|
+
disable_computer_screenshot_truncation: bool
|
87
|
+
"""The model does not support the truncation/redaction of computer screenshots."""
|
88
|
+
|
89
|
+
|
90
|
+
ToolCallModelInput = Callable[
|
91
|
+
[int, int, str | list[Content], ToolCallModelInputHints], str | list[Content]
|
92
|
+
]
|
86
93
|
"""Determine how tool call results are played back as model input.
|
87
94
|
|
88
95
|
The first argument is an index into the total number of tool results
|
inspect_ai/tool/_tool_def.py
CHANGED
@@ -21,7 +21,7 @@ from ._tool_description import (
|
|
21
21
|
tool_description,
|
22
22
|
)
|
23
23
|
from ._tool_info import parse_tool_info
|
24
|
-
from ._tool_params import ToolParams
|
24
|
+
from ._tool_params import ToolParam, ToolParams
|
25
25
|
|
26
26
|
|
27
27
|
class ToolDef:
|
@@ -194,17 +194,7 @@ def tool_def_fields(tool: Tool) -> ToolDefFields:
|
|
194
194
|
raise ValueError(f"Description not provided for tool function '{name}'")
|
195
195
|
|
196
196
|
# validate that we have types/descriptions for paramters
|
197
|
-
|
198
|
-
|
199
|
-
def raise_not_provided_error(context: str) -> None:
|
200
|
-
raise ValueError(
|
201
|
-
f"{context} not provided for parameter '{param_name}' of tool function '{name}'."
|
202
|
-
)
|
203
|
-
|
204
|
-
if param.type is None and not param.anyOf and not param.enum:
|
205
|
-
raise_not_provided_error("Unsupported type or type annotation")
|
206
|
-
elif not param.description:
|
207
|
-
raise_not_provided_error("Description")
|
197
|
+
validate_tool_parameters(name, tool_info.parameters.properties)
|
208
198
|
|
209
199
|
# see if the user has overriden any of the tool's descriptions
|
210
200
|
desc = tool_description(tool)
|
@@ -238,3 +228,18 @@ def tool_registry_info(
|
|
238
228
|
viewer = info.metadata.get(TOOL_VIEWER, None)
|
239
229
|
model_input = info.metadata.get(TOOL_MODEL_INPUT, None)
|
240
230
|
return name, prompt, parallel, viewer, model_input
|
231
|
+
|
232
|
+
|
233
|
+
def validate_tool_parameters(tool_name: str, parameters: dict[str, ToolParam]) -> None:
|
234
|
+
# validate that we have types/descriptions for paramters
|
235
|
+
for param_name, param in parameters.items():
|
236
|
+
|
237
|
+
def raise_not_provided_error(context: str) -> None:
|
238
|
+
raise ValueError(
|
239
|
+
f"{context} provided for parameter '{param_name}' of function '{tool_name}'."
|
240
|
+
)
|
241
|
+
|
242
|
+
if param.type is None and not param.anyOf and not param.enum:
|
243
|
+
raise_not_provided_error("Unsupported type or type annotation")
|
244
|
+
elif not param.description:
|
245
|
+
raise_not_provided_error("Description not")
|
@@ -129,8 +129,8 @@ async def tool_container_sandbox(tool_name: str) -> SandboxEnvironment:
|
|
129
129
|
Alternatively, you can include the service into your own Dockerfile:
|
130
130
|
|
131
131
|
ENV PATH="$PATH:/opt/inspect_tool_support/bin"
|
132
|
-
RUN python -m venv /opt/inspect_tool_support &&
|
133
|
-
/opt/inspect_tool_support/bin/pip install inspect-tool-support &&
|
132
|
+
RUN python -m venv /opt/inspect_tool_support && \\
|
133
|
+
/opt/inspect_tool_support/bin/pip install inspect-tool-support && \\
|
134
134
|
/opt/inspect_tool_support/bin/inspect-tool-support post-install
|
135
135
|
""").strip()
|
136
136
|
raise PrerequisiteError(msg)
|
inspect_ai/tool/_tool_with.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
from copy import deepcopy
|
2
|
-
|
3
1
|
from inspect_ai._util.registry import (
|
4
2
|
registry_info,
|
5
3
|
registry_params,
|
@@ -22,10 +20,15 @@ def tool_with(
|
|
22
20
|
viewer: ToolCallViewer | None = None,
|
23
21
|
model_input: ToolCallModelInput | None = None,
|
24
22
|
) -> Tool:
|
25
|
-
"""Tool with modifications to
|
23
|
+
"""Tool with modifications to various attributes.
|
24
|
+
|
25
|
+
This function modifies the passed tool in place and
|
26
|
+
returns it. If you want to create multiple variations
|
27
|
+
of a single tool using `tool_with()` you should create
|
28
|
+
the underlying tool multiple times.
|
26
29
|
|
27
30
|
Args:
|
28
|
-
tool: Tool instance to
|
31
|
+
tool: Tool instance to modify.
|
29
32
|
name: Tool name (optional).
|
30
33
|
description: Tool description (optional).
|
31
34
|
parameters: Parameter descriptions (optional)
|
@@ -36,7 +39,7 @@ def tool_with(
|
|
36
39
|
tool call results are played back as model input.
|
37
40
|
|
38
41
|
Returns:
|
39
|
-
|
42
|
+
The passed tool with the requested modifications.
|
40
43
|
"""
|
41
44
|
# get the existing tool info
|
42
45
|
tool_info = parse_tool_info(tool)
|
@@ -54,8 +57,7 @@ def tool_with(
|
|
54
57
|
param_name
|
55
58
|
]
|
56
59
|
|
57
|
-
#
|
58
|
-
tool_copy = deepcopy(tool)
|
60
|
+
# resolve attributes
|
59
61
|
info = registry_info(tool).model_copy()
|
60
62
|
if parallel is not None:
|
61
63
|
info.metadata[TOOL_PARALLEL] = parallel
|
@@ -64,12 +66,13 @@ def tool_with(
|
|
64
66
|
elif model_input is not None:
|
65
67
|
info.metadata[TOOL_MODEL_INPUT] = model_input
|
66
68
|
|
67
|
-
|
68
|
-
|
69
|
+
# set attributes
|
70
|
+
set_registry_info(tool, info)
|
71
|
+
set_registry_params(tool, registry_params(tool))
|
69
72
|
set_tool_description(
|
70
|
-
|
73
|
+
tool,
|
71
74
|
ToolDescription(
|
72
75
|
name=name, description=description, parameters=tool_info.parameters
|
73
76
|
),
|
74
77
|
)
|
75
|
-
return
|
78
|
+
return tool
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from pydantic import BaseModel, Field, RootModel
|
2
|
+
from shortuuid import uuid
|
2
3
|
|
3
4
|
from inspect_ai.tool import ToolResult
|
4
5
|
from inspect_ai.tool._tool_support_helpers import (
|
@@ -52,13 +53,21 @@ def code_viewer(language: str, code_param: str) -> ToolCallViewer:
|
|
52
53
|
|
53
54
|
|
54
55
|
@tool(viewer=code_viewer("bash", "command"))
|
55
|
-
def bash_session(timeout: int | None = None) -> Tool:
|
56
|
+
def bash_session(*, timeout: int | None = None, instance: str | None = uuid()) -> Tool:
|
56
57
|
"""Bash shell session command execution tool.
|
57
58
|
|
58
59
|
Execute bash shell commands in a long running session using a sandbox environment (e.g. "docker").
|
59
60
|
|
61
|
+
By default, a separate bash process is created within the sandbox for each
|
62
|
+
call to `bash_session()`. You can modify this behavior by passing `instance=None`
|
63
|
+
(which will result in a single bash process for the entire sample) or use other
|
64
|
+
`instance` values that implement another scheme).
|
65
|
+
|
66
|
+
See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.
|
67
|
+
|
60
68
|
Args:
|
61
69
|
timeout: Timeout (in seconds) for command.
|
70
|
+
instance: Instance id (each unique instance id has its own bash process)
|
62
71
|
|
63
72
|
Returns:
|
64
73
|
String with command output (stdout) or command error (stderr).
|
@@ -85,7 +94,7 @@ def bash_session(timeout: int | None = None) -> Tool:
|
|
85
94
|
params: dict[str, object] = {"command": command, "restart": restart}
|
86
95
|
|
87
96
|
sandbox = await tool_container_sandbox("bash session")
|
88
|
-
store = store_as(BashSessionStore)
|
97
|
+
store = store_as(BashSessionStore, instance=instance)
|
89
98
|
|
90
99
|
if not store.session_id:
|
91
100
|
store.session_id = (
|
@@ -83,6 +83,22 @@ async def middle_click(coordinate: list[int], timeout: int | None = None) -> Too
|
|
83
83
|
)
|
84
84
|
|
85
85
|
|
86
|
+
async def back_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
|
87
|
+
return await _send_cmd(
|
88
|
+
["back_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
|
89
|
+
timeout=timeout,
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
async def forward_click(
|
94
|
+
coordinate: list[int], timeout: int | None = None
|
95
|
+
) -> ToolResult:
|
96
|
+
return await _send_cmd(
|
97
|
+
["forward_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
|
98
|
+
timeout=timeout,
|
99
|
+
)
|
100
|
+
|
101
|
+
|
86
102
|
async def double_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
|
87
103
|
return await _send_cmd(
|
88
104
|
["double_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
|
@@ -182,11 +198,11 @@ async def computer_sandbox() -> SandboxEnvironment:
|
|
182
198
|
else:
|
183
199
|
raise PrerequisiteError(
|
184
200
|
dedent("""
|
185
|
-
The computer tool service was not found in any of the sandboxes for this sample. Please add the computer tool service to your configuration. For example, the following Docker compose file uses the aisiuk/inspect-computer-tool
|
201
|
+
The computer tool service was not found in any of the sandboxes for this sample. Please add the computer tool service to your configuration. For example, the following Docker compose file uses the aisiuk/inspect-computer-tool image as its default sandbox:
|
186
202
|
|
187
203
|
services:
|
188
204
|
default:
|
189
|
-
image: "aisiuk/inspect-computer-tool
|
205
|
+
image: "aisiuk/inspect-computer-tool"
|
190
206
|
init: true
|
191
207
|
""").strip()
|
192
208
|
)
|
@@ -3,7 +3,7 @@ from typing import Awaitable, Callable, Literal, TypeVar
|
|
3
3
|
from inspect_ai._util.content import Content, ContentImage, ContentText
|
4
4
|
from inspect_ai.tool import Tool, ToolResult, tool
|
5
5
|
from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
|
6
|
-
from inspect_ai.tool._tool_call import ToolCallModelInput
|
6
|
+
from inspect_ai.tool._tool_call import ToolCallModelInput, ToolCallModelInputHints
|
7
7
|
|
8
8
|
from . import _common as common
|
9
9
|
from ._resources.tool._constants import Action
|
@@ -64,6 +64,8 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
|
|
64
64
|
- Example: execute(action="left_click_drag", coordinate=(150, 250))
|
65
65
|
- `right_click`: Click the right mouse button.
|
66
66
|
- `middle_click`: Click the middle mouse button.
|
67
|
+
- `back_click`: Click the 'back' mouse button.
|
68
|
+
- `forward_click`: Click the 'forward' mouse button.
|
67
69
|
- `double_click`: Double-click the left mouse button.
|
68
70
|
- `triple_click`: Double-click the left mouse button.
|
69
71
|
- `wait`: Wait for a specified duration (in seconds).
|
@@ -117,6 +119,14 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
|
|
117
119
|
return await common.middle_click(
|
118
120
|
not_none(coordinate, "coordinate"), timeout=timeout
|
119
121
|
)
|
122
|
+
case "back_click":
|
123
|
+
return await common.back_click(
|
124
|
+
not_none(coordinate, "coordinate"), timeout=timeout
|
125
|
+
)
|
126
|
+
case "forward_click":
|
127
|
+
return await common.forward_click(
|
128
|
+
not_none(coordinate, "coordinate"), timeout=timeout
|
129
|
+
)
|
120
130
|
case "double_click":
|
121
131
|
return await common.double_click(
|
122
132
|
not_none(coordinate, "coordinate"), timeout=timeout
|
@@ -150,8 +160,14 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
|
|
150
160
|
|
151
161
|
def _computer_model_input(max_screenshots: int) -> ToolCallModelInput:
|
152
162
|
def model_input(
|
153
|
-
message_index: int,
|
163
|
+
message_index: int,
|
164
|
+
message_total: int,
|
165
|
+
content: str | list[Content],
|
166
|
+
hints: ToolCallModelInputHints,
|
154
167
|
) -> str | list[Content]:
|
168
|
+
if hints.get("forbids_computer_screenshot_truncation", False):
|
169
|
+
return content
|
170
|
+
|
155
171
|
# nothing to do for scalars
|
156
172
|
if isinstance(content, str):
|
157
173
|
return content
|
@@ -153,6 +153,19 @@ class X11Client:
|
|
153
153
|
) -> ToolResult:
|
154
154
|
return await self._mouse_move_and("middle_click", coordinate, text)
|
155
155
|
|
156
|
+
# https://wiki.archlinux.org/title/Mouse_buttons#Thumb_buttons_-_forward_and_back
|
157
|
+
# suggests that, although not in any spec, the de facto standard is 8 for
|
158
|
+
# back and 9 for forward.
|
159
|
+
async def back_click(
|
160
|
+
self, coordinate: tuple[int, int] | None, text: str | None
|
161
|
+
) -> ToolResult:
|
162
|
+
return await self._mouse_move_and("back_click", coordinate, text)
|
163
|
+
|
164
|
+
async def forward_click(
|
165
|
+
self, coordinate: tuple[int, int] | None, text: str | None
|
166
|
+
) -> ToolResult:
|
167
|
+
return await self._mouse_move_and("forward_click", coordinate, text)
|
168
|
+
|
156
169
|
async def double_click(
|
157
170
|
self, coordinate: tuple[int, int] | None, text: str | None
|
158
171
|
) -> ToolResult:
|
@@ -215,6 +228,8 @@ class X11Client:
|
|
215
228
|
"left_click",
|
216
229
|
"right_click",
|
217
230
|
"middle_click",
|
231
|
+
"back_click",
|
232
|
+
"forward_click",
|
218
233
|
"double_click",
|
219
234
|
"triple_click",
|
220
235
|
],
|
@@ -233,6 +248,8 @@ class X11Client:
|
|
233
248
|
"left_click": "1",
|
234
249
|
"right_click": "3",
|
235
250
|
"middle_click": "2",
|
251
|
+
"back_click": "8",
|
252
|
+
"forward_click": "9",
|
236
253
|
"double_click": "--repeat 2 --delay 300 1",
|
237
254
|
"triple_click": "--repeat 3 --delay 300 1",
|
238
255
|
}[action]
|
inspect_ai/tool/_tools/_think.py
CHANGED
@@ -22,7 +22,7 @@ def think(
|
|
22
22
|
async def execute(thought: str) -> str:
|
23
23
|
"""Use the tool to think about something.
|
24
24
|
|
25
|
-
The will not obtain new information or change the environment, but just append the thought to the log. Use it when complex reasoning or some cache memory is needed.
|
25
|
+
The will not obtain new information or change the environment, but just append the thought to the log. Use it when complex reasoning or some cache memory is needed.
|
26
26
|
|
27
27
|
Args:
|
28
28
|
thought: A thought to think about.
|