inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/eval.py +35 -2
- inspect_ai/_cli/util.py +44 -1
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +13 -4
- inspect_ai/_display/core/results.py +1 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +8 -8
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +44 -0
- inspect_ai/_eval/eval.py +74 -25
- inspect_ai/_eval/evalset.py +22 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +13 -15
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +55 -6
- inspect_ai/_eval/task/run.py +51 -10
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/file.py +30 -1
- inspect_ai/_util/json.py +37 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/server.py +113 -1
- inspect_ai/_view/www/App.css +7 -1
- inspect_ai/_view/www/dist/assets/index.css +813 -415
- inspect_ai/_view/www/dist/assets/index.js +54475 -32003
- inspect_ai/_view/www/eslint.config.mjs +1 -1
- inspect_ai/_view/www/log-schema.json +137 -31
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/package.json +11 -2
- inspect_ai/_view/www/src/App.tsx +161 -853
- inspect_ai/_view/www/src/api/api-browser.ts +176 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
- inspect_ai/_view/www/src/api/client-api.ts +66 -10
- inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
- inspect_ai/_view/www/src/api/types.ts +107 -2
- inspect_ai/_view/www/src/appearance/icons.ts +2 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
- inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
- inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
- inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
- inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
- inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
- inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
- inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
- inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
- inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
- inspect_ai/_view/www/src/index.tsx +26 -94
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
- inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
- inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
- inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
- inspect_ai/_view/www/src/scoring/utils.ts +87 -0
- inspect_ai/_view/www/src/state/appSlice.ts +244 -0
- inspect_ai/_view/www/src/state/hooks.ts +399 -0
- inspect_ai/_view/www/src/state/logPolling.ts +200 -0
- inspect_ai/_view/www/src/state/logSlice.ts +224 -0
- inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
- inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
- inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
- inspect_ai/_view/www/src/state/scrolling.ts +206 -0
- inspect_ai/_view/www/src/state/store.ts +168 -0
- inspect_ai/_view/www/src/state/store_filter.ts +84 -0
- inspect_ai/_view/www/src/state/utils.ts +23 -0
- inspect_ai/_view/www/src/storage/index.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +36 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/types.ts +94 -32
- inspect_ai/_view/www/src/utils/attachments.ts +58 -23
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/utils/logger.ts +52 -0
- inspect_ai/_view/www/src/utils/polling.ts +100 -0
- inspect_ai/_view/www/src/utils/react.ts +30 -0
- inspect_ai/_view/www/src/utils/vscode.ts +1 -1
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
- inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
- inspect_ai/_view/www/src/workspace/types.ts +4 -3
- inspect_ai/_view/www/src/workspace/utils.ts +4 -4
- inspect_ai/_view/www/vite.config.js +6 -0
- inspect_ai/_view/www/yarn.lock +464 -355
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_condense.py +26 -0
- inspect_ai/log/_log.py +17 -5
- inspect_ai/log/_recorders/buffer/__init__.py +14 -0
- inspect_ai/log/_recorders/buffer/buffer.py +30 -0
- inspect_ai/log/_recorders/buffer/database.py +685 -0
- inspect_ai/log/_recorders/buffer/filestore.py +259 -0
- inspect_ai/log/_recorders/buffer/types.py +84 -0
- inspect_ai/log/_recorders/eval.py +2 -11
- inspect_ai/log/_recorders/types.py +30 -0
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +257 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_generate_config.py +2 -2
- inspect_ai/model/_model.py +114 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +4 -4
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- inspect_ai/util/_subtask.py +1 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/eval.py
CHANGED
@@ -2,9 +2,11 @@ import logging
|
|
2
2
|
import os
|
3
3
|
import sys
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Any, Literal
|
5
|
+
from typing import Any, Literal, cast
|
6
6
|
|
7
7
|
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
|
8
|
+
from inspect_ai.agent._agent import Agent, is_agent
|
9
|
+
from inspect_ai.agent._as_solver import as_solver
|
8
10
|
|
9
11
|
if sys.version_info < (3, 11):
|
10
12
|
from exceptiongroup import ExceptionGroup
|
@@ -15,7 +17,11 @@ from typing_extensions import Unpack
|
|
15
17
|
from inspect_ai._cli.util import parse_cli_args
|
16
18
|
from inspect_ai._display.core.active import display as task_display
|
17
19
|
from inspect_ai._util.config import resolve_args
|
18
|
-
from inspect_ai._util.constants import
|
20
|
+
from inspect_ai._util.constants import (
|
21
|
+
DEFAULT_LOG_FORMAT,
|
22
|
+
DEFAULT_LOG_SHARED,
|
23
|
+
JSON_LOG_FORMAT,
|
24
|
+
)
|
19
25
|
from inspect_ai._util.error import PrerequisiteError
|
20
26
|
from inspect_ai._util.file import absolute_file_path
|
21
27
|
from inspect_ai._util.logger import warn_once
|
@@ -31,6 +37,7 @@ from inspect_ai.approval._policy import (
|
|
31
37
|
from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
|
32
38
|
from inspect_ai.log._file import read_eval_log_async
|
33
39
|
from inspect_ai.log._recorders import create_recorder_for_format
|
40
|
+
from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
|
34
41
|
from inspect_ai.model import (
|
35
42
|
GenerateConfig,
|
36
43
|
GenerateConfigArgs,
|
@@ -66,7 +73,7 @@ def eval(
|
|
66
73
|
task_args: dict[str, Any] | str = dict(),
|
67
74
|
sandbox: SandboxEnvironmentType | None = None,
|
68
75
|
sandbox_cleanup: bool | None = None,
|
69
|
-
solver: Solver | list[Solver] |
|
76
|
+
solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
|
70
77
|
tags: list[str] | None = None,
|
71
78
|
metadata: dict[str, Any] | None = None,
|
72
79
|
trace: bool | None = None,
|
@@ -92,6 +99,7 @@ def eval(
|
|
92
99
|
log_samples: bool | None = None,
|
93
100
|
log_images: bool | None = None,
|
94
101
|
log_buffer: int | None = None,
|
102
|
+
log_shared: bool | int | None = None,
|
95
103
|
score: bool = True,
|
96
104
|
score_display: bool | None = None,
|
97
105
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -161,6 +169,9 @@ def eval(
|
|
161
169
|
log_buffer: Number of samples to buffer before writing log file.
|
162
170
|
If not specified, an appropriate default for the format and filesystem is
|
163
171
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
172
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
173
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
174
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
164
175
|
score: Score output (defaults to True)
|
165
176
|
score_display: Show scoring metrics in realtime (defaults to True)
|
166
177
|
**kwargs: Model generation options.
|
@@ -210,6 +221,7 @@ def eval(
|
|
210
221
|
log_samples=log_samples,
|
211
222
|
log_images=log_images,
|
212
223
|
log_buffer=log_buffer,
|
224
|
+
log_shared=log_shared,
|
213
225
|
score=score,
|
214
226
|
score_display=score_display,
|
215
227
|
**kwargs,
|
@@ -236,7 +248,7 @@ async def eval_async(
|
|
236
248
|
task_args: dict[str, Any] | str = dict(),
|
237
249
|
sandbox: SandboxEnvironmentType | None = None,
|
238
250
|
sandbox_cleanup: bool | None = None,
|
239
|
-
solver: Solver | list[Solver] |
|
251
|
+
solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
|
240
252
|
tags: list[str] | None = None,
|
241
253
|
metadata: dict[str, Any] | None = None,
|
242
254
|
approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
|
@@ -260,6 +272,7 @@ async def eval_async(
|
|
260
272
|
log_samples: bool | None = None,
|
261
273
|
log_images: bool | None = None,
|
262
274
|
log_buffer: int | None = None,
|
275
|
+
log_shared: bool | int | None = None,
|
263
276
|
score: bool = True,
|
264
277
|
score_display: bool | None = None,
|
265
278
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -312,6 +325,7 @@ async def eval_async(
|
|
312
325
|
log_buffer: Number of samples to buffer before writing log file.
|
313
326
|
If not specified, an appropriate default for the format and filesystem is
|
314
327
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
328
|
+
log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
|
315
329
|
score: Score output (defaults to True)
|
316
330
|
score_display: Show scoring metrics in realtime (defaults to True)
|
317
331
|
**kwargs: Model generation options.
|
@@ -341,13 +355,10 @@ async def eval_async(
|
|
341
355
|
|
342
356
|
try:
|
343
357
|
# intialise eval
|
344
|
-
model, approval
|
345
|
-
tasks=tasks,
|
358
|
+
model, approval = eval_init(
|
346
359
|
model=model,
|
347
360
|
model_base_url=model_base_url,
|
348
361
|
model_args=model_args,
|
349
|
-
task_args=task_args,
|
350
|
-
sandbox=sandbox,
|
351
362
|
approval=approval,
|
352
363
|
max_subprocesses=max_subprocesses,
|
353
364
|
log_level=log_level,
|
@@ -355,6 +366,11 @@ async def eval_async(
|
|
355
366
|
**kwargs,
|
356
367
|
)
|
357
368
|
|
369
|
+
# resolve tasks
|
370
|
+
resolved_tasks = eval_resolve_tasks(
|
371
|
+
tasks, task_args, model, GenerateConfig(**kwargs), sandbox
|
372
|
+
)
|
373
|
+
|
358
374
|
# warn and return empty string if we resolved no tasks
|
359
375
|
if len(resolved_tasks) == 0:
|
360
376
|
log.warning("No inspect tasks were found at the specified paths.")
|
@@ -390,8 +406,22 @@ async def eval_async(
|
|
390
406
|
f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
|
391
407
|
)
|
392
408
|
|
409
|
+
# resolve log_shared
|
410
|
+
log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
|
411
|
+
|
412
|
+
# validate that --log-shared can't use used with 'json' format
|
413
|
+
if log_shared and log_format == JSON_LOG_FORMAT:
|
414
|
+
raise PrerequisiteError(
|
415
|
+
"ERROR: --log-shared is not compatible with the json log format."
|
416
|
+
)
|
417
|
+
|
393
418
|
# resolve solver
|
394
|
-
|
419
|
+
if isinstance(solver, list):
|
420
|
+
solver = chain(solver)
|
421
|
+
elif is_agent(solver):
|
422
|
+
solver = as_solver(solver)
|
423
|
+
else:
|
424
|
+
solver = cast(Solver | SolverSpec | None, solver)
|
395
425
|
|
396
426
|
# ensure consistency of limit and sample_id
|
397
427
|
if sample_id is not None and limit is not None:
|
@@ -426,6 +456,7 @@ async def eval_async(
|
|
426
456
|
log_samples=log_samples,
|
427
457
|
log_images=log_images,
|
428
458
|
log_buffer=log_buffer,
|
459
|
+
log_shared=log_shared,
|
429
460
|
score_display=score_display,
|
430
461
|
)
|
431
462
|
|
@@ -485,6 +516,9 @@ async def eval_async(
|
|
485
516
|
)
|
486
517
|
logs = EvalLogs(results)
|
487
518
|
|
519
|
+
# cleanup sample buffers if required
|
520
|
+
cleanup_sample_buffers(log_dir)
|
521
|
+
|
488
522
|
finally:
|
489
523
|
_eval_async_running = False
|
490
524
|
|
@@ -510,6 +544,7 @@ def eval_retry(
|
|
510
544
|
log_samples: bool | None = None,
|
511
545
|
log_images: bool | None = None,
|
512
546
|
log_buffer: int | None = None,
|
547
|
+
log_shared: bool | int | None = None,
|
513
548
|
score: bool = True,
|
514
549
|
score_display: bool | None = None,
|
515
550
|
max_retries: int | None = None,
|
@@ -551,6 +586,9 @@ def eval_retry(
|
|
551
586
|
log_buffer: Number of samples to buffer before writing log file.
|
552
587
|
If not specified, an appropriate default for the format and filesystem is
|
553
588
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
589
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
590
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
591
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
554
592
|
score: Score output (defaults to True)
|
555
593
|
score_display: Show scoring metrics in realtime (defaults to True)
|
556
594
|
max_retries:
|
@@ -586,6 +624,7 @@ def eval_retry(
|
|
586
624
|
log_samples=log_samples,
|
587
625
|
log_images=log_images,
|
588
626
|
log_buffer=log_buffer,
|
627
|
+
log_shared=log_shared,
|
589
628
|
score=score,
|
590
629
|
score_display=score_display,
|
591
630
|
max_retries=max_retries,
|
@@ -612,6 +651,7 @@ async def eval_retry_async(
|
|
612
651
|
log_samples: bool | None = None,
|
613
652
|
log_images: bool | None = None,
|
614
653
|
log_buffer: int | None = None,
|
654
|
+
log_shared: bool | int | None = None,
|
615
655
|
score: bool = True,
|
616
656
|
score_display: bool | None = None,
|
617
657
|
max_retries: int | None = None,
|
@@ -651,6 +691,8 @@ async def eval_retry_async(
|
|
651
691
|
log_buffer: (int | None): Number of samples to buffer before writing log file.
|
652
692
|
If not specified, an appropriate default for the format and filesystem is
|
653
693
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
694
|
+
log_shared: Indicate that the log directory is shared, which results in
|
695
|
+
additional syncing of realtime log data for Inspect View.
|
654
696
|
score (bool): Score output (defaults to True)
|
655
697
|
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
656
698
|
max_retries (int | None):
|
@@ -691,7 +733,7 @@ async def eval_retry_async(
|
|
691
733
|
# context to reconstruct ephemeral Task instances)
|
692
734
|
task: str | None
|
693
735
|
task_id = eval_log.eval.task_id
|
694
|
-
task_name = eval_log.eval.task
|
736
|
+
task_name = eval_log.eval.task_registry_name or eval_log.eval.task
|
695
737
|
task_file = eval_log.eval.task_file
|
696
738
|
if task_file:
|
697
739
|
if not Path(task_file).exists():
|
@@ -750,6 +792,9 @@ async def eval_retry_async(
|
|
750
792
|
log_buffer = (
|
751
793
|
log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
|
752
794
|
)
|
795
|
+
log_shared = (
|
796
|
+
log_shared if log_shared is not None else eval_log.eval.config.log_shared
|
797
|
+
)
|
753
798
|
score_display = (
|
754
799
|
score_display
|
755
800
|
if score_display is not None
|
@@ -796,6 +841,7 @@ async def eval_retry_async(
|
|
796
841
|
log_samples=log_samples,
|
797
842
|
log_images=log_images,
|
798
843
|
log_buffer=log_buffer,
|
844
|
+
log_shared=log_shared,
|
799
845
|
score=score,
|
800
846
|
score_display=score_display,
|
801
847
|
**dict(config),
|
@@ -809,24 +855,20 @@ async def eval_retry_async(
|
|
809
855
|
|
810
856
|
|
811
857
|
def eval_init(
|
812
|
-
tasks: Tasks,
|
813
858
|
model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
|
814
859
|
model_base_url: str | None = None,
|
815
860
|
model_args: dict[str, Any] | str = dict(),
|
816
|
-
task_args: dict[str, Any] | str = dict(),
|
817
|
-
sandbox: SandboxEnvironmentType | None = None,
|
818
861
|
approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
|
819
862
|
max_subprocesses: int | None = None,
|
820
863
|
log_level: str | None = None,
|
821
864
|
log_level_transcript: str | None = None,
|
822
865
|
**kwargs: Unpack[GenerateConfigArgs],
|
823
|
-
) -> tuple[list[Model], list[ApprovalPolicy] | None
|
866
|
+
) -> tuple[list[Model], list[ApprovalPolicy] | None]:
|
824
867
|
# init eval context
|
825
868
|
init_eval_context(log_level, log_level_transcript, max_subprocesses)
|
826
869
|
|
827
870
|
# resolve model and task args
|
828
871
|
model_args = resolve_args(model_args)
|
829
|
-
task_args = resolve_args(task_args)
|
830
872
|
|
831
873
|
# resolve model args from environment if not specified
|
832
874
|
if len(model_args) == 0:
|
@@ -839,21 +881,28 @@ def eval_init(
|
|
839
881
|
generate_config = GenerateConfig(**kwargs)
|
840
882
|
models = resolve_models(model, model_base_url, model_args, generate_config)
|
841
883
|
|
842
|
-
# resolve tasks (set active model to resolve uses of the
|
843
|
-
# 'default' model in tools, solvers, and scorers)
|
844
|
-
|
845
|
-
with task_display().suspend_task_app():
|
846
|
-
resolved_tasks: list[ResolvedTask] = []
|
847
|
-
for m in models:
|
848
|
-
init_active_model(m, generate_config)
|
849
|
-
resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
|
850
|
-
|
851
884
|
# resolve approval
|
852
885
|
if isinstance(approval, str | ApprovalPolicyConfig):
|
853
886
|
approval = approval_policies_from_config(approval)
|
854
887
|
init_tool_approval(approval)
|
855
888
|
|
856
|
-
return models, approval
|
889
|
+
return models, approval
|
890
|
+
|
891
|
+
|
892
|
+
def eval_resolve_tasks(
|
893
|
+
tasks: Tasks,
|
894
|
+
task_args: dict[str, Any] | str,
|
895
|
+
models: list[Model],
|
896
|
+
config: GenerateConfig,
|
897
|
+
sandbox: SandboxEnvironmentType | None,
|
898
|
+
) -> list[ResolvedTask]:
|
899
|
+
task_args = resolve_args(task_args)
|
900
|
+
with task_display().suspend_task_app():
|
901
|
+
resolved_tasks: list[ResolvedTask] = []
|
902
|
+
for m in models:
|
903
|
+
init_active_model(m, config)
|
904
|
+
resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
|
905
|
+
return resolved_tasks
|
857
906
|
|
858
907
|
|
859
908
|
def init_eval_display(
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
from copy import deepcopy
|
4
3
|
from typing import Any, Literal, NamedTuple, Set, cast
|
5
4
|
|
6
5
|
import rich
|
@@ -18,6 +17,7 @@ from typing_extensions import Unpack
|
|
18
17
|
from inspect_ai._util.error import PrerequisiteError
|
19
18
|
from inspect_ai._util.file import basename, filesystem
|
20
19
|
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
|
20
|
+
from inspect_ai.agent._agent import Agent
|
21
21
|
from inspect_ai.approval._policy import ApprovalPolicy
|
22
22
|
from inspect_ai.log import EvalLog
|
23
23
|
from inspect_ai.log._bundle import bundle_log_dir
|
@@ -37,7 +37,7 @@ from inspect_ai.solver._solver import Solver, SolverSpec
|
|
37
37
|
from inspect_ai.util import DisplayType, SandboxEnvironmentType
|
38
38
|
from inspect_ai.util._display import display_type_initialized, init_display_type
|
39
39
|
|
40
|
-
from .eval import eval, eval_init
|
40
|
+
from .eval import eval, eval_init, eval_resolve_tasks
|
41
41
|
from .loader import resolve_task_args
|
42
42
|
from .task import Epochs
|
43
43
|
from .task.resolved import ResolvedTask
|
@@ -66,7 +66,7 @@ def eval_set(
|
|
66
66
|
task_args: dict[str, Any] | str = dict(),
|
67
67
|
sandbox: SandboxEnvironmentType | None = None,
|
68
68
|
sandbox_cleanup: bool | None = None,
|
69
|
-
solver: Solver | list[Solver] |
|
69
|
+
solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
|
70
70
|
tags: list[str] | None = None,
|
71
71
|
metadata: dict[str, Any] | None = None,
|
72
72
|
trace: bool | None = None,
|
@@ -92,6 +92,7 @@ def eval_set(
|
|
92
92
|
log_samples: bool | None = None,
|
93
93
|
log_images: bool | None = None,
|
94
94
|
log_buffer: int | None = None,
|
95
|
+
log_shared: bool | int | None = None,
|
95
96
|
bundle_dir: str | None = None,
|
96
97
|
bundle_overwrite: bool = False,
|
97
98
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -171,6 +172,9 @@ def eval_set(
|
|
171
172
|
log_buffer: Number of samples to buffer before writing log file.
|
172
173
|
If not specified, an appropriate default for the format and filesystem is
|
173
174
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
175
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
176
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
177
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
174
178
|
bundle_dir: If specified, the log viewer and logs generated
|
175
179
|
by this eval set will be bundled into this directory.
|
176
180
|
bundle_overwrite: Whether to overwrite files in the bundle_dir.
|
@@ -219,6 +223,7 @@ def eval_set(
|
|
219
223
|
log_samples=log_samples,
|
220
224
|
log_images=log_images,
|
221
225
|
log_buffer=log_buffer,
|
226
|
+
log_shared=log_shared,
|
222
227
|
score=score,
|
223
228
|
**kwargs,
|
224
229
|
)
|
@@ -242,29 +247,21 @@ def eval_set(
|
|
242
247
|
if display == "conversation":
|
243
248
|
raise RuntimeError("eval_set cannot be used with conversation display.")
|
244
249
|
|
245
|
-
#
|
246
|
-
models, _
|
247
|
-
tasks=tasks,
|
250
|
+
# initialize eval
|
251
|
+
models, _ = eval_init(
|
248
252
|
model=model,
|
249
253
|
model_base_url=model_base_url,
|
250
254
|
model_args=model_args,
|
251
|
-
task_args=task_args,
|
252
|
-
sandbox=sandbox,
|
253
255
|
max_subprocesses=max_subprocesses,
|
254
256
|
log_level=log_level,
|
255
257
|
log_level_transcript=log_level_transcript,
|
256
258
|
**kwargs,
|
257
259
|
)
|
258
260
|
|
259
|
-
# ensure log_dir
|
261
|
+
# ensure log_dir
|
260
262
|
fs = filesystem(log_dir)
|
261
263
|
fs.mkdir(log_dir, exist_ok=True)
|
262
264
|
|
263
|
-
# validate that:
|
264
|
-
# (1) All tasks have a unique identifier
|
265
|
-
# (2) All logs have identifiers that map to tasks
|
266
|
-
validate_eval_set_prerequisites(resolved_tasks, list_all_eval_logs(log_dir))
|
267
|
-
|
268
265
|
# resolve some parameters
|
269
266
|
retry_connections = retry_connections or 0.5
|
270
267
|
retry_cleanup = retry_cleanup is not False
|
@@ -305,11 +302,21 @@ def eval_set(
|
|
305
302
|
# - tasks with a successful log (they'll just be returned)
|
306
303
|
# - tasks with failed logs (they'll be retried)
|
307
304
|
def try_eval() -> list[EvalLog]:
|
305
|
+
# resolve tasks
|
306
|
+
resolved_tasks = eval_resolve_tasks(
|
307
|
+
tasks, task_args, models, GenerateConfig(**kwargs), sandbox
|
308
|
+
)
|
309
|
+
|
308
310
|
# list all logs currently in the log directory (update manifest if there are some)
|
309
311
|
all_logs = list_all_eval_logs(log_dir)
|
310
312
|
if len(all_logs) > 0:
|
311
313
|
write_log_dir_manifest(log_dir)
|
312
314
|
|
315
|
+
# validate that:
|
316
|
+
# (1) All tasks have a unique identifier
|
317
|
+
# (2) All logs have identifiers that map to tasks
|
318
|
+
validate_eval_set_prerequisites(resolved_tasks, all_logs)
|
319
|
+
|
313
320
|
# see which tasks are yet to run (to complete successfully we need
|
314
321
|
# a successful eval for every [task_file/]task_name/model combination)
|
315
322
|
# for those that haven't run, schedule them into models => tasks groups
|
@@ -414,13 +421,10 @@ def as_previous_tasks(
|
|
414
421
|
# want to bring this back but we'd need to resolve the
|
415
422
|
# directory issues.
|
416
423
|
|
417
|
-
# deepcopy so the same instance is not run twice
|
418
|
-
prev_task = deepcopy(task.task)
|
419
|
-
|
420
424
|
previous_tasks.append(
|
421
425
|
PreviousTask(
|
422
426
|
id=log.header.eval.task_id,
|
423
|
-
task=
|
427
|
+
task=task.task,
|
424
428
|
task_args=resolve_task_args(task.task),
|
425
429
|
model=task.model,
|
426
430
|
log=read_eval_log(log.info),
|
inspect_ai/_eval/loader.py
CHANGED
@@ -26,6 +26,8 @@ from inspect_ai._util.registry import (
|
|
26
26
|
registry_lookup,
|
27
27
|
registry_params,
|
28
28
|
)
|
29
|
+
from inspect_ai.agent._agent import Agent
|
30
|
+
from inspect_ai.agent._as_solver import as_solver
|
29
31
|
from inspect_ai.model import Model
|
30
32
|
from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
|
31
33
|
from inspect_ai.solver._bridge import bridge
|
@@ -421,20 +423,32 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
421
423
|
if solver_file is None:
|
422
424
|
if solver_name is None:
|
423
425
|
raise ValueError(f"Unable to resolve solver name from {spec.solver}")
|
424
|
-
|
426
|
+
elif registry_lookup("solver", solver_name) is not None:
|
427
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
428
|
+
elif registry_lookup("agent", solver_name) is not None:
|
429
|
+
agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
|
430
|
+
return as_solver(agent)
|
431
|
+
else:
|
432
|
+
raise ValueError(
|
433
|
+
f"Unkonwn solver {solver_name} (not registered as a @solver or @agent)"
|
434
|
+
)
|
425
435
|
|
426
436
|
# we do have a solver file
|
427
437
|
else:
|
428
438
|
# load the module and parse decorators
|
429
439
|
solver_module = load_module(solver_file)
|
430
|
-
|
440
|
+
solver_decorators = parse_decorators(solver_file, "solver")
|
441
|
+
agent_decorators = parse_decorators(solver_file, "agent")
|
431
442
|
|
432
443
|
# if there is no solver_name see if we can discover it
|
433
444
|
if solver_name is None:
|
434
|
-
if len(
|
445
|
+
if len(solver_decorators) == 1:
|
435
446
|
# decorator based solver
|
436
|
-
solver_name =
|
437
|
-
elif len(
|
447
|
+
solver_name = solver_decorators[0][0]
|
448
|
+
elif len(agent_decorators) == 1:
|
449
|
+
# decorator based agent
|
450
|
+
solver_name = agent_decorators[0][0]
|
451
|
+
elif len(solver_decorators) == 0 and len(agent_decorators) == 0:
|
438
452
|
# see if we can find an agent based solver
|
439
453
|
functions = [
|
440
454
|
function
|
@@ -454,26 +468,35 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
454
468
|
|
455
469
|
elif len(agent_functions) == 0:
|
456
470
|
raise PrerequisiteError(
|
457
|
-
f"The source file {pretty_solver_file} does not contain any @solver
|
471
|
+
f"The source file {pretty_solver_file} does not contain any @solver, @agent or bridged agent functions."
|
458
472
|
)
|
459
473
|
else:
|
460
474
|
raise PrerequisiteError(
|
461
|
-
f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
|
475
|
+
f"The source file {pretty_solver_file} has more than one bridged agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
|
462
476
|
)
|
463
|
-
|
477
|
+
elif len(solver_decorators) > 1:
|
464
478
|
raise PrerequisiteError(
|
465
479
|
f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
|
466
480
|
)
|
481
|
+
else:
|
482
|
+
raise PrerequisiteError(
|
483
|
+
f"The source file {pretty_solver_file} has more than one @agent function (qualify which agent using e.g. '{solver_file.name}y@agent_fn')"
|
484
|
+
)
|
467
485
|
|
468
486
|
# create decorator based solvers using the registry
|
469
|
-
if any(solver[0] == solver_name for solver in
|
487
|
+
if any(solver[0] == solver_name for solver in solver_decorators):
|
470
488
|
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
471
489
|
|
472
|
-
# create
|
490
|
+
# create decorator based agents using the registry
|
491
|
+
elif any(agent[0] == solver_name for agent in agent_decorators):
|
492
|
+
agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
|
493
|
+
return as_solver(agent)
|
494
|
+
|
495
|
+
# create bridge based solvers by calling the function and wrapping it in bridge()
|
473
496
|
else:
|
474
497
|
agent_fn = getattr(solver_module, solver_name, None)
|
475
498
|
if inspect.isfunction(agent_fn):
|
476
|
-
return bridge
|
499
|
+
return bridge(agent_fn(**spec.args))
|
477
500
|
elif agent_fn is not None:
|
478
501
|
raise PrerequisiteError(
|
479
502
|
f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
|
inspect_ai/_eval/run.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
import functools
|
2
1
|
import logging
|
3
2
|
import os
|
4
3
|
import sys
|
@@ -20,7 +19,6 @@ from inspect_ai._display.core.active import (
|
|
20
19
|
init_task_screen,
|
21
20
|
)
|
22
21
|
from inspect_ai._display.core.display import TaskSpec
|
23
|
-
from inspect_ai._util._async import tg_collect
|
24
22
|
from inspect_ai._util.error import PrerequisiteError, exception_message
|
25
23
|
from inspect_ai._util.path import chdir
|
26
24
|
from inspect_ai._util.registry import registry_unqualified_name
|
@@ -195,6 +193,7 @@ async def eval_run(
|
|
195
193
|
task_name=task.name,
|
196
194
|
task_version=task.version,
|
197
195
|
task_file=resolved_task.task_file,
|
196
|
+
task_registry_name=resolved_task.task.registry_name,
|
198
197
|
task_id=resolved_task.id if resolved_task.id else uuid(),
|
199
198
|
run_id=run_id,
|
200
199
|
solver=eval_solver_spec,
|
@@ -359,17 +358,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
359
358
|
"Run Task",
|
360
359
|
f"task: {task_options.task.name} ({task_options.model})",
|
361
360
|
):
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
else:
|
371
|
-
result = tg_results[0]
|
372
|
-
results.append(result)
|
361
|
+
async with anyio.create_task_group() as tg:
|
362
|
+
|
363
|
+
async def run_task() -> None:
|
364
|
+
result = await task_run(task_options)
|
365
|
+
results.append(result)
|
366
|
+
|
367
|
+
tg.start_soon(run_task)
|
373
368
|
|
374
369
|
except Exception as ex:
|
375
370
|
# errors generally don't escape from tasks (the exception being if an error
|
@@ -407,12 +402,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
407
402
|
# Use anyio task group instead of manual task management
|
408
403
|
try:
|
409
404
|
async with anyio.create_task_group() as tg:
|
405
|
+
# computer number of workers (never more than total_tasks)
|
406
|
+
num_workers = min(parallel, total_tasks)
|
407
|
+
|
410
408
|
# start worker tasks
|
411
|
-
for _ in range(
|
409
|
+
for _ in range(num_workers):
|
412
410
|
tg.start_soon(worker)
|
413
411
|
|
414
412
|
# enqueue initial set of tasks
|
415
|
-
for _ in range(
|
413
|
+
for _ in range(num_workers):
|
416
414
|
await enque_next_task()
|
417
415
|
except anyio.get_cancelled_exc_class():
|
418
416
|
pass
|
inspect_ai/_eval/score.py
CHANGED
@@ -7,8 +7,8 @@ import anyio
|
|
7
7
|
|
8
8
|
from inspect_ai._display import display
|
9
9
|
from inspect_ai._eval.loader import scorer_from_spec
|
10
|
-
from inspect_ai._util._async import tg_collect
|
11
|
-
from inspect_ai._util.platform import platform_init
|
10
|
+
from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
|
11
|
+
from inspect_ai._util.platform import platform_init, running_in_notebook
|
12
12
|
from inspect_ai._util.registry import registry_create, registry_unqualified_name
|
13
13
|
from inspect_ai.log import (
|
14
14
|
EvalLog,
|
@@ -56,7 +56,17 @@ def score(
|
|
56
56
|
# resolve scorers into a list
|
57
57
|
scorers = [scorers] if isinstance(scorers, Scorer) else scorers
|
58
58
|
|
59
|
-
|
59
|
+
if running_in_notebook():
|
60
|
+
return run_coroutine(score_async(log, scorers, epochs_reducer, action))
|
61
|
+
else:
|
62
|
+
return anyio.run(
|
63
|
+
score_async,
|
64
|
+
log,
|
65
|
+
scorers,
|
66
|
+
epochs_reducer,
|
67
|
+
action,
|
68
|
+
backend=configured_async_backend(),
|
69
|
+
)
|
60
70
|
|
61
71
|
|
62
72
|
async def score_async(
|
@@ -1,12 +1,8 @@
|
|
1
1
|
from typing import Literal
|
2
2
|
|
3
|
-
from inspect_ai.model import
|
4
|
-
CachePolicy,
|
5
|
-
GenerateConfig,
|
6
|
-
Model,
|
7
|
-
call_tools,
|
8
|
-
)
|
3
|
+
from inspect_ai.model import CachePolicy, GenerateConfig, Model
|
9
4
|
from inspect_ai.model._cache import epoch
|
5
|
+
from inspect_ai.model._call_tools import execute_tools
|
10
6
|
from inspect_ai.solver import TaskState
|
11
7
|
from inspect_ai.solver._limit import SampleLimitExceededError
|
12
8
|
from inspect_ai.tool import ToolFunction
|
@@ -48,10 +44,13 @@ async def task_generate(
|
|
48
44
|
|
49
45
|
# resolve tool calls if necessary
|
50
46
|
if tool_calls != "none" and message.tool_calls:
|
51
|
-
# call tools and
|
52
|
-
|
53
|
-
|
47
|
+
# call tools and update messages and output
|
48
|
+
messages, output = await execute_tools(
|
49
|
+
state.messages, state.tools, config.max_tool_output
|
54
50
|
)
|
51
|
+
state.messages.extend(messages)
|
52
|
+
if output is not None:
|
53
|
+
state.output = output
|
55
54
|
|
56
55
|
# check for completed or only executing a single tool call
|
57
56
|
if state.completed or tool_calls == "single":
|