PyPI - inspect-ai - Versions diffs - 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl - Mend

inspect-ai 0.3.81py3-none-any.whl → 0.3.83py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (297) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_cli/eval.py +35 -2
inspect_ai/_cli/util.py +44 -1
inspect_ai/_display/core/config.py +1 -1
inspect_ai/_display/core/display.py +13 -4
inspect_ai/_display/core/results.py +1 -1
inspect_ai/_display/textual/app.py +14 -3
inspect_ai/_display/textual/display.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +9 -3
inspect_ai/_display/textual/widgets/task_detail.py +8 -8
inspect_ai/_display/textual/widgets/tasks.py +17 -1
inspect_ai/_display/textual/widgets/vscode.py +44 -0
inspect_ai/_eval/eval.py +74 -25
inspect_ai/_eval/evalset.py +22 -18
inspect_ai/_eval/loader.py +34 -11
inspect_ai/_eval/run.py +13 -15
inspect_ai/_eval/score.py +13 -3
inspect_ai/_eval/task/generate.py +8 -9
inspect_ai/_eval/task/log.py +55 -6
inspect_ai/_eval/task/run.py +51 -10
inspect_ai/_eval/task/task.py +23 -9
inspect_ai/_util/constants.py +2 -0
inspect_ai/_util/file.py +30 -1
inspect_ai/_util/json.py +37 -1
inspect_ai/_util/registry.py +1 -0
inspect_ai/_util/vscode.py +37 -0
inspect_ai/_view/server.py +113 -1
inspect_ai/_view/www/App.css +7 -1
inspect_ai/_view/www/dist/assets/index.css +813 -415
inspect_ai/_view/www/dist/assets/index.js +54475 -32003
inspect_ai/_view/www/eslint.config.mjs +1 -1
inspect_ai/_view/www/log-schema.json +137 -31
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/package.json +11 -2
inspect_ai/_view/www/src/App.tsx +161 -853
inspect_ai/_view/www/src/api/api-browser.ts +176 -5
inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
inspect_ai/_view/www/src/api/client-api.ts +66 -10
inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
inspect_ai/_view/www/src/api/types.ts +107 -2
inspect_ai/_view/www/src/appearance/icons.ts +2 -0
inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
inspect_ai/_view/www/src/components/Card.tsx +6 -4
inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
inspect_ai/_view/www/src/components/Modal.module.css +38 -0
inspect_ai/_view/www/src/components/Modal.tsx +77 -0
inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
inspect_ai/_view/www/src/index.tsx +26 -94
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
inspect_ai/_view/www/src/scoring/utils.ts +87 -0
inspect_ai/_view/www/src/state/appSlice.ts +244 -0
inspect_ai/_view/www/src/state/hooks.ts +399 -0
inspect_ai/_view/www/src/state/logPolling.ts +200 -0
inspect_ai/_view/www/src/state/logSlice.ts +224 -0
inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
inspect_ai/_view/www/src/state/scrolling.ts +206 -0
inspect_ai/_view/www/src/state/store.ts +168 -0
inspect_ai/_view/www/src/state/store_filter.ts +84 -0
inspect_ai/_view/www/src/state/utils.ts +23 -0
inspect_ai/_view/www/src/storage/index.ts +26 -0
inspect_ai/_view/www/src/types/log.d.ts +36 -26
inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
inspect_ai/_view/www/src/types.ts +94 -32
inspect_ai/_view/www/src/utils/attachments.ts +58 -23
inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
inspect_ai/_view/www/src/utils/logger.ts +52 -0
inspect_ai/_view/www/src/utils/polling.ts +100 -0
inspect_ai/_view/www/src/utils/react.ts +30 -0
inspect_ai/_view/www/src/utils/vscode.ts +1 -1
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
inspect_ai/_view/www/src/workspace/types.ts +4 -3
inspect_ai/_view/www/src/workspace/utils.ts +4 -4
inspect_ai/_view/www/vite.config.js +6 -0
inspect_ai/_view/www/yarn.lock +464 -355
inspect_ai/agent/__init__.py +36 -0
inspect_ai/agent/_agent.py +268 -0
inspect_ai/agent/_as_solver.py +72 -0
inspect_ai/agent/_as_tool.py +122 -0
inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
inspect_ai/agent/_filter.py +46 -0
inspect_ai/agent/_handoff.py +93 -0
inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
inspect_ai/agent/_react.py +241 -0
inspect_ai/agent/_run.py +36 -0
inspect_ai/agent/_types.py +81 -0
inspect_ai/log/_condense.py +26 -0
inspect_ai/log/_log.py +17 -5
inspect_ai/log/_recorders/buffer/__init__.py +14 -0
inspect_ai/log/_recorders/buffer/buffer.py +30 -0
inspect_ai/log/_recorders/buffer/database.py +685 -0
inspect_ai/log/_recorders/buffer/filestore.py +259 -0
inspect_ai/log/_recorders/buffer/types.py +84 -0
inspect_ai/log/_recorders/eval.py +2 -11
inspect_ai/log/_recorders/types.py +30 -0
inspect_ai/log/_transcript.py +32 -2
inspect_ai/model/__init__.py +7 -1
inspect_ai/model/_call_tools.py +257 -52
inspect_ai/model/_chat_message.py +7 -4
inspect_ai/model/_conversation.py +13 -62
inspect_ai/model/_display.py +85 -0
inspect_ai/model/_generate_config.py +2 -2
inspect_ai/model/_model.py +114 -14
inspect_ai/model/_model_output.py +14 -9
inspect_ai/model/_openai.py +16 -4
inspect_ai/model/_openai_computer_use.py +162 -0
inspect_ai/model/_openai_responses.py +319 -165
inspect_ai/model/_providers/anthropic.py +20 -21
inspect_ai/model/_providers/azureai.py +24 -13
inspect_ai/model/_providers/bedrock.py +1 -7
inspect_ai/model/_providers/cloudflare.py +3 -3
inspect_ai/model/_providers/goodfire.py +2 -6
inspect_ai/model/_providers/google.py +11 -10
inspect_ai/model/_providers/groq.py +6 -3
inspect_ai/model/_providers/hf.py +7 -3
inspect_ai/model/_providers/mistral.py +7 -10
inspect_ai/model/_providers/openai.py +47 -17
inspect_ai/model/_providers/openai_o1.py +11 -4
inspect_ai/model/_providers/openai_responses.py +12 -14
inspect_ai/model/_providers/providers.py +2 -2
inspect_ai/model/_providers/together.py +12 -2
inspect_ai/model/_providers/util/chatapi.py +7 -2
inspect_ai/model/_providers/util/hf_handler.py +4 -2
inspect_ai/model/_providers/util/llama31.py +4 -2
inspect_ai/model/_providers/vertex.py +11 -9
inspect_ai/model/_providers/vllm.py +4 -4
inspect_ai/scorer/__init__.py +2 -0
inspect_ai/scorer/_metrics/__init__.py +2 -0
inspect_ai/scorer/_metrics/grouped.py +84 -0
inspect_ai/scorer/_score.py +26 -6
inspect_ai/solver/__init__.py +2 -2
inspect_ai/solver/_basic_agent.py +22 -9
inspect_ai/solver/_bridge.py +31 -0
inspect_ai/solver/_chain.py +20 -12
inspect_ai/solver/_fork.py +5 -1
inspect_ai/solver/_human_agent.py +52 -0
inspect_ai/solver/_prompt.py +3 -1
inspect_ai/solver/_run.py +59 -0
inspect_ai/solver/_solver.py +14 -4
inspect_ai/solver/_task_state.py +5 -3
inspect_ai/tool/_tool_call.py +15 -8
inspect_ai/tool/_tool_def.py +17 -12
inspect_ai/tool/_tool_support_helpers.py +4 -4
inspect_ai/tool/_tool_with.py +14 -11
inspect_ai/tool/_tools/_bash_session.py +11 -2
inspect_ai/tool/_tools/_computer/_common.py +18 -2
inspect_ai/tool/_tools/_computer/_computer.py +18 -2
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_anyio.py +27 -0
inspect_ai/util/_sandbox/__init__.py +2 -1
inspect_ai/util/_sandbox/context.py +32 -7
inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/docker/docker.py +12 -1
inspect_ai/util/_store_model.py +30 -7
inspect_ai/util/_subprocess.py +13 -3
inspect_ai/util/_subtask.py +1 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
/inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -2,9 +2,11 @@ import logging
 import os
 import sys
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, cast
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
+from inspect_ai.agent._agent import Agent, is_agent
+from inspect_ai.agent._as_solver import as_solver
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -15,7 +17,11 @@ from typing_extensions import Unpack
 from inspect_ai._cli.util import parse_cli_args
 from inspect_ai._display.core.active import display as task_display
 from inspect_ai._util.config import resolve_args
-from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
+from inspect_ai._util.constants import (
+    DEFAULT_LOG_FORMAT,
+    DEFAULT_LOG_SHARED,
+    JSON_LOG_FORMAT,
+)
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.file import absolute_file_path
 from inspect_ai._util.logger import warn_once
@@ -31,6 +37,7 @@ from inspect_ai.approval._policy import (
 from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
 from inspect_ai.log._file import read_eval_log_async
 from inspect_ai.log._recorders import create_recorder_for_format
+from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
 from inspect_ai.model import (
     GenerateConfig,
     GenerateConfigArgs,
@@ -66,7 +73,7 @@ def eval(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
@@ -92,6 +99,7 @@ def eval(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -161,6 +169,9 @@ def eval(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -210,6 +221,7 @@ def eval(
                 log_samples=log_samples,
                 log_images=log_images,
                 log_buffer=log_buffer,
+                log_shared=log_shared,
                 score=score,
                 score_display=score_display,
                 **kwargs,
@@ -236,7 +248,7 @@ async def eval_async(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
@@ -260,6 +272,7 @@ async def eval_async(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -312,6 +325,7 @@ async def eval_async(
         log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -341,13 +355,10 @@ async def eval_async(
     try:
         # intialise eval
-        model, approval, resolved_tasks = eval_init(
-            tasks=tasks,
+        model, approval = eval_init(
             model=model,
             model_base_url=model_base_url,
             model_args=model_args,
-            task_args=task_args,
-            sandbox=sandbox,
             approval=approval,
             max_subprocesses=max_subprocesses,
             log_level=log_level,
@@ -355,6 +366,11 @@ async def eval_async(
             **kwargs,
         )
+        # resolve tasks
+        resolved_tasks = eval_resolve_tasks(
+            tasks, task_args, model, GenerateConfig(**kwargs), sandbox
+        )
         # warn and return empty string if we resolved no tasks
         if len(resolved_tasks) == 0:
             log.warning("No inspect tasks were found at the specified paths.")
@@ -390,8 +406,22 @@ async def eval_async(
                 f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
             )
+        # resolve log_shared
+        log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
+        # validate that --log-shared can't use used with 'json' format
+        if log_shared and log_format == JSON_LOG_FORMAT:
+            raise PrerequisiteError(
+                "ERROR: --log-shared is not compatible with the json log format."
+            )
         # resolve solver
-        solver = chain(solver) if isinstance(solver, list) else solver
+        if isinstance(solver, list):
+            solver = chain(solver)
+        elif is_agent(solver):
+            solver = as_solver(solver)
+        else:
+            solver = cast(Solver | SolverSpec | None, solver)
         # ensure consistency of limit and sample_id
         if sample_id is not None and limit is not None:
@@ -426,6 +456,7 @@ async def eval_async(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score_display=score_display,
         )
@@ -485,6 +516,9 @@ async def eval_async(
             )
             logs = EvalLogs(results)
+        # cleanup sample buffers if required
+        cleanup_sample_buffers(log_dir)
     finally:
         _eval_async_running = False
@@ -510,6 +544,7 @@ def eval_retry(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     max_retries: int | None = None,
@@ -551,6 +586,9 @@ def eval_retry(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         max_retries:
@@ -586,6 +624,7 @@ def eval_retry(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score=score,
             score_display=score_display,
             max_retries=max_retries,
@@ -612,6 +651,7 @@ async def eval_retry_async(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     max_retries: int | None = None,
@@ -651,6 +691,8 @@ async def eval_retry_async(
         log_buffer: (int | None): Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Indicate that the log directory is shared, which results in
+            additional syncing of realtime log data for Inspect View.
         score (bool): Score output (defaults to True)
         score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         max_retries (int | None):
@@ -691,7 +733,7 @@ async def eval_retry_async(
         # context to reconstruct ephemeral Task instances)
         task: str | None
         task_id = eval_log.eval.task_id
-        task_name = eval_log.eval.task
+        task_name = eval_log.eval.task_registry_name or eval_log.eval.task
         task_file = eval_log.eval.task_file
         if task_file:
             if not Path(task_file).exists():
@@ -750,6 +792,9 @@ async def eval_retry_async(
         log_buffer = (
             log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
         )
+        log_shared = (
+            log_shared if log_shared is not None else eval_log.eval.config.log_shared
+        )
         score_display = (
             score_display
             if score_display is not None
@@ -796,6 +841,7 @@ async def eval_retry_async(
                 log_samples=log_samples,
                 log_images=log_images,
                 log_buffer=log_buffer,
+                log_shared=log_shared,
                 score=score,
                 score_display=score_display,
                 **dict(config),
@@ -809,24 +855,20 @@ async def eval_retry_async(
 def eval_init(
-    tasks: Tasks,
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
-    task_args: dict[str, Any] | str = dict(),
-    sandbox: SandboxEnvironmentType | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
     max_subprocesses: int | None = None,
     log_level: str | None = None,
     log_level_transcript: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
-) -> tuple[list[Model], list[ApprovalPolicy] | None, list[ResolvedTask]]:
+) -> tuple[list[Model], list[ApprovalPolicy] | None]:
     # init eval context
     init_eval_context(log_level, log_level_transcript, max_subprocesses)
     # resolve model and task args
     model_args = resolve_args(model_args)
-    task_args = resolve_args(task_args)
     # resolve model args from environment if not specified
     if len(model_args) == 0:
@@ -839,21 +881,28 @@ def eval_init(
     generate_config = GenerateConfig(**kwargs)
     models = resolve_models(model, model_base_url, model_args, generate_config)
-    # resolve tasks (set active model to resolve uses of the
-    # 'default' model in tools, solvers, and scorers)
-    with task_display().suspend_task_app():
-        resolved_tasks: list[ResolvedTask] = []
-        for m in models:
-            init_active_model(m, generate_config)
-            resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
     # resolve approval
     if isinstance(approval, str | ApprovalPolicyConfig):
         approval = approval_policies_from_config(approval)
     init_tool_approval(approval)
-    return models, approval, resolved_tasks
+    return models, approval
+def eval_resolve_tasks(
+    tasks: Tasks,
+    task_args: dict[str, Any] | str,
+    models: list[Model],
+    config: GenerateConfig,
+    sandbox: SandboxEnvironmentType | None,
+) -> list[ResolvedTask]:
+    task_args = resolve_args(task_args)
+    with task_display().suspend_task_app():
+        resolved_tasks: list[ResolvedTask] = []
+        for m in models:
+            init_active_model(m, config)
+            resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
+        return resolved_tasks
 def init_eval_display(

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import hashlib
 import logging
-from copy import deepcopy
 from typing import Any, Literal, NamedTuple, Set, cast
 import rich
@@ -18,6 +17,7 @@ from typing_extensions import Unpack
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.file import basename, filesystem
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
+from inspect_ai.agent._agent import Agent
 from inspect_ai.approval._policy import ApprovalPolicy
 from inspect_ai.log import EvalLog
 from inspect_ai.log._bundle import bundle_log_dir
@@ -37,7 +37,7 @@ from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util import DisplayType, SandboxEnvironmentType
 from inspect_ai.util._display import display_type_initialized, init_display_type
-from .eval import eval, eval_init
+from .eval import eval, eval_init, eval_resolve_tasks
 from .loader import resolve_task_args
 from .task import Epochs
 from .task.resolved import ResolvedTask
@@ -66,7 +66,7 @@ def eval_set(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
@@ -92,6 +92,7 @@ def eval_set(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     bundle_dir: str | None = None,
     bundle_overwrite: bool = False,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -171,6 +172,9 @@ def eval_set(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         bundle_dir: If specified, the log viewer and logs generated
             by this eval set will be bundled into this directory.
         bundle_overwrite: Whether to overwrite files in the bundle_dir.
@@ -219,6 +223,7 @@ def eval_set(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score=score,
             **kwargs,
         )
@@ -242,29 +247,21 @@ def eval_set(
     if display == "conversation":
         raise RuntimeError("eval_set cannot be used with conversation display.")
-    # resolve tasks
-    models, _, resolved_tasks = eval_init(
-        tasks=tasks,
+    # initialize eval
+    models, _ = eval_init(
         model=model,
         model_base_url=model_base_url,
         model_args=model_args,
-        task_args=task_args,
-        sandbox=sandbox,
         max_subprocesses=max_subprocesses,
         log_level=log_level,
         log_level_transcript=log_level_transcript,
         **kwargs,
     )
-    # ensure log_dir and list all logs
+    # ensure log_dir
     fs = filesystem(log_dir)
     fs.mkdir(log_dir, exist_ok=True)
-    # validate that:
-    #  (1) All tasks have a unique identifier
-    #  (2) All logs have identifiers that map to tasks
-    validate_eval_set_prerequisites(resolved_tasks, list_all_eval_logs(log_dir))
     # resolve some parameters
     retry_connections = retry_connections or 0.5
     retry_cleanup = retry_cleanup is not False
@@ -305,11 +302,21 @@ def eval_set(
     #   - tasks with a successful log (they'll just be returned)
     #   - tasks with failed logs (they'll be retried)
     def try_eval() -> list[EvalLog]:
+        # resolve tasks
+        resolved_tasks = eval_resolve_tasks(
+            tasks, task_args, models, GenerateConfig(**kwargs), sandbox
+        )
         # list all logs currently in the log directory (update manifest if there are some)
         all_logs = list_all_eval_logs(log_dir)
         if len(all_logs) > 0:
             write_log_dir_manifest(log_dir)
+        # validate that:
+        #  (1) All tasks have a unique identifier
+        #  (2) All logs have identifiers that map to tasks
+        validate_eval_set_prerequisites(resolved_tasks, all_logs)
         # see which tasks are yet to run (to complete successfully we need
         # a successful eval for every [task_file/]task_name/model combination)
         # for those that haven't run, schedule them into models => tasks groups
@@ -414,13 +421,10 @@ def as_previous_tasks(
         # want to bring this back but we'd need to resolve the
         # directory issues.
-        # deepcopy so the same instance is not run twice
-        prev_task = deepcopy(task.task)
         previous_tasks.append(
             PreviousTask(
                 id=log.header.eval.task_id,
-                task=prev_task,
+                task=task.task,
                 task_args=resolve_task_args(task.task),
                 model=task.model,
                 log=read_eval_log(log.info),

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -26,6 +26,8 @@ from inspect_ai._util.registry import (
     registry_lookup,
     registry_params,
 )
+from inspect_ai.agent._agent import Agent
+from inspect_ai.agent._as_solver import as_solver
 from inspect_ai.model import Model
 from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
 from inspect_ai.solver._bridge import bridge
@@ -421,20 +423,32 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
         if solver_file is None:
             if solver_name is None:
                 raise ValueError(f"Unable to resolve solver name from {spec.solver}")
-            return cast(Solver, registry_create("solver", solver_name, **spec.args))
+            elif registry_lookup("solver", solver_name) is not None:
+                return cast(Solver, registry_create("solver", solver_name, **spec.args))
+            elif registry_lookup("agent", solver_name) is not None:
+                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                return as_solver(agent)
+            else:
+                raise ValueError(
+                    f"Unkonwn solver {solver_name} (not registered as a @solver or @agent)"
+                )
         # we do have a solver file
         else:
             # load the module and parse decorators
             solver_module = load_module(solver_file)
-            decorators = parse_decorators(solver_file, "solver")
+            solver_decorators = parse_decorators(solver_file, "solver")
+            agent_decorators = parse_decorators(solver_file, "agent")
             # if there is no solver_name see if we can discover it
             if solver_name is None:
-                if len(decorators) == 1:
+                if len(solver_decorators) == 1:
                     # decorator based solver
-                    solver_name = decorators[0][0]
-                elif len(decorators) == 0:
+                    solver_name = solver_decorators[0][0]
+                elif len(agent_decorators) == 1:
+                    # decorator based agent
+                    solver_name = agent_decorators[0][0]
+                elif len(solver_decorators) == 0 and len(agent_decorators) == 0:
                     # see if we can find an agent based solver
                     functions = [
                         function
@@ -454,26 +468,35 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
                     elif len(agent_functions) == 0:
                         raise PrerequisiteError(
-                            f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
+                            f"The source file {pretty_solver_file} does not contain any @solver, @agent or bridged agent functions."
                         )
                     else:
                         raise PrerequisiteError(
-                            f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
+                            f"The source file {pretty_solver_file} has more than one bridged agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
                         )
-                else:
+                elif len(solver_decorators) > 1:
                     raise PrerequisiteError(
                         f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
                     )
+                else:
+                    raise PrerequisiteError(
+                        f"The source file {pretty_solver_file} has more than one @agent function (qualify which agent using e.g. '{solver_file.name}y@agent_fn')"
+                    )
             # create decorator based solvers using the registry
-            if any(solver[0] == solver_name for solver in decorators):
+            if any(solver[0] == solver_name for solver in solver_decorators):
                 return cast(Solver, registry_create("solver", solver_name, **spec.args))
-            # create agent based solvers by calling the function and wrapping it in bridge()
+            # create decorator based agents using the registry
+            elif any(agent[0] == solver_name for agent in agent_decorators):
+                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                return as_solver(agent)
+            # create bridge based solvers by calling the function and wrapping it in bridge()
             else:
                 agent_fn = getattr(solver_module, solver_name, None)
                 if inspect.isfunction(agent_fn):
-                    return bridge.bridge(agent_fn(**spec.args))
+                    return bridge(agent_fn(**spec.args))
                 elif agent_fn is not None:
                     raise PrerequisiteError(
                         f"The object {solver_name} in file {pretty_solver_file} is not a Python function."

inspect_ai/_eval/run.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import functools
 import logging
 import os
 import sys
@@ -20,7 +19,6 @@ from inspect_ai._display.core.active import (
     init_task_screen,
 )
 from inspect_ai._display.core.display import TaskSpec
-from inspect_ai._util._async import tg_collect
 from inspect_ai._util.error import PrerequisiteError, exception_message
 from inspect_ai._util.path import chdir
 from inspect_ai._util.registry import registry_unqualified_name
@@ -195,6 +193,7 @@ async def eval_run(
                     task_name=task.name,
                     task_version=task.version,
                     task_file=resolved_task.task_file,
+                    task_registry_name=resolved_task.task.registry_name,
                     task_id=resolved_task.id if resolved_task.id else uuid(),
                     run_id=run_id,
                     solver=eval_solver_spec,
@@ -359,17 +358,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                         "Run Task",
                         f"task: {task_options.task.name} ({task_options.model})",
                     ):
-                        tg_results = await tg_collect(
-                            [functools.partial(task_run, task_options)]
-                        )
-                    # check for empty results list (indicates cancellation)
-                    if len(tg_results) == 0:
-                        # task was cancelled, break out of the worker loop
-                        result = None
-                    else:
-                        result = tg_results[0]
-                        results.append(result)
+                        async with anyio.create_task_group() as tg:
+                            async def run_task() -> None:
+                                result = await task_run(task_options)
+                                results.append(result)
+                            tg.start_soon(run_task)
                 except Exception as ex:
                     # errors generally don't escape from tasks (the exception being if an error
@@ -407,12 +402,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
         # Use anyio task group instead of manual task management
         try:
             async with anyio.create_task_group() as tg:
+                # computer number of workers (never more than total_tasks)
+                num_workers = min(parallel, total_tasks)
                 # start worker tasks
-                for _ in range(parallel):
+                for _ in range(num_workers):
                     tg.start_soon(worker)
                 # enqueue initial set of tasks
-                for _ in range(min(parallel, total_tasks)):
+                for _ in range(num_workers):
                     await enque_next_task()
         except anyio.get_cancelled_exc_class():
             pass

inspect_ai/_eval/score.py CHANGED Viewed

@@ -7,8 +7,8 @@ import anyio
 from inspect_ai._display import display
 from inspect_ai._eval.loader import scorer_from_spec
-from inspect_ai._util._async import tg_collect
-from inspect_ai._util.platform import platform_init
+from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
+from inspect_ai._util.platform import platform_init, running_in_notebook
 from inspect_ai._util.registry import registry_create, registry_unqualified_name
 from inspect_ai.log import (
     EvalLog,
@@ -56,7 +56,17 @@ def score(
     # resolve scorers into a list
     scorers = [scorers] if isinstance(scorers, Scorer) else scorers
-    return anyio.run(score_async, log, scorers, epochs_reducer, action)
+    if running_in_notebook():
+        return run_coroutine(score_async(log, scorers, epochs_reducer, action))
+    else:
+        return anyio.run(
+            score_async,
+            log,
+            scorers,
+            epochs_reducer,
+            action,
+            backend=configured_async_backend(),
+        )
 async def score_async(

inspect_ai/_eval/task/generate.py CHANGED Viewed

@@ -1,12 +1,8 @@
 from typing import Literal
-from inspect_ai.model import (
-    CachePolicy,
-    GenerateConfig,
-    Model,
-    call_tools,
-)
+from inspect_ai.model import CachePolicy, GenerateConfig, Model
 from inspect_ai.model._cache import epoch
+from inspect_ai.model._call_tools import execute_tools
 from inspect_ai.solver import TaskState
 from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.tool import ToolFunction
@@ -48,10 +44,13 @@ async def task_generate(
             # resolve tool calls if necessary
             if tool_calls != "none" and message.tool_calls:
-                # call tools and append messages to state
-                state.messages.extend(
-                    await call_tools(message, state.tools, config.max_tool_output)
+                # call tools and update messages and output
+                messages, output = await execute_tools(
+                    state.messages, state.tools, config.max_tool_output
                 )
+                state.messages.extend(messages)
+                if output is not None:
+                    state.output = output
                 # check for completed or only executing a single tool call
                 if state.completed or tool_calls == "single":

inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

inspect-ai 0.3.81py3-none-any.whl → 0.3.83py3-none-any.whl