PyPI - inspect-ai - Versions diffs - 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl - Mend

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_display/textual/app.py +14 -3
inspect_ai/_display/textual/display.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +9 -3
inspect_ai/_display/textual/widgets/task_detail.py +3 -4
inspect_ai/_display/textual/widgets/tasks.py +17 -1
inspect_ai/_display/textual/widgets/vscode.py +48 -0
inspect_ai/_eval/eval.py +36 -24
inspect_ai/_eval/evalset.py +17 -18
inspect_ai/_eval/loader.py +34 -11
inspect_ai/_eval/run.py +8 -13
inspect_ai/_eval/score.py +13 -3
inspect_ai/_eval/task/generate.py +8 -9
inspect_ai/_eval/task/log.py +2 -0
inspect_ai/_eval/task/task.py +23 -9
inspect_ai/_util/file.py +13 -0
inspect_ai/_util/json.py +2 -1
inspect_ai/_util/registry.py +1 -0
inspect_ai/_util/vscode.py +37 -0
inspect_ai/_view/www/App.css +6 -0
inspect_ai/_view/www/dist/assets/index.css +304 -128
inspect_ai/_view/www/dist/assets/index.js +47495 -27519
inspect_ai/_view/www/log-schema.json +124 -31
inspect_ai/_view/www/package.json +3 -0
inspect_ai/_view/www/src/App.tsx +12 -0
inspect_ai/_view/www/src/appearance/icons.ts +1 -0
inspect_ai/_view/www/src/components/Card.tsx +6 -4
inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
inspect_ai/_view/www/src/components/Modal.module.css +38 -0
inspect_ai/_view/www/src/components/Modal.tsx +77 -0
inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
inspect_ai/_view/www/src/state/hooks.ts +5 -3
inspect_ai/_view/www/src/state/logPolling.ts +5 -1
inspect_ai/_view/www/src/state/logSlice.ts +10 -0
inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
inspect_ai/_view/www/src/types/log.d.ts +34 -26
inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
inspect_ai/_view/www/yarn.lock +94 -1
inspect_ai/agent/__init__.py +36 -0
inspect_ai/agent/_agent.py +268 -0
inspect_ai/agent/_as_solver.py +72 -0
inspect_ai/agent/_as_tool.py +122 -0
inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
inspect_ai/agent/_filter.py +46 -0
inspect_ai/agent/_handoff.py +93 -0
inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
inspect_ai/agent/_react.py +241 -0
inspect_ai/agent/_run.py +36 -0
inspect_ai/agent/_types.py +81 -0
inspect_ai/log/_log.py +11 -2
inspect_ai/log/_transcript.py +13 -9
inspect_ai/model/__init__.py +7 -1
inspect_ai/model/_call_tools.py +256 -52
inspect_ai/model/_chat_message.py +7 -4
inspect_ai/model/_conversation.py +13 -62
inspect_ai/model/_display.py +85 -0
inspect_ai/model/_model.py +113 -14
inspect_ai/model/_model_output.py +14 -9
inspect_ai/model/_openai.py +16 -4
inspect_ai/model/_openai_computer_use.py +162 -0
inspect_ai/model/_openai_responses.py +319 -165
inspect_ai/model/_providers/anthropic.py +20 -21
inspect_ai/model/_providers/azureai.py +24 -13
inspect_ai/model/_providers/bedrock.py +1 -7
inspect_ai/model/_providers/cloudflare.py +3 -3
inspect_ai/model/_providers/goodfire.py +2 -6
inspect_ai/model/_providers/google.py +11 -10
inspect_ai/model/_providers/groq.py +6 -3
inspect_ai/model/_providers/hf.py +7 -3
inspect_ai/model/_providers/mistral.py +7 -10
inspect_ai/model/_providers/openai.py +47 -17
inspect_ai/model/_providers/openai_o1.py +11 -4
inspect_ai/model/_providers/openai_responses.py +12 -14
inspect_ai/model/_providers/providers.py +2 -2
inspect_ai/model/_providers/together.py +12 -2
inspect_ai/model/_providers/util/chatapi.py +7 -2
inspect_ai/model/_providers/util/hf_handler.py +4 -2
inspect_ai/model/_providers/util/llama31.py +4 -2
inspect_ai/model/_providers/vertex.py +11 -9
inspect_ai/model/_providers/vllm.py +4 -4
inspect_ai/scorer/__init__.py +2 -0
inspect_ai/scorer/_metrics/__init__.py +2 -0
inspect_ai/scorer/_metrics/grouped.py +84 -0
inspect_ai/scorer/_score.py +26 -6
inspect_ai/solver/__init__.py +2 -2
inspect_ai/solver/_basic_agent.py +22 -9
inspect_ai/solver/_bridge.py +31 -0
inspect_ai/solver/_chain.py +20 -12
inspect_ai/solver/_fork.py +5 -1
inspect_ai/solver/_human_agent.py +52 -0
inspect_ai/solver/_prompt.py +3 -1
inspect_ai/solver/_run.py +59 -0
inspect_ai/solver/_solver.py +14 -4
inspect_ai/solver/_task_state.py +5 -3
inspect_ai/tool/_tool_call.py +15 -8
inspect_ai/tool/_tool_def.py +17 -12
inspect_ai/tool/_tool_support_helpers.py +2 -2
inspect_ai/tool/_tool_with.py +14 -11
inspect_ai/tool/_tools/_bash_session.py +11 -2
inspect_ai/tool/_tools/_computer/_common.py +18 -2
inspect_ai/tool/_tools/_computer/_computer.py +18 -2
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_anyio.py +27 -0
inspect_ai/util/_sandbox/__init__.py +2 -1
inspect_ai/util/_sandbox/context.py +32 -7
inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/docker/docker.py +12 -1
inspect_ai/util/_store_model.py +30 -7
inspect_ai/util/_subprocess.py +13 -3
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/METADATA +1 -1
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/RECORD +179 -153
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
/inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/top_level.txt +0 -0

inspect_ai/__init__.py CHANGED Viewed

@@ -10,7 +10,8 @@ from inspect_ai._eval.score import score, score_async
 from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
 from inspect_ai._eval.task.tasks import Tasks
 from inspect_ai._util.constants import PKG_NAME
-from inspect_ai.solver._human_agent.agent import human_agent
+from inspect_ai.agent._human.agent import human_cli
+from inspect_ai.solver._human_agent import human_agent
 __version__ = importlib_version(PKG_NAME)

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -58,10 +58,12 @@ class TaskScreenResult(Generic[TR]):
         value: TR | BaseException,
         tasks: list[TaskWithResult],
         output: list[str],
+        warnings: list[str],
     ) -> None:
         self.value = value
         self.tasks = tasks
         self.output = output
+        self.warnings = warnings
 class TaskScreenApp(App[TR]):
@@ -86,6 +88,7 @@ class TaskScreenApp(App[TR]):
         self._worker: Worker[TR] | None = None
         self._error: BaseException | None = None
         self._output: list[str] = []
+        self._warnings: list[str] = []
         # task screen
         self._total_tasks = 0
@@ -120,7 +123,12 @@ class TaskScreenApp(App[TR]):
             value = CancelledError()
         # return result w/ output
-        return TaskScreenResult(value=value, tasks=self._app_tasks, output=self._output)
+        return TaskScreenResult(
+            value=value,
+            tasks=self._app_tasks,
+            output=self._output,
+            warnings=self._warnings,
+        )
     async def on_load(self) -> None:
         # events used to synchronise loading
@@ -349,8 +357,11 @@ class TaskScreenApp(App[TR]):
         if text.endswith("\n"):
             text = text[:-1]
-        # track output (for printing at the end)
-        self._output.append(text)
+        # track output and warnings (for printing at the end)
+        if "WARNING" in text:
+            self._warnings.append(text)
+        else:
+            self._output.append(text)
         # write to console view
         self.query_one(ConsoleView).write_ansi(text)

inspect_ai/_display/textual/display.py CHANGED Viewed

@@ -42,6 +42,10 @@ class TextualDisplay(Display):
         # print tasks
         rich.print(tasks_results(result.tasks))
+        # print warnings
+        if result.warnings:
+            print("\n".join(result.warnings))
         # raise error as required
         if isinstance(result.value, BaseException):
             raise result.value

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -17,7 +17,7 @@ from textual.widgets import (
     OptionList,
     Static,
 )
-from textual.widgets.option_list import Option
+from textual.widgets.option_list import Option, OptionDoesNotExist
 from inspect_ai._display.textual.widgets.port_mappings import get_url
 from inspect_ai._util.format import format_progress_time
@@ -124,7 +124,7 @@ class SamplesList(OptionList):
     def set_samples(self, samples: list[ActiveSample]) -> None:
         # check for a highlighted sample (make sure we don't remove it)
         highlighted_id = (
-            self.get_option_at_index(self.highlighted).id
+            self.get_id_at_index(self.highlighted)
             if self.highlighted is not None
             else None
         )
@@ -179,12 +179,18 @@ class SamplesList(OptionList):
             self.scroll_to_highlight()
     def sample_for_highlighted(self, highlighted: int) -> ActiveSample | None:
-        highlighted_id = self.get_option_at_index(highlighted).id
+        highlighted_id = self.get_id_at_index(highlighted)
         if highlighted_id is not None:
             return sample_for_id(self.samples, highlighted_id)
         else:
             return None
+    def get_id_at_index(self, index: int) -> str | None:
+        try:
+            return self.get_option_at_index(index).id
+        except OptionDoesNotExist:
+            return None
 class SampleVNC(Horizontal):
     DEFAULT_CSS = """

inspect_ai/_display/textual/widgets/task_detail.py CHANGED Viewed

@@ -221,12 +221,11 @@ class TaskMetrics(Widget):
             self.recompute_grid()
     def on_mount(self) -> None:
-        self.recompute_grid()
+        self.recompute_grid(True)
-    def recompute_grid(self) -> None:
-        if not self.is_mounted:
+    def recompute_grid(self, force: bool = False) -> None:
+        if not self.is_mounted and not force:
             return
         grid = self.query_one(f"#{self.grid_id()}")
         grid.remove_children()

inspect_ai/_display/textual/widgets/tasks.py CHANGED Viewed

@@ -17,6 +17,11 @@ from inspect_ai._display.core.results import task_metric
 from inspect_ai._display.textual.widgets.clock import Clock
 from inspect_ai._display.textual.widgets.task_detail import TaskDetail
 from inspect_ai._display.textual.widgets.toggle import Toggle
+from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
+from inspect_ai._util.file import to_uri
+from inspect_ai._util.vscode import (
+    VSCodeCommand,
+)
 from ...core.display import (
     Progress,
@@ -151,7 +156,7 @@ class TaskProgressView(Widget):
         height: auto;
         width: 1fr;
         layout: grid;
-        grid-size: 8 2;
+        grid-size: 9 2;
         grid-columns: auto auto auto auto 1fr auto auto auto;
         grid-rows: auto auto;
         grid-gutter: 0 1;
@@ -200,6 +205,15 @@ class TaskProgressView(Widget):
         self.sample_count_width: int = sample_count_width
         self.display_metrics = display_metrics
+        self.view_log_link = conditional_vscode_link(
+            "[View Log]",
+            VSCodeCommand(
+                command="inspect.openLogViewer",
+                args=[to_uri(task.profile.log_location)]
+                if task.profile.log_location
+                else [],
+            ),
+        )
     metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
     metrics_width: reactive[int | None] = reactive(None)
@@ -222,6 +236,8 @@ class TaskProgressView(Widget):
         yield self.count_display
         yield self.metrics_display
         yield Clock()
+        yield self.view_log_link
         yield self.task_detail
     @on(Toggle.Toggled)

inspect_ai/_display/textual/widgets/vscode.py ADDED Viewed

@@ -0,0 +1,48 @@
+from textual.widget import Widget
+from textual.widgets import Link, Static
+from inspect_ai._util.vscode import (
+    VSCodeCommand,
+    can_execute_vscode_command,
+    execute_vscode_commands,
+)
+def conditional_vscode_link(text: str, command: VSCodeCommand) -> Widget:
+    if can_execute_vscode_command(command.command):
+        vscode_link = VSCodeLink(text)
+        vscode_link.commands = [command]
+        return vscode_link
+    else:
+        return Static()
+class VSCodeLink(Link):
+    def __init__(
+        self,
+        text: str,
+        *,
+        url: str | None = None,
+        tooltip: str | None = None,
+        name: str | None = None,
+        id: str | None = None,
+        classes: str | None = None,
+        disabled: bool = False,
+    ) -> None:
+        super().__init__(
+            text,
+            url=url,
+            tooltip=tooltip,
+            name=name,
+            id=id,
+            classes=classes,
+            disabled=disabled,
+        )
+        self.commands: list[VSCodeCommand] = []
+    def on_click(self) -> None:
+        execute_vscode_commands(self.commands)
+    def action_open_link(self) -> None:
+        # Workaround to prevent the default action of opening the link in a browser
+        return None

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -2,9 +2,11 @@ import logging
 import os
 import sys
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, cast
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
+from inspect_ai.agent._agent import Agent, is_agent
+from inspect_ai.agent._as_solver import as_solver
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -71,7 +73,7 @@ def eval(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
@@ -246,7 +248,7 @@ async def eval_async(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
@@ -353,13 +355,10 @@ async def eval_async(
     try:
         # intialise eval
-        model, approval, resolved_tasks = eval_init(
-            tasks=tasks,
+        model, approval = eval_init(
             model=model,
             model_base_url=model_base_url,
             model_args=model_args,
-            task_args=task_args,
-            sandbox=sandbox,
             approval=approval,
             max_subprocesses=max_subprocesses,
             log_level=log_level,
@@ -367,6 +366,11 @@ async def eval_async(
             **kwargs,
         )
+        # resolve tasks
+        resolved_tasks = eval_resolve_tasks(
+            tasks, task_args, model, GenerateConfig(**kwargs), sandbox
+        )
         # warn and return empty string if we resolved no tasks
         if len(resolved_tasks) == 0:
             log.warning("No inspect tasks were found at the specified paths.")
@@ -412,7 +416,12 @@ async def eval_async(
             )
         # resolve solver
-        solver = chain(solver) if isinstance(solver, list) else solver
+        if isinstance(solver, list):
+            solver = chain(solver)
+        elif is_agent(solver):
+            solver = as_solver(solver)
+        else:
+            solver = cast(Solver | SolverSpec | None, solver)
         # ensure consistency of limit and sample_id
         if sample_id is not None and limit is not None:
@@ -724,7 +733,7 @@ async def eval_retry_async(
         # context to reconstruct ephemeral Task instances)
         task: str | None
         task_id = eval_log.eval.task_id
-        task_name = eval_log.eval.task
+        task_name = eval_log.eval.task_registry_name or eval_log.eval.task
         task_file = eval_log.eval.task_file
         if task_file:
             if not Path(task_file).exists():
@@ -846,24 +855,20 @@ async def eval_retry_async(
 def eval_init(
-    tasks: Tasks,
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
-    task_args: dict[str, Any] | str = dict(),
-    sandbox: SandboxEnvironmentType | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
     max_subprocesses: int | None = None,
     log_level: str | None = None,
     log_level_transcript: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
-) -> tuple[list[Model], list[ApprovalPolicy] | None, list[ResolvedTask]]:
+) -> tuple[list[Model], list[ApprovalPolicy] | None]:
     # init eval context
     init_eval_context(log_level, log_level_transcript, max_subprocesses)
     # resolve model and task args
     model_args = resolve_args(model_args)
-    task_args = resolve_args(task_args)
     # resolve model args from environment if not specified
     if len(model_args) == 0:
@@ -876,21 +881,28 @@ def eval_init(
     generate_config = GenerateConfig(**kwargs)
     models = resolve_models(model, model_base_url, model_args, generate_config)
-    # resolve tasks (set active model to resolve uses of the
-    # 'default' model in tools, solvers, and scorers)
-    with task_display().suspend_task_app():
-        resolved_tasks: list[ResolvedTask] = []
-        for m in models:
-            init_active_model(m, generate_config)
-            resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
     # resolve approval
     if isinstance(approval, str | ApprovalPolicyConfig):
         approval = approval_policies_from_config(approval)
     init_tool_approval(approval)
-    return models, approval, resolved_tasks
+    return models, approval
+def eval_resolve_tasks(
+    tasks: Tasks,
+    task_args: dict[str, Any] | str,
+    models: list[Model],
+    config: GenerateConfig,
+    sandbox: SandboxEnvironmentType | None,
+) -> list[ResolvedTask]:
+    task_args = resolve_args(task_args)
+    with task_display().suspend_task_app():
+        resolved_tasks: list[ResolvedTask] = []
+        for m in models:
+            init_active_model(m, config)
+            resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
+        return resolved_tasks
 def init_eval_display(

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import hashlib
 import logging
-from copy import deepcopy
 from typing import Any, Literal, NamedTuple, Set, cast
 import rich
@@ -18,6 +17,7 @@ from typing_extensions import Unpack
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.file import basename, filesystem
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
+from inspect_ai.agent._agent import Agent
 from inspect_ai.approval._policy import ApprovalPolicy
 from inspect_ai.log import EvalLog
 from inspect_ai.log._bundle import bundle_log_dir
@@ -37,7 +37,7 @@ from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util import DisplayType, SandboxEnvironmentType
 from inspect_ai.util._display import display_type_initialized, init_display_type
-from .eval import eval, eval_init
+from .eval import eval, eval_init, eval_resolve_tasks
 from .loader import resolve_task_args
 from .task import Epochs
 from .task.resolved import ResolvedTask
@@ -66,7 +66,7 @@ def eval_set(
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
-    solver: Solver | list[Solver] | SolverSpec | None = None,
+    solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
     tags: list[str] | None = None,
     metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
@@ -247,29 +247,21 @@ def eval_set(
     if display == "conversation":
         raise RuntimeError("eval_set cannot be used with conversation display.")
-    # resolve tasks
-    models, _, resolved_tasks = eval_init(
-        tasks=tasks,
+    # initialize eval
+    models, _ = eval_init(
         model=model,
         model_base_url=model_base_url,
         model_args=model_args,
-        task_args=task_args,
-        sandbox=sandbox,
         max_subprocesses=max_subprocesses,
         log_level=log_level,
         log_level_transcript=log_level_transcript,
         **kwargs,
     )
-    # ensure log_dir and list all logs
+    # ensure log_dir
     fs = filesystem(log_dir)
     fs.mkdir(log_dir, exist_ok=True)
-    # validate that:
-    #  (1) All tasks have a unique identifier
-    #  (2) All logs have identifiers that map to tasks
-    validate_eval_set_prerequisites(resolved_tasks, list_all_eval_logs(log_dir))
     # resolve some parameters
     retry_connections = retry_connections or 0.5
     retry_cleanup = retry_cleanup is not False
@@ -310,11 +302,21 @@ def eval_set(
     #   - tasks with a successful log (they'll just be returned)
     #   - tasks with failed logs (they'll be retried)
     def try_eval() -> list[EvalLog]:
+        # resolve tasks
+        resolved_tasks = eval_resolve_tasks(
+            tasks, task_args, models, GenerateConfig(**kwargs), sandbox
+        )
         # list all logs currently in the log directory (update manifest if there are some)
         all_logs = list_all_eval_logs(log_dir)
         if len(all_logs) > 0:
             write_log_dir_manifest(log_dir)
+        # validate that:
+        #  (1) All tasks have a unique identifier
+        #  (2) All logs have identifiers that map to tasks
+        validate_eval_set_prerequisites(resolved_tasks, all_logs)
         # see which tasks are yet to run (to complete successfully we need
         # a successful eval for every [task_file/]task_name/model combination)
         # for those that haven't run, schedule them into models => tasks groups
@@ -419,13 +421,10 @@ def as_previous_tasks(
         # want to bring this back but we'd need to resolve the
         # directory issues.
-        # deepcopy so the same instance is not run twice
-        prev_task = deepcopy(task.task)
         previous_tasks.append(
             PreviousTask(
                 id=log.header.eval.task_id,
-                task=prev_task,
+                task=task.task,
                 task_args=resolve_task_args(task.task),
                 model=task.model,
                 log=read_eval_log(log.info),

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -26,6 +26,8 @@ from inspect_ai._util.registry import (
     registry_lookup,
     registry_params,
 )
+from inspect_ai.agent._agent import Agent
+from inspect_ai.agent._as_solver import as_solver
 from inspect_ai.model import Model
 from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
 from inspect_ai.solver._bridge import bridge
@@ -421,20 +423,32 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
         if solver_file is None:
             if solver_name is None:
                 raise ValueError(f"Unable to resolve solver name from {spec.solver}")
-            return cast(Solver, registry_create("solver", solver_name, **spec.args))
+            elif registry_lookup("solver", solver_name) is not None:
+                return cast(Solver, registry_create("solver", solver_name, **spec.args))
+            elif registry_lookup("agent", solver_name) is not None:
+                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                return as_solver(agent)
+            else:
+                raise ValueError(
+                    f"Unkonwn solver {solver_name} (not registered as a @solver or @agent)"
+                )
         # we do have a solver file
         else:
             # load the module and parse decorators
             solver_module = load_module(solver_file)
-            decorators = parse_decorators(solver_file, "solver")
+            solver_decorators = parse_decorators(solver_file, "solver")
+            agent_decorators = parse_decorators(solver_file, "agent")
             # if there is no solver_name see if we can discover it
             if solver_name is None:
-                if len(decorators) == 1:
+                if len(solver_decorators) == 1:
                     # decorator based solver
-                    solver_name = decorators[0][0]
-                elif len(decorators) == 0:
+                    solver_name = solver_decorators[0][0]
+                elif len(agent_decorators) == 1:
+                    # decorator based agent
+                    solver_name = agent_decorators[0][0]
+                elif len(solver_decorators) == 0 and len(agent_decorators) == 0:
                     # see if we can find an agent based solver
                     functions = [
                         function
@@ -454,26 +468,35 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
                     elif len(agent_functions) == 0:
                         raise PrerequisiteError(
-                            f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
+                            f"The source file {pretty_solver_file} does not contain any @solver, @agent or bridged agent functions."
                         )
                     else:
                         raise PrerequisiteError(
-                            f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
+                            f"The source file {pretty_solver_file} has more than one bridged agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
                         )
-                else:
+                elif len(solver_decorators) > 1:
                     raise PrerequisiteError(
                         f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
                     )
+                else:
+                    raise PrerequisiteError(
+                        f"The source file {pretty_solver_file} has more than one @agent function (qualify which agent using e.g. '{solver_file.name}y@agent_fn')"
+                    )
             # create decorator based solvers using the registry
-            if any(solver[0] == solver_name for solver in decorators):
+            if any(solver[0] == solver_name for solver in solver_decorators):
                 return cast(Solver, registry_create("solver", solver_name, **spec.args))
-            # create agent based solvers by calling the function and wrapping it in bridge()
+            # create decorator based agents using the registry
+            elif any(agent[0] == solver_name for agent in agent_decorators):
+                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                return as_solver(agent)
+            # create bridge based solvers by calling the function and wrapping it in bridge()
             else:
                 agent_fn = getattr(solver_module, solver_name, None)
                 if inspect.isfunction(agent_fn):
-                    return bridge.bridge(agent_fn(**spec.args))
+                    return bridge(agent_fn(**spec.args))
                 elif agent_fn is not None:
                     raise PrerequisiteError(
                         f"The object {solver_name} in file {pretty_solver_file} is not a Python function."

inspect_ai/_eval/run.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import functools
 import logging
 import os
 import sys
@@ -20,7 +19,6 @@ from inspect_ai._display.core.active import (
     init_task_screen,
 )
 from inspect_ai._display.core.display import TaskSpec
-from inspect_ai._util._async import tg_collect
 from inspect_ai._util.error import PrerequisiteError, exception_message
 from inspect_ai._util.path import chdir
 from inspect_ai._util.registry import registry_unqualified_name
@@ -195,6 +193,7 @@ async def eval_run(
                     task_name=task.name,
                     task_version=task.version,
                     task_file=resolved_task.task_file,
+                    task_registry_name=resolved_task.task.registry_name,
                     task_id=resolved_task.id if resolved_task.id else uuid(),
                     run_id=run_id,
                     solver=eval_solver_spec,
@@ -359,17 +358,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                         "Run Task",
                         f"task: {task_options.task.name} ({task_options.model})",
                     ):
-                        tg_results = await tg_collect(
-                            [functools.partial(task_run, task_options)]
-                        )
-                    # check for empty results list (indicates cancellation)
-                    if len(tg_results) == 0:
-                        # task was cancelled, break out of the worker loop
-                        result = None
-                    else:
-                        result = tg_results[0]
-                        results.append(result)
+                        async with anyio.create_task_group() as tg:
+                            async def run_task() -> None:
+                                result = await task_run(task_options)
+                                results.append(result)
+                            tg.start_soon(run_task)
                 except Exception as ex:
                     # errors generally don't escape from tasks (the exception being if an error

inspect_ai/_eval/score.py CHANGED Viewed

@@ -7,8 +7,8 @@ import anyio
 from inspect_ai._display import display
 from inspect_ai._eval.loader import scorer_from_spec
-from inspect_ai._util._async import tg_collect
-from inspect_ai._util.platform import platform_init
+from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
+from inspect_ai._util.platform import platform_init, running_in_notebook
 from inspect_ai._util.registry import registry_create, registry_unqualified_name
 from inspect_ai.log import (
     EvalLog,
@@ -56,7 +56,17 @@ def score(
     # resolve scorers into a list
     scorers = [scorers] if isinstance(scorers, Scorer) else scorers
-    return anyio.run(score_async, log, scorers, epochs_reducer, action)
+    if running_in_notebook():
+        return run_coroutine(score_async(log, scorers, epochs_reducer, action))
+    else:
+        return anyio.run(
+            score_async,
+            log,
+            scorers,
+            epochs_reducer,
+            action,
+            backend=configured_async_backend(),
+        )
 async def score_async(

inspect_ai/_eval/task/generate.py CHANGED Viewed

@@ -1,12 +1,8 @@
 from typing import Literal
-from inspect_ai.model import (
-    CachePolicy,
-    GenerateConfig,
-    Model,
-    call_tools,
-)
+from inspect_ai.model import CachePolicy, GenerateConfig, Model
 from inspect_ai.model._cache import epoch
+from inspect_ai.model._call_tools import execute_tools
 from inspect_ai.solver import TaskState
 from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.tool import ToolFunction
@@ -48,10 +44,13 @@ async def task_generate(
             # resolve tool calls if necessary
             if tool_calls != "none" and message.tool_calls:
-                # call tools and append messages to state
-                state.messages.extend(
-                    await call_tools(message, state.tools, config.max_tool_output)
+                # call tools and update messages and output
+                messages, output = await execute_tools(
+                    state.messages, state.tools, config.max_tool_output
                 )
+                state.messages.extend(messages)
+                if output is not None:
+                    state.output = output
                 # check for completed or only executing a single tool call
                 if state.completed or tool_calls == "single":

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -57,6 +57,7 @@ class TaskLogger:
         task_name: str,
         task_version: int,
         task_file: str | None,
+        task_registry_name: str | None,
         task_id: str | None,
         run_id: str,
         solver: SolverSpec | None,
@@ -131,6 +132,7 @@ class TaskLogger:
             task_id=task_id if task_id else uuid(),
             task_version=task_version,
             task_file=task_file,
+            task_registry_name=task_registry_name,
             task_attribs=task_attribs,
             task_args=task_args,
             solver=solver.solver if solver else None,

inspect-ai 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl