PyPI - inspect-ai - Versions diffs - 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl - Mend

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_cli/score.py +1 -12
inspect_ai/_cli/util.py +4 -2
inspect_ai/_display/core/footer.py +2 -2
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_eval/context.py +7 -1
inspect_ai/_eval/eval.py +51 -27
inspect_ai/_eval/evalset.py +27 -10
inspect_ai/_eval/loader.py +7 -8
inspect_ai/_eval/run.py +23 -31
inspect_ai/_eval/score.py +18 -1
inspect_ai/_eval/task/log.py +5 -13
inspect_ai/_eval/task/resolved.py +1 -0
inspect_ai/_eval/task/run.py +231 -244
inspect_ai/_eval/task/task.py +25 -2
inspect_ai/_eval/task/util.py +1 -8
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/json.py +8 -3
inspect_ai/_util/registry.py +30 -13
inspect_ai/_view/www/App.css +5 -0
inspect_ai/_view/www/dist/assets/index.css +55 -18
inspect_ai/_view/www/dist/assets/index.js +550 -458
inspect_ai/_view/www/log-schema.json +84 -1
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
inspect_ai/_view/www/src/types/log.d.ts +150 -129
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
inspect_ai/agent/_agent.py +12 -0
inspect_ai/agent/_as_tool.py +1 -1
inspect_ai/agent/_bridge/bridge.py +9 -2
inspect_ai/agent/_react.py +142 -74
inspect_ai/agent/_run.py +13 -2
inspect_ai/agent/_types.py +6 -0
inspect_ai/approval/_apply.py +6 -9
inspect_ai/approval/_approver.py +3 -3
inspect_ai/approval/_auto.py +2 -2
inspect_ai/approval/_call.py +20 -4
inspect_ai/approval/_human/approver.py +3 -3
inspect_ai/approval/_human/manager.py +2 -2
inspect_ai/approval/_human/panel.py +3 -3
inspect_ai/approval/_policy.py +3 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +23 -2
inspect_ai/log/_model.py +58 -0
inspect_ai/log/_recorders/file.py +14 -3
inspect_ai/log/_transcript.py +3 -0
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_call_tools.py +15 -2
inspect_ai/model/_model.py +49 -3
inspect_ai/model/_openai.py +151 -21
inspect_ai/model/_providers/anthropic.py +25 -14
inspect_ai/model/_providers/bedrock.py +3 -3
inspect_ai/model/_providers/cloudflare.py +29 -108
inspect_ai/model/_providers/google.py +21 -10
inspect_ai/model/_providers/grok.py +23 -17
inspect_ai/model/_providers/groq.py +61 -37
inspect_ai/model/_providers/llama_cpp_python.py +8 -9
inspect_ai/model/_providers/mistral.py +8 -3
inspect_ai/model/_providers/ollama.py +8 -9
inspect_ai/model/_providers/openai.py +53 -157
inspect_ai/model/_providers/openai_compatible.py +195 -0
inspect_ai/model/_providers/openrouter.py +4 -15
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/model/_providers/together.py +25 -23
inspect_ai/model/_trim.py +83 -0
inspect_ai/solver/_plan.py +5 -3
inspect_ai/tool/_tool_call.py +3 -0
inspect_ai/tool/_tool_def.py +8 -2
inspect_ai/util/__init__.py +3 -0
inspect_ai/util/_concurrency.py +15 -2
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
inspect_ai/_eval/task/rundir.py +0 -78
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -84,6 +84,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         envvar="INSPECT_EVAL_MODEL_CONFIG",
         help="YAML or JSON config file with model arguments.",
     )
+    @click.option(
+        "--model-role",
+        multiple=True,
+        type=str,
+        envvar="INSPECT_EVAL_MODEL_ROLE",
+        help="Named model role, e.g. --model-role critic=openai/gpt-4o",
+    )
     @click.option(
         "-T",
         multiple=True,
@@ -467,6 +474,7 @@ def eval_command(
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,
+    model_role: tuple[str] | None,
     t: tuple[str] | None,
     task_config: str | None,
     s: tuple[str] | None,
@@ -545,6 +553,7 @@ def eval_command(
         model_base_url=model_base_url,
         m=m,
         model_config=model_config,
+        model_role=model_role,
         t=t,
         task_config=task_config,
         s=s,
@@ -638,6 +647,7 @@ def eval_set_command(
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,
+    model_role: tuple[str] | None,
     t: tuple[str] | None,
     task_config: str | None,
     s: tuple[str] | None,
@@ -719,6 +729,7 @@ def eval_set_command(
         model_base_url=model_base_url,
         m=m,
         model_config=model_config,
+        model_role=model_role,
         t=t,
         task_config=task_config,
         s=s,
@@ -775,6 +786,7 @@ def eval_exec(
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,
+    model_role: tuple[str] | None,
     t: tuple[str] | None,
     task_config: str | None,
     s: tuple[str] | None,
@@ -820,6 +832,9 @@ def eval_exec(
     solver_args = parse_cli_config(s, solver_config)
     model_args = parse_cli_config(m, model_config)
+    # parse model roles
+    eval_model_roles = parse_cli_args(model_role, force_str=True)
     # parse tags
     eval_tags = parse_comma_separated(tags)
@@ -858,6 +873,7 @@ def eval_exec(
             model=model,
             model_base_url=model_base_url,
             model_args=model_args,
+            model_roles=eval_model_roles,
             task_args=task_args,
             solver=SolverSpec(solver, solver_args) if solver else None,
             tags=eval_tags,

inspect_ai/_cli/score.py CHANGED Viewed

@@ -11,13 +11,12 @@ from typing_extensions import Unpack
 from inspect_ai._cli.util import parse_cli_config
 from inspect_ai._display import display
 from inspect_ai._display.core.rich import rich_theme
-from inspect_ai._eval.context import init_eval_context, init_task_context
+from inspect_ai._eval.context import init_eval_context
 from inspect_ai._eval.score import ScoreAction, task_score
 from inspect_ai._util._async import configured_async_backend
 from inspect_ai._util.file import basename, dirname, exists
 from inspect_ai.log._log import EvalLog
 from inspect_ai.log._recorders import create_recorder_for_location
-from inspect_ai.model import get_model
 from .common import CommonOptions, common_options, process_common_options
@@ -109,16 +108,6 @@ async def score(
     if eval_log.samples is None or len(eval_log.samples) == 0:
         raise ValueError(f"{log_file} does not include samples to score")
-    # get the model then initialize the async context
-    model = get_model(
-        model=eval_log.eval.model,
-        config=eval_log.plan.config,
-        **eval_log.eval.model_args,
-    )
-    # initialize active model
-    init_task_context(model)
     # re-score the task
     eval_log = await task_score(
         log=eval_log, scorer=scorer, scorer_args=scorer_args, action=action

inspect_ai/_cli/util.py CHANGED Viewed

@@ -63,7 +63,9 @@ def parse_cli_config(
     return cli_config
-def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]:
+def parse_cli_args(
+    args: tuple[str] | list[str] | None, force_str: bool = False
+) -> dict[str, Any]:
     params: dict[str, Any] = dict()
     if args:
         for arg in list(args):
@@ -74,7 +76,7 @@ def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]:
                 if isinstance(value, str):
                     value = value.split(",")
                     value = value if len(value) > 1 else value[0]
-                params[key] = value
+                params[key] = str(value) if force_str else value
     return params

inspect_ai/_display/core/footer.py CHANGED Viewed

@@ -2,7 +2,7 @@ from rich.console import RenderableType
 from rich.text import Text
 from inspect_ai._util.retry import http_retries_count
-from inspect_ai.util._concurrency import concurrency_status
+from inspect_ai.util._concurrency import concurrency_status_display
 from inspect_ai.util._throttle import throttle
 from .config import task_dict
@@ -20,7 +20,7 @@ def task_footer(
 def task_resources() -> str:
     resources: dict[str, str] = {}
-    for model, resource in concurrency_status().items():
+    for model, resource in concurrency_status_display().items():
         resources[model] = f"{resource[0]}/{resource[1]}"
     return task_dict(resources)

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -10,7 +10,7 @@ from inspect_ai._util.platform import running_in_notebook
 from inspect_ai._util.text import truncate
 from inspect_ai._util.throttle import throttle
-from ...util._concurrency import concurrency_status
+from ...util._concurrency import concurrency_status_display
 from ..core.config import task_config
 from ..core.display import (
     TR,
@@ -179,7 +179,7 @@ class PlainTaskDisplay(TaskDisplay):
             # Very similar to ``inspect_ai._display.core.footer.task_resources``, but without
             # the rich formatting added in the ``task_dict`` call
             resources_dict: dict[str, str] = {}
-            for model, resource in concurrency_status().items():
+            for model, resource in concurrency_status_display().items():
                 resources_dict[model] = f"{resource[0]:2d}/{resource[1]:2d}"
             resources = ", ".join(
                 [f"{key}: {value}" for key, value in resources_dict.items()]

inspect_ai/_eval/context.py CHANGED Viewed

@@ -6,7 +6,11 @@ from inspect_ai.approval._human.manager import init_human_approval_manager
 from inspect_ai.approval._policy import ApprovalPolicy
 from inspect_ai.log._samples import init_active_samples
 from inspect_ai.model import GenerateConfig, Model
-from inspect_ai.model._model import init_active_model, init_model_usage
+from inspect_ai.model._model import (
+    init_active_model,
+    init_model_roles,
+    init_model_usage,
+)
 from inspect_ai.util._concurrency import init_concurrency
 from inspect_ai.util._subprocess import init_max_subprocesses
@@ -27,10 +31,12 @@ def init_eval_context(
 def init_task_context(
     model: Model,
+    model_roles: dict[str, Model] | None = None,
     approval: list[ApprovalPolicy] | None = None,
     config: GenerateConfig = GenerateConfig(),
 ) -> None:
     init_active_model(model, config)
+    init_model_roles(model_roles or {})
     init_model_usage()
     if not have_tool_approval():
         init_tool_approval(approval)

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -4,9 +4,11 @@ import sys
 from pathlib import Path
 from typing import Any, Literal, cast
+from inspect_ai._eval.task.task import resolve_model_roles
 from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 from inspect_ai.agent._agent import Agent, is_agent
 from inspect_ai.agent._as_solver import as_solver
+from inspect_ai.log._model import model_roles_config_to_model_roles
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -70,6 +72,7 @@ def eval(
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
+    model_roles: dict[str, str | Model] | None = None,
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
@@ -84,7 +87,7 @@ def eval(
     log_dir: str | None = None,
     log_format: Literal["eval", "json"] | None = None,
     limit: int | tuple[int, int] | None = None,
-    sample_id: str | int | list[str | int] | None = None,
+    sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
     debug_errors: bool | None = None,
@@ -116,6 +119,7 @@ def eval(
             with the model API.
         model_args: Model creation args
             (as a dictionary or as a path to a JSON or YAML config file)
+        model_roles: Named roles for use in `get_model()`.
         task_args: Task creation arguments
             (as a dictionary or as a path to a JSON or YAML config file)
         sandbox: Sandbox environment type
@@ -194,6 +198,7 @@ def eval(
                 model=model,
                 model_base_url=model_base_url,
                 model_args=model_args,
+                model_roles=model_roles,
                 task_args=task_args,
                 sandbox=sandbox,
                 sandbox_cleanup=sandbox_cleanup,
@@ -245,6 +250,7 @@ async def eval_async(
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
+    model_roles: dict[str, str | Model] | None = None,
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
@@ -257,7 +263,7 @@ async def eval_async(
     log_dir: str | None = None,
     log_format: Literal["eval", "json"] | None = None,
     limit: int | tuple[int, int] | None = None,
-    sample_id: str | int | list[str | int] | None = None,
+    sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
     debug_errors: bool | None = None,
@@ -286,7 +292,8 @@ async def eval_async(
             environment variable. Specify `None` to define no default model(s), which will
             leave model usage entirely up to tasks.
         model_base_url: Base URL for communicating with the model API.
-        model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file)
+        model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file
+        model_roles: Named roles for use in `get_model()`.
         task_args: Task creation arguments (as a dictionary or as a path to a JSON or YAML config file)
         sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
         sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
@@ -333,12 +340,11 @@ async def eval_async(
     Returns:
         List of EvalLog (one for each task)
     """
-    # only a single call to eval_async can be active at a time, this is
-    # because when running a task a chdir to the task's directory (and
-    # similar mutation of the Python sys.path) occurs. since this is a
-    # change to global process state it cannot occur in parallel. for
-    # task parallelism, pass multiple tasks to eval or eval_async (which
-    # will enforce the appropriate constraints on task parallelism)
+    # only a single call to eval_async can be active at a time, this used
+    # to be due to running tasks switching to the task's directory, however
+    # that feature no longer exists so we may be able to revisit this
+    # restriction (probably just need to examine if there is *global* state
+    # that could have conflicts in the case of multiple eval_async calls)
     global _eval_async_running
     if _eval_async_running:
         raise RuntimeError("Multiple concurrent calls to eval_async are not allowed.")
@@ -355,11 +361,10 @@ async def eval_async(
     try:
         # intialise eval
-        model, approval = eval_init(
+        model = eval_init(
             model=model,
             model_base_url=model_base_url,
             model_args=model_args,
-            approval=approval,
             max_subprocesses=max_subprocesses,
             log_level=log_level,
             log_level_transcript=log_level_transcript,
@@ -367,8 +372,14 @@ async def eval_async(
         )
         # resolve tasks
-        resolved_tasks = eval_resolve_tasks(
-            tasks, task_args, model, GenerateConfig(**kwargs), sandbox
+        resolved_tasks, approval = eval_resolve_tasks(
+            tasks,
+            task_args,
+            model,
+            model_roles,
+            GenerateConfig(**kwargs),
+            approval,
+            sandbox,
         )
         # warn and return empty string if we resolved no tasks
@@ -759,6 +770,9 @@ async def eval_retry_async(
             **eval_log.eval.model_args,
         )
+        # resolve model roles
+        model_roles = model_roles_config_to_model_roles(eval_log.eval.model_roles)
         # collect the rest of the params we need for the eval
         task_args = eval_log.eval.task_args
         tags = eval_log.eval.tags
@@ -815,9 +829,15 @@ async def eval_retry_async(
         log = (
             await eval_async(
                 tasks=PreviousTask(
-                    id=task_id, task=task, task_args=task_args, model=None, log=eval_log
+                    id=task_id,
+                    task=task,
+                    task_args=task_args,
+                    model=None,
+                    model_roles=None,
+                    log=eval_log,
                 ),
                 model=model,
+                model_roles=cast(dict[str, str | Model], model_roles),
                 task_args=task_args,
                 sandbox=eval_log.eval.sandbox,
                 sandbox_cleanup=sandbox_cleanup,
@@ -861,12 +881,11 @@ def eval_init(
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
-    approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
     max_subprocesses: int | None = None,
     log_level: str | None = None,
     log_level_transcript: str | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
-) -> tuple[list[Model], list[ApprovalPolicy] | None]:
+) -> list[Model]:
     # init eval context
     init_eval_context(log_level, log_level_transcript, max_subprocesses)
@@ -880,32 +899,37 @@ def eval_init(
             args = [arg.strip() for arg in env_model_args.split(" ")]
             model_args = parse_cli_args(args)
-    # resolve models
+    # resolve and return models
     generate_config = GenerateConfig(**kwargs)
     models = resolve_models(model, model_base_url, model_args, generate_config)
-    # resolve approval
-    if isinstance(approval, str | ApprovalPolicyConfig):
-        approval = approval_policies_from_config(approval)
-    init_tool_approval(approval)
-    return models, approval
+    return models
 def eval_resolve_tasks(
     tasks: Tasks,
     task_args: dict[str, Any] | str,
     models: list[Model],
+    model_roles: dict[str, str | Model] | None,
     config: GenerateConfig,
+    approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None,
     sandbox: SandboxEnvironmentType | None,
-) -> list[ResolvedTask]:
+) -> tuple[list[ResolvedTask], list[ApprovalPolicy] | None]:
+    resolved_model_roles = resolve_model_roles(model_roles)
     task_args = resolve_args(task_args)
     with task_display().suspend_task_app():
         resolved_tasks: list[ResolvedTask] = []
         for m in models:
             init_active_model(m, config)
-            resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
-        return resolved_tasks
+            resolved_tasks.extend(
+                resolve_tasks(tasks, task_args, m, resolved_model_roles, sandbox)
+            )
+    if isinstance(approval, str | ApprovalPolicyConfig):
+        approval = approval_policies_from_config(approval)
+    init_tool_approval(approval)
+    # return tasks and approval
+    return resolved_tasks, approval
 def init_eval_display(

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -28,6 +28,7 @@ from inspect_ai.log._file import (
     read_eval_log_headers,
     write_log_dir_manifest,
 )
+from inspect_ai.log._model import model_roles_to_model_roles_config
 from inspect_ai.model import (
     GenerateConfigArgs,
     Model,
@@ -63,6 +64,7 @@ def eval_set(
     model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
+    model_roles: dict[str, str | Model] | None = None,
     task_args: dict[str, Any] | str = dict(),
     sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
@@ -77,7 +79,7 @@ def eval_set(
     log_level_transcript: str | None = None,
     log_format: Literal["eval", "json"] | None = None,
     limit: int | tuple[int, int] | None = None,
-    sample_id: str | int | list[str | int] | None = None,
+    sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
     debug_errors: bool | None = None,
@@ -120,6 +122,7 @@ def eval_set(
             with the model API.
         model_args: Model creation args
             (as a dictionary or as a path to a JSON or YAML config file)
+        model_roles: Named roles for use in `get_model()`.
         task_args: Task creation arguments
             (as a dictionary or as a path to a JSON or YAML config file)
         sandbox: Sandbox environment type
@@ -194,6 +197,7 @@ def eval_set(
             model=None,  # ResolvedTask/PreviousTask already carries its model
             model_base_url=model_base_url,
             model_args=model_args,
+            model_roles=model_roles,
             task_args=task_args,
             sandbox=sandbox,
             sandbox_cleanup=sandbox_cleanup,
@@ -248,7 +252,7 @@ def eval_set(
         raise RuntimeError("eval_set cannot be used with conversation display.")
     # initialize eval
-    models, _ = eval_init(
+    models = eval_init(
         model=model,
         model_base_url=model_base_url,
         model_args=model_args,
@@ -303,8 +307,14 @@ def eval_set(
     #   - tasks with failed logs (they'll be retried)
     def try_eval() -> list[EvalLog]:
         # resolve tasks
-        resolved_tasks = eval_resolve_tasks(
-            tasks, task_args, models, GenerateConfig(**kwargs), sandbox
+        resolved_tasks, _ = eval_resolve_tasks(
+            tasks,
+            task_args,
+            models,
+            model_roles,
+            GenerateConfig(**kwargs),
+            approval,
+            sandbox,
         )
         # list all logs currently in the log directory (update manifest if there are some)
@@ -415,18 +425,13 @@ def as_previous_tasks(
     previous_tasks: list[PreviousTask] = []
     for task, log in zip(tasks, map(task_to_failed_log, tasks)):
-        # NOTE: we used to try to recreate registry objects by
-        # by just passing the task name, but that didn't work
-        # when evals were run from another directory. we may
-        # want to bring this back but we'd need to resolve the
-        # directory issues.
         previous_tasks.append(
             PreviousTask(
                 id=log.header.eval.task_id,
                 task=task.task,
                 task_args=resolve_task_args(task.task),
                 model=task.model,
+                model_roles=task.model_roles,
                 log=read_eval_log(log.info),
             )
         )
@@ -561,17 +566,29 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
         task_name = task.task.name
         task_args = task.task_args
         model = str(task.model)
+        model_roles = model_roles_to_model_roles_config(task.model_roles) or {}
     else:
         task_file = task.eval.task_file or ""
         task_name = task.eval.task
         task_args = task.eval.task_args
         model = str(task.eval.model)
+        model_roles = task.eval.model_roles or {}
     # hash for task args
     task_args_hash = hashlib.sha256(
         to_json(task_args, exclude_none=True, fallback=lambda _x: None)
     ).hexdigest()
+    # hash for model roles
+    if len(model_roles):
+        model = (
+            model
+            + "/"
+            + hashlib.sha256(
+                to_json(model_roles, exclude_none=True, fallback=lambda _x: None)
+            ).hexdigest()
+        )
     if task_file:
         return f"{task_file}@{task_name}#{task_args_hash}/{model}"
     else:

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -13,7 +13,6 @@ from typing_extensions import overload
 from inspect_ai._eval.task.resolved import ResolvedTask
 from inspect_ai._eval.task.util import task_file, task_run_dir
-from inspect_ai._util._async import configured_async_backend
 from inspect_ai._util.decorator import parse_decorators
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.logger import warn_once
@@ -52,6 +51,7 @@ def resolve_tasks(
     tasks: Tasks,
     task_args: dict[str, Any],
     model: Model,
+    model_roles: dict[str, Model] | None,
     sandbox: SandboxEnvironmentType | None,
 ) -> list[ResolvedTask]:
     def as_resolved_tasks(tasks: list[Task]) -> list[ResolvedTask]:
@@ -61,6 +61,7 @@ def resolve_tasks(
                 task_args=resolve_task_args(task),
                 task_file=task_file(task, relative=True),
                 model=task.model or model,
+                model_roles=task.model_roles or model_roles,
                 sandbox=resolve_task_sandbox(task, sandbox),
                 sequence=sequence,
             )
@@ -109,6 +110,9 @@ def resolve_tasks(
                 task_args=loaded_task_args,
                 task_file=previous_task.log.eval.task_file,
                 model=previous_task.model or loaded_task.model or model,
+                model_roles=(
+                    previous_task.model_roles or loaded_task.model_roles or model_roles
+                ),
                 sandbox=previous_task.log.eval.sandbox,
                 sequence=sequence,
                 id=previous_task.id,
@@ -282,16 +286,11 @@ def create_file_tasks(
             setattr(task, TASK_RUN_DIR_ATTR, run_dir)
             tasks.append(task)
-            # warn that chdir is deprecated
+            # warn that chdir has been removed
             if "chdir" in task.attribs:
-                if configured_async_backend() == "trio":
-                    raise RuntimeError(
-                        "The task 'chdir' attribute is not compatible with the trio async backend."
-                    )
                 warn_once(
                     logger,
-                    "The 'chdir' task attribute is deprecated and will be removed in a future release "
+                    "The 'chdir' task attribute is no longer supported "
                     + "(you should write your tasks to not depend on their runtime working directory)",
                 )

inspect_ai/_eval/run.py CHANGED Viewed

@@ -49,9 +49,8 @@ from .loader import (
 from .task.log import TaskLogger
 from .task.resolved import ResolvedTask
 from .task.run import TaskRunOptions, task_run
-from .task.rundir import task_run_dir_switching
 from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
-from .task.util import slice_dataset, task_chdir, task_run_dir
+from .task.util import slice_dataset, task_run_dir
 log = logging.getLogger(__name__)
@@ -71,13 +70,10 @@ async def eval_run(
     score: bool = True,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
-    # see if we need to use run_dir switching
-    run_dir = task_run_dir(tasks[0].task)
-    multiple_run_dirs = any([task_run_dir(task.task) != run_dir for task in tasks])
-    tasks_chdir = any([task_chdir(task.task) is not None for task in tasks])
+    # are sandboxes in play?
     has_sandbox = next((task.has_sandbox for task in tasks), None)
-    # get cwd before switching to task dir
+    # get cwd before any switching
     eval_wd = os.getcwd()
     # ensure sample ids
@@ -199,6 +195,7 @@ async def eval_run(
                     solver=eval_solver_spec,
                     tags=tags,
                     model=resolved_task.model,
+                    model_roles=resolved_task.model_roles,
                     dataset=task.dataset,
                     scorer=eval_scorer_specs,
                     metrics=eval_metrics,
@@ -217,6 +214,7 @@ async def eval_run(
                     TaskRunOptions(
                         task=task,
                         model=resolved_task.model,
+                        model_roles=resolved_task.model_roles,
                         sandbox=resolved_task.sandbox,
                         logger=logger,
                         eval_wd=eval_wd,
@@ -233,25 +231,10 @@ async def eval_run(
         # multiple mode is for running/displaying multiple
         # task definitions, which requires some smart scheduling
         # to ensure that we spread work among models
-        if tasks_chdir:
-            if parallel > 1:
-                if multiple_run_dirs:
-                    with task_run_dir_switching():
-                        return await run_multiple(task_run_options, parallel)
-                else:
-                    with chdir(run_dir):
-                        return await run_multiple(task_run_options, parallel)
-            # single mode is for a single task definitions (which
-            # could in turn be executed for multiple models)
-            else:
-                with chdir(run_dir):
-                    return await run_single(task_run_options, debug_errors)
+        if parallel > 1:
+            return await run_multiple(task_run_options, parallel)
         else:
-            if parallel > 1:
-                return await run_multiple(task_run_options, parallel)
-            else:
-                return await run_single(task_run_options, debug_errors)
+            return await run_single(task_run_options, debug_errors)
     finally:
         # shutdown sandbox environments
@@ -359,12 +342,21 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                         f"task: {task_options.task.name} ({task_options.model})",
                     ):
                         async with anyio.create_task_group() as tg:
-                            async def run_task() -> None:
-                                result = await task_run(task_options)
-                                results.append(result)
-                            tg.start_soon(run_task)
+                            # Create a factory function that captures the current
+                            # task_options. Otherwise, we suffer from Python's
+                            # late/by reference binding behavior.
+                            # see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
+                            def create_task_runner(
+                                options: TaskRunOptions = task_options,
+                            ) -> Callable[[], Awaitable[None]]:
+                                async def run_task() -> None:
+                                    nonlocal result
+                                    result = await task_run(options)
+                                    results.append(result)
+                                return run_task
+                            tg.start_soon(create_task_runner())
                 except Exception as ex:
                     # errors generally don't escape from tasks (the exception being if an error

inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl