PyPI - inspect-ai - Versions diffs - 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl - Mend

inspect-ai 0.3.74py3-none-any.whl → 0.3.76py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

inspect_ai/__init__.py +3 -2
inspect_ai/_cli/cache.py +1 -1
inspect_ai/_cli/common.py +15 -0
inspect_ai/_cli/eval.py +4 -5
inspect_ai/_cli/log.py +1 -1
inspect_ai/_cli/sandbox.py +1 -1
inspect_ai/_cli/trace.py +1 -1
inspect_ai/_cli/view.py +1 -1
inspect_ai/_display/core/config.py +3 -1
inspect_ai/_eval/eval.py +55 -61
inspect_ai/_eval/evalset.py +64 -154
inspect_ai/_eval/loader.py +27 -54
inspect_ai/_eval/registry.py +4 -15
inspect_ai/_eval/run.py +7 -4
inspect_ai/_eval/task/__init__.py +8 -2
inspect_ai/_eval/task/log.py +9 -1
inspect_ai/_eval/task/resolved.py +35 -0
inspect_ai/_eval/task/run.py +4 -0
inspect_ai/_eval/task/task.py +50 -69
inspect_ai/_eval/task/tasks.py +30 -0
inspect_ai/_util/constants.py +3 -0
inspect_ai/_util/dotenv.py +17 -0
inspect_ai/_util/logger.py +3 -0
inspect_ai/_util/registry.py +43 -2
inspect_ai/_view/server.py +28 -10
inspect_ai/_view/www/dist/assets/index.css +32 -19
inspect_ai/_view/www/dist/assets/index.js +17682 -29989
inspect_ai/_view/www/log-schema.json +79 -9
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/appearance/styles.ts +6 -5
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
inspect_ai/_view/www/src/constants.ts +3 -0
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
inspect_ai/_view/www/src/types/log.d.ts +11 -5
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
inspect_ai/_view/www/yarn.lock +12 -5
inspect_ai/log/_log.py +10 -1
inspect_ai/log/_recorders/eval.py +27 -8
inspect_ai/log/_recorders/json.py +10 -2
inspect_ai/log/_transcript.py +13 -4
inspect_ai/model/_call_tools.py +13 -4
inspect_ai/model/_chat_message.py +15 -1
inspect_ai/model/_model.py +30 -12
inspect_ai/model/_model_output.py +6 -1
inspect_ai/model/_openai.py +11 -6
inspect_ai/model/_providers/anthropic.py +167 -77
inspect_ai/model/_providers/google.py +6 -2
inspect_ai/model/_providers/none.py +31 -0
inspect_ai/model/_providers/openai.py +11 -8
inspect_ai/model/_providers/providers.py +7 -0
inspect_ai/model/_providers/vertex.py +5 -2
inspect_ai/solver/_bridge/bridge.py +1 -1
inspect_ai/solver/_chain.py +7 -6
inspect_ai/tool/__init__.py +4 -0
inspect_ai/tool/_tool_call.py +5 -2
inspect_ai/tool/_tool_support_helpers.py +200 -0
inspect_ai/tool/_tools/_bash_session.py +119 -0
inspect_ai/tool/_tools/_computer/_computer.py +1 -1
inspect_ai/tool/_tools/_text_editor.py +121 -0
inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
inspect_ai/tool/_tools/_web_search.py +2 -2
inspect_ai/util/_json.py +28 -0
inspect_ai/util/_sandbox/context.py +18 -8
inspect_ai/util/_sandbox/docker/config.py +1 -1
inspect_ai/util/_sandbox/docker/internal.py +3 -3
inspect_ai/util/_sandbox/environment.py +17 -2
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
{inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0

inspect_ai/__init__.py CHANGED Viewed

@@ -7,7 +7,8 @@ from inspect_ai._eval.evalset import eval_set
 from inspect_ai._eval.list import list_tasks
 from inspect_ai._eval.registry import task
 from inspect_ai._eval.score import score, score_async
-from inspect_ai._eval.task import Epochs, Task, TaskInfo, Tasks, task_with
+from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
+from inspect_ai._eval.task.tasks import Tasks
 from inspect_ai._util.constants import PKG_NAME
 from inspect_ai.solver._human_agent.agent import human_agent
@@ -26,8 +27,8 @@ __all__ = [
     "score_async",
     "Epochs",
     "Task",
-    "TaskInfo",
     "Tasks",
+    "TaskInfo",
     "task",
     "task_with",
 ]

inspect_ai/_cli/cache.py CHANGED Viewed

@@ -44,7 +44,7 @@ def _print_table(title: str, paths: list[tuple[str, int]]) -> None:
 def cache_command() -> None:
     """Manage the inspect model output cache.
-    Learn more about model output caching at https://inspect.ai-safety-institute.org.uk/caching.html.
+    Learn more about model output caching at https://inspect.aisi.org.uk/caching.html.
     """
     return None

inspect_ai/_cli/common.py CHANGED Viewed

@@ -10,14 +10,18 @@ from inspect_ai._util.constants import (
     DEFAULT_DISPLAY,
     DEFAULT_LOG_LEVEL,
 )
+from inspect_ai._util.dotenv import init_cli_env
 from inspect_ai.util._display import init_display_type
+from .util import parse_cli_args
 class CommonOptions(TypedDict):
     log_level: str
     log_dir: str
     display: Literal["full", "conversation", "rich", "plain", "none"]
     no_ansi: bool | None
+    env: tuple[str] | None
     debug: bool
     debug_port: int
     debug_errors: bool
@@ -68,6 +72,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help="Do not print ANSI control characters.",
         envvar="INSPECT_NO_ANSI",
     )
+    @click.option(
+        "--env",
+        multiple=True,
+        type=str,
+        envvar="INSPECT_EVAL_ENV",
+        help="Define an environment variable e.g. --env NAME=value (--env can be specified multiple times)",
+    )
     @click.option(
         "--debug", is_flag=True, envvar="INSPECT_DEBUG", help="Wait to attach debugger"
     )
@@ -92,6 +103,10 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
 def process_common_options(options: CommonOptions) -> None:
+    # set environment variables
+    env_args = parse_cli_args(options["env"])
+    init_cli_env(env_args)
     # propagate display
     if options["no_ansi"]:
         display = "rich"

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -56,7 +56,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--model",
         type=str,
-        required=True,
         help="Model used to evaluate tasks.",
         envvar="INSPECT_EVAL_MODEL",
     )
@@ -441,7 +440,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
 def eval_command(
     tasks: tuple[str] | None,
     solver: str | None,
-    model: str,
+    model: str | None,
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,
@@ -608,7 +607,7 @@ def eval_set_command(
     solver: str | None,
     trace: bool | None,
     approval: str | None,
-    model: str,
+    model: str | None,
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,
@@ -671,7 +670,7 @@ def eval_set_command(
 ) -> int:
     """Evaluate a set of tasks with retries.
-    Learn more about eval sets at https://inspect.ai-safety-institute.org.uk/eval-sets.html.
+    Learn more about eval sets at https://inspect.aisi.org.uk/eval-sets.html.
     """
     # read config
     config = config_from_locals(dict(locals()))
@@ -741,7 +740,7 @@ def eval_exec(
     log_level_transcript: str,
     log_dir: str,
     log_format: Literal["eval", "json"] | None,
-    model: str,
+    model: str | None,
     model_base_url: str | None,
     m: tuple[str] | None,
     model_config: str | None,

inspect_ai/_cli/log.py CHANGED Viewed

@@ -30,7 +30,7 @@ def log_command() -> None:
     The 'log' commands enable you to read Inspect logs uniformly as JSON no matter their physical storage format, and also enable you to read only the headers (everything but the samples) from log files, which is useful for very large logs.
-    Learn more about managing log files at https://inspect.ai-safety-institute.org.uk/eval-logs.html.
+    Learn more about managing log files at https://inspect.aisi.org.uk/eval-logs.html.
     """
     return None

inspect_ai/_cli/sandbox.py CHANGED Viewed

@@ -9,7 +9,7 @@ from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
 def sandbox_command() -> None:
     """Manage Sandbox Environments.
-    Learn more about sandboxing at https://inspect.ai-safety-institute.org.uk/sandboxing.html.
+    Learn more about sandboxing at https://inspect.aisi.org.uk/sandboxing.html.
     """
     return None

inspect_ai/_cli/trace.py CHANGED Viewed

@@ -28,7 +28,7 @@ def trace_command() -> None:
     Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
-    Learn more about execution traces at https://inspect.ai-safety-institute.org.uk/tracing.html.
+    Learn more about execution traces at https://inspect.aisi.org.uk/tracing.html.
     """
     return None

inspect_ai/_cli/view.py CHANGED Viewed

@@ -41,7 +41,7 @@ def start_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
 def view_command(ctx: click.Context, **kwargs: Unpack[CommonOptions]) -> None:
     """Inspect log viewer.
-    Learn more about using the log viewer at https://inspect.ai-safety-institute.org.uk/log-viewer.html.
+    Learn more about using the log viewer at https://inspect.aisi.org.uk/log-viewer.html.
     """
     if ctx.invoked_subcommand is None:
         ctx.invoke(start, **kwargs)

inspect_ai/_display/core/config.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from inspect_ai._util.registry import is_registry_dict
+from inspect_ai._util.registry import is_model_dict, is_registry_dict
 from inspect_ai.log._log import eval_config_defaults
 from .display import TaskProfile
@@ -14,6 +14,8 @@ def task_config(
         value = task_args[key]
         if is_registry_dict(value):
             task_args[key] = value["name"]
+        if is_model_dict(value):
+            task_args[key] = value["model"]
     # get eval_config overrides
     eval_config = dict(profile.eval_config.model_dump(exclude_none=True))
     for name, default_value in eval_config_defaults().items():

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -4,6 +4,8 @@ import sys
 from pathlib import Path
 from typing import Any, Literal
+from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -47,16 +49,18 @@ from inspect_ai.util._display import (
 )
 from .context import init_eval_context
-from .loader import ResolvedTask, resolve_tasks
+from .loader import resolve_tasks
 from .run import eval_run
-from .task import Epochs, PreviousTask, Tasks
+from .task import Epochs, PreviousTask
+from .task.resolved import ResolvedTask, resolved_model_names
+from .task.tasks import Tasks
 log = logging.getLogger(__name__)
 def eval(
     tasks: Tasks,
-    model: str | Model | list[str] | list[Model] | None = None,
+    model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
     task_args: dict[str, Any] | str = dict(),
@@ -96,9 +100,9 @@ def eval(
     Args:
         tasks: Task(s) to evaluate. If None, attempt
             to evaluate a task in the current working directory
-        model: Model(s) for
-            evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
-            environment variable.
+        model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
+            environment variable. Specify `None` to define no default model(s), which will
+            leave model usage entirely up to tasks.
         model_base_url: Base URL for communicating
             with the model API.
         model_args: Model creation args
@@ -144,7 +148,7 @@ def eval(
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
-            (default is 1)
+            (defaults to number of models being evaluated)
         max_subprocesses: Maximum number of subprocesses to
             run in parallel (default is os.cpu_count())
         max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -223,7 +227,7 @@ _eval_async_running = False
 async def eval_async(
     tasks: Tasks,
-    model: str | Model | list[str] | list[Model] | None = None,
+    model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
     task_args: dict[str, Any] | str = dict(),
@@ -259,67 +263,53 @@ async def eval_async(
     r"""Evaluate tasks using a Model (async).
     Args:
-        tasks: (Tasks): Task(s) to evaluate. If None, attempt
+        tasks: Task(s) to evaluate. If None, attempt
             to evaluate a task in the current working directory
-        model (str | Model | list[str] | list[Model] | None): Model(s) for
-            evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
-            environment variable.
-        model_base_url: (str | None): Base URL for communicating
-            with the model API.
-        model_args (dict[str,Any] | str): Model creation args
-            (as a dictionary or as a path to a JSON or YAML config file)
-        task_args (dict[str,Any] | str): Task creation arguments
-            (as a dictionary or as a path to a JSON or YAML config file)
-        sandbox (SandboxEnvironmentType | None): Sandbox environment type
-          (or optionally a str or tuple with a shorthand spec)
-        sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
-           (defaults to True)
-        solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
-          Optional (uses task solver by default).
+        model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
+            environment variable. Specify `None` to define no default model(s), which will
+            leave model usage entirely up to tasks.
+        model_base_url: Base URL for communicating with the model API.
+        model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file)
+        task_args: Task creation arguments (as a dictionary or as a path to a JSON or YAML config file)
+        sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
+        sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
+        solver: Alternative solver for task(s).  Optional (uses task solver by default).
         tags (list[str] | None): Tags to associate with this evaluation run.
-        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
+        approval: Tool use approval policies.
           Either a path to an approval policy config file or a list of approval policies.
           Defaults to no approval policy.
-        log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
           "info", "warning", "error", or "critical" (defaults to "warning")
-        log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
-        log_dir (str | None): Output path for logging results
-            (defaults to file log in ./logs directory).
-        log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
-           to "eval", the native high-performance format).
-        limit (str | int | list[str | int] | None): Limit evaluated samples
-            (defaults to all samples).
-        sample_id (str | list[str] | None): Evaluate specific sample(s) from the dataset.
-        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+        log_level_transcript: Level for logging to the log file (defaults to "info")
+        log_dir: Output path for logging results (defaults to file log in ./logs directory).
+        log_format: Format for writing log files (defaults to "eval", the native high-performance format).
+        limit: Limit evaluated samples (defaults to all samples).
+        sample_id: Evaluate specific sample(s) from the dataset.
+        epochs: Epochs to repeat samples for and optional score
             reducer function(s) used to combine sample scores (defaults to "mean")
-        fail_on_error (bool | float | None): `True` to fail on first sample error
+        fail_on_error: `True` to fail on first sample error
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
-        debug_errors (bool | None): Raise task errors (rather than logging them)
-           so they can be debugged (defaults to False).
-        message_limit (int | None): Limit on total messages used for each sample.
-        token_limit (int | None): Limit on total tokens used for each sample.
+        debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
+        message_limit: Limit on total messages used for each sample.
+        token_limit: Limit on total tokens used for each sample.
         time_limit: Limit on clock time (in seconds) for samples.
         working_limit: Limit on working time (in seconds) for sample. Working
             time includes model generation, tool calls, etc. but does not include
             time spent waiting on retries or shared resources.
-        max_samples (int | None): Maximum number of samples to run in parallel
-           (default is max_connections)
-        max_tasks (int | None): Maximum number of tasks to run in parallel
-           (default is 1)
-        max_subprocesses (int | None): Maximum number of subprocesses to
-            run in parallel (default is os.cpu_count())
-        max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
-           to run in parallel.
-        log_samples: (bool | None): Log detailed samples and scores (defaults to True)
-        log_images: (bool | None): Log base64 encoded version of images,
-            even if specified as a filename or URL (defaults to False)
-        log_buffer: (int | None): Number of samples to buffer before writing log file.
+        max_samples: Maximum number of samples to run in parallel (default is max_connections)
+        max_tasks: Maximum number of tasks to run in parallel
+            (defaults to number of models being evaluated)
+        max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
+        max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
+        log_samples: Log detailed samples and scores (defaults to True)
+        log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
+        log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        score (bool): Score output (defaults to True)
-        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
-        **kwargs (GenerateConfigArgs): Model generation options.
+        score: Score output (defaults to True)
+        score_display: Show scoring metrics in realtime (defaults to True)
+        **kwargs: Model generation options.
     Returns:
         List of EvalLog (one for each task)
@@ -365,6 +355,12 @@ async def eval_async(
             log.warning("No inspect tasks were found at the specified paths.")
             return []
+        # if there is no max tasks then base it on unique model names
+        if max_tasks is None:
+            model_count = len(resolved_model_names(resolved_tasks))
+            if model_count > 1:
+                max_tasks = model_count
         # apply conversation display constraints
         if display_type() == "conversation":
             # single task at a time
@@ -450,7 +446,6 @@ async def eval_async(
                         eval_config=eval_config,
                         eval_sandbox=sandbox,
                         recorder=recorder,
-                        model_args=model_args,
                         epochs_reducer=epochs_reducer,
                         solver=solver,
                         tags=tags,
@@ -475,7 +470,6 @@ async def eval_async(
                 eval_config=eval_config,
                 eval_sandbox=sandbox,
                 recorder=recorder,
-                model_args=model_args,
                 epochs_reducer=epochs_reducer,
                 solver=solver,
                 tags=tags,
@@ -529,7 +523,7 @@ def eval_retry(
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
-            (default is 1)
+            (defaults to number of models being evaluated)
         max_subprocesses: Maximum number of subprocesses to
             run in parallel (default is os.cpu_count())
         max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -764,7 +758,7 @@ async def eval_retry_async(
         log = (
             await eval_async(
                 tasks=PreviousTask(
-                    id=task_id, task=task, task_args=task_args, log=eval_log
+                    id=task_id, task=task, task_args=task_args, model=None, log=eval_log
                 ),
                 model=model,
                 model_base_url=model_base_url,
@@ -809,7 +803,7 @@ async def eval_retry_async(
 def eval_init(
     tasks: Tasks,
-    model: str | Model | list[str] | list[Model] | None = None,
+    model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] | str = dict(),
     task_args: dict[str, Any] | str = dict(),
@@ -886,7 +880,7 @@ def init_eval_display(
         # multiple models not allowed in trace mode
         if isinstance(model, list) and len(model) > 1:
             raise PrerequisiteError(
-                "Trace mode cannot be used when evaluating multiple models."
+                "Conversation mode cannot be used when evaluating multiple models."
             )
     return max_tasks, max_samples

inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl

inspect-ai 0.3.74py3-none-any.whl → 0.3.76py3-none-any.whl