PyPI - inspect-ai - Versions diffs - 0.3.73__py3-none-any.whl → 0.3.75__py3-none-any.whl - Mend

inspect-ai 0.3.73py3-none-any.whl → 0.3.75py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

inspect_ai/__init__.py +3 -2
inspect_ai/_cli/cache.py +1 -1
inspect_ai/_cli/common.py +15 -0
inspect_ai/_cli/eval.py +4 -5
inspect_ai/_cli/log.py +1 -1
inspect_ai/_cli/sandbox.py +1 -1
inspect_ai/_cli/trace.py +1 -1
inspect_ai/_cli/view.py +1 -1
inspect_ai/_display/core/config.py +3 -1
inspect_ai/_eval/eval.py +55 -61
inspect_ai/_eval/evalset.py +63 -154
inspect_ai/_eval/loader.py +27 -54
inspect_ai/_eval/registry.py +1 -10
inspect_ai/_eval/run.py +3 -4
inspect_ai/_eval/task/__init__.py +8 -2
inspect_ai/_eval/task/log.py +9 -1
inspect_ai/_eval/task/resolved.py +35 -0
inspect_ai/_eval/task/task.py +50 -69
inspect_ai/_eval/task/tasks.py +30 -0
inspect_ai/_util/constants.py +3 -0
inspect_ai/_util/dotenv.py +17 -0
inspect_ai/_util/registry.py +43 -2
inspect_ai/_view/server.py +28 -10
inspect_ai/_view/www/dist/assets/index.css +4 -3
inspect_ai/_view/www/dist/assets/index.js +13030 -25523
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/appearance/styles.ts +6 -5
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
inspect_ai/_view/www/src/constants.ts +3 -0
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
inspect_ai/_view/www/yarn.lock +12 -5
inspect_ai/log/_log.py +10 -1
inspect_ai/log/_recorders/eval.py +27 -8
inspect_ai/log/_recorders/json.py +2 -2
inspect_ai/model/_cache.py +3 -1
inspect_ai/model/_chat_message.py +12 -1
inspect_ai/model/_model.py +25 -11
inspect_ai/model/_providers/anthropic.py +34 -2
inspect_ai/model/_providers/google.py +6 -2
inspect_ai/model/_providers/none.py +31 -0
inspect_ai/model/_providers/providers.py +7 -0
inspect_ai/solver/_bridge/bridge.py +1 -1
inspect_ai/solver/_chain.py +7 -6
inspect_ai/tool/_tools/_computer/_computer.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/tool/_tools/_web_search.py +2 -2
inspect_ai/util/_sandbox/context.py +2 -1
inspect_ai/util/_sandbox/environment.py +17 -2
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/METADATA +4 -4
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/RECORD +63 -60
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -13,6 +13,7 @@ from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_c
 from inspect_ai.dataset import Dataset, MemoryDataset, Sample
 from inspect_ai.log import EvalLog
 from inspect_ai.model import GenerateConfig
+from inspect_ai.model._model import Model, get_model
 from inspect_ai.scorer import Metric, Scorer
 from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
 from inspect_ai.solver import Plan, Solver, generate
@@ -50,6 +51,7 @@ class Task:
         cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
         scorer: Scorer | list[Scorer] | None = None,
         metrics: list[Metric] | dict[str, list[Metric]] | None = None,
+        model: str | Model | None = None,
         config: GenerateConfig = GenerateConfig(),
         sandbox: SandboxEnvironmentType | None = None,
         approval: str | list[ApprovalPolicy] | None = None,
@@ -67,42 +69,38 @@ class Task:
         """Create a task.
         Args:
-            dataset (Dataset | Sequence[Sample]): Dataset to evaluate
-            setup: (Solver | list[Solver] | None): Setup step (always run
-                even when the main `solver` is replaced).
-            solver: (Solver | list[Solver]): Solver or list of solvers.
-                Defaults to generate(), a normal call to the model.
+            dataset: Dataset to evaluate
+            setup: Setup step (always run even when the main `solver` is replaced).
+            solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
             cleanup: Optional cleanup function for task. Called after
                 all solvers have run for each sample (including if an
                 exception occurs during the run)
-            scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
-            metrics (list[Metric] | dict[str, list[Metric]] | None):
-                Alternative metrics (overrides the metrics provided by the specified scorer).
-            config (GenerateConfig): Model generation config.
-            sandbox (SandboxEnvironmentType | None): Sandbox environment type
-                (or optionally a str or tuple with a shorthand spec)
-            approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-                Either a path to an approval policy config file or a list of approval policies.
-                Defaults to no approval policy.
-            epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+            scorer: Scorer used to evaluate model output.
+            metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
+            model: Default model for task (Optional, defaults to eval model).
+            config: Model generation config.
+            sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
+            approval: Tool use approval policies.
+                Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
+            epochs: Epochs to repeat samples for and optional score
                 reducer function(s) used to combine sample scores (defaults to "mean")
-            fail_on_error (bool | float | None): `True` to fail on first sample error
+            fail_on_error: `True` to fail on first sample error
                 (default); `False` to never fail on sample errors; Value between 0 and 1
                 to fail if a proportion of total samples fails. Value greater than 1 to fail
                 eval if a count of samples fails.
-            message_limit (int | None): Limit on total messages used for each sample.
-            token_limit (int | None): Limit on total tokens used for each sample.
+            message_limit: Limit on total messages used for each sample.
+            token_limit: Limit on total tokens used for each sample.
             time_limit: Limit on clock time (in seconds) for samples.
             working_limit: Limit on working time (in seconds) for sample. Working
                 time includes model generation, tool calls, etc. but does not include
                 time spent waiting on retries or shared resources.
-            name: (str | None): Task name. If not specified is automatically
+            name: Task name. If not specified is automatically
                 determined based on the name of the task directory (or "task")
                 if its anonymous task (e.g. created in a notebook and passed to
                 eval() directly)
-            version: (int): Version of task (to distinguish evolutions
+            version: Version of task (to distinguish evolutions
                 of the task spec or breaking changes to it)
-            metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
+            metadata:  Additional metadata to associate with the task.
             **kwargs: Deprecated arguments.
         """
         # handle deprecated args
@@ -135,6 +133,7 @@ class Task:
         self.cleanup = cleanup
         self.scorer = resolve_scorer(scorer)
         self.metrics = metrics
+        self.model = resolve_model(model)
         self.config = config
         self.sandbox = resolve_sandbox_environment(sandbox)
         self.approval = resolve_approval(approval)
@@ -176,6 +175,7 @@ def task_with(
     cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
     scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
     metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
+    model: str | Model | NotGiven = NOT_GIVEN,
     config: GenerateConfig | NotGiven = NOT_GIVEN,
     sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
     approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
@@ -192,43 +192,39 @@ def task_with(
     """Task adapted with alternate values for one or more options.
     Args:
-        task (Task): Task to adapt (it is deep copied prior to mutating options)
-        dataset (Dataset | Sequence[Sample]): Dataset to evaluate
-        setup: (Solver | list[Solver] | None): Setup step (always run
-            even when the main `solver` is replaced).
-        solver: (Solver | list[Solver]): Solver or list of solvers.
-            Defaults to generate(), a normal call to the model.
+        task: Task to adapt (it is deep copied prior to mutating options)
+        dataset: Dataset to evaluate
+        setup: Setup step (always run even when the main `solver` is replaced).
+        solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
         cleanup: Optional cleanup function for task. Called after
             all solvers have run for each sample (including if an
             exception occurs during the run)
-        scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
-        metrics (list[Metric] | dict[str, list[Metric]] | None):
-            Alternative metrics (overrides the metrics provided by the specified scorer).
-        config (GenerateConfig): Model generation config.
-        sandbox (SandboxEnvironmentType | None): Sandbox environment type
-            (or optionally a str or tuple with a shorthand spec)
-        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-            Either a path to an approval policy config file or a list of approval policies.
-            Defaults to no approval policy.
-        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+        scorer: Scorer used to evaluate model output.
+        metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
+        model: Default model for task (Optional, defaults to eval model).
+        config: Model generation config.
+        sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
+        approval: Tool use approval policies.
+            Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
+        epochs: Epochs to repeat samples for and optional score
             reducer function(s) used to combine sample scores (defaults to "mean")
-        fail_on_error (bool | float | None): `True` to fail on first sample error
+        fail_on_error: `True` to fail on first sample error
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail
             eval if a count of samples fails.
-        message_limit (int | None): Limit on total messages used for each sample.
-        token_limit (int | None): Limit on total tokens used for each sample.
+        message_limit: Limit on total messages used for each sample.
+        token_limit: Limit on total tokens used for each sample.
         time_limit: Limit on clock time (in seconds) for samples.
-        working_limit: Limit on execution time (in seconds) for sample. Execution
+        working_limit: Limit on working time (in seconds) for sample. Working
             time includes model generation, tool calls, etc. but does not include
             time spent waiting on retries or shared resources.
-        name: (str | None): Task name. If not specified is automatically
+        name: Task name. If not specified is automatically
             determined based on the name of the task directory (or "task")
             if its anonymous task (e.g. created in a notebook and passed to
             eval() directly)
-        version: (int): Version of task (to distinguish evolutions
+        version: Version of task (to distinguish evolutions
             of the task spec or breaking changes to it)
-        metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
+        metadata:  Additional metadata to associate with the task.
     Returns:
         Task: Task adapted with alternate options.
@@ -248,6 +244,8 @@ def task_with(
         task.scorer = resolve_scorer(scorer)
     if not isinstance(metrics, NotGiven):
         task.metrics = metrics
+    if not isinstance(model, NotGiven):
+        task.model = resolve_model(model)
     if not isinstance(config, NotGiven):
         task.config = config
     if not isinstance(sandbox, NotGiven):
@@ -307,34 +305,10 @@ class PreviousTask:
     id: str
     task: str | Task
     task_args: dict[str, Any]
+    model: Model | None
     log: EvalLog
-Tasks = (
-    str
-    | PreviousTask
-    | TaskInfo
-    | Task
-    | Callable[..., Task]
-    | type[Task]
-    | list[str]
-    | list[PreviousTask]
-    | list[TaskInfo]
-    | list[Task]
-    | list[Callable[..., Task]]
-    | list[type[Task]]
-    | None
-)
-r"""One or more tasks.
-Tasks to be evaluated. Many forms of task specification are
-supported including directory names, task functions, task
-classes, and task instances (a single task or list of tasks
-can be specified). None is a request to read a task out
-of the current working directory.
-"""
 def resolve_approval(
     approval: str | list[ApprovalPolicy] | None,
 ) -> list[ApprovalPolicy] | None:
@@ -370,6 +344,13 @@ def resolve_solver(solver: Solver | list[Solver]) -> Solver:
     return chain(solver) if isinstance(solver, list) else solver
+def resolve_model(model: str | Model | None) -> Model | None:
+    if isinstance(model, str):
+        return get_model(model)
+    else:
+        return model
 def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
     return (
         scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None

inspect_ai/_eval/task/tasks.py ADDED Viewed

@@ -0,0 +1,30 @@
+from typing import Callable, TypeAlias
+from .resolved import ResolvedTask
+from .task import PreviousTask, Task, TaskInfo
+Tasks: TypeAlias = (
+    str
+    | PreviousTask
+    | ResolvedTask
+    | TaskInfo
+    | Task
+    | Callable[..., Task]
+    | type[Task]
+    | list[str]
+    | list[PreviousTask]
+    | list[ResolvedTask]
+    | list[TaskInfo]
+    | list[Task]
+    | list[Callable[..., Task]]
+    | list[type[Task]]
+    | None
+)
+r"""One or more tasks.
+Tasks to be evaluated. Many forms of task specification are
+supported including directory names, task functions, task
+classes, and task instances (a single task or list of tasks
+can be specified). None is a request to read a task out
+of the current working directory.
+"""

inspect_ai/_util/constants.py CHANGED Viewed

@@ -36,3 +36,6 @@ CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
 SANDBOX_SETUP_TIMEOUT = 300
 NO_CONTENT = "(no content)"
+DESERIALIZING = "deserializing"
+DESERIALIZING_CONTEXT = {DESERIALIZING: True}

inspect_ai/_util/dotenv.py CHANGED Viewed

@@ -52,6 +52,9 @@ def init_dotenv() -> None:
         if inspect_log_dir:
             os.environ[INSPECT_LOG_DIR_VAR] = inspect_log_dir
+        # re-apply any env vars specified at the cli w/ --env
+        apply_cli_env()
 @contextlib.contextmanager
 def dotenv_environ(
@@ -76,3 +79,17 @@ def dotenv_environ(
     finally:
         os.environ.update(update_after)
         [os.environ.pop(k) for k in remove_after]
+_cli_env: dict[str, Any] = {}
+def init_cli_env(env: dict[str, Any]) -> None:
+    global _cli_env
+    _cli_env = env
+    apply_cli_env()
+def apply_cli_env() -> None:
+    for var, value in _cli_env.items():
+        os.environ[var] = str(value)

inspect_ai/_util/registry.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Callable, Literal, TypedDict, TypeGuard, cast
 from pydantic import BaseModel, Field
 from pydantic_core import to_jsonable_python
+from inspect_ai._util.json import jsonable_python
 from inspect_ai._util.package import get_installed_package_name
 from .constants import PKG_NAME
@@ -198,13 +199,15 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
     def with_registry_info(o: object) -> object:
         return set_registry_info(o, registry_info(obj))
-    # instantiate registry objects
+    # instantiate registry and model objects
     for param in kwargs.keys():
         value = kwargs[param]
         if is_registry_dict(value):
             kwargs[param] = registry_create(
                 value["type"], value["name"], **value["params"]
             )
+        elif is_model_dict(value):
+            kwargs[param] = model_create_from_dict(value)
     if isclass(obj):
         return with_registry_info(obj(**kwargs))
@@ -380,6 +383,8 @@ def is_registry_dict(o: object) -> TypeGuard[RegistryDict]:
 def registry_value(o: object) -> Any:
+    from inspect_ai.model._model import Model
     # treat tuple as list
     if isinstance(o, tuple):
         o = list(o)
@@ -390,14 +395,50 @@ def registry_value(o: object) -> Any:
     elif isinstance(o, dict):
         return {k: registry_value(v) for k, v in o.items()}
     elif has_registry_params(o):
-        return dict(
+        return RegistryDict(
             type=registry_info(o).type,
             name=registry_log_name(o),
             params=registry_params(o),
         )
+    elif isinstance(o, Model):
+        return ModelDict(
+            model=str(o),
+            config=jsonable_python(o.config),
+            base_url=o.api.base_url,
+            model_args=o.model_args,
+        )
     else:
         return o
 def registry_create_from_dict(d: RegistryDict) -> object:
     return registry_create(d["type"], d["name"], **d["params"])
+class ModelDict(TypedDict):
+    model: str
+    config: dict[str, Any]
+    base_url: str | None
+    model_args: dict[str, Any]
+def is_model_dict(o: object) -> TypeGuard[ModelDict]:
+    return (
+        isinstance(o, dict)
+        and "model" in o
+        and "config" in o
+        and "base_url" in o
+        and "model_args" in o
+    )
+def model_create_from_dict(d: ModelDict) -> object:
+    from inspect_ai.model._generate_config import GenerateConfig
+    from inspect_ai.model._model import get_model
+    return get_model(
+        d["model"],
+        config=GenerateConfig(**d["config"]),
+        base_url=d["base_url"],
+        **d["model_args"],
+    )

inspect_ai/_view/server.py CHANGED Viewed

@@ -57,8 +57,7 @@ def view_server(
     @routes.get("/api/logs/{log}")
     async def api_log(request: web.Request) -> web.Response:
         # log file requested
-        file = request.match_info["log"]
-        file = urllib.parse.unquote(file)
+        file = normalize_uri(request.match_info["log"])
         validate_log_file_request(file)
         # header_only is based on a size threshold
@@ -68,8 +67,7 @@ def view_server(
     @routes.get("/api/log-size/{log}")
     async def api_log_size(request: web.Request) -> web.Response:
         # log file requested
-        file = request.match_info["log"]
-        file = urllib.parse.unquote(file)
+        file = normalize_uri(request.match_info["log"])
         validate_log_file_request(file)
         return await log_size_response(file)
@@ -77,8 +75,7 @@ def view_server(
     @routes.get("/api/log-delete/{log}")
     async def api_log_delete(request: web.Request) -> web.Response:
         # log file requested
-        file = request.match_info["log"]
-        file = urllib.parse.unquote(file)
+        file = normalize_uri(request.match_info["log"])
         validate_log_file_request(file)
         return await log_delete_response(file)
@@ -86,8 +83,7 @@ def view_server(
     @routes.get("/api/log-bytes/{log}")
     async def api_log_bytes(request: web.Request) -> web.Response:
         # log file requested
-        file = request.match_info["log"]
-        file = urllib.parse.unquote(file)
+        file = normalize_uri(request.match_info["log"])
         validate_log_file_request(file)
         # header_only is based on a size threshold
@@ -106,7 +102,7 @@ def view_server(
         if authorization:
             request_log_dir = request.query.getone("log_dir", None)
             if request_log_dir:
-                request_log_dir = urllib.parse.unquote(request_log_dir)
+                request_log_dir = normalize_uri(request_log_dir)
             else:
                 request_log_dir = log_dir
         else:
@@ -121,7 +117,7 @@ def view_server(
     @routes.get("/api/log-headers")
     async def api_log_headers(request: web.Request) -> web.Response:
         files = request.query.getall("file", [])
-        files = [urllib.parse.unquote(file) for file in files]
+        files = [normalize_uri(file) for file in files]
         map(validate_log_file_request, files)
         return await log_headers_response(files)
@@ -166,6 +162,28 @@ def view_server(
     )
+def normalize_uri(uri: str) -> str:
+    """Normalize incoming URIs to a consistent format."""
+    # Decode any URL-encoded characters
+    parsed = urllib.parse.urlparse(urllib.parse.unquote(uri))
+    if parsed.scheme != "file":
+        # If this isn't a file uri, just unquote it
+        return urllib.parse.unquote(uri)
+    else:
+        # If this is a file uri, see whether we should process triple slashes
+        # down to double slashes
+        path = parsed.path
+        # Detect and normalize Windows-style file URIs
+        if path.startswith("/") and len(path) > 3 and path[2] == ":":
+            # Strip leading `/` before drive letter
+            path = path[1:]
+        return f"file://{path}"
 def log_listing_response(logs: list[EvalLogInfo], log_dir: str) -> web.Response:
     response = dict(
         log_dir=aliased_path(log_dir),

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -16346,7 +16346,7 @@ ul.jsondiffpatch-textdiff {
   column-gap: 0.5em;
   min-width: 200px;
 }
-._flatBody_gk2ju_1 {
+._flatBody_1uw6w_1 {
   color: var(--bs-danger);
   display: grid;
   grid-template-columns: max-content max-content;
@@ -16354,16 +16354,17 @@ ul.jsondiffpatch-textdiff {
   margin-top: 0.4rem;
 }
-._iconSmall_gk2ju_9 {
+._iconSmall_1uw6w_9 {
   font-size: var(--inspect-font-size-small);
   line-height: var(--inspect-font-size-small);
   height: var(--inspect-font-size-small);
 }
-._lineBase_gk2ju_15 {
+._lineBase_1uw6w_15 {
   font-size: var(--inspect-font-size-base);
   line-height: var(--inspect-font-size-base);
   height: var(--inspect-font-size-base);
+  max-width: 30em;
 }
 ._target_9qy4e_1 {
   padding-left: 0;

inspect-ai 0.3.73__py3-none-any.whl → 0.3.75__py3-none-any.whl

inspect-ai 0.3.73py3-none-any.whl → 0.3.75py3-none-any.whl