PyPI - inspect-ai - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl - Mend

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +2 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/core/progress.py +1 -1
inspect_ai/_display/textual/app.py +8 -4
inspect_ai/_display/textual/widgets/samples.py +6 -5
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/__init__.py +0 -0
inspect_ai/_eval/eval.py +100 -97
inspect_ai/_eval/evalset.py +69 -69
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +6 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/__init__.py +0 -0
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/App.css +8 -3
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +66 -38
inspect_ai/_view/www/dist/assets/index.js +525 -523
inspect_ai/_view/www/log-schema.json +86 -73
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/App.tsx +1 -0
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
inspect_ai/_view/www/src/types/log.d.ts +107 -19
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +36 -45
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +13 -13
inspect_ai/dataset/_sources/hf.py +29 -29
inspect_ai/dataset/_sources/json.py +10 -10
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +98 -7
inspect_ai/log/_message.py +3 -1
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +2 -2
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openrouter.py +1 -1
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +1 -1
inspect_ai/scorer/_classification.py +4 -0
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +15 -18
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +2 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/_tools/_computer/_common.py +2 -2
inspect_ai/tool/_tools/_computer/_computer.py +11 -0
inspect_ai/tool/_tools/_execute.py +15 -9
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +10 -1
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -190,7 +190,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
         if task.setup:
             plan.steps = unroll(task.setup) + plan.steps
-        # reaolve the scorer
+        # resolve the scorer
         score = score and task.scorer is not None
         scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
         scorer_profiles = (
@@ -519,6 +519,7 @@ async def task_run_sample(
                     key: SampleScore(
                         score=score,
                         sample_id=previous_sample.id,
+                        sample_metadata=previous_sample.metadata,
                     )
                     for key, score in previous_sample.scores.items()
                 }
@@ -696,6 +697,7 @@ async def task_run_sample(
                                         sample_score = SampleScore(
                                             score=score_result,
                                             sample_id=sample.id,
+                                            sample_metadata=sample.metadata,
                                             scorer=registry_unqualified_name(scorer),
                                         )
                                         transcript()._event(
@@ -709,7 +711,9 @@ async def task_run_sample(
                             if state.scores is not None:
                                 for name, score in state.scores.items():
                                     results[name] = SampleScore(
-                                        score=score, sample_id=state.sample_id
+                                        score=score,
+                                        sample_id=state.sample_id,
+                                        sample_metadata=state.metadata,
                                     )
                                     transcript()._event(
                                         ScoreEvent(score=score, target=sample.target)

inspect_ai/_eval/task/sandbox.py CHANGED Viewed

@@ -5,11 +5,20 @@ from random import random
 from typing import AsyncGenerator, Callable, NamedTuple, cast
 import httpx
+from tenacity import (
+    retry,
+    retry_if_exception,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential_jitter,
+)
 from inspect_ai._eval.task.task import Task
 from inspect_ai._eval.task.util import task_run_dir
+from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT
 from inspect_ai._util.file import file, filesystem
 from inspect_ai._util.registry import registry_unqualified_name
+from inspect_ai._util.retry import httpx_should_retry, log_retry_attempt
 from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
 from inspect_ai.dataset import Sample
 from inspect_ai.util._concurrency import concurrency
@@ -115,8 +124,7 @@ async def read_sandboxenv_file(contents: str) -> bytes:
         contents_base64 = data_uri_to_base64(contents)
         file_bytes = base64.b64decode(contents_base64)
     elif is_http_url(contents):
-        client = httpx.AsyncClient()
-        file_bytes = (await client.get(contents, follow_redirects=True)).content
+        file_bytes = await _retrying_httpx_get(contents)
     else:
         # try to read as a file (if it doesn't exist or has a path not cool w/
         # the filesystem then we fall back to contents)
@@ -172,3 +180,28 @@ def resolve_sandbox(
         return sample.sandbox
     else:
         return None
+async def _retrying_httpx_get(
+    url: str,
+    client: httpx.AsyncClient = httpx.AsyncClient(),
+    timeout: int = 30,  # per-attempt timeout
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    total_timeout: int = DEFAULT_TIMEOUT,  #  timeout for the whole retry loop. not for an individual attempt
+) -> bytes:
+    @retry(
+        wait=wait_exponential_jitter(),
+        stop=(stop_after_attempt(max_retries) | stop_after_delay(total_timeout)),
+        retry=retry_if_exception(httpx_should_retry),
+        before_sleep=log_retry_attempt(url),
+    )
+    async def do_get() -> bytes:
+        response = await client.get(
+            url=url,
+            follow_redirects=True,
+            timeout=(timeout, timeout, timeout, timeout),
+        )
+        response.raise_for_status()
+        return response.content
+    return await do_get()

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -39,38 +39,6 @@ class Task:
     r"""Evaluation task.
     Tasks are the basis for defining and running evaluations.
-    Args:
-        dataset (Dataset | Sequence[Sample]): Dataset to evaluate
-        setup: (Solver | list[Solver] | None): Setup step (always run
-          even when the main `solver` is replaced).
-        solver: (Solver | list[Solver]): Solver or list of solvers.
-          Defaults to generate(), a normal call to the model.
-        scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
-        metrics (list[Metric] | dict[str, list[Metric]] | None):
-          Alternative metrics (overrides the metrics provided by the specified scorer).
-        config (GenerateConfig): Model generation config.
-        sandbox (SandboxEnvironmentType | None): Sandbox environment type
-          (or optionally a str or tuple with a shorthand spec)
-        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-          Either a path to an approval policy config file or a list of approval policies.
-          Defaults to no approval policy.
-        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
-           reducer function(s) used to combine sample scores (defaults to "mean")
-        fail_on_error (bool | float | None): `True` to fail on first sample error
-           (default); `False` to never fail on sample errors; Value between 0 and 1
-           to fail if a proportion of total samples fails. Value greater than 1 to fail
-           eval if a count of samples fails.
-        message_limit (int | None): Limit on total messages used for each sample.
-        token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
-        name: (str | None): Task name. If not specified is automatically
-          determined based on the name of the task directory (or "task")
-          if its anonymous task (e.g. created in a notebook and passed to
-          eval() directly)
-        version: (int): Version of task (to distinguish evolutions
-          of the task spec or breaking changes to it)
-        metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
     """
     def __init__(
@@ -93,6 +61,41 @@ class Task:
         metadata: dict[str, Any] | None = None,
         **kwargs: Unpack[TaskDeprecatedArgs],
     ) -> None:
+        """Create a task.
+        Args:
+            dataset (Dataset | Sequence[Sample]): Dataset to evaluate
+            setup: (Solver | list[Solver] | None): Setup step (always run
+                even when the main `solver` is replaced).
+            solver: (Solver | list[Solver]): Solver or list of solvers.
+                Defaults to generate(), a normal call to the model.
+            scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
+            metrics (list[Metric] | dict[str, list[Metric]] | None):
+                Alternative metrics (overrides the metrics provided by the specified scorer).
+            config (GenerateConfig): Model generation config.
+            sandbox (SandboxEnvironmentType | None): Sandbox environment type
+                (or optionally a str or tuple with a shorthand spec)
+            approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
+                Either a path to an approval policy config file or a list of approval policies.
+                Defaults to no approval policy.
+            epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+                reducer function(s) used to combine sample scores (defaults to "mean")
+            fail_on_error (bool | float | None): `True` to fail on first sample error
+                (default); `False` to never fail on sample errors; Value between 0 and 1
+                to fail if a proportion of total samples fails. Value greater than 1 to fail
+                eval if a count of samples fails.
+            message_limit (int | None): Limit on total messages used for each sample.
+            token_limit (int | None): Limit on total tokens used for each sample.
+            time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+            name: (str | None): Task name. If not specified is automatically
+                determined based on the name of the task directory (or "task")
+                if its anonymous task (e.g. created in a notebook and passed to
+                eval() directly)
+            version: (int): Version of task (to distinguish evolutions
+                of the task spec or breaking changes to it)
+            metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
+            **kwargs: Deprecated arguments.
+        """
         # handle deprecated args
         for arg, value in kwargs.items():
             newarg = ""
@@ -179,33 +182,33 @@ def task_with(
         task (Task): Task to adapt (it is deep copied prior to mutating options)
         dataset (Dataset | Sequence[Sample]): Dataset to evaluate
         setup: (Solver | list[Solver] | None): Setup step (always run
-          even when the main `solver` is replaced).
+            even when the main `solver` is replaced).
         solver: (Solver | list[Solver]): Solver or list of solvers.
-          Defaults to generate(), a normal call to the model.
+            Defaults to generate(), a normal call to the model.
         scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
         metrics (list[Metric] | dict[str, list[Metric]] | None):
-          Alternative metrics (overrides the metrics provided by the specified scorer).
+            Alternative metrics (overrides the metrics provided by the specified scorer).
         config (GenerateConfig): Model generation config.
         sandbox (SandboxEnvironmentType | None): Sandbox environment type
-          (or optionally a str or tuple with a shorthand spec)
+            (or optionally a str or tuple with a shorthand spec)
         approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
-          Either a path to an approval policy config file or a list of approval policies.
-          Defaults to no approval policy.
+            Either a path to an approval policy config file or a list of approval policies.
+            Defaults to no approval policy.
         epochs (int | Epochs | None): Epochs to repeat samples for and optional score
-           reducer function(s) used to combine sample scores (defaults to "mean")
+            reducer function(s) used to combine sample scores (defaults to "mean")
         fail_on_error (bool | float | None): `True` to fail on first sample error
-           (default); `False` to never fail on sample errors; Value between 0 and 1
-           to fail if a proportion of total samples fails. Value greater than 1 to fail
-           eval if a count of samples fails.
+            (default); `False` to never fail on sample errors; Value between 0 and 1
+            to fail if a proportion of total samples fails. Value greater than 1 to fail
+            eval if a count of samples fails.
         message_limit (int | None): Limit on total messages used for each sample.
         token_limit (int | None): Limit on total tokens used for each sample.
         time_limit (int | None): Limit on time (in seconds) for execution of each sample.
         name: (str | None): Task name. If not specified is automatically
-          determined based on the name of the task directory (or "task")
-          if its anonymous task (e.g. created in a notebook and passed to
-          eval() directly)
+            determined based on the name of the task directory (or "task")
+            if its anonymous task (e.g. created in a notebook and passed to
+            eval() directly)
         version: (int): Version of task (to distinguish evolutions
-          of the task spec or breaking changes to it)
+            of the task spec or breaking changes to it)
         metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
     Returns:

inspect_ai/_util/__init__.py ADDED Viewed

File without changes

inspect_ai/_util/constants.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from pathlib import Path
 from typing import Literal
-PKG_AUTHOR = "UK AI Safety Institute"
+PKG_AUTHOR = "UK AI Security Institute"
 PKG_AUTHOR_DIR = "UK-AISI"
 PKG_NAME = Path(__file__).parent.parent.stem
 PKG_PATH = Path(__file__).parent.parent

inspect_ai/_util/content.py CHANGED Viewed

@@ -4,6 +4,8 @@ from pydantic import BaseModel, Field
 class ContentText(BaseModel):
+    """Text content."""
     type: Literal["text"] = Field(default="text")
     """Type."""
@@ -12,6 +14,8 @@ class ContentText(BaseModel):
 class ContentImage(BaseModel):
+    """Image content."""
     type: Literal["image"] = Field(default="image")
     """Type."""
@@ -26,6 +30,8 @@ class ContentImage(BaseModel):
 class ContentAudio(BaseModel):
+    """Audio content."""
     type: Literal["audio"] = Field(default="audio")
     """Type."""
@@ -37,6 +43,8 @@ class ContentAudio(BaseModel):
 class ContentVideo(BaseModel):
+    """Video content."""
     type: Literal["video"] = Field(default="video")
     """Type."""

inspect_ai/_util/error.py CHANGED Viewed

@@ -9,6 +9,8 @@ from rich.console import RenderableType
 class EvalError(BaseModel):
+    """Eval error details."""
     message: str
     """Error message."""

inspect_ai/_util/file.py CHANGED Viewed

@@ -18,6 +18,7 @@ from fsspec.core import split_protocol  # type: ignore  # type: ignore
 from fsspec.implementations.local import make_path_posix  # type: ignore
 from pydantic import BaseModel
 from s3fs import S3FileSystem  # type: ignore
+from shortuuid import uuid
 # https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem
 # https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.generic.GenericFileSystem
@@ -169,6 +170,9 @@ class FileSystem:
     def exists(self, path: str) -> bool:
         return self.fs.exists(path) is True
+    def touch(self, path: str) -> None:
+        self.fs.touch(path)
     def rm(
         self, path: str, recursive: bool = False, maxdepth: int | None = None
     ) -> None:
@@ -218,6 +222,16 @@ class FileSystem:
     def is_local(self) -> bool:
         return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem)
+    def is_writeable(self, path: str) -> bool:
+        try:
+            path = path.rstrip("/\\")
+            touch_file = f"{path}{self.fs.sep}{uuid()}"
+            self.touch(touch_file)
+            self.rm(touch_file)
+            return True
+        except PermissionError:
+            return False
     def is_async(self) -> bool:
         return isinstance(self.fs, fsspec.asyn.AsyncFileSystem)
@@ -354,7 +368,7 @@ def safe_filename(s: str, max_length: int = 255) -> str:
     Returns:
         str: A safe filename string
-    Example:
+    Examples:
         >>> safe_filename("Hello/World?.txt")
         'Hello_World.txt'
     """

inspect_ai/_util/logger.py CHANGED Viewed

@@ -161,7 +161,7 @@ def init_logger(
         getLogger().addHandler(_logHandler)
     # establish default capture level
-    capture_level = min(TRACE, levelno)
+    capture_level = min(TRACE, levelno, transcript_levelno)
     # see all the messages (we won't actually display/write all of them)
     getLogger().setLevel(capture_level)
@@ -181,7 +181,9 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
     from inspect_ai.log._transcript import LoggerEvent, transcript
     if write:
-        transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record)))
+        transcript()._event(
+            LoggerEvent(message=LoggingMessage._from_log_record(record))
+        )
     global _rate_limit_count
     if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
         record.levelno == DEBUG

inspect_ai/_util/registry.py CHANGED Viewed

@@ -209,7 +209,13 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
     if isclass(obj):
         return with_registry_info(obj(**kwargs))
     elif callable(obj):
-        return_type = getattr(get_annotations(obj)["return"], "__name__", None)
+        return_type = get_annotations(obj).get("return")
+        # Until we remove the MetricDeprecated symbol we need this extra
+        # bit to map the Metric union back to Metric
+        if "_metric.Metric" in str(return_type):
+            return_type = "Metric"
+        else:
+            return_type = getattr(return_type, "__name__", None)
         if return_type and return_type.lower() == type:
             return with_registry_info(obj(**kwargs))
         else:

inspect_ai/_view/view.py CHANGED Viewed

@@ -28,11 +28,10 @@ def view(
     port: int = DEFAULT_VIEW_PORT,
     authorization: str | None = None,
     log_level: str | None = None,
-    log_level_transcript: str | None = None,
     fs_options: dict[str, Any] = {},
 ) -> None:
     init_dotenv()
-    init_logger(log_level, log_level_transcript)
+    init_logger(log_level)
     # initialize the log_dir
     log_dir = log_dir if log_dir else os.getenv("INSPECT_LOG_DIR", "./logs")

inspect_ai/_view/www/App.css CHANGED Viewed

@@ -25,6 +25,7 @@
   /* Inspect Font Sizes */
   --inspect-font-size-title: 1.5rem;
   --inspect-font-size-title-secondary: 1.3rem;
+  --inspect-font-size-largest: 1.2rem;
   --inspect-font-size-larger: 1.1rem;
   --inspect-font-size-large: 1rem;
   --inspect-font-size-base: 0.9rem;
@@ -64,15 +65,15 @@ body[class^="vscode-"] .app-main-grid {
 /* Inspect Text Styles */
 .text-style-label {
-  text-transform: uppercase;
+  text-transform: uppercase !important;
 }
 .text-style-secondary {
-  color: var(--bs-secondary);
+  color: var(--bs-secondary) !important;
 }
 .text-style-tertiary {
-  color: var(--bs-tertiary-color);
+  color: var(--bs-tertiary-color) !important;
 }
 /* Inspect Font Size Styles */
@@ -84,6 +85,10 @@ body[class^="vscode-"] .app-main-grid {
   font-size: var(--inspect-font-size-title-secondary);
 }
+.text-size-largest {
+  font-size: var(--inspect-font-size-largest);
+}
 .text-size-larger {
   font-size: var(--inspect-font-size-larger);
 }

inspect_ai/_view/www/README.md CHANGED Viewed

@@ -19,7 +19,7 @@ Use the following commands (run in the `src/inspect_ai/_view/www` dir) to ensure
    yarn prettier:write
    ```
-3. Build the bundled output to `dist`
+3. Build the bundled output into the `dist` directory.
    ```bash
    yarn build

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -14298,6 +14298,7 @@ pre[class*="language-"] {
   /* Inspect Font Sizes */
   --inspect-font-size-title: 1.5rem;
   --inspect-font-size-title-secondary: 1.3rem;
+  --inspect-font-size-largest: 1.2rem;
   --inspect-font-size-larger: 1.1rem;
   --inspect-font-size-large: 1rem;
   --inspect-font-size-base: 0.9rem;
@@ -14337,15 +14338,15 @@ body[class^="vscode-"] .app-main-grid {
 /* Inspect Text Styles */
 .text-style-label {
-  text-transform: uppercase;
+  text-transform: uppercase !important;
 }
 .text-style-secondary {
-  color: var(--bs-secondary);
+  color: var(--bs-secondary) !important;
 }
 .text-style-tertiary {
-  color: var(--bs-tertiary-color);
+  color: var(--bs-tertiary-color) !important;
 }
 /* Inspect Font Size Styles */
@@ -14357,6 +14358,10 @@ body[class^="vscode-"] .app-main-grid {
   font-size: var(--inspect-font-size-title-secondary);
 }
+.text-size-largest {
+  font-size: var(--inspect-font-size-largest);
+}
 .text-size-larger {
   font-size: var(--inspect-font-size-larger);
 }
@@ -16195,58 +16200,58 @@ ul.jsondiffpatch-textdiff {
   grid-template-columns: max-content max-content;
   column-gap: 1em;
 }
-._container_43lfg_1 {
+._container_1jqar_1 {
   margin-top: 0.5em;
   padding-left: 0;
 }
-._label_43lfg_6 {
-  padding-right: 2em;
-  padding-left: 0;
-  padding-bottom: 0;
+._label_1jqar_6 {
+  padding-right: 2em !important;
+  padding-left: 0 !important;
+  padding-bottom: 0 !important;
   font-weight: 400;
-  padding-bottom: 0;
+  padding-bottom: 0 !important;
 }
-._wordBreak_43lfg_14 {
+._wordBreak_1jqar_14 {
   word-break: break-all;
 }
-._scoreTable_43lfg_18 {
+._scoreTable_1jqar_18 {
   width: 100%;
   margin-bottom: 1em;
 }
-._bottomBorder_43lfg_23 {
+._bottomBorder_1jqar_23 {
   border-bottom-color: #00000000;
 }
-._headerScore_43lfg_27 {
+._headerScore_1jqar_27 {
   padding-left: 2em;
 }
-._targetValue_43lfg_31 {
-  padding-right: 2em;
-  padding-left: 0;
-  padding-top: 0;
+._targetValue_1jqar_31 {
+  padding-right: 2em !important;
+  padding-left: 0 !important;
+  padding-top: 0 !important;
 }
-._answerValue_43lfg_37 {
-  padding-left: 0;
-  padding-top: 0;
+._answerValue_1jqar_37 {
+  padding-left: 0 !important;
+  padding-top: 0 !important;
 }
-._scoreValue_43lfg_42 {
-  padding-left: 2em;
-  padding-top: 0;
+._scoreValue_1jqar_42 {
+  padding-left: 2em !important;
+  padding-top: 0 !important;
 }
-._noLeft_43lfg_47 {
-  padding-left: 0;
+._noLeft_1jqar_47 {
+  padding-left: 0 !important;
 }
-._noTop_43lfg_51 {
-  margin-top: 0;
+._noTop_1jqar_51 {
+  margin-top: 0 !important;
 }
 ._wrapper_b0it4_1 {
   display: grid;
@@ -19490,7 +19495,7 @@ span.ap-marker-container:hover span.ap-marker {
   display: grid;
   grid-template-columns: minmax(0, max-content) max-content;
 }
-._simpleMetricsRows_13pa9_1 {
+._simpleMetricsRows_tnqkm_1 {
   display: flex;
   flex-direction: row;
   flex-wrap: wrap;
@@ -19501,28 +19506,28 @@ span.ap-marker-container:hover span.ap-marker {
   overflow: scroll;
 }
-._multiMetricsRows_13pa9_12 {
+._multiMetricsRows_tnqkm_12 {
   display: flex;
   flex-direction: row;
   flex-wrap: wrap;
   justify-content: end;
-  height: 100%;
   align-items: center;
   margin-top: 0.2rem;
   padding-bottom: 0.4rem;
   row-gap: 1em;
   max-height: 15em;
   overflow: scroll;
+  align-items: baseline;
 }
-._verticalMetricReducer_13pa9_26 {
+._verticalMetricReducer_tnqkm_26 {
   font-size: var(--inspect-font-size-smaller);
   text-align: center;
   padding-top: 0.3rem;
   margin-bottom: -0.3rem;
 }
-._verticalMetricName_13pa9_33 {
+._verticalMetricName_tnqkm_33 {
   font-size: var(--inspect-font-size-smaller);
   text-align: center;
   padding-top: 0.3rem;
@@ -19530,32 +19535,55 @@ span.ap-marker-container:hover span.ap-marker {
   border-bottom: solid var(--bs-border-color) 1px;
 }
-._verticalMetricValue_13pa9_41 {
-  font-size: var(--inspect-font-size-larger);
+._verticalMetricValue_tnqkm_41 {
   font-weight: 500;
   text-align: center;
 }
-._multiScorerReducer_13pa9_47 {
+._multiScorer_tnqkm_46 {
+  padding-left: 0;
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+  padding: 0.5em 1em;
+}
+._multiScorerIndent_tnqkm_54 {
+  padding-left: 1.5em;
+}
+._multiScorerReducer_tnqkm_58 {
   text-align: center;
   margin-bottom: -0.3rem;
+  margin-top: 0.2em;
 }
-._multiScorerLabel_13pa9_52 {
+._multiScorerLabel_tnqkm_64 {
   text-align: center;
   border-bottom: solid var(--bs-border-color) 1px;
   margin-bottom: -0.1rem;
 }
-._multiScorerValue_13pa9_58 {
+._multiScorerValue_tnqkm_70 {
   display: grid;
   grid-template-columns: auto auto;
+  grid-auto-rows: auto;
   grid-column-gap: 0.3rem;
   grid-row-gap: 0;
+  padding-top: 0.3em;
 }
-._multiScorerValueContent_13pa9_65 {
+._multiScorerValueContent_tnqkm_79 {
   font-weight: 600;
+  text-align: center;
+}
+._multiScoreMetricGrid_tnqkm_84 {
+  display: grid;
+  grid-template-rows: auto auto;
+  column-gap: 1em;
+  padding: 0 0.2em;
+  justify-content: center;
 }
 ._statusPanel_1fzh4_1 {
   padding: 1em;

inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl