PyPI - inspect-ai - Versions diffs - 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl - Mend

inspect-ai 0.3.52py3-none-any.whl → 0.3.54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

inspect_ai/_cli/eval.py +55 -1
inspect_ai/_cli/main.py +2 -0
inspect_ai/_cli/trace.py +244 -0
inspect_ai/_display/core/progress.py +9 -3
inspect_ai/_display/core/results.py +8 -4
inspect_ai/_display/textual/app.py +5 -1
inspect_ai/_display/textual/widgets/task_detail.py +3 -0
inspect_ai/_display/textual/widgets/tasks.py +97 -6
inspect_ai/_eval/eval.py +33 -0
inspect_ai/_eval/evalset.py +4 -0
inspect_ai/_eval/registry.py +2 -2
inspect_ai/_eval/task/images.py +4 -14
inspect_ai/_eval/task/results.py +22 -4
inspect_ai/_eval/task/run.py +40 -20
inspect_ai/_eval/task/sandbox.py +72 -43
inspect_ai/_eval/task/task.py +4 -0
inspect_ai/_eval/task/util.py +2 -0
inspect_ai/_util/constants.py +3 -3
inspect_ai/_util/display.py +1 -0
inspect_ai/_util/logger.py +34 -8
inspect_ai/_util/trace.py +275 -0
inspect_ai/_view/www/App.css +13 -0
inspect_ai/_view/www/dist/assets/index.css +13 -0
inspect_ai/_view/www/dist/assets/index.js +80 -43
inspect_ai/_view/www/src/App.mjs +31 -6
inspect_ai/_view/www/src/Types.mjs +6 -0
inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
inspect_ai/_view/www/src/components/Tools.mjs +46 -18
inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
inspect_ai/log/_log.py +6 -0
inspect_ai/log/_message.py +2 -2
inspect_ai/log/_recorders/eval.py +8 -18
inspect_ai/log/_recorders/json.py +19 -17
inspect_ai/model/_cache.py +22 -16
inspect_ai/model/_call_tools.py +9 -1
inspect_ai/model/_generate_config.py +8 -2
inspect_ai/model/_model.py +11 -12
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/bedrock.py +18 -2
inspect_ai/model/_providers/hf.py +1 -1
inspect_ai/model/_providers/openai.py +32 -8
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/vllm.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/util/_sandbox/context.py +7 -3
inspect_ai/util/_sandbox/docker/compose.py +58 -19
inspect_ai/util/_sandbox/docker/config.py +8 -10
inspect_ai/util/_sandbox/docker/docker.py +20 -16
inspect_ai/util/_sandbox/docker/util.py +3 -9
inspect_ai/util/_sandbox/environment.py +7 -2
inspect_ai/util/_sandbox/limits.py +1 -1
inspect_ai/util/_sandbox/local.py +8 -9
inspect_ai/util/_sandbox/service.py +17 -7
inspect_ai/util/_subprocess.py +6 -1
inspect_ai/util/_subtask.py +8 -2
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/sandbox.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import asyncio
 import base64
 import contextlib
-from typing import AsyncGenerator, NamedTuple
+from random import random
+from typing import AsyncGenerator, Callable, NamedTuple, cast
 from inspect_ai._eval.task.task import Task
 from inspect_ai._eval.task.util import task_run_dir
@@ -9,6 +10,7 @@ from inspect_ai._util.file import file, filesystem
 from inspect_ai._util.registry import registry_unqualified_name
 from inspect_ai._util.url import data_uri_to_base64, is_data_uri
 from inspect_ai.dataset import Sample
+from inspect_ai.util._concurrency import concurrency
 from inspect_ai.util._sandbox.context import (
     cleanup_sandbox_environments_sample,
     init_sandbox_environments_sample,
@@ -18,12 +20,14 @@ from inspect_ai.util._sandbox.environment import (
     SandboxEnvironmentConfigType,
     SandboxEnvironmentSpec,
 )
+from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
 @contextlib.asynccontextmanager
 async def sandboxenv_context(
     task_name: str,
     sandbox: SandboxEnvironmentSpec | None,
+    max_sandboxes: int | None,
     cleanup: bool,
     sample: Sample,
 ) -> AsyncGenerator[None, None]:
@@ -32,52 +36,77 @@ async def sandboxenv_context(
     if not sandbox:
         raise ValueError("sandboxenv_context called with no sandbox specified")
-    # read files from sample
-    files: dict[str, bytes] = {}
-    if sample.files:
-        for path, contents in sample.files.items():
-            files[path] = read_sandboxenv_file(contents)
-    # read setup script from sample (add bash shebang if necessary)
-    setup: bytes | None = None
-    if sample.setup:
-        setup = read_sandboxenv_file(sample.setup)
-        setup_str = setup.decode(encoding="utf-8")
-        if not setup_str.strip().startswith("#!"):
-            setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
-            setup = setup_str.encode(encoding="utf-8")
-    interrupted = False
-    environments: dict[str, SandboxEnvironment] | None = None
-    try:
-        # initialize sandbox environment,
-        environments = await init_sandbox_environments_sample(
-            type=sandbox.type,
-            task_name=registry_unqualified_name(task_name),
-            config=sandbox.config,
-            files=files,
-            setup=setup,
-            metadata=sample.metadata if sample.metadata else {},
-        )
-        # run sample
-        yield
-    except asyncio.CancelledError as ex:
-        interrupted = True
-        raise ex
+    # get sandboxenv_type
+    sandboxenv_type = registry_find_sandboxenv(sandbox.type)
-    finally:
-        # cleanup sandbox environment
-        if environments and cleanup:
-            await cleanup_sandbox_environments_sample(
-                type=sandbox.type,
-                task_name=task_name,
+    # see if there is a max_sandboxes in play (passed or from type)
+    if max_sandboxes is None:
+        default_concurrency_fn = cast(
+            Callable[[], int | None], getattr(sandboxenv_type, "default_concurrency")
+        )
+        max_sandboxes = default_concurrency_fn()
+    # if we are enforcing max_sandboxes, then when samples are scheduled they may
+    # not get interleaved properly across tasks (because the first task will come
+    # in and grab all of the sandboxes). Therefore, in this case we wait a random
+    # delay so that all tasks/samples have an equal shot at getting scheduled.
+    if max_sandboxes is not None:
+        await asyncio.sleep(random())
+    # enforce concurrency if required
+    sandboxes_cm = (
+        concurrency(sandbox.type, max_sandboxes, f"sandboxes/{sandbox.type}")
+        if max_sandboxes is not None
+        else contextlib.nullcontext()
+    )
+    async with sandboxes_cm:
+        # read files from sample
+        files: dict[str, bytes] = {}
+        if sample.files:
+            for path, contents in sample.files.items():
+                files[path] = read_sandboxenv_file(contents)
+        # read setup script from sample (add bash shebang if necessary)
+        setup: bytes | None = None
+        if sample.setup:
+            setup = read_sandboxenv_file(sample.setup)
+            setup_str = setup.decode(encoding="utf-8")
+            if not setup_str.strip().startswith("#!"):
+                setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
+                setup = setup_str.encode(encoding="utf-8")
+        interrupted = False
+        environments: dict[str, SandboxEnvironment] | None = None
+        try:
+            # initialize sandbox environment,
+            environments = await init_sandbox_environments_sample(
+                sandboxenv_type=sandboxenv_type,
+                task_name=registry_unqualified_name(task_name),
                 config=sandbox.config,
-                environments=environments,
-                interrupted=interrupted,
+                files=files,
+                setup=setup,
+                metadata=sample.metadata if sample.metadata else {},
             )
+            # run sample
+            yield
+        except asyncio.CancelledError as ex:
+            interrupted = True
+            raise ex
+        finally:
+            # cleanup sandbox environment
+            if environments and cleanup:
+                await cleanup_sandbox_environments_sample(
+                    type=sandbox.type,
+                    task_name=task_name,
+                    config=sandbox.config,
+                    environments=environments,
+                    interrupted=interrupted,
+                )
 def read_sandboxenv_file(contents: str) -> bytes:
     if is_data_uri(contents):

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -39,6 +39,8 @@ class Task:
     Args:
         dataset (Dataset | Sequence[Sample]): Dataset to evaluate
+        setup: (Solver | list[Solver] | None): Setup step (always run
+          even when the main `solver` is replaced).
         solver: (Solver | list[Solver]): Solver or list of solvers.
           Defaults to generate(), a normal call to the model.
         scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
@@ -68,6 +70,7 @@ class Task:
     def __init__(
         self,
         dataset: Dataset | Sequence[Sample] | None = None,
+        setup: Solver | list[Solver] | None = None,
         solver: Solver | list[Solver] = generate(),
         scorer: Scorer | list[Scorer] | None = None,
         metrics: list[Metric] | dict[str, list[Metric]] | None = None,
@@ -119,6 +122,7 @@ class Task:
         self.dataset: Dataset = (
             dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
         )
+        self.setup = setup
         self.solver = chain(solver) if isinstance(solver, list) else solver
         self.scorer = (
             scorer

inspect_ai/_eval/task/util.py CHANGED Viewed

@@ -42,6 +42,8 @@ def slice_dataset(
     sample_id: str | int | list[str | int] | None,
 ) -> Dataset:
     def normalise(id: str | int | None) -> str:
+        if isinstance(id, str) and id.isdigit():
+            id = int(id)
         return id if isinstance(id, str) else str(id).zfill(20)
     if sample_id is not None:

inspect_ai/_util/constants.py CHANGED Viewed

@@ -14,12 +14,12 @@ DEFAULT_VIEW_PORT = 7575
 DEFAULT_SERVER_HOST = "127.0.0.1"
 HTTP = 15
 HTTP_LOG_LEVEL = "HTTP"
-SANDBOX = 17
-SANDBOX_LOG_LEVEL = "SANDBOX"
+TRACE = 13
+TRACE_LOG_LEVEL = "TRACE"
 ALL_LOG_LEVELS = [
     "DEBUG",
+    TRACE_LOG_LEVEL,
     HTTP_LOG_LEVEL,
-    SANDBOX_LOG_LEVEL,
     "INFO",
     "WARNING",
     "ERROR",

inspect_ai/_util/display.py CHANGED Viewed

@@ -14,6 +14,7 @@ _display_type: DisplayType | None = None
 def init_display_type(display: str | None = None) -> DisplayType:
     global _display_type
+    global _display_metrics
     display = (
         display or os.environ.get("INSPECT_DISPLAY", DEFAULT_DISPLAY).lower().strip()
     )

inspect_ai/_util/logger.py CHANGED Viewed

@@ -11,6 +11,7 @@ from logging import (
     getLevelName,
     getLogger,
 )
+from pathlib import Path
 import rich
 from rich.console import ConsoleRenderable
@@ -18,17 +19,20 @@ from rich.logging import RichHandler
 from rich.text import Text
 from typing_extensions import override
-from inspect_ai._util.constants import (
+from .constants import (
     ALL_LOG_LEVELS,
     DEFAULT_LOG_LEVEL,
     DEFAULT_LOG_LEVEL_TRANSCRIPT,
     HTTP,
     HTTP_LOG_LEVEL,
     PKG_NAME,
-    SANDBOX,
-    SANDBOX_LOG_LEVEL,
+    TRACE,
+    TRACE_LOG_LEVEL,
 )
-from inspect_ai._util.error import PrerequisiteError
+from .error import PrerequisiteError
+from .trace import TraceFileHandler, TraceFormatter, inspect_trace_dir
+TRACE_FILE_NAME = "trace.log"
 # log handler that filters messages to stderr and the log file
@@ -52,6 +56,24 @@ class LogHandler(RichHandler):
         else:
             self.file_logger_level = 0
+        # add a trace handler
+        default_trace_file = inspect_trace_dir() / TRACE_FILE_NAME
+        have_existing_trace_file = default_trace_file.exists()
+        env_trace_file = os.environ.get("INSPECT_TRACE_FILE", None)
+        trace_file = Path(env_trace_file) if env_trace_file else default_trace_file
+        trace_total_files = 10
+        self.trace_logger = TraceFileHandler(
+            trace_file.as_posix(),
+            backupCount=trace_total_files - 1,  # exclude the current file (10 total)
+        )
+        self.trace_logger.setFormatter(TraceFormatter())
+        if have_existing_trace_file:
+            self.trace_logger.doRollover()
+        # set trace level
+        trace_level = os.environ.get("INSPECT_TRACE_LEVEL", TRACE_LOG_LEVEL)
+        self.trace_logger_level = int(getLevelName(trace_level.upper()))
     @override
     def emit(self, record: LogRecord) -> None:
         # demote httpx and return notifications to log_level http
@@ -79,6 +101,10 @@ class LogHandler(RichHandler):
         ):
             self.file_logger.emit(record)
+        # write to trace if the trace level matches.
+        if self.trace_logger and record.levelno >= self.trace_logger_level:
+            self.trace_logger.emit(record)
         # eval log always gets info level and higher records
         # eval log only gets debug or http if we opt-in
         write = record.levelno >= self.transcript_levelno
@@ -95,12 +121,12 @@ def init_logger(
     log_level: str | None = None, log_level_transcript: str | None = None
 ) -> None:
     # backwards compatibility for 'tools'
-    if log_level == "tools":
-        log_level = "sandbox"
+    if log_level == "sandbox" or log_level == "tools":
+        log_level = "trace"
     # register http and tools levels
     addLevelName(HTTP, HTTP_LOG_LEVEL)
-    addLevelName(SANDBOX, SANDBOX_LOG_LEVEL)
+    addLevelName(TRACE, TRACE_LOG_LEVEL)
     def validate_level(option: str, level: str) -> None:
         if level not in ALL_LOG_LEVELS:
@@ -134,7 +160,7 @@ def init_logger(
         getLogger().addHandler(_logHandler)
     # establish default capture level
-    capture_level = min(HTTP, levelno)
+    capture_level = min(TRACE, levelno)
     # see all the messages (we won't actually display/write all of them)
     getLogger().setLevel(capture_level)

inspect_ai/_util/trace.py ADDED Viewed

@@ -0,0 +1,275 @@
+import asyncio
+import datetime
+import gzip
+import json
+import logging
+import os
+import shutil
+import time
+import traceback
+from contextlib import contextmanager
+from logging import Logger
+from logging.handlers import RotatingFileHandler
+from pathlib import Path
+from typing import Any, Generator, Literal, TextIO
+import jsonlines
+from pydantic import BaseModel, Field, JsonValue
+from shortuuid import uuid
+from .appdirs import inspect_data_dir
+from .constants import TRACE
+def inspect_trace_dir() -> Path:
+    return inspect_data_dir("traces")
+@contextmanager
+def trace_action(
+    logger: Logger, action: str, message: str, *args: Any, **kwargs: Any
+) -> Generator[None, None, None]:
+    trace_id = uuid()
+    start_monotonic = time.monotonic()
+    start_wall = time.time()
+    pid = os.getpid()
+    detail = message % args if args else message % kwargs if kwargs else message
+    def trace_message(event: str) -> str:
+        return f"{action}: {detail} ({event})"
+    logger.log(
+        TRACE,
+        trace_message("enter"),
+        extra={
+            "action": action,
+            "detail": detail,
+            "event": "enter",
+            "trace_id": str(trace_id),
+            "start_time": start_wall,
+            "pid": pid,
+        },
+    )
+    try:
+        yield
+        duration = time.monotonic() - start_monotonic
+        logger.log(
+            TRACE,
+            trace_message("exit"),
+            extra={
+                "action": action,
+                "detail": detail,
+                "event": "exit",
+                "trace_id": str(trace_id),
+                "duration": duration,
+                "pid": pid,
+            },
+        )
+    except (KeyboardInterrupt, asyncio.CancelledError):
+        duration = time.monotonic() - start_monotonic
+        logger.log(
+            TRACE,
+            trace_message("cancel"),
+            extra={
+                "action": action,
+                "detail": detail,
+                "event": "cancel",
+                "trace_id": str(trace_id),
+                "duration": duration,
+                "pid": pid,
+            },
+        )
+        raise
+    except TimeoutError:
+        duration = time.monotonic() - start_monotonic
+        logger.log(
+            TRACE,
+            trace_message("timeout"),
+            extra={
+                "action": action,
+                "detail": detail,
+                "event": "timeout",
+                "trace_id": str(trace_id),
+                "duration": duration,
+                "pid": pid,
+            },
+        )
+        raise
+    except Exception as ex:
+        duration = time.monotonic() - start_monotonic
+        logger.log(
+            TRACE,
+            trace_message("error"),
+            extra={
+                "action": action,
+                "detail": detail,
+                "event": "error",
+                "trace_id": str(trace_id),
+                "duration": duration,
+                "error": getattr(ex, "message", str(ex)) or repr(ex),
+                "error_type": type(ex).__name__,
+                "stacktrace": traceback.format_exc(),
+                "pid": pid,
+            },
+        )
+        raise
+def trace_message(
+    logger: Logger, category: str, message: str, *args: Any, **kwargs: Any
+) -> None:
+    logger.log(TRACE, f"[{category}] {message}", *args, **kwargs)
+class TraceFormatter(logging.Formatter):
+    def format(self, record: logging.LogRecord) -> str:
+        # Base log entry with standard fields
+        output: dict[str, JsonValue] = {
+            "timestamp": self.formatTime(record),
+            "level": record.levelname,
+            "message": record.getMessage(),  # This handles the % formatting of the message
+        }
+        # Add basic context its not a TRACE message
+        if record.levelname != "TRACE":
+            if hasattr(record, "module"):
+                output["module"] = record.module
+            if hasattr(record, "funcName"):
+                output["function"] = record.funcName
+            if hasattr(record, "lineno"):
+                output["line"] = record.lineno
+        # Add any structured fields from extra
+        elif hasattr(record, "action"):
+            # This is a trace_action log
+            for key in [
+                "action",
+                "detail",
+                "event",
+                "trace_id",
+                "start_time",
+                "duration",
+                "error",
+                "error_type",
+                "stacktrace",
+                "pid",
+            ]:
+                if hasattr(record, key):
+                    output[key] = getattr(record, key)
+        # Handle any unexpected extra attributes
+        for key, value in record.__dict__.items():
+            if key not in output and key not in (
+                "args",
+                "lineno",
+                "funcName",
+                "module",
+                "asctime",
+                "created",
+                "exc_info",
+                "exc_text",
+                "filename",
+                "levelno",
+                "levelname",
+                "msecs",
+                "msg",
+                "name",
+                "pathname",
+                "process",
+                "processName",
+                "relativeCreated",
+                "stack_info",
+                "thread",
+                "threadName",
+            ):
+                output[key] = value
+        return json.dumps(
+            output, default=str
+        )  # default=str handles non-serializable objects
+    def formatTime(self, record: logging.LogRecord, datefmt: str | None = None) -> str:
+        # ISO format with timezone
+        dt = datetime.datetime.fromtimestamp(record.created)
+        return dt.isoformat()
+class TraceRecord(BaseModel):
+    timestamp: str
+    level: str
+    message: str
+class SimpleTraceRecord(TraceRecord):
+    action: None = Field(default=None)
+class ActionTraceRecord(TraceRecord):
+    action: str
+    event: Literal["enter", "cancel", "error", "timeout", "exit"]
+    trace_id: str
+    detail: str = Field(default="")
+    start_time: float | None = Field(default=None)
+    duration: float | None = Field(default=None)
+    error: str | None = Field(default=None)
+    error_type: str | None = Field(default=None)
+    stacktrace: str | None = Field(default=None)
+    pid: int | None = Field(default=None)
+def read_trace_file(file: Path) -> list[TraceRecord]:
+    def read_file(f: TextIO) -> list[TraceRecord]:
+        jsonlines_reader = jsonlines.Reader(f)
+        trace_records: list[TraceRecord] = []
+        for trace in jsonlines_reader.iter(type=dict):
+            if "action" in trace:
+                trace_records.append(ActionTraceRecord(**trace))
+            else:
+                trace_records.append(SimpleTraceRecord(**trace))
+        return trace_records
+    if file.name.endswith(".gz"):
+        with gzip.open(file, "rt") as f:
+            return read_file(f)
+    else:
+        with open(file, "r") as f:
+            return read_file(f)
+class TraceFileHandler(RotatingFileHandler):
+    def __init__(
+        self,
+        filename: str,
+        mode: str = "a",
+        maxBytes: int = 0,
+        backupCount: int = 0,
+        encoding: str | None = None,
+        delay: bool = False,
+    ) -> None:
+        super().__init__(filename, mode, maxBytes, backupCount, encoding, delay)
+    def rotation_filename(self, default_name: str) -> str:
+        """
+        Returns the name of the rotated file.
+        Args:
+            default_name: The default name that would be used for rotation
+        Returns:
+            The modified filename with .gz extension
+        """
+        return default_name + ".gz"
+    def rotate(self, source: str, dest: str) -> None:
+        """
+        Compresses the source file and moves it to destination.
+        Args:
+            source: The source file to be compressed
+            dest: The destination path for the compressed file
+        """
+        with open(source, "rb") as f_in:
+            with gzip.open(dest, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        os.remove(source)

inspect_ai/_view/www/App.css CHANGED Viewed

@@ -711,6 +711,19 @@ pre[class*="language-"].tool-output,
   background-color: #333333;
 }
+pre[class*="language-"].tool-output {
+  border: none !important;
+  box-shadow: none !important;
+  border-radius: var(--bs-border-radius) !important;
+}
+.vscode-dark pre.jsonPanel {
+  background: none !important;
+  border: none !important;
+  box-shadow: none !important;
+  border-radius: var(--bs-border-radius) !important;
+}
 /* jsondiffpatch */

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -14984,6 +14984,19 @@ pre[class*="language-"].tool-output,
   background-color: #333333;
 }
+pre[class*="language-"].tool-output {
+  border: none !important;
+  box-shadow: none !important;
+  border-radius: var(--bs-border-radius) !important;
+}
+.vscode-dark pre.jsonPanel {
+  background: none !important;
+  border: none !important;
+  box-shadow: none !important;
+  border-radius: var(--bs-border-radius) !important;
+}
 /* jsondiffpatch */

inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

inspect-ai 0.3.52py3-none-any.whl → 0.3.54py3-none-any.whl