PyPI - inspect-ai - Versions diffs - 0.3.56__py3-none-any.whl → 0.3.57__py3-none-any.whl - Mend

inspect-ai 0.3.56py3-none-any.whl → 0.3.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_eval/run.py +16 -11
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/logger.py +2 -1
inspect_ai/_util/trace.py +39 -3
inspect_ai/_util/transcript.py +36 -7
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/dist/assets/index.js +286 -224
inspect_ai/_view/www/log-schema.json +124 -125
inspect_ai/_view/www/src/App.mjs +18 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +18 -3
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/types/log.d.ts +2 -8
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/log/_log.py +25 -0
inspect_ai/log/_recorders/eval.py +2 -0
inspect_ai/model/_call_tools.py +27 -5
inspect_ai/model/_providers/google.py +24 -6
inspect_ai/model/_providers/openai.py +17 -3
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +1 -3
inspect_ai/util/_sandbox/docker/util.py +2 -1
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_store.py +2 -2
inspect_ai/util/_subprocess.py +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +57 -56
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0

inspect_ai/_display/core/panel.py CHANGED Viewed

@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
 def task_title(profile: TaskProfile, show_model: bool) -> str:
     eval_epochs = profile.eval_config.epochs or 1
     epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
-    samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
+    samples = f"{profile.samples // eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
     title = f"{registry_unqualified_name(profile.name)} ({samples})"
     if show_model:
         title = f"{title}: {profile.model}"

inspect_ai/_eval/run.py CHANGED Viewed

@@ -42,7 +42,7 @@ from .task.log import TaskLogger
 from .task.run import TaskRunOptions, task_run
 from .task.rundir import task_run_dir_switching
 from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
-from .task.util import task_run_dir
+from .task.util import slice_dataset, task_run_dir
 log = logging.getLogger(__name__)
@@ -70,12 +70,23 @@ async def eval_run(
     # get cwd before switching to task dir
     eval_wd = os.getcwd()
+    # ensure sample ids
+    for resolved_task in tasks:
+        # add sample ids to dataset if they aren't there (start at 1 not 0)
+        task = resolved_task.task
+        for id, sample in enumerate(task.dataset):
+            if sample.id is None:
+                sample.id = id + 1
+        # Ensure sample ids are unique
+        ensure_unique_ids(task.dataset)
     # run startup pass for the sandbox environments
     shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
     if has_sandbox:
         cleanup = eval_config.sandbox_cleanup is not False
         shutdown_sandbox_environments = await startup_sandbox_environments(
-            resolve_sandbox_environment(eval_sandbox), tasks, cleanup
+            resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
         )
     # resolve solver and solver spec
@@ -146,14 +157,6 @@ async def eval_run(
                 else:
                     task.fail_on_error = task_eval_config.fail_on_error
-                # add sample ids to dataset if they aren't there (start at 1 not 0)
-                for id, sample in enumerate(task.dataset):
-                    if sample.id is None:
-                        sample.id = id + 1
-                # Ensure sample ids are unique
-                ensure_unique_ids(task.dataset)
                 # create and track the logger
                 logger = TaskLogger(
                     task_name=task.name,
@@ -340,13 +343,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
 async def startup_sandbox_environments(
     eval_sandbox: SandboxEnvironmentSpec | None,
     tasks: list[ResolvedTask],
+    config: EvalConfig,
     cleanup: bool,
 ) -> Callable[[], Awaitable[None]]:
     # find unique sandboxenvs
     sandboxenvs: Set[TaskSandboxEnvironment] = set()
     for task in tasks:
         # resolve each sample and add to sandboxenvs
-        for sample in task.task.dataset:
+        dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
+        for sample in dataset:
             sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
             if sandbox is not None and sandbox not in sandboxenvs:
                 sandboxenvs.add(sandbox)

inspect_ai/_util/datetime.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Literal
 def iso_now(
     timespec: Literal[
-        "auto", "hours", "minutes", "seconds", "milliseconds" "microseconds"
+        "auto", "hours", "minutes", "seconds", "milliseconds", "microseconds"
     ] = "seconds",
 ) -> str:
     return datetime.now().astimezone().isoformat(timespec=timespec)

inspect_ai/_util/deprecation.py CHANGED Viewed

@@ -174,7 +174,7 @@ def default_deprecation_msg(
         _qual = getattr(obj, "__qualname__", "") or ""
         if _qual.endswith(".__init__") or _qual.endswith(".__new__"):
-            _obj = f' class ({_qual.rsplit(".", 1)[0]})'
+            _obj = f" class ({_qual.rsplit('.', 1)[0]})"
         elif _qual and _obj:
             _obj += f" ({_qual})"

inspect_ai/_util/json.py CHANGED Viewed

@@ -103,10 +103,20 @@ def json_changes(
                 paths = json_change.path.split("/")[1:]
                 replaced = before
                 for path in paths:
-                    index: Any = int(path) if path.isnumeric() else path
+                    decoded_path = decode_json_pointer_segment(path)
+                    index: Any = (
+                        int(decoded_path) if decoded_path.isnumeric() else decoded_path
+                    )
                     replaced = replaced[index]
                 json_change.replaced = replaced
             changes.append(json_change)
         return changes
     else:
         return None
+def decode_json_pointer_segment(segment: str) -> str:
+    """Decode a single JSON Pointer segment."""
+    # JSON points encode ~ and / because they are special characters
+    # this decodes these values (https://www.rfc-editor.org/rfc/rfc6901)
+    return segment.replace("~1", "/").replace("~0", "~")

inspect_ai/_util/logger.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import atexit
 import os
+import re
 from logging import (
     DEBUG,
     INFO,
@@ -182,7 +183,7 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
     if write:
         transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record)))
     global _rate_limit_count
-    if (record.levelno <= INFO and "429" in record.getMessage()) or (
+    if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
         record.levelno == DEBUG
         # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html#validating-retry-attempts
         # for boto retry logic / log messages (this is tracking standard or adapative retries)

inspect_ai/_util/trace.py CHANGED Viewed

@@ -33,6 +33,22 @@ def inspect_trace_file() -> Path:
 def trace_action(
     logger: Logger, action: str, message: str, *args: Any, **kwargs: Any
 ) -> Generator[None, None, None]:
+    """Trace a long running or poentially unreliable action.
+    Trace actions for which you want to collect data on the resolution
+    (e.g. succeeded, cancelled, failed, timed out, etc.) and duration of.
+    Traces are written to the `TRACE` log level (which is just below
+    `HTTP` and `INFO`). List and read trace logs with `inspect trace list`
+    and related commands (see `inspect trace --help` for details).
+    Args:
+       logger (Logger): Logger to use for tracing (e.g. from `getLogger(__name__)`)
+       action (str): Name of action to trace (e.g. 'Model', 'Subprocess', etc.)
+       message (str): Message describing action (can be a format string w/ args or kwargs)
+       *args (Any): Positional arguments for `message` format string.
+       **kwargs (Any): Named args for `message` format string.
+    """
     trace_id = uuid()
     start_monotonic = time.monotonic()
     start_wall = time.time()
@@ -117,6 +133,19 @@ def trace_action(
 def trace_message(
     logger: Logger, category: str, message: str, *args: Any, **kwargs: Any
 ) -> None:
+    """Log a message using the TRACE log level.
+    The `TRACE` log level is just below `HTTP` and `INFO`). List and
+    read trace logs with `inspect trace list` and related commands
+    (see `inspect trace --help` for details).
+    Args:
+       logger (Logger): Logger to use for tracing (e.g. from `getLogger(__name__)`)
+       category (str): Category of trace message.
+       message (str): Trace message (can be a format string w/ args or kwargs)
+       *args (Any): Positional arguments for `message` format string.
+       **kwargs (Any): Named args for `message` format string.
+    """
     logger.log(TRACE, f"[{category}] {message}", *args, **kwargs)
@@ -250,9 +279,16 @@ def read_trace_file(file: Path) -> list[TraceRecord]:
 def rotate_trace_files() -> None:
-    rotate_files = list_trace_files()[10:]
-    for file in rotate_files:
-        file.file.unlink(missing_ok=True)
+    # if multiple inspect processes start up at once they
+    # will all be attempting to rotate at the same time,
+    # which can lead to FileNotFoundError -- ignore these
+    # errors if they occur
+    try:
+        rotate_files = list_trace_files()[10:]
+        for file in rotate_files:
+            file.file.unlink(missing_ok=True)
+    except FileNotFoundError:
+        pass
 def compress_trace_log(log_handler: FileHandler) -> Callable[[], None]:

inspect_ai/_util/transcript.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import html
+import re
 from typing import Any
 from rich.align import AlignMethod
@@ -19,13 +20,43 @@ def transcript_code_theme() -> str:
 def transcript_markdown(content: str, *, escape: bool = False) -> Markdown:
     code_theme = transcript_code_theme()
     return Markdown(
-        html.escape(content) if escape else content,
+        html_escape_markdown(content) if escape else content,
         code_theme=code_theme,
         inline_code_lexer="python",
         inline_code_theme=code_theme,
     )
+def html_escape_markdown(content: str) -> str:
+    """Escape markdown lines that aren't in a code block."""
+    codeblock_pattern = re.compile("`{3,}")
+    current_codeblock = ""
+    escaped: list[str] = []
+    lines = content.splitlines()
+    for line in lines:
+        # look for matching end of codeblock
+        if current_codeblock:
+            if current_codeblock in line:
+                current_codeblock = ""
+                escaped.append(line)
+                continue
+        # look for beginning of codeblock
+        match = codeblock_pattern.search(line)
+        if match:
+            current_codeblock = match[0]
+            escaped.append(line)
+            continue
+        # escape if we are not in a codeblock
+        if current_codeblock:
+            escaped.append(line)
+        else:
+            escaped.append(html.escape(line, quote=False))
+    return "\n".join(escaped)
 def set_transcript_markdown_options(markdown: Markdown) -> None:
     code_theme = transcript_code_theme()
     markdown.code_theme = code_theme
@@ -89,12 +120,10 @@ def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableT
     return transcript_markdown("```python\n" + call + "\n```\n")
-DOUBLE_LINE = Box(
-    " ══ \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n"
-)
+DOUBLE_LINE = Box(" ══ \n    \n    \n    \n    \n    \n    \n    \n")
-LINE = Box(" ── \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n")
+LINE = Box(" ── \n    \n    \n    \n    \n    \n    \n    \n")
-DOTTED = Box(" ·· \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n")
+DOTTED = Box(" ·· \n    \n    \n    \n    \n    \n    \n    \n")
-NOBORDER = Box("    \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n" "    \n")
+NOBORDER = Box("    \n    \n    \n    \n    \n    \n    \n    \n")

inspect_ai/_view/www/.prettierrc.js ADDED Viewed

@@ -0,0 +1,12 @@
+// Do not remove this file even if the config is empty!
+// VSCode's "Format Document" will respect this config and use the default
+// settings, which is what we want. Without prettierrc, VSCode falls back to
+// users settings, which could be different.
+/**
+ * @see https://prettier.io/docs/en/configuration.html
+ * @type {import("prettier").Config}
+ */
+const config = {};
+export default config;

inspect-ai 0.3.56__py3-none-any.whl → 0.3.57__py3-none-any.whl

inspect-ai 0.3.56py3-none-any.whl → 0.3.57py3-none-any.whl