PyPI - inspect-ai - Versions diffs - 0.3.108__py3-none-any.whl → 0.3.110__py3-none-any.whl - Mend

inspect-ai 0.3.108py3-none-any.whl → 0.3.110py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

inspect_ai/analysis/beta/_dataframe/util.py CHANGED Viewed

@@ -139,18 +139,13 @@ def add_unreferenced_columns(
 def records_to_pandas(records: list[dict[str, ColumnType]]) -> "pd.DataFrame":
+    import pandas as pd
     import pyarrow as pa
-    # create arrow table
-    records = normalize_records(records)
-    table = pa.Table.from_pylist(records)
-    # convert arrow to pandas
-    df = table.to_pandas(types_mapper=arrow_types_mapper)
-    # swap numpy-backed nullable columns for arrow-backed equivalents
-    # df = df.convert_dtypes(dtype_backend="pyarrow")
-    return df
+    # arrow backed df w/ our types mapper
+    df = pd.DataFrame(records)
+    table = pa.Table.from_pandas(df)
+    return table.to_pandas(types_mapper=arrow_types_mapper)
 def arrow_types_mapper(arrow_type: pa.DataType) -> pd.ArrowDtype:
@@ -160,3 +155,104 @@ def arrow_types_mapper(arrow_type: pa.DataType) -> pd.ArrowDtype:
     if pa.types.is_null(arrow_type):
         arrow_type = pa.string()
     return pd.ArrowDtype(arrow_type)
+# sample_id                       string[pyarrow]
+# eval_id                         string[pyarrow]
+# id                              string[pyarrow]
+# epoch                            int64[pyarrow]
+# input                           string[pyarrow]
+# target                          string[pyarrow]
+# metadata_challenge_address      string[pyarrow]
+# metadata_challenge_type         string[pyarrow]
+# metadata_color                  string[pyarrow]
+# metadata_cookie                 string[pyarrow]
+# metadata_foo                    string[pyarrow]
+# metadata_get_flag_cmd           string[pyarrow]
+# metadata_get_flag_service       string[pyarrow]
+# metadata_label_confidence       double[pyarrow]
+# metadata_long                   string[pyarrow]
+# metadata_objective_prompt       string[pyarrow]
+# metadata_prompt                 string[pyarrow]
+# metadata_variant                string[pyarrow]
+# score_another_rand_score        double[pyarrow]
+# score_check_flag                string[pyarrow]
+# score_choice                    string[pyarrow]
+# score_compare_quantities        double[pyarrow]
+# score_complex_scorer            string[pyarrow]
+# score_exact                     string[pyarrow]
+# score_foo                       double[pyarrow]
+# score_generating_scorer         double[pyarrow]
+# score_includes                  string[pyarrow]
+# score_letter_count              string[pyarrow]
+# score_match                     string[pyarrow]
+# score_model_graded_fact         string[pyarrow]
+# score_model_graded_qa           string[pyarrow]
+# score_nested_dict_scorer        string[pyarrow]
+# score_nested_list_scorer        string[pyarrow]
+# score_rand_score                double[pyarrow]
+# score_score_color               string[pyarrow]
+# score_score_table               string[pyarrow]
+# score_simple_score              string[pyarrow]
+# score_simple_score1             string[pyarrow]
+# score_simple_score2             string[pyarrow]
+# score_slow_scorer               double[pyarrow]
+# score_token_consuming_scorer    double[pyarrow]
+# score_wildcard_scorer           string[pyarrow]
+# model_usage                     string[pyarrow]
+# total_time                      double[pyarrow]
+# working_time                    double[pyarrow]
+# error                           string[pyarrow]
+# limit                           string[pyarrow]
+# retries                          int64[pyarrow]
+# dtype: object
+# sample_id                       string[pyarrow]
+# eval_id                         string[pyarrow]
+# id                              string[pyarrow]
+# epoch                            int64[pyarrow]
+# input                           string[pyarrow]
+# target                          string[pyarrow]
+# metadata_challenge_address      string[pyarrow]
+# metadata_challenge_type         string[pyarrow]
+# metadata_color                  string[pyarrow]
+# metadata_cookie                 string[pyarrow]
+# metadata_foo                    string[pyarrow]
+# metadata_get_flag_cmd           string[pyarrow]
+# metadata_get_flag_service       string[pyarrow]
+# metadata_label_confidence       double[pyarrow]
+# metadata_long                   string[pyarrow]
+# metadata_objective_prompt       string[pyarrow]
+# metadata_prompt                 string[pyarrow]
+# metadata_variant                string[pyarrow]
+# score_another_rand_score         int64[pyarrow]
+# score_check_flag                string[pyarrow]
+# score_choice                    string[pyarrow]
+# score_compare_quantities        double[pyarrow]
+# score_complex_scorer            string[pyarrow]
+# score_exact                     string[pyarrow]
+# score_foo                       double[pyarrow]
+# score_generating_scorer          int64[pyarrow]
+# score_includes                  string[pyarrow]
+# score_letter_count              string[pyarrow]
+# score_match                     string[pyarrow]
+# score_model_graded_fact         string[pyarrow]
+# score_model_graded_qa           string[pyarrow]
+# score_nested_dict_scorer        string[pyarrow]
+# score_nested_list_scorer        string[pyarrow]
+# score_rand_score                 int64[pyarrow]
+# score_score_color               string[pyarrow]
+# score_score_table               string[pyarrow]
+# score_simple_score              string[pyarrow]
+# score_simple_score1             string[pyarrow]
+# score_simple_score2             string[pyarrow]
+# score_slow_scorer                int64[pyarrow]
+# score_token_consuming_scorer     int64[pyarrow]
+# score_wildcard_scorer           string[pyarrow]
+# model_usage                     string[pyarrow]
+# total_time                      double[pyarrow]
+# working_time                    double[pyarrow]
+# error                           string[pyarrow]
+# limit                           string[pyarrow]
+# retries                          int64[pyarrow]
+# dtype: object

inspect_ai/log/_recorders/buffer/database.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime
 import hashlib
 import json
 import os
@@ -15,6 +16,7 @@ from typing_extensions import override
 from inspect_ai._display.core.display import TaskDisplayMetric
 from inspect_ai._util.appdirs import inspect_data_dir
+from inspect_ai._util.dateutil import is_file_older_than
 from inspect_ai._util.file import basename, dirname, filesystem
 from inspect_ai._util.json import to_json_str_safe
 from inspect_ai._util.trace import trace_action
@@ -301,17 +303,44 @@ class SampleBufferDatabase(SampleBuffer):
     @contextmanager
     def _get_connection(self, *, write: bool = False) -> Iterator[Connection]:
         """Get a database connection."""
-        conn = sqlite3.connect(self.db_path, timeout=10)
-        conn.row_factory = sqlite3.Row  # Enable row factory for named columns
-        try:
-            # Enable foreign key constraints
-            conn.execute("PRAGMA foreign_keys = ON")
+        max_retries = 5
+        retry_delay = 0.1
+        conn: Connection | None = None
+        last_error: Exception | None = None
-            # concurrency setup
-            conn.execute("PRAGMA journal_mode=MEMORY")
-            conn.execute("PRAGMA busy_timeout=10000")
-            conn.execute("PRAGMA synchronous=OFF")
+        for attempt in range(max_retries):
+            try:
+                conn = sqlite3.connect(self.db_path, timeout=30)
+                conn.row_factory = sqlite3.Row  # enable row factory for named columns
+                # Enable foreign key constraints
+                conn.execute("PRAGMA foreign_keys = ON")
+                # concurrency setup
+                conn.execute("PRAGMA busy_timeout=30000")
+                conn.execute("PRAGMA synchronous=OFF")
+                conn.execute("PRAGMA cache_size=-64000")
+                conn.execute("PRAGMA temp_store=MEMORY")
+                break
+            except sqlite3.OperationalError as e:
+                last_error = e
+                if "locked" in str(e) and attempt < max_retries - 1:
+                    if conn:
+                        conn.close()
+                    time.sleep(retry_delay * (2**attempt))
+                    continue
+                raise
+        # ensure we have a connection
+        if conn is None:
+            raise sqlite3.OperationalError(
+                f"Failed to establish connection after {max_retries} attempts"
+            ) from last_error
+        try:
             # do work
             yield conn
@@ -663,13 +692,23 @@ def maximum_ids(
 def cleanup_sample_buffer_databases(db_dir: Path | None = None) -> None:
-    db_dir = resolve_db_dir(db_dir)
-    for db in db_dir.glob("*.*.db"):
-        _, pid_str, _ = db.name.rsplit(".", 2)
-        if pid_str.isdigit():
-            pid = int(pid_str)
-            if not psutil.pid_exists(pid):
-                cleanup_sample_buffer_db(db)
+    try:
+        db_dir = resolve_db_dir(db_dir)
+        for db in db_dir.glob("*.*.db"):
+            # this is a failsafe cleanup method for buffer db's leaked during
+            # abnormal terminations. therefore, it's not critical that we clean
+            # it up immediately. it's also possible that users are _sharing_
+            # their inspect_data_dir across multiple pid namespaces (e.g. in an
+            # effort to share their cache) one eval could remove the db of
+            # another running eval if we don't put in a delay.
+            if is_file_older_than(db, datetime.timedelta(days=3), default=False):
+                _, pid_str, _ = db.name.rsplit(".", 2)
+                if pid_str.isdigit():
+                    pid = int(pid_str)
+                    if not psutil.pid_exists(pid):
+                        cleanup_sample_buffer_db(db)
+    except Exception as ex:
+        logger.warning(f"Error cleaning up sample buffer databases at {db_dir}: {ex}")
 def cleanup_sample_buffer_db(path: Path) -> None:

inspect_ai/model/_model.py CHANGED Viewed

@@ -560,7 +560,7 @@ class Model:
             input = collapse_consecutive_assistant_messages(input)
         # retry for transient http errors:
-        # - no default timeout or max_retries (try forever)
+        # - use config.max_retries and config.timeout if specified, otherwise retry forever
         # - exponential backoff starting at 3 seconds (will wait 25 minutes
         #   on the 10th retry,then will wait no longer than 30 minutes on
         #   subsequent retries)

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -96,9 +96,9 @@ def vertex() -> type[ModelAPI]:
     verify_required_version(FEATURE, PACKAGE, MIN_VERSION)
     # in the clear
-    from .vertex import VertexAPI
+    from .vertex import VertexAPI  # type: ignore
-    return VertexAPI
+    return VertexAPI  # type: ignore
 @modelapi(name="google")

inspect_ai/model/_providers/vertex.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# type: ignore
 import functools
 import json
 from copy import copy
@@ -340,6 +342,7 @@ async def content_part(content: Content | str) -> Part:
         if isinstance(content, ContentAudio):
             file = content.audio
         elif isinstance(content, ContentData):
+            file = ""
             assert False, "Vertex provider should never encounter ContentData"
         else:
             # it's ContentVideo

inspect_ai/tool/_mcp/_mcp.py CHANGED Viewed

@@ -15,6 +15,7 @@ from mcp.types import (
     AudioContent,
     EmbeddedResource,
     ImageContent,
+    ResourceLink,
     TextContent,
     TextResourceContents,
 )
@@ -283,7 +284,9 @@ def create_server_sandbox(
 def tool_result_as_text(
-    content: list[TextContent | ImageContent | AudioContent | EmbeddedResource],
+    content: list[
+        TextContent | ImageContent | AudioContent | ResourceLink | EmbeddedResource
+    ],
 ) -> str:
     content_list: list[str] = []
     for c in content:
@@ -293,6 +296,8 @@ def tool_result_as_text(
             content_list.append("(base64 encoded image omitted)")
         elif isinstance(c, AudioContent):
             content_list.append("(base64 encoded audio omitted)")
+        elif isinstance(c, ResourceLink):
+            content_list.append(f"{c.description} ({c.uri})")
         elif isinstance(c.resource, TextResourceContents):
             content_list.append(c.resource.text)

inspect_ai/tool/_mcp/sampling.py CHANGED Viewed

@@ -10,6 +10,7 @@ from mcp.types import (
     EmbeddedResource,
     ErrorData,
     ImageContent,
+    ResourceLink,
     TextContent,
     TextResourceContents,
 )
@@ -94,7 +95,11 @@ async def sampling_fn(
 def as_inspect_content(
-    content: TextContent | ImageContent | AudioContent | EmbeddedResource,
+    content: TextContent
+    | ImageContent
+    | AudioContent
+    | ResourceLink
+    | EmbeddedResource,
 ) -> Content:
     if isinstance(content, TextContent):
         return ContentText(text=content.text)
@@ -107,6 +112,8 @@ def as_inspect_content(
             audio=f"data:audio/{content.mimeType};base64,{content.data}",
             format=_get_audio_format(content.mimeType),
         )
+    elif isinstance(content, ResourceLink):
+        return ContentText(text=f"{content.description} ({content.uri})")
     elif isinstance(content.resource, TextResourceContents):
         return ContentText(text=content.resource.text)
     else:

inspect_ai/tool/_tools/_bash_session.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Annotated, Literal
 from pydantic import BaseModel, Discriminator, Field, RootModel
 from semver import Version
-from shortuuid import uuid
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai.tool import ToolResult
@@ -82,7 +81,7 @@ def bash_session(
     *,
     timeout: int | None = None,  # default is max_wait + 5 seconds
     wait_for_output: int | None = None,  # default is 30 seconds
-    instance: str | None = uuid(),
+    instance: str | None = None,
 ) -> Tool:
     """Interactive bash shell session tool.
@@ -91,10 +90,8 @@ def bash_session(
     which could be a command followed by a newline character or any other input
     text such as the response to a password prompt.
-    By default, a separate bash process is created within the sandbox for each
-    call to `bash_session()`. You can modify this behavior by passing
-    `instance=None` (which will result in a single bash process for the entire
-    sample) or use other `instance` values that implement another scheme).
+    To create a separate bash process for each
+    call to `bash_session()`, pass a unique value for `instance`
     See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.

inspect_ai/tool/_tools/_web_browser/_web_browser.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import re
 from pydantic import BaseModel, Field
-from shortuuid import uuid
 from inspect_ai._util.content import ContentText
 from inspect_ai._util.error import PrerequisiteError
@@ -32,15 +31,11 @@ class CrawlerResult(BaseModel):
     error: str | None = None
-def web_browser(
-    *, interactive: bool = True, instance: str | None = uuid()
-) -> list[Tool]:
+def web_browser(*, interactive: bool = True, instance: str | None = None) -> list[Tool]:
     """Tools used for web browser navigation.
-    By default, a separate web browser process is created within the sandbox for each
-    call to `web_browser()`. You can modify this behavior by passing `instance=None`
-    (which will result in a single web browser for the entire sample) or use other
-    `instance` values that implement another scheme).
+    To create a separate web browser process for each
+    call to `web_browser()`, pass a unique value for `instance`.
     See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-web-browser>.

inspect_ai/util/_anyio.py CHANGED Viewed

@@ -10,11 +10,20 @@ if sys.version_info < (3, 11):
 def inner_exception(exc: Exception) -> Exception:
-    return _flatten_exception(exc)[0]
+    return _flatten_exception(exc, set())[0]
-def _flatten_exception(exc: Exception) -> list[Exception]:
+def _flatten_exception(exc: Exception, seen: set[int] | None = None) -> list[Exception]:
     """Recursively flatten an exception to get all related (__context__) and contained (ExceptionGroup) exceptions."""
+    if seen is None:
+        seen = set()
+    # Prevent infinite recursion by tracking seen exceptions by their id
+    exc_id = id(exc)
+    if exc_id in seen:
+        return []
+    seen.add(exc_id)
     context_to_follow = (
         [exc.__context__]
         # conceptually, if __cause__ is present, it means that this exception
@@ -36,7 +45,7 @@ def _flatten_exception(exc: Exception) -> list[Exception]:
     other_exceptions = [
         flattened_e
         for e in set(itertools.chain(context_to_follow, children_to_follow))
-        for flattened_e in _flatten_exception(e)
+        for flattened_e in _flatten_exception(e, seen)
     ]
     return maybe_this_exception + other_exceptions

{inspect_ai-0.3.108.dist-info → inspect_ai-0.3.110.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.108
+Version: 0.3.110
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License
@@ -63,7 +63,7 @@ Requires-Dist: groq; extra == "dev"
 Requires-Dist: ipython; extra == "dev"
 Requires-Dist: jsonpath-ng; extra == "dev"
 Requires-Dist: markdown; extra == "dev"
-Requires-Dist: mcp>=1.9.4; extra == "dev"
+Requires-Dist: mcp>=1.10.0; extra == "dev"
 Requires-Dist: mistralai; extra == "dev"
 Requires-Dist: moto[server]; extra == "dev"
 Requires-Dist: mypy>=1.16.0; extra == "dev"

inspect-ai 0.3.108__py3-none-any.whl → 0.3.110__py3-none-any.whl

inspect-ai 0.3.108py3-none-any.whl → 0.3.110py3-none-any.whl