PyPI - inspect-ai - Versions diffs - 0.3.95__py3-none-any.whl → 0.3.97__py3-none-any.whl - Mend

inspect-ai 0.3.95py3-none-any.whl → 0.3.97py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

inspect_ai/analysis/beta/_dataframe/samples/table.py CHANGED Viewed

@@ -1,29 +1,37 @@
 from __future__ import annotations
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
+from functools import lru_cache
+from itertools import chain
 from typing import (
     TYPE_CHECKING,
     Callable,
     Generator,
     Literal,
+    Sequence,
+    cast,
     overload,
 )
-from inspect_ai._display import display
-from inspect_ai._util.path import pretty_path
-from inspect_ai.analysis.beta._dataframe.events.columns import EventColumn
-from inspect_ai.analysis.beta._dataframe.messages.columns import MessageColumn
+from inspect_ai._util.hash import mm3_hash
+from inspect_ai.analysis.beta._dataframe.progress import import_progress, no_progress
 from inspect_ai.log._file import (
+    list_eval_logs,
     read_eval_log_sample_summaries,
     read_eval_log_samples,
 )
 from inspect_ai.log._log import EvalSample, EvalSampleSummary
-from inspect_ai.log._transcript import BaseEvent, Event
+from inspect_ai.log._transcript import Event
 from inspect_ai.model._chat_message import ChatMessage
-from ..columns import Column, ColumnErrors, ColumnType
+from ..columns import Column, ColumnError, ColumnType
 from ..evals.columns import EvalColumn
-from ..evals.table import EVAL_ID, EVAL_SUFFIX, ensure_eval_id, evals_df
+from ..evals.table import EVAL_ID, EVAL_SUFFIX, _read_evals_df, ensure_eval_id
+from ..events.columns import EventColumn
+from ..extract import message_as_str
+from ..messages.columns import MessageColumn
 from ..record import import_record, resolve_duplicate_columns
 from ..util import (
     LogPaths,
@@ -46,49 +54,55 @@ SAMPLE_SUFFIX = "_sample"
 @overload
 def samples_df(
-    logs: LogPaths,
-    columns: list[Column] = SampleSummary,
-    recursive: bool = True,
-    reverse: bool = False,
+    logs: LogPaths = list_eval_logs(),
+    columns: Sequence[Column] = SampleSummary,
     strict: Literal[True] = True,
+    parallel: bool | int = False,
+    quiet: bool = False,
 ) -> "pd.DataFrame": ...
 @overload
 def samples_df(
-    logs: LogPaths,
-    columns: list[Column] = SampleSummary,
-    recursive: bool = True,
-    reverse: bool = False,
+    logs: LogPaths = list_eval_logs(),
+    columns: Sequence[Column] = SampleSummary,
     strict: Literal[False] = False,
-) -> tuple["pd.DataFrame", ColumnErrors]: ...
+    parallel: bool | int = False,
+    quiet: bool = False,
+) -> tuple["pd.DataFrame", list[ColumnError]]: ...
 def samples_df(
-    logs: LogPaths,
-    columns: list[Column] = SampleSummary,
-    recursive: bool = True,
-    reverse: bool = False,
+    logs: LogPaths = list_eval_logs(),
+    columns: Sequence[Column] = SampleSummary,
     strict: bool = True,
-) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
+    parallel: bool | int = False,
+    quiet: bool = False,
+) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
     """Read a dataframe containing samples from a set of evals.
     Args:
        logs: One or more paths to log files or log directories.
+          Defaults to the contents of the currently active log directory
+          (e.g. ./logs or INSPECT_LOG_DIR).
        columns: Specification for what columns to read from log files.
-       recursive: Include recursive contents of directories (defaults to `True`)
-       reverse: Reverse the order of the dataframe (by default, items
-          are ordered from oldest to newest).
        strict: Raise import errors immediately. Defaults to `True`.
           If `False` then a tuple of `DataFrame` and errors is returned.
+       parallel: If `True`, use `ProcessPoolExecutor` to read logs in parallel
+          (with workers based on `mp.cpu_count()`, capped at 8). If `int`, read
+          in parallel with the specified number of workers. If `False` (the default)
+          do not read in parallel.
+       quiet: If `True` do not print any output or progress (defaults to `False`).
     Returns:
        For `strict`, a Pandas `DataFrame` with information for the specified logs.
        For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
        encountered (by log file) during import.
     """
+    verify_prerequisites()
     return _read_samples_df(
-        logs, columns, recursive=recursive, reverse=reverse, strict=strict
+        logs, columns, strict=strict, progress=not quiet, parallel=parallel
     )
@@ -96,30 +110,108 @@ def samples_df(
 class MessagesDetail:
     name: str = "message"
     col_type = MessageColumn
-    filter: Callable[[ChatMessage], bool] = lambda m: True
+    filter: Callable[[ChatMessage], bool] | None = None
 @dataclass
 class EventsDetail:
-    name: str = "message"
+    name: str = "event"
     col_type = EventColumn
-    filter: Callable[[BaseEvent], bool] = lambda e: True
+    filter: Callable[[Event], bool] | None = None
 def _read_samples_df(
     logs: LogPaths,
-    columns: list[Column],
+    columns: Sequence[Column],
     *,
-    recursive: bool = True,
-    reverse: bool = False,
     strict: bool = True,
     detail: MessagesDetail | EventsDetail | None = None,
-) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
-    verify_prerequisites()
+    progress: bool = True,
+    parallel: bool | int = False,
+) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
+    import pandas as pd
     # resolve logs
-    logs = resolve_logs(logs, recursive=recursive, reverse=reverse)
+    logs = resolve_logs(logs)
+    if parallel:
+        # resolve number of workers (cap at 8 as eventually we run into disk/memory contention)
+        if parallel is True:
+            parallel = max(min(mp.cpu_count(), 8), 2)
+        # flatted out list of logs
+        logs = resolve_logs(logs)
+        # establish progress
+        entity = detail.name if detail else "sample"
+        progress_cm = (
+            import_progress(f"reading {entity}s", total=len(logs))
+            if progress
+            else no_progress()
+        )
+        # run the parallel reads (setup arrays for holding results in order)
+        df_results: list[pd.DataFrame | None] = [None] * len(logs)
+        error_results: list[list[ColumnError] | None] = [None] * len(logs)
+        executor = ProcessPoolExecutor(max_workers=parallel)
+        try:
+            with progress_cm as p:
+                futures = {
+                    executor.submit(
+                        _read_samples_df_serial,  # type: ignore[arg-type]
+                        logs=[log],
+                        columns=columns,
+                        strict=strict,
+                        detail=detail,
+                        progress=False,
+                    ): idx
+                    for idx, log in enumerate(logs)
+                }
+                for fut in as_completed(futures):
+                    idx = futures[fut]
+                    if strict:
+                        df_results[idx] = cast(pd.DataFrame, fut.result())
+                    else:
+                        df, errs = cast(
+                            tuple[pd.DataFrame, list[ColumnError]], fut.result()
+                        )
+                        df_results[idx] = df
+                        error_results[idx] = errs
+                    p.update()
+        finally:
+            executor.shutdown(wait=False, cancel_futures=True)
+        # recombine df
+        df = pd.concat(df_results, ignore_index=True)
+        subset = f"{detail.name}_id" if detail else SAMPLE_ID
+        df.drop_duplicates(subset=subset, ignore_index=True, inplace=True)
+        # recombine errors
+        errors: list[ColumnError] = list(
+            chain.from_iterable(e for e in error_results if e)
+        )
+        # return as required
+        if strict:
+            return df
+        else:
+            return df, errors
+    # non-parallel
+    else:
+        return _read_samples_df_serial(
+            logs=logs, columns=columns, strict=strict, detail=detail, progress=progress
+        )
+def _read_samples_df_serial(
+    logs: list[str],
+    columns: Sequence[Column],
+    *,
+    strict: bool = True,
+    detail: MessagesDetail | EventsDetail | None = None,
+    progress: bool = True,
+) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
     # split columns by type
     columns_eval: list[Column] = []
     columns_sample: list[Column] = []
@@ -148,33 +240,56 @@ def _read_samples_df(
     )
     # make sure eval_id is present
-    ensure_eval_id(columns_eval)
-    # read samples from each log
-    sample_records: list[dict[str, ColumnType]] = []
-    detail_records: list[dict[str, ColumnType]] = []
-    all_errors = ColumnErrors()
-    evals_table = evals_df(logs, columns=columns_eval)
-    with display().progress(total=len(evals_table)) as p:
+    columns_eval = list(ensure_eval_id(columns_eval))
+    # establish progress
+    progress_cm = (
+        import_progress("scanning logs", total=len(logs)) if progress else no_progress()
+    )
+    # determine how we will allocate progress
+    with progress_cm as p:
+        # read samples from each log
+        sample_records: list[dict[str, ColumnType]] = []
+        detail_records: list[dict[str, ColumnType]] = []
+        all_errors: list[ColumnError] = []
+        # read logs and note total samples
+        evals_table, eval_logs, total_samples = _read_evals_df(
+            logs, columns=columns_eval, strict=True, progress=p.update
+        )
+        # update progress now that we know the total samples
+        entity = detail.name if detail else "sample"
+        p.reset(description=f"reading {entity}s", completed=0, total=total_samples)
         # read samples
-        for eval_id, log in zip(evals_table[EVAL_ID].to_list(), logs):
+        for eval_id, eval_log in zip(evals_table[EVAL_ID].to_list(), eval_logs):
             # get a generator for the samples (might require reading the full log
             # or might be fine to just read the summaries)
             if require_full_samples:
                 samples: Generator[EvalSample | EvalSampleSummary, None, None] = (
                     read_eval_log_samples(
-                        log, all_samples_required=False, resolve_attachments=True
+                        eval_log.location,
+                        all_samples_required=False,
+                        resolve_attachments=True,
                     )
                 )
             else:
-                samples = (summary for summary in read_eval_log_sample_summaries(log))
+                samples = (
+                    summary
+                    for summary in read_eval_log_sample_summaries(eval_log.location)
+                )
             for sample in samples:
                 if strict:
-                    record = import_record(sample, columns_sample, strict=True)
+                    record = import_record(
+                        eval_log, sample, columns_sample, strict=True
+                    )
                 else:
-                    record, errors = import_record(sample, columns_sample, strict=False)
-                    error_key = f"{pretty_path(log)} [{sample.id}, {sample.epoch}]"
-                    all_errors[error_key] = errors
+                    record, errors = import_record(
+                        eval_log, sample, columns_sample, strict=False
+                    )
+                    all_errors.extend(errors)
                 # inject ids
                 sample_id = sample.uuid or auto_sample_id(eval_id, sample)
@@ -191,11 +306,15 @@ def _read_samples_df(
                     # filter detail records
                     assert isinstance(sample, EvalSample)
                     if isinstance(detail, MessagesDetail):
-                        detail_items: list[ChatMessage] | list[Event] = [
-                            m for m in sample.messages if detail.filter(m)
-                        ]
+                        detail_items: list[ChatMessage] | list[Event] = (
+                            sample_messages_from_events(sample.events, detail.filter)
+                        )
                     elif isinstance(detail, EventsDetail):
-                        detail_items = [e for e in sample.events if detail.filter(e)]
+                        detail_items = [
+                            e
+                            for e in sample.events
+                            if detail.filter is None or detail.filter(e)
+                        ]
                     else:
                         detail_items = []
@@ -203,16 +322,13 @@ def _read_samples_df(
                     for index, item in enumerate(detail_items):
                         if strict:
                             detail_record = import_record(
-                                item, columns_detail, strict=True
+                                eval_log, item, columns_detail, strict=True
                             )
                         else:
                             detail_record, errors = import_record(
-                                item, columns_detail, strict=False
-                            )
-                            error_key = (
-                                f"{pretty_path(log)} [{sample.id}, {sample.epoch}]"
+                                eval_log, item, columns_detail, strict=False
                             )
-                            all_errors[error_key] = errors
+                            all_errors.extend(errors)
                         # inject ids
                         detail_id = detail_record.get(
@@ -226,14 +342,20 @@ def _read_samples_df(
                 # record sample record
                 sample_records.append(record)
-            p.update()
+                p.update()
     # normalize records and produce samples table
     samples_table = records_to_pandas(sample_records)
+    samples_table.drop_duplicates(
+        "sample_id", keep="first", inplace=True, ignore_index=True
+    )
     # if we have detail records then join them into the samples table
     if detail is not None:
         details_table = records_to_pandas(detail_records)
+        details_table.drop_duplicates(
+            f"{detail.name}_id", keep="first", inplace=True, ignore_index=True
+        )
         samples_table = details_table.merge(
             samples_table,
             on=SAMPLE_ID,
@@ -262,6 +384,35 @@ def _read_samples_df(
         return samples_table, all_errors
+def sample_messages_from_events(
+    events: list[Event], filter: Callable[[ChatMessage], bool] | None
+) -> list[ChatMessage]:
+    # don't yield the same event twice
+    ids: set[str] = set()
+    # we need to look at the full input to every model event and add
+    # messages we haven't seen before
+    messages: list[ChatMessage] = []
+    for event in events:
+        if event.event == "model":
+            event_messages = event.input + (
+                [event.output.message] if not event.output.empty else []
+            )
+            for message in event_messages:
+                id = message.id or message_hash(message_as_str(message))
+                if id not in ids:
+                    messages.append(message)
+                    ids.add(id)
+    # then apply the filter
+    return [message for message in messages if filter is None or filter(message)]
+@lru_cache(maxsize=100)
+def message_hash(message: str) -> str:
+    return mm3_hash(message)
 def reorder_samples_df_columns(
     df: "pd.DataFrame",
     eval_columns: list[Column],

inspect_ai/analysis/beta/_dataframe/util.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Sequence, TypeAlias
 from inspect_ai._util.error import pip_dependency_error
 from inspect_ai._util.file import FileInfo, filesystem
 from inspect_ai._util.version import verify_required_version
-from inspect_ai.log._file import log_files_from_ls
+from inspect_ai.log._file import EvalLogInfo, log_files_from_ls
 if TYPE_CHECKING:
     import pandas as pd
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
 from .columns import ColumnType
-LogPaths: TypeAlias = PathLike[str] | str | Sequence[PathLike[str] | str]
+LogPaths: TypeAlias = (
+    PathLike[str] | str | EvalLogInfo | Sequence[PathLike[str] | str | EvalLogInfo]
+)
 def verify_prerequisites() -> None:
@@ -37,34 +39,35 @@ def verify_prerequisites() -> None:
         raise pip_dependency_error("inspect_ai.analysis", required_packages)
     # enforce version constraints
-    verify_required_version("inspect_ai.analysis", "pandas", "2.0.0")
+    verify_required_version("inspect_ai.analysis", "pandas", "2.1.0")
     verify_required_version("inspect_ai.analysis", "pyarrow", "10.0.1")
-def resolve_logs(logs: LogPaths, recursive: bool, reverse: bool) -> list[str]:
+def resolve_logs(logs: LogPaths) -> list[str]:
     # normalize to list of str
-    logs = [logs] if isinstance(logs, str | PathLike) else logs
-    logs = [Path(log).as_posix() if isinstance(log, PathLike) else log for log in logs]
+    logs = [logs] if isinstance(logs, str | PathLike | EvalLogInfo) else logs
+    logs_str = [
+        Path(log).as_posix()
+        if isinstance(log, PathLike)
+        else log.name
+        if isinstance(log, EvalLogInfo)
+        else log
+        for log in logs
+    ]
     # expand directories
     log_paths: list[FileInfo] = []
-    for log in logs:
-        if isinstance(log, PathLike):
-            log = Path(log).as_posix()
-        fs = filesystem(log)
-        info = fs.info(log)
+    for log_str in logs_str:
+        fs = filesystem(log_str)
+        info = fs.info(log_str)
         if info.type == "directory":
             log_paths.extend(
-                [
-                    fi
-                    for fi in fs.ls(info.name, recursive=recursive)
-                    if fi.type == "file"
-                ]
+                [fi for fi in fs.ls(info.name, recursive=True) if fi.type == "file"]
             )
         else:
             log_paths.append(info)
-    log_files = log_files_from_ls(log_paths, descending=reverse)
+    log_files = log_files_from_ls(log_paths, sort=False)
     return [log_file.name for log_file in log_files]
@@ -138,20 +141,22 @@ def add_unreferenced_columns(
 def records_to_pandas(records: list[dict[str, ColumnType]]) -> "pd.DataFrame":
     import pyarrow as pa
+    # create arrow table
     records = normalize_records(records)
-    table = pa.Table.from_pylist(records).to_pandas(types_mapper=arrow_types_mapper)
-    return table
+    table = pa.Table.from_pylist(records)
+    # convert arrow to pandas
+    df = table.to_pandas(types_mapper=arrow_types_mapper)
-def arrow_types_mapper(
-    arrow_type: "pa.DataType",
-) -> "pd.api.extensions.ExtensionDtype" | None:
+    # swap numpy-backed nullable columns for arrow-backed equivalents
+    # df = df.convert_dtypes(dtype_backend="pyarrow")
+    return df
+def arrow_types_mapper(arrow_type: pa.DataType) -> pd.ArrowDtype:
     import pandas as pd
     import pyarrow as pa
-    # convert str => str
-    if pa.types.is_string(arrow_type):
-        return pd.StringDtype()
-    # default conversion for other types
-    else:
-        return None
+    if pa.types.is_null(arrow_type):
+        arrow_type = pa.string()
+    return pd.ArrowDtype(arrow_type)

inspect_ai/log/_file.py CHANGED Viewed

@@ -526,12 +526,19 @@ def log_files_from_ls(
     ls: list[FileInfo],
     formats: list[Literal["eval", "json"]] | None = None,
     descending: bool = True,
+    sort: bool = True,
 ) -> list[EvalLogInfo]:
     extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
     return [
         log_file_info(file)
-        for file in sorted(
-            ls, key=lambda file: (file.mtime if file.mtime else 0), reverse=descending
+        for file in (
+            sorted(
+                ls,
+                key=lambda file: (file.mtime if file.mtime else 0),
+                reverse=descending,
+            )
+            if sort
+            else ls
         )
         if file.type == "file" and is_log_file(file.name, extensions)
     ]

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -303,7 +303,7 @@ async def execute_tools(
                 )
                 result_messages.append(tool_message)
                 display_conversation_message(tool_message)
-            else:
+            elif result is not None:
                 for message in result.messages:
                     result_messages.append(message)
                     display_conversation_message(message)

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -276,13 +276,25 @@ class AnthropicAPI(ModelAPI):
         params = dict(model=self.service_model_name(), max_tokens=max_tokens)
         headers: dict[str, str] = {}
         betas: list[str] = []
-        # some params not compatible with thinking models
-        if not self.is_using_thinking(config):
-            if config.temperature is not None:
+        # temperature not compatible with extended thinking
+        THINKING_WARNING = "anthropic models do not support the '{parameter}' parameter when using extended thinking."
+        if config.temperature is not None:
+            if self.is_using_thinking(config):
+                warn_once(logger, THINKING_WARNING.format(parameter="temperature"))
+            else:
                 params["temperature"] = config.temperature
-            if config.top_p is not None:
+        # top_p not compatible with extended thinking
+        if config.top_p is not None:
+            if self.is_using_thinking(config):
+                warn_once(logger, THINKING_WARNING.format(parameter="top_p"))
+            else:
                 params["top_p"] = config.top_p
-            if config.top_k is not None:
+        # top_k not compatible with extended thinking
+        if config.top_k is not None:
+            if self.is_using_thinking(config):
+                warn_once(logger, THINKING_WARNING.format(parameter="top_k"))
+            else:
                 params["top_k"] = config.top_k
         # some thinking-only stuff
@@ -346,6 +358,7 @@ class AnthropicAPI(ModelAPI):
             # for "overloaded_error" so we check for it explicitly
             if (
                 isinstance(ex.body, dict)
+                and isinstance(ex.body.get("error", {}), dict)
                 and ex.body.get("error", {}).get("type", "") == "overloaded_error"
             ):
                 return True

inspect_ai/model/_providers/azureai.py CHANGED Viewed

@@ -138,6 +138,7 @@ class AzureAIAPI(ModelAPI):
     ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         # emulate tools (auto for llama, opt-in for others)
         if self.emulate_tools is None and self.is_llama():
+            self.emulate_tools = True
             handler: ChatAPIHandler | None = Llama31Handler(self.model_name)
         elif self.emulate_tools:
             handler = Llama31Handler(self.model_name)
@@ -151,10 +152,14 @@ class AzureAIAPI(ModelAPI):
         # prepare request
         request = dict(
             messages=await chat_request_messages(input, handler),
-            tools=chat_tools(tools) if len(tools) > 0 else None,
-            tool_choice=chat_tool_choice(tool_choice) if len(tools) > 0 else None,
             **self.completion_params(config),
         )
+        # newer versions of vllm reject requests with tools or tool_choice if the
+        # server hasn't been started explicitly with the --tool-call-parser and
+        # --enable-auto-tool-choice flags
+        if (not self.emulate_tools) and len(tools) > 0:
+            request["tools"] = chat_tools(tools)
+            request["tool_choice"] = chat_tool_choice(tool_choice)
         # create client (note the client needs to be created and closed
         # with each call so it can be cleaned up and not end up on another

inspect_ai/model/_providers/util/llama31.py CHANGED Viewed

@@ -79,7 +79,7 @@ class Llama31Handler(ChatAPIHandler):
         prompt that asks the model to use the <tool_call>...</tool_call> syntax)
         """
         # extract tool calls
-        tool_call_regex = rf"<{TOOL_CALL}>((?:.|\n)*?)</{TOOL_CALL}>"
+        tool_call_regex = rf"<{TOOL_CALL}s?>((?:.|\n)*?)</{TOOL_CALL}s?>"
         tool_calls_content: list[str] = re.findall(tool_call_regex, response)
         # if there are tool calls proceed with parsing
@@ -93,7 +93,7 @@ class Llama31Handler(ChatAPIHandler):
             ]
             # find other content that exists outside tool calls
-            tool_call_content_regex = rf"<{TOOL_CALL}>(?:.|\n)*?</{TOOL_CALL}>"
+            tool_call_content_regex = rf"<{TOOL_CALL}s?>(?:.|\n)*?</{TOOL_CALL}s?>"
             other_content = re.split(tool_call_content_regex, response, flags=re.DOTALL)
             other_content = [
                 str(content).strip()
@@ -164,7 +164,7 @@ def parse_tool_call_content(content: str, tools: list[ToolInfo]) -> ToolCall:
         # see if we can get the fields (if not report error)
         name = tool_call_data.get("name", None)
         arguments = tool_call_data.get("arguments", None)
-        if not name or not arguments:
+        if not name or (arguments is None):
             raise ValueError(
                 "Required 'name' and 'arguments' not provided in JSON dictionary."
             )

inspect_ai/solver/_task_state.py CHANGED Viewed

@@ -138,7 +138,7 @@ class TaskState:
     The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
     The `TaskState` is passed to and returned from each solver during a sample's
-    evaluation. It allows us to manipulated the message history, the tools
+    evaluation. It allows us to maintain the manipulated message history, the tools
     available to the model, the final output of the model, and whether the task
     is completed or has hit a limit.
     """

inspect-ai 0.3.95__py3-none-any.whl → 0.3.97__py3-none-any.whl

inspect-ai 0.3.95py3-none-any.whl → 0.3.97py3-none-any.whl