PyPI - inspect-ai - Versions diffs - 0.3.96__py3-none-any.whl → 0.3.98__py3-none-any.whl - Mend

inspect-ai 0.3.96py3-none-any.whl → 0.3.98py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

inspect_ai/agent/_react.py CHANGED Viewed

@@ -22,6 +22,7 @@ from ._agent import Agent, AgentState, agent, agent_with
 from ._filter import MessageFilter
 from ._handoff import has_handoff
 from ._types import (
+    DEFAULT_CONTINUE_PROMOT_NO_SUBMIT,
     DEFAULT_CONTINUE_PROMPT,
     AgentAttempts,
     AgentContinue,
@@ -41,7 +42,7 @@ def react(
     tools: Sequence[Tool | ToolDef | ToolSource] | None = None,
     model: str | Model | Agent | None = None,
     attempts: int | AgentAttempts = 1,
-    submit: AgentSubmit = AgentSubmit(),
+    submit: AgentSubmit | bool | None = None,
     on_continue: str | AgentContinue | None = None,
     truncation: Literal["auto", "disabled"] | MessageFilter = "disabled",
 ) -> Agent:
@@ -73,14 +74,16 @@ def react(
        tools: Tools available for the agent.
        model: Model to use for agent (defaults to currently evaluated model).
        attempts: Configure agent to make multiple attempts.
-       submit: Configure submit tool used by agent.
-       on_continue: Message to play back to the model to urge it to continue.
-          Use the placeholder {submit} to refer to the submit tool within the message.
-          Alternatively, an async function to call to determine whether the loop
-          should continue and what message to play back. Note that this function
-          is called on _every_ iteration of the loop so if you only want to send
-          a message back when the model fails to call tools you need to code
-          that behavior explicitly.
+       submit: Use a submit tool for reporting the final answer. Defaults to `True`
+          which uses the default submit behavior. Pass an `AgentSubmit` to
+          customize the behavior or pass `False` to disable the submit tool.
+       on_continue: Message to play back to the model to urge it to continue
+          when it stops calling tools. Use the placeholder {submit} to refer to
+          the submit tool within the message. Alternatively, an async function
+          to call to determine whether the loop should continue and what message
+          to play back. Note that this function is called on _every_ iteration of
+          the loop so if you only want to send a message back when the model fails
+          to call tools you need to code that behavior explicitly.
        truncation: Truncate the conversation history in the event of a context
           window overflow. Defaults to "disabled" which does no truncation. Pass
           "auto" to use `trim_messages()` to reduce the context size. Pass a
@@ -89,6 +92,29 @@ def react(
     Returns:
         ReAct agent.
     """
+    # if there is no submit tool then delegate to react_no_submit
+    if submit is False:
+        # if the user passes a `str` for on_continue this won't do anything
+        if isinstance(on_continue, str):
+            raise ValueError(
+                "Passing a string to on_continue with no submit tool is not permitted, "
+                + "because in this case the agent will always terminate when no tool "
+                + "calls are made."
+            )
+        return react_no_submit(
+            name=name,
+            description=description,
+            prompt=prompt,
+            tools=tools,
+            model=model,
+            on_continue=on_continue,
+            truncation=truncation,
+        )
+    # if submit is True or None then use default AgentSubmit
+    if submit is True or submit is None:
+        submit = AgentSubmit()
     # default submit tool
     @tool(name="submit")
@@ -115,19 +141,7 @@ def react(
     tools.append(submit_tool)
     # resolve prompt / system message
-    prompt = AgentPrompt(prompt) if isinstance(prompt, str) else prompt
-    if prompt:
-        prompt_lines: list[str] = []
-        if prompt.instructions:
-            prompt_lines.append(prompt.instructions)
-        if prompt.handoff_prompt and has_handoff(tools):
-            prompt_lines.append(prompt.handoff_prompt)
-        if prompt.assistant_prompt:
-            prompt_lines.append(prompt.assistant_prompt)
-        prompt_content = "\n\n".join(prompt_lines).format(submit=submit_tool.name)
-        system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
-    else:
-        system_message = None
+    system_message = _prompt_to_system_message(prompt, tools, submit_tool.name)
     # resolve attempts
     attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
@@ -150,12 +164,7 @@ def react(
                 state.messages.insert(0, system_message)
             # resolve overflow handling
-            if truncation == "auto":
-                overflow = cast(MessageFilter | None, trim_messages)
-            elif truncation == "disabled":
-                overflow = None
-            else:
-                overflow = truncation
+            overflow = _resolve_overflow(truncation)
             # track attempts
             attempt_count = 0
@@ -168,20 +177,11 @@ def react(
                 # check for context window overflow
                 if state.output.stop_reason == "model_length":
-                    from inspect_ai.log._transcript import transcript
-                    if overflow is not None:
-                        previous_messages = state.messages[:-1]
-                        state.messages = await overflow(previous_messages)
-                        if len(state.messages) < len(previous_messages):
-                            transcript().info(
-                                "Agent exceeded model context window, truncating messages and continuing."
-                            )
-                            continue
-                    # no overflow policy or overflow didn't reduce conversation length
-                    transcript().info("Agent terminated: model context window exceeded")
-                    break
+                    state, handled = await _handle_overflow(state, overflow)
+                    if handled:
+                        continue
+                    else:
+                        break
                 # resolve tool calls (if any)
                 if state.output.message.tool_calls:
@@ -233,9 +233,7 @@ def react(
                 # call the on_continue hook (if any)
                 if callable(on_continue):
-                    if not is_callable_coroutine(on_continue):
-                        raise ValueError("The on_continue function must be async.")
-                    do_continue = await cast(AgentContinue, on_continue)(state)
+                    do_continue = await _call_on_continue(on_continue, state)
                     if do_continue is True:
                         # if there were no tool calls we need to send back a user message
                         if not state.output.message.tool_calls:
@@ -274,10 +272,133 @@ def react(
             state.messages = _remove_submit_tool(state.messages, submit_tool.name)
             return state
-    if name is not None or description is not None:
-        return agent_with(execute, name=name, description=description)
+    return _resolve_agent(execute, name, description)
+def react_no_submit(
+    *,
+    name: str | None,
+    description: str | None,
+    prompt: str | AgentPrompt | None,
+    tools: Sequence[Tool | ToolDef | ToolSource] | None,
+    model: str | Model | Agent | None,
+    on_continue: AgentContinue | None,
+    truncation: Literal["auto", "disabled"] | MessageFilter,
+) -> Agent:
+    # resolve tools
+    tools = list(tools) if tools is not None else []
+    # resolve prompt / system message
+    system_message = _prompt_to_system_message(prompt, tools, None)
+    async def execute(state: AgentState) -> AgentState:
+        async with mcp_connection(tools):
+            # prepend system message if we have one
+            if system_message:
+                state.messages.insert(0, system_message)
+            # resolve overflow handling
+            overflow = _resolve_overflow(truncation)
+            # main loop
+            while True:
+                # generate output and append assistant message
+                state = await _agent_generate(model, state, tools)
+                # check for context window overflow
+                if state.output.stop_reason == "model_length":
+                    state, handled = await _handle_overflow(state, overflow)
+                    if handled:
+                        continue
+                    else:
+                        break
+                # resolve tool calls (if any)
+                if state.output.message.tool_calls:
+                    # call tool functions
+                    messages, output = await execute_tools(state.messages, tools)
+                    state.messages.extend(messages)
+                    if output:
+                        state.output = output
+                # call the on_continue hook (if any)
+                if on_continue:
+                    do_continue = await _call_on_continue(on_continue, state)
+                    if do_continue is True:
+                        do_continue = DEFAULT_CONTINUE_PROMOT_NO_SUBMIT
+                    if do_continue:
+                        state.messages.append(ChatMessageUser(content=do_continue))
+                    else:
+                        break
+                elif not state.output.message.tool_calls:
+                    break
+            return state
+    return _resolve_agent(execute, name, description)
+def _prompt_to_system_message(
+    prompt: str | AgentPrompt | None,
+    tools: list[Tool | ToolDef | ToolSource],
+    submit_tool: str | None,
+) -> ChatMessage | None:
+    prompt = AgentPrompt(prompt) if isinstance(prompt, str) else prompt
+    if prompt:
+        prompt_lines: list[str] = []
+        if prompt.instructions:
+            prompt_lines.append(prompt.instructions)
+        if prompt.handoff_prompt and has_handoff(tools):
+            prompt_lines.append(prompt.handoff_prompt)
+        if prompt.assistant_prompt:
+            if (
+                submit_tool
+                and ("{submit}" not in prompt.assistant_prompt)
+                and prompt.submit_prompt
+            ):
+                assistant_prompt = f"{prompt.assistant_prompt}\n{prompt.submit_prompt}"
+            else:
+                assistant_prompt = prompt.assistant_prompt
+            prompt_lines.append(assistant_prompt)
+        prompt_content = "\n\n".join(prompt_lines).format(
+            submit=submit_tool or "submit"
+        )
+        system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
     else:
-        return execute
+        system_message = None
+    return system_message
+def _resolve_overflow(
+    truncation: Literal["auto", "disabled"] | MessageFilter,
+) -> MessageFilter | None:
+    # resolve overflow handling
+    if truncation == "auto":
+        overflow = cast(MessageFilter | None, trim_messages)
+    elif truncation == "disabled":
+        overflow = None
+    else:
+        overflow = truncation
+    return overflow
+async def _handle_overflow(
+    state: AgentState, overflow: MessageFilter | None
+) -> tuple[AgentState, bool]:
+    from inspect_ai.log._transcript import transcript
+    if overflow is not None:
+        previous_messages = state.messages[:-1]
+        state.messages = await overflow(previous_messages)
+        if len(state.messages) < len(previous_messages):
+            transcript().info(
+                "Agent exceeded model context window, truncating messages and continuing."
+            )
+            return state, True
+    # no overflow policy or overflow didn't reduce conversation length
+    transcript().info("Agent terminated: model context window exceeded")
+    return state, False
 async def _agent_generate(
@@ -319,6 +440,21 @@ def _model_generate(model: str | Model | None) -> Agent:
     return generate
+async def _call_on_continue(
+    on_continue: AgentContinue, state: AgentState
+) -> str | bool:
+    if not is_callable_coroutine(on_continue):
+        raise ValueError("The on_continue function must be async.")
+    return await on_continue(state)
+def _resolve_agent(agent: Agent, name: str | None, description: str | None) -> Agent:
+    if name is not None or description is not None:
+        return agent_with(agent, name=name, description=description)
+    else:
+        return agent
 def _remove_submit_tool(
     messages: list[ChatMessage], submit_name: str
 ) -> list[ChatMessage]:

inspect_ai/agent/_types.py CHANGED Viewed

@@ -22,7 +22,9 @@ see the result of tool calls right after sending the message. If you need
 to perform multiple actions, you can always send more messages with additional
 tool calls. Do some reasoning before your actions, describing what tool calls
 you are going to use and how they fit into your plan.
+"""
+DEFAULT_SUBMIT_PROMPT = """
 When you have completed the task and have an answer, call the {submit}()
 tool to report it.
 """
@@ -38,7 +40,14 @@ class AgentPrompt(NamedTuple):
     """Prompt used when there are additional handoff agents active."""
     assistant_prompt: str | None = DEFAULT_ASSISTANT_PROMPT
-    """Prompt for assistant (covers tool use, submit tool, CoT, etc.)."""
+    """Prompt for assistant (covers tool use, CoT, etc.)."""
+    submit_prompt: str | None = DEFAULT_SUBMIT_PROMPT
+    """Prompt to tell the model about the submit tool.
+    This prompt is not used if the `assistant_prompt` contains a
+    {submit} placeholder.
+    """
 DEFAULT_CONTINUE_PROMPT = """
@@ -46,6 +55,10 @@ Please proceed to the next step using your best judgement. If you believe you
 have completed the task, please call the `{submit}()` tool with your final answer.
 """
+DEFAULT_CONTINUE_PROMOT_NO_SUBMIT = """
+Please proceed to the next step using your best judgement.
+"""
 AgentContinue: TypeAlias = Callable[[AgentState], Awaitable[bool | str]]
 """Function called to determine whether the agent should continue.

inspect_ai/analysis/beta/__init__.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from ._dataframe.columns import (
     Column,
     ColumnError,
-    ColumnErrors,
     ColumnType,
 )
 from ._dataframe.evals.columns import (
@@ -63,5 +62,4 @@ __all__ = [
     "Column",
     "ColumnType",
     "ColumnError",
-    "ColumnErrors",
 ]

inspect_ai/analysis/beta/_dataframe/columns.py CHANGED Viewed

@@ -7,6 +7,8 @@ from jsonpath_ng import JSONPath  # type: ignore
 from jsonpath_ng.ext import parse  # type: ignore
 from pydantic import JsonValue
+from inspect_ai.log._log import EvalLog
 from .validate import jsonpath_in_schema
 ColumnType: TypeAlias = int | float | bool | str | date | time | datetime | None
@@ -122,24 +124,17 @@ class ColumnError:
     path: str | None
     """Path to select column value. """
-    message: str
-    """Error message."""
+    error: Exception
+    """Underlying error."""
+    log: EvalLog
+    """Eval log where the error occurred.
+    Use log.location to determine the path where the log was read from.
+    """
     def __str__(self) -> str:
         msg = f"Error reading column '{self.column}'"
         if self.path:
             msg = f"{msg} from path '{self.path}'"
-        return f"{msg}: {self.message}"
-class ColumnErrors(dict[str, list[ColumnError]]):
-    """Dictionary of column errors keyed by log file."""
-    def __str__(self) -> str:
-        lines: list[str] = [""]
-        for file, errors in self.items():
-            lines.append(file)
-            for error in errors:
-                lines.append(f" - {error}")
-            lines.append("")
-        return "\n".join(lines)
+        return f"{msg}: {self.error} (log: {self.log.location})"

inspect_ai/analysis/beta/_dataframe/evals/table.py CHANGED Viewed

@@ -1,15 +1,16 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Callable, Literal, overload
+from logging import getLogger
+from typing import TYPE_CHECKING, Callable, Literal, Sequence, overload
-from inspect_ai._util.path import pretty_path
-from inspect_ai.analysis.beta._dataframe.progress import import_progress
+from inspect_ai.analysis.beta._dataframe.progress import import_progress, no_progress
 from inspect_ai.log._file import (
     list_eval_logs,
     read_eval_log,
 )
+from inspect_ai.log._log import EvalLog
-from ..columns import Column, ColumnErrors, ColumnType
+from ..columns import Column, ColumnError, ColumnType
 from ..record import import_record, resolve_duplicate_columns
 from ..util import (
     LogPaths,
@@ -21,6 +22,8 @@ from ..util import (
 )
 from .columns import EvalColumns, EvalId
+logger = getLogger(__name__)
 if TYPE_CHECKING:
     import pandas as pd
@@ -31,24 +34,27 @@ EVAL_SUFFIX = "_eval"
 @overload
 def evals_df(
     logs: LogPaths = list_eval_logs(),
-    columns: list[Column] = EvalColumns,
+    columns: Sequence[Column] = EvalColumns,
     strict: Literal[True] = True,
+    quiet: bool = False,
 ) -> "pd.DataFrame": ...
 @overload
 def evals_df(
     logs: LogPaths = list_eval_logs(),
-    columns: list[Column] = EvalColumns,
+    columns: Sequence[Column] = EvalColumns,
     strict: Literal[False] = False,
-) -> tuple["pd.DataFrame", ColumnErrors]: ...
+    quiet: bool = False,
+) -> tuple["pd.DataFrame", Sequence[ColumnError]]: ...
 def evals_df(
     logs: LogPaths = list_eval_logs(),
-    columns: list[Column] = EvalColumns,
+    columns: Sequence[Column] = EvalColumns,
     strict: bool = True,
-) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
+    quiet: bool = False,
+) -> "pd.DataFrame" | tuple["pd.DataFrame", Sequence[ColumnError]]:
     """Read a dataframe containing evals.
     Args:
@@ -58,6 +64,7 @@ def evals_df(
        columns: Specification for what columns to read from log files.
        strict: Raise import errors immediately. Defaults to `True`.
           If `False` then a tuple of `DataFrame` and errors is returned.
+       quiet: If `True`, do not show any output or progress. Defaults to `False`.
     Returns:
        For `strict`, a Pandas `DataFrame` with information for the specified logs.
@@ -69,70 +76,86 @@ def evals_df(
     # resolve logs
     log_paths = resolve_logs(logs)
-    with import_progress("reading logs", total=len(log_paths)) as (p, task_id):
+    # establish progress
+    progress_cm = (
+        import_progress("reading logs", total=len(log_paths))
+        if not quiet
+        else no_progress()
+    )
+    with progress_cm as p:
         if strict:
-            evals_table, _ = _read_evals_df(
-                log_paths, columns, True, lambda: p.update(task_id, advance=1)
-            )
+            evals_table, _, _ = _read_evals_df(log_paths, columns, True, p.update)
             return evals_table
         else:
-            evals_table, all_errors, _ = _read_evals_df(
-                log_paths, columns, False, lambda: p.update(task_id, advance=1)
+            evals_table, _, all_errors, _ = _read_evals_df(
+                log_paths, columns, False, p.update
             )
             return evals_table, all_errors
 @overload
 def _read_evals_df(
-    log_paths: list[str],
-    columns: list[Column],
+    log_paths: Sequence[str],
+    columns: Sequence[Column],
     strict: Literal[True],
     progress: Callable[[], None],
-) -> tuple["pd.DataFrame", int]: ...
+) -> tuple["pd.DataFrame", Sequence[EvalLog], int]: ...
 @overload
 def _read_evals_df(
-    log_paths: list[str],
-    columns: list[Column],
+    log_paths: Sequence[str],
+    columns: Sequence[Column],
     strict: Literal[False],
     progress: Callable[[], None],
-) -> tuple["pd.DataFrame", ColumnErrors, int]: ...
+) -> tuple["pd.DataFrame", Sequence[EvalLog], Sequence[ColumnError], int]: ...
 def _read_evals_df(
-    log_paths: list[str],
-    columns: list[Column],
+    log_paths: Sequence[str],
+    columns: Sequence[Column],
     strict: bool,
     progress: Callable[[], None],
-) -> tuple["pd.DataFrame", int] | tuple["pd.DataFrame", ColumnErrors, int]:
+) -> (
+    tuple["pd.DataFrame", Sequence[EvalLog], int]
+    | tuple["pd.DataFrame", Sequence[EvalLog], Sequence[ColumnError], int]
+):
     verify_prerequisites()
     # resolve duplicate columns
     columns = resolve_duplicate_columns(columns)
     # accumulate errors for strict=False
-    all_errors = ColumnErrors()
+    all_errors: list[ColumnError] = []
     # ensure eval_id
-    ensure_eval_id(columns)
+    columns = ensure_eval_id(columns)
     # read logs
     total_samples = 0
+    eval_ids: set[str] = set()
+    eval_logs: list[EvalLog] = []
     records: list[dict[str, ColumnType]] = []
     for log_path in log_paths:
         log = read_eval_log(log_path, header_only=True)
         if strict:
-            record = import_record(log, columns, strict=True)
+            record = import_record(log, log, columns, strict=True)
         else:
-            record, errors = import_record(log, columns, strict=False)
-            all_errors[pretty_path(log_path)] = errors
-        records.append(record)
-        total_samples += (
-            len(log.eval.dataset.sample_ids)
-            if log.eval.dataset.sample_ids is not None
-            else (log.eval.dataset.samples or 100)
-        )
+            record, errors = import_record(log, log, columns, strict=False)
+            all_errors.extend(errors)
+        # don't add duplicate ids
+        eval_id = str(record.get(EVAL_ID, ""))
+        if eval_id not in eval_ids:
+            eval_ids.add(eval_id)
+            eval_logs.append(log)
+            records.append(record)
+            total_samples += (
+                len(log.eval.dataset.sample_ids)
+                if log.eval.dataset.sample_ids is not None
+                else (log.eval.dataset.samples or 100)
+            )
         progress()
     # return table (+errors if strict=False)
@@ -140,18 +163,20 @@ def _read_evals_df(
     evals_table = reorder_evals_df_columns(evals_table, columns)
     if strict:
-        return evals_table, total_samples
+        return evals_table, eval_logs, total_samples
     else:
-        return evals_table, all_errors, total_samples
+        return evals_table, eval_logs, all_errors, total_samples
-def ensure_eval_id(columns: list[Column]) -> None:
+def ensure_eval_id(columns: Sequence[Column]) -> Sequence[Column]:
     if not any([column.name == EVAL_ID for column in columns]):
-        columns.extend(EvalId)
+        return list(columns) + EvalId
+    else:
+        return columns
 def reorder_evals_df_columns(
-    df: "pd.DataFrame", eval_columns: list[Column]
+    df: "pd.DataFrame", eval_columns: Sequence[Column]
 ) -> "pd.DataFrame":
     actual_columns = list(df.columns)
     ordered_columns: list[str] = []

inspect-ai 0.3.96__py3-none-any.whl → 0.3.98__py3-none-any.whl

inspect-ai 0.3.96py3-none-any.whl → 0.3.98py3-none-any.whl