PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/agent/_as_solver.py CHANGED Viewed

@@ -2,6 +2,9 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
+from inspect_ai.util._limit import Limit, apply_limits
+from inspect_ai.util._span import span
 if TYPE_CHECKING:
     from inspect_ai.solver._solver import Solver
@@ -14,7 +17,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
 from ._agent import Agent, AgentState
-def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
+def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
     """Convert an agent to a solver.
     Note that agents used as solvers will only receive their first parameter
@@ -23,6 +26,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
     Args:
        agent: Agent to convert.
+       limits: List of limits to apply to the agent. Should a limit
+          be exceeded, the Sample ends and proceeds to scoring.
        **agent_kwargs: Arguments to curry to Agent function (required
           if the agent has parameters without default values).
@@ -52,17 +57,22 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
     @solver(name=agent_name)
     def agent_to_solver() -> Solver:
         async def solve(state: TaskState, generate: Generate) -> TaskState:
-            # run agent
-            agent_state = await agent(
-                AgentState(messages=state.messages), **agent_kwargs
-            )
-            # update messages
-            state.messages = agent_state.messages
-            # update output if its not empty
-            if agent_state.output:
-                state.output = agent_state.output
+            agent_state = AgentState(messages=state.messages)
+            try:
+                # run the agent with limits
+                with apply_limits(limits):
+                    async with span(name=agent_name, type="agent"):
+                        agent_state = await agent(agent_state, **agent_kwargs)
+            # if an exception occurs, we still want to update the TaskState with the
+            # AgentState's messages + output so that it appears in the log and is scored
+            finally:
+                # update messages
+                state.messages = agent_state.messages
+                # update output if its not empty
+                if agent_state.output:
+                    state.output = agent_state.output
             return state

inspect_ai/agent/_as_tool.py CHANGED Viewed

@@ -10,12 +10,19 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
 from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
 from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
 from inspect_ai.tool._tool_params import ToolParam
+from inspect_ai.util._limit import Limit, apply_limits
+from inspect_ai.util._span import span
 from ._agent import AGENT_DESCRIPTION, Agent, AgentState
 @tool
-def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -> Tool:
+def as_tool(
+    agent: Agent,
+    description: str | None = None,
+    limits: list[Limit] = [],
+    **agent_kwargs: Any,
+) -> Tool:
     """Convert an agent to a tool.
     By default the model will see all of the agent's arguments as
@@ -27,6 +34,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
     Args:
        agent: Agent to convert.
        description: Tool description (defaults to agent description)
+       limits: List of limits to apply to the agent. Should a limit
+          be exceeded, the tool call ends and returns an error
+          explaining that a limit was exceeded.
        **agent_kwargs: Arguments to curry to Agent function (arguments
           provided here will not be presented to the model as part
           of the tool interface).
@@ -40,10 +50,17 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
             "Agent passed to as_tool was not created by an @agent decorated function"
         )
+    # get tool_info
+    tool_info = agent_tool_info(agent, description, **agent_kwargs)
     async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
-        # prepare state and call agent
+        # prepare state
         state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
-        state = await agent(state, *args, **(agent_kwargs | kwargs))
+        # run the agent with limits
+        with apply_limits(limits):
+            async with span(name=tool_info.name, type="agent"):
+                state = await agent(state, *args, **(agent_kwargs | kwargs))
         # find assistant message to read content from (prefer output)
         if not state.output.empty:
@@ -55,9 +72,6 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
         else:
             return ""
-    # get tool_info
-    tool_info = agent_tool_info(agent, description, **agent_kwargs)
     # add "input" param
     tool_info.parameters.properties = {
         "input": ToolParam(type="string", description="Input message.")

inspect_ai/agent/_handoff.py CHANGED Viewed

@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
 from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
 from inspect_ai.tool._tool_def import ToolDef
 from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
+from inspect_ai.util._limit import Limit
 from ._agent import Agent
 from ._as_tool import agent_tool_info
@@ -21,6 +22,7 @@ def handoff(
     input_filter: MessageFilter | None = None,
     output_filter: MessageFilter | None = None,
     tool_name: str | None = None,
+    limits: list[Limit] = [],
     **agent_kwargs: Any,
 ) -> Tool:
     """Create a tool that enables models to handoff to agents.
@@ -35,6 +37,9 @@ def handoff(
             Use the built-in `last_message` filter to return only the last message
             or alternatively specify a custom `MessageFilter` function.
         tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
+        limits: List of limits to apply to the agent. Should a limit be exceeded,
+            the agent stops and a user message is appended explaining that a limit was
+            exceeded.
         **agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
             will not be presented to the model as part of the tool interface).
@@ -52,7 +57,9 @@ def handoff(
     tool_info = agent_tool_info(agent, description, **agent_kwargs)
     # AgentTool calls will be intercepted by execute_tools
-    agent_tool = AgentTool(agent, input_filter, output_filter, **agent_kwargs)
+    agent_tool = AgentTool(
+        agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
+    )
     tool_name = tool_name or f"transfer_to_{tool_info.name}"
     set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
     set_tool_description(
@@ -70,13 +77,17 @@ class AgentTool(Tool):
     def __init__(
         self,
         agent: Agent,
+        name: str,
         input_filter: MessageFilter | None = None,
         output_filter: MessageFilter | None = None,
+        limits: list[Limit] = [],
         **kwargs: Any,
     ):
         self.agent = agent
+        self.name = name
         self.input_filter = input_filter
         self.output_filter = output_filter
+        self.limits = limits
         self.kwargs = kwargs
     @property

inspect_ai/agent/_react.py CHANGED Viewed

@@ -195,9 +195,10 @@ def react(
                     answer = submission(messages)
                     if answer is not None:
                         # set the output to the answer for scoring
-                        state.output.completion = (
-                            f"{state.output.completion}\n\n{answer}".strip()
-                        )
+                        if submit.answer_only:
+                            state.output.completion = answer
+                        else:
+                            state.output.completion = f"{state.output.completion}{submit.answer_delimiter}{answer}".strip()
                         # exit if we are at max_attempts
                         attempt_count += 1

inspect_ai/agent/_run.py CHANGED Viewed

@@ -1,13 +1,19 @@
 from copy import copy
 from typing import Any
+from inspect_ai._util.registry import registry_unqualified_name
 from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
+from inspect_ai.util._limit import Limit, apply_limits
+from inspect_ai.util._span import span
 from ._agent import Agent, AgentState
 async def run(
-    agent: Agent, input: str | list[ChatMessage] | AgentState, **agent_kwargs: Any
+    agent: Agent,
+    input: str | list[ChatMessage] | AgentState,
+    limits: list[Limit] = [],
+    **agent_kwargs: Any,
 ) -> AgentState:
     """Run an agent.
@@ -17,6 +23,9 @@ async def run(
     Args:
         agent: Agent to run.
         input: Agent input (string, list of messages, or an `AgentState`).
+        limits: List of limits to apply to the agent. Should a limit be
+            exceeded, a LimitExceededError is raised which the caller may
+            handle as appropriate.
         **agent_kwargs: Additional arguments to pass to agent.
     Returns:
@@ -43,5 +52,9 @@ async def run(
     # create state
     state = AgentState(messages=input_messages)
-    # run the agent
-    return await agent(state, **agent_kwargs)
+    # run the agent with limits
+    with apply_limits(limits):
+        # run the agent
+        agent_name = registry_unqualified_name(agent)
+        async with span(name=agent_name, type="agent"):
+            return await agent(state, **agent_kwargs)

inspect_ai/agent/_types.py CHANGED Viewed

@@ -96,3 +96,12 @@ class AgentSubmit(NamedTuple):
     The tool should return the `answer` provided to it for scoring.
     """
+    answer_only: bool = False
+    """Set the completion to only the answer provided by the submit tool.
+    By default, the answer is appended (with `answer_delimiter`) to whatever
+    other content the model generated along with the call to `submit()`."""
+    answer_delimiter: str = "\n\n"
+    """Delimter used when appending submit tool answer to other content the model generated along with the call to `submit()`."""

inspect_ai/dataset/_dataset.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import (
 from pydantic import BaseModel, Field, ValidationError
 from typing_extensions import override
+from inspect_ai._util.answer import answer_character, answer_index
 from inspect_ai.model import ChatMessage
 from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
 from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
             shuffled_choices = [sample.choices[i] for i in positions]
             # Map of original position / target letter
-            position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
+            position_map = {
+                i: answer_character(new_i) for new_i, i in enumerate(positions)
+            }
             # Update to the shuffled choices and target
             sample.choices = shuffled_choices
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
         self, target: str | list[str], position_map: dict[int, str]
     ) -> str | list[str]:
         if isinstance(target, list):
-            return [position_map[ord(t) - 65] for t in target]
+            return [position_map[answer_index(t)] for t in target]
         else:
-            return position_map[ord(target) - 65]
+            return position_map[answer_index(target)]
     @override
     def sort(

inspect_ai/log/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from ._file import (
     read_eval_log,
     read_eval_log_async,
     read_eval_log_sample,
+    read_eval_log_sample_summaries,
     read_eval_log_samples,
     write_eval_log,
     write_eval_log_async,
@@ -28,6 +29,7 @@ from ._log import (
     EvalSampleLimit,
     EvalSampleReductions,
     EvalSampleScore,
+    EvalSampleSummary,
     EvalScore,
     EvalSpec,
     EvalStats,
@@ -46,6 +48,8 @@ from ._transcript import (
     SampleLimitEvent,
     SandboxEvent,
     ScoreEvent,
+    SpanBeginEvent,
+    SpanEndEvent,
     StateEvent,
     StepEvent,
     StoreEvent,
@@ -54,6 +58,7 @@ from ._transcript import (
     Transcript,
     transcript,
 )
+from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
 __all__ = [
     "EvalConfig",
@@ -70,6 +75,7 @@ __all__ = [
     "EvalSampleLimit",
     "EvalSampleScore",
     "EvalSampleReductions",
+    "EvalSampleSummary",
     "EvalScore",
     "EvalSpec",
     "EvalStats",
@@ -89,6 +95,8 @@ __all__ = [
     "SampleLimitEvent",
     "SandboxEvent",
     "ScoreEvent",
+    "SpanBeginEvent",
+    "SpanEndEvent",
     "StateEvent",
     "StepEvent",
     "StoreEvent",
@@ -100,6 +108,7 @@ __all__ = [
     "read_eval_log_async",
     "read_eval_log_sample",
     "read_eval_log_samples",
+    "read_eval_log_sample_summaries",
     "condense_sample",
     "resolve_sample_attachments",
     "write_eval_log",
@@ -107,4 +116,9 @@ __all__ = [
     "write_log_dir_manifest",
     "retryable_eval_logs",
     "bundle_log_dir",
+    "event_tree",
+    "event_sequence",
+    "EventTree",
+    "EventNode",
+    "SpanNode",
 ]

inspect_ai/log/_convert.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from typing import Literal
 from inspect_ai._util.error import PrerequisiteError
-from inspect_ai._util.file import copy_file, exists, filesystem
+from inspect_ai._util.file import exists, filesystem
 from inspect_ai.log._file import (
     log_files_from_ls,
     read_eval_log,
@@ -66,14 +66,9 @@ def convert_eval_logs(
                 "Output file {output_file} already exists (use --overwrite to overwrite existing files)"
             )
-        # if the input and output files have the same format just copy
-        if input_file.endswith(f".{to}"):
-            copy_file(input_file, output_file)
-        # otherwise do a full read/write
-        else:
-            log = read_eval_log(input_file)
-            write_eval_log(log, output_file)
+        # do a full read/write (normalized deprecated constructs and adds sample summaries)
+        log = read_eval_log(input_file)
+        write_eval_log(log, output_file)
     if fs.info(path).type == "file":
         convert_file(path)

inspect_ai/log/_file.py CHANGED Viewed

@@ -16,6 +16,7 @@ from inspect_ai._util.file import (
 )
 from inspect_ai._util.json import jsonable_python
 from inspect_ai.log._condense import resolve_sample_attachments
+from inspect_ai.log._log import EvalSampleSummary
 from ._log import EvalLog, EvalSample
 from ._recorders import recorder_type_for_format, recorder_type_for_location
@@ -393,6 +394,61 @@ async def read_eval_log_sample_async(
     return sample
+def read_eval_log_sample_summaries(
+    log_file: str | Path | EvalLogInfo,
+    format: Literal["eval", "json", "auto"] = "auto",
+) -> list[EvalSampleSummary]:
+    """Read sample summaries from an eval log.
+    Args:
+       log_file (str | FileInfo): Log file to read.
+       format (Literal["eval", "json", "auto"]): Read from format
+          (defaults to 'auto' based on `log_file` extension)
+    Returns:
+       Sample summaries for eval log.
+    """
+    # don't mix trio and asyncio
+    if current_async_backend() == "trio":
+        raise RuntimeError(
+            "read_eval_log_sample_summaries cannot be called from a trio async context (please use read_eval_log_sample_summaries_asymc instead)"
+        )
+    # will use s3fs and is not called from main inspect solver/scorer/tool/sandbox
+    # flow, so force the use of asyncio
+    return run_coroutine(read_eval_log_sample_summaries_async(log_file, format))
+async def read_eval_log_sample_summaries_async(
+    log_file: str | Path | EvalLogInfo,
+    format: Literal["eval", "json", "auto"] = "auto",
+) -> list[EvalSampleSummary]:
+    """Read sample summaries from an eval log.
+    Args:
+       log_file (str | FileInfo): Log file to read.
+       format (Literal["eval", "json", "auto"]): Read from format
+          (defaults to 'auto' based on `log_file` extension)
+    Returns:
+       Sample summaries for eval log.
+    """
+    # resolve to file path
+    log_file = (
+        log_file
+        if isinstance(log_file, str)
+        else log_file.as_posix()
+        if isinstance(log_file, Path)
+        else log_file.name
+    )
+    if format == "auto":
+        recorder_type = recorder_type_for_location(log_file)
+    else:
+        recorder_type = recorder_type_for_format(format)
+    return await recorder_type.read_log_sample_summaries(log_file)
 def read_eval_log_samples(
     log_file: str | Path | EvalLogInfo,
     all_samples_required: bool = True,

inspect_ai/log/_log.py CHANGED Viewed

@@ -30,6 +30,7 @@ from inspect_ai.util._store import Store
 from inspect_ai.util._store_model import SMT
 from ._transcript import Event
+from ._util import text_input_only, thin_metadata
 logger = getLogger(__name__)
@@ -42,6 +43,7 @@ class EvalConfigDefaults(TypedDict):
     fail_on_error: bool
     sandbox_cleanup: bool
     log_samples: bool
+    log_realtime: bool
     log_images: bool
     score_display: bool
@@ -53,6 +55,7 @@ def eval_config_defaults() -> EvalConfigDefaults:
         "fail_on_error": True,
         "sandbox_cleanup": True,
         "log_samples": True,
+        "log_realtime": True,
         "log_images": True,
         "score_display": True,
     }
@@ -120,6 +123,9 @@ class EvalConfig(BaseModel):
     log_samples: bool | None = Field(default=None)
     """Log detailed information on each sample."""
+    log_realtime: bool | None = Field(default=None)
+    """Log events in realtime (enables live viewing of samples in inspect view)."""
     log_images: bool | None = Field(default=None)
     """Log base64 encoded versions of images."""
@@ -161,6 +167,70 @@ class EvalSampleLimit(BaseModel):
     """The limit value"""
+class EvalSampleSummary(BaseModel):
+    """Summary information (including scoring) for a sample."""
+    id: int | str
+    """Unique id for sample."""
+    epoch: int
+    """Epoch number for sample."""
+    input: str | list[ChatMessage]
+    """Sample input (text inputs only)."""
+    target: str | list[str]
+    """Sample target value(s)"""
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    """Sample metadata (scalar types only, strings truncated to 1k)."""
+    scores: dict[str, Score] | None = Field(default=None)
+    """Scores for sample (score values only, no answers, explanations, or metadata)."""
+    model_usage: dict[str, ModelUsage] = Field(default_factory=dict)
+    """Model token usage for sample."""
+    total_time: float | None = Field(default=None)
+    """Total time that the sample was running."""
+    working_time: float | None = Field(default=None)
+    """Time spent working (model generation, sandbox calls, etc.)"""
+    uuid: str | None = Field(default=None)
+    """Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70)"""
+    error: str | None = Field(default=None)
+    """Error that halted sample."""
+    limit: str | None = Field(default=None)
+    """Limit that halted the sample"""
+    retries: int | None = Field(default=None)
+    """Number of retries for the sample."""
+    completed: bool = Field(default=False)
+    """Is the sample complete."""
+    @model_validator(mode="after")
+    def thin_data(self) -> "EvalSampleSummary":
+        # thin input
+        self.input = text_input_only(self.input)
+        # thin metadata
+        self.metadata = thin_metadata(self.metadata)
+        # thin score explanations and metadata
+        if self.scores is not None:
+            self.scores = {
+                key: Score(value=score.value) for key, score in self.scores.items()
+            }
+        return self
+    # allow field model_usage
+    model_config = ConfigDict(protected_namespaces=())
 class EvalSample(BaseModel):
     """Sample from evaluation task."""
@@ -271,6 +341,35 @@ class EvalSample(BaseModel):
     limit: EvalSampleLimit | None = Field(default=None)
     """The limit that halted the sample"""
+    def summary(self) -> EvalSampleSummary:
+        """Summary of sample.
+        The summary excludes potentially large fields like messages, output,
+        events, store, and metadata so that it is always fast to load.
+        If there are images, audio, or video in the input, they are
+        replaced with a placeholder.
+        Returns:
+           Summary of sample.
+        """
+        return EvalSampleSummary(
+            id=self.id,
+            epoch=self.epoch,
+            input=self.input,
+            target=self.target,
+            metadata=self.metadata,
+            scores=self.scores,
+            model_usage=self.model_usage,
+            total_time=self.total_time,
+            working_time=self.working_time,
+            uuid=self.uuid,
+            error=self.error.message if self.error is not None else None,
+            limit=f"{self.limit.type}" if self.limit is not None else None,
+            retries=len(self.error_retries) if self.error_retries is not None else None,
+            completed=True,
+        )
     # deprecated properties
     @property

inspect_ai/log/_recorders/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .._log import EvalSampleSummary
 from .create import (
     create_recorder_for_format,
     create_recorder_for_location,
@@ -7,6 +8,7 @@ from .create import (
 from .recorder import Recorder
 __all__ = [
+    "EvalSampleSummary",
     "Recorder",
     "create_recorder_for_format",
     "create_recorder_for_location",

inspect_ai/log/_recorders/buffer/database.py CHANGED Viewed

@@ -26,7 +26,8 @@ from ..._condense import (
     walk_input,
     walk_json_dict,
 )
-from ..types import SampleEvent, SampleSummary
+from ..._log import EvalSampleSummary
+from ..types import SampleEvent
 from .filestore import (
     Manifest,
     SampleBufferFilestore,
@@ -141,7 +142,7 @@ class SampleBufferDatabase(SampleBuffer):
         )
         self._sync_time = time.monotonic()
-    def start_sample(self, sample: SampleSummary) -> None:
+    def start_sample(self, sample: EvalSampleSummary) -> None:
         with self._get_connection(write=True) as conn:
             sample = self._consense_sample(conn, sample)
             conn.execute(
@@ -177,7 +178,7 @@ class SampleBufferDatabase(SampleBuffer):
             # Insert all rows
             conn.execute(sql, values)
-    def complete_sample(self, summary: SampleSummary) -> None:
+    def complete_sample(self, summary: EvalSampleSummary) -> None:
         with self._get_connection(write=True) as conn:
             summary = self._consense_sample(conn, summary)
             conn.execute(
@@ -307,9 +308,9 @@ class SampleBufferDatabase(SampleBuffer):
             conn.execute("PRAGMA foreign_keys = ON")
             # concurrency setup
-            conn.execute("PRAGMA journal_mode=WAL")
+            conn.execute("PRAGMA journal_mode=MEMORY")
             conn.execute("PRAGMA busy_timeout=10000")
-            conn.execute("PRAGMA synchronous=NORMAL")
+            conn.execute("PRAGMA synchronous=OFF")
             # do work
             yield conn
@@ -359,7 +360,7 @@ class SampleBufferDatabase(SampleBuffer):
     def _get_samples(
         self, conn: Connection, resolve_attachments: bool = False
-    ) -> Iterator[SampleSummary]:
+    ) -> Iterator[EvalSampleSummary]:
         cursor = conn.execute(
             """
             SELECT s.data as sample_data
@@ -369,7 +370,7 @@ class SampleBufferDatabase(SampleBuffer):
         )
         for row in cursor:
-            summary = SampleSummary.model_validate_json(row["sample_data"])
+            summary = EvalSampleSummary.model_validate_json(row["sample_data"])
             if resolve_attachments:
                 summary = self._resolve_sample_attachments(conn, summary)
             yield summary
@@ -437,8 +438,8 @@ class SampleBufferDatabase(SampleBuffer):
             )
     def _consense_sample(
-        self, conn: Connection, sample: SampleSummary
-    ) -> SampleSummary:
+        self, conn: Connection, sample: EvalSampleSummary
+    ) -> EvalSampleSummary:
         # alias attachments
         attachments: dict[str, str] = {}
         sample = sample.model_copy(
@@ -456,8 +457,8 @@ class SampleBufferDatabase(SampleBuffer):
         return sample
     def _resolve_sample_attachments(
-        self, conn: Connection, sample: SampleSummary
-    ) -> SampleSummary:
+        self, conn: Connection, sample: EvalSampleSummary
+    ) -> EvalSampleSummary:
         return sample.model_copy(
             update={
                 "input": walk_input(

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl