PyPI - inspect-ai - Versions diffs - 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.93py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/task/run.py +10 -7
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +51 -21
inspect_ai/_view/www/dist/assets/index.css +14 -13
inspect_ai/_view/www/dist/assets/index.js +400 -84
inspect_ai/_view/www/log-schema.json +375 -0
inspect_ai/_view/www/src/@types/log.d.ts +90 -12
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/agent/_as_solver.py +3 -1
inspect_ai/agent/_as_tool.py +6 -4
inspect_ai/agent/_handoff.py +5 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +6 -1
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +10 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_samples.py +14 -17
inspect_ai/log/_transcript.py +77 -35
inspect_ai/log/_tree.py +118 -0
inspect_ai/model/_call_tools.py +42 -34
inspect_ai/model/_model.py +45 -40
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/sglang.py +8 -2
inspect_ai/model/_providers/vllm.py +6 -2
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_mcp/_mcp.py +6 -5
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +56 -51
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/log/_samples.py CHANGED Viewed

@@ -5,12 +5,11 @@ from typing import AsyncGenerator, Iterator, Literal
 from shortuuid import uuid
-from inspect_ai._util.constants import SAMPLE_SUBTASK
 from inspect_ai.dataset._dataset import Sample
 from inspect_ai.util._sandbox import SandboxConnection
 from inspect_ai.util._sandbox.context import sandbox_connections
-from ._transcript import Transcript, transcript
+from ._transcript import ModelEvent, Transcript
 class ActiveSample:
@@ -47,7 +46,6 @@ class ActiveSample:
         self.total_tokens = 0
         self.transcript = transcript
         self.sandboxes = sandboxes
-        self.retry_count = 0
         self._interrupt_action: Literal["score", "error"] | None = None
     @property
@@ -151,27 +149,26 @@ def set_active_sample_total_messages(total_messages: int) -> None:
         active.total_messages = total_messages
+_active_model_event: ContextVar[ModelEvent | None] = ContextVar(
+    "_active_model_event", default=None
+)
 @contextlib.contextmanager
-def track_active_sample_retries() -> Iterator[None]:
-    reset_active_sample_retries()
+def track_active_model_event(event: ModelEvent) -> Iterator[None]:
+    token = _active_model_event.set(event)
     try:
         yield
     finally:
-        reset_active_sample_retries()
-def reset_active_sample_retries() -> None:
-    active = sample_active()
-    if active:
-        active.retry_count = 0
+        _active_model_event.reset(token)
 def report_active_sample_retry() -> None:
-    active = sample_active()
-    if active:
-        # only do this for the top level subtask
-        if transcript().name == SAMPLE_SUBTASK:
-            active.retry_count = active.retry_count + 1
+    model_event = _active_model_event.get()
+    if model_event is not None:
+        if model_event.retries is None:
+            model_event.retries = 0
+        model_event.retries = model_event.retries + 1
 _sample_active: ContextVar[ActiveSample | None] = ContextVar(

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -23,9 +23,10 @@ from pydantic import (
 )
 from shortuuid import uuid
-from inspect_ai._util.constants import SAMPLE_SUBTASK
+from inspect_ai._util.constants import DESERIALIZING
 from inspect_ai._util.error import EvalError
-from inspect_ai._util.json import JsonChange, json_changes
+from inspect_ai._util.json import JsonChange
+from inspect_ai._util.logger import warn_once
 from inspect_ai._util.working import sample_working_time
 from inspect_ai.dataset._dataset import Sample
 from inspect_ai.log._message import LoggingMessage
@@ -34,7 +35,6 @@ from inspect_ai.model._generate_config import GenerateConfig
 from inspect_ai.model._model_call import ModelCall
 from inspect_ai.model._model_output import ModelOutput
 from inspect_ai.scorer._metric import Score
-from inspect_ai.solver._task_state import state_jsonable
 from inspect_ai.tool._tool import ToolResult
 from inspect_ai.tool._tool_call import (
     ToolCall,
@@ -44,6 +44,7 @@ from inspect_ai.tool._tool_call import (
 )
 from inspect_ai.tool._tool_choice import ToolChoice
 from inspect_ai.tool._tool_info import ToolInfo
+from inspect_ai.util._span import current_span_id
 from inspect_ai.util._store import store, store_changes, store_jsonable
 logger = getLogger(__name__)
@@ -57,6 +58,9 @@ class BaseEvent(BaseModel):
     }
     id_: str = Field(default_factory=lambda: str(uuid()), exclude=True)
+    span_id: str | None = Field(default=None)
+    """Span the event occurred within."""
     timestamp: datetime = Field(default_factory=datetime.now)
     """Clock time at which event occurred."""
@@ -66,6 +70,17 @@ class BaseEvent(BaseModel):
     pending: bool | None = Field(default=None)
     """Is this event pending?"""
+    def model_post_init(self, __context: Any) -> None:
+        # check if deserializing
+        is_deserializing = isinstance(__context, dict) and __context.get(
+            DESERIALIZING, False
+        )
+        # Generate context id fields if not deserializing
+        if not is_deserializing:
+            if self.span_id is None:
+                self.span_id = current_span_id()
     @field_serializer("timestamp")
     def serialize_timestamp(self, dt: datetime) -> str:
         return dt.astimezone().isoformat()
@@ -147,6 +162,9 @@ class ModelEvent(BaseEvent):
     output: ModelOutput
     """Output from model."""
+    retries: int | None = Field(default=None)
+    """Retries for the model API request."""
     error: str | None = Field(default=None)
     """Error which occurred during model call."""
@@ -203,7 +221,13 @@ class ToolEvent(BaseEvent):
     """Error that occurred during tool call."""
     events: list["Event"] = Field(default_factory=list)
-    """Transcript of events for tool."""
+    """Transcript of events for tool.
+    Note that events are no longer recorded separately within
+    tool events but rather all events are recorded in the main
+    transcript. This field is deprecated and here for backwards
+    compatibility with transcripts that have sub-events.
+    """
     completed: datetime | None = Field(default=None)
     """Time that tool call completed (see `timestamp` for started)"""
@@ -222,7 +246,6 @@ class ToolEvent(BaseEvent):
         result: ToolResult,
         truncated: tuple[int, int] | None,
         error: ToolCallError | None,
-        events: list["Event"],
         waiting_time: float,
         agent: str | None,
         failed: bool | None,
@@ -230,7 +253,6 @@ class ToolEvent(BaseEvent):
         self.result = result
         self.truncated = truncated
         self.error = error
-        self.events = events
         self.pending = None
         completed = datetime.now()
         self.completed = completed
@@ -402,6 +424,35 @@ class ScoreEvent(BaseEvent):
     """Was this an intermediate scoring?"""
+class SpanBeginEvent(BaseEvent):
+    """Mark the beginning of a transcript span."""
+    event: Literal["span_begin"] = Field(default="span_begin")
+    """Event type."""
+    id: str
+    """Unique identifier for span."""
+    parent_id: str | None = Field(default=None)
+    """Identifier for parent span."""
+    type: str | None = Field(default=None)
+    """Optional 'type' field for span."""
+    name: str
+    """Span name."""
+class SpanEndEvent(BaseEvent):
+    """Mark the end of a transcript span."""
+    event: Literal["span_end"] = Field(default="span_end")
+    """Event type."""
+    id: str
+    """Unique identifier for span."""
 class StepEvent(BaseEvent):
     """Step within current sample or subtask."""
@@ -437,7 +488,13 @@ class SubtaskEvent(BaseEvent):
     """Subtask function result."""
     events: list["Event"] = Field(default_factory=list)
-    """Transcript of events for subtask."""
+    """Transcript of events for subtask.
+    Note that events are no longer recorded separately within
+    subtasks but rather all events are recorded in the main
+    transcript. This field is deprecated and here for backwards
+    compatibility with transcripts that have sub-events.
+    """
     completed: datetime | None = Field(default=None)
     """Time that subtask completed (see `timestamp` for started)"""
@@ -467,6 +524,8 @@ Event: TypeAlias = Union[
     | ErrorEvent
     | LoggerEvent
     | InfoEvent
+    | SpanBeginEvent
+    | SpanEndEvent
     | StepEvent
     | SubtaskEvent,
 ]
@@ -480,8 +539,7 @@ class Transcript:
     _event_logger: Callable[[Event], None] | None
-    def __init__(self, name: str = "") -> None:
-        self.name = name
+    def __init__(self) -> None:
         self._event_logger = None
         self._events: list[Event] = []
@@ -498,19 +556,20 @@ class Transcript:
     def step(self, name: str, type: str | None = None) -> Iterator[None]:
         """Context manager for recording StepEvent.
+        The `step()` context manager is deprecated and will be removed in a future version.
+        Please use the `span()` context manager instead.
         Args:
             name (str): Step name.
             type (str | None): Optional step type.
         """
-        # step event
-        self._event(StepEvent(action="begin", name=name, type=type))
-        # run the step (tracking state/store changes)
-        with track_state_changes(type), track_store_changes():
-            yield
-        # end step event
-        self._event(StepEvent(action="end", name=name, type=type))
+        warn_once(
+            logger,
+            "The `transcript().step()` context manager is deprecated and will "
+            + "be removed in a future version. Please replace the call to step() "
+            + "with a call to span().",
+        )
+        yield
     @property
     def events(self) -> Sequence[Event]:
@@ -551,23 +610,6 @@ def track_store_changes() -> Iterator[None]:
         transcript()._event(StoreEvent(changes=changes))
-@contextlib.contextmanager
-def track_state_changes(type: str | None = None) -> Iterator[None]:
-    # we only want to track for step() inside the the sample
-    # (solver level tracking is handled already and there are
-    # no state changes in subtasks)
-    if transcript().name == SAMPLE_SUBTASK and type != "solver":
-        before = state_jsonable()
-        yield
-        after = state_jsonable()
-        changes = json_changes(before, after)
-        if changes:
-            transcript()._event(StateEvent(changes=changes))
-    else:
-        yield
 def init_transcript(transcript: Transcript) -> None:
     _transcript.set(transcript)

inspect_ai/log/_tree.py ADDED Viewed

@@ -0,0 +1,118 @@
+from dataclasses import dataclass, field
+from logging import getLogger
+from typing import Iterable, Sequence, TypeAlias
+from ._transcript import Event, SpanBeginEvent, SpanEndEvent
+logger = getLogger(__name__)
+EventNode: TypeAlias = "SpanNode" | Event
+"""Node in an event tree."""
+EventTree: TypeAlias = list[EventNode]
+"""Tree of events (has invividual events and event spans)."""
+@dataclass
+class SpanNode:
+    """Event tree node representing a span of events."""
+    id: str
+    """Span id."""
+    parent_id: str | None
+    """Parent span id."""
+    type: str | None
+    """Optional 'type' field for span."""
+    name: str
+    """Span name."""
+    begin: SpanBeginEvent
+    """Span begin event."""
+    end: SpanEndEvent | None = None
+    """Span end event (if any)."""
+    children: list[EventNode] = field(default_factory=list)
+    """Children in the span."""
+def event_tree(events: Sequence[Event]) -> EventTree:
+    """Build a tree representation of a sequence of events.
+    Organize events heirarchially into event spans.
+    Args:
+        events: Sequence of `Event`.
+    Returns:
+        Event tree.
+    """
+    # Convert one flat list of (possibly interleaved) events into  *forest*
+    # (list of root-level items).
+    # Pre-create one node per span so we can attach events no matter when they
+    # arrive in the file. A single forward scan guarantees that the order of
+    # `children` inside every span reflects the order in which things appeared
+    # in the transcript.
+    nodes: dict[str, SpanNode] = {
+        ev.id: SpanNode(
+            id=ev.id, parent_id=ev.parent_id, type=ev.type, name=ev.name, begin=ev
+        )
+        for ev in events
+        if isinstance(ev, SpanBeginEvent)
+    }
+    roots: list[EventNode] = []
+    # Where should an event with `span_id` go?
+    def bucket(span_id: str | None) -> list[EventNode]:
+        if span_id and span_id in nodes:
+            return nodes[span_id].children
+        return roots  # root level
+    # Single pass in original order
+    for ev in events:
+        if isinstance(ev, SpanBeginEvent):  # span starts
+            bucket(ev.parent_id).append(nodes[ev.id])
+        elif isinstance(ev, SpanEndEvent):  # span ends
+            if n := nodes.get(ev.id):
+                n.end = ev
+            else:
+                logger.warning(f"Span end event (id: {ev.id} with no span begin)")
+        else:  # ordinary event
+            bucket(ev.span_id).append(ev)
+    return roots
+def event_sequence(tree: EventTree) -> Iterable[Event]:
+    """Flatten a span forest back into a properly ordered seqeunce.
+    Args:
+        tree: Event tree
+    Returns:
+        Sequence of events.
+    """
+    for item in tree:
+        if isinstance(item, SpanNode):
+            yield item.begin
+            yield from event_sequence(item.children)
+            if item.end:
+                yield item.end
+        else:
+            yield item
+def _print_event_tree(tree: EventTree, indent: str = "") -> None:
+    for item in tree:
+        if isinstance(item, SpanNode):
+            print(f"{indent}span ({item.type}): {item.name}")
+            _print_event_tree(item.children, f"{indent}  ")
+        else:
+            print(f"{indent}{item.event}")

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -61,6 +61,7 @@ from inspect_ai.tool._tool_params import ToolParams
 from inspect_ai.util import OutputLimitExceededError
 from inspect_ai.util._anyio import inner_exception
 from inspect_ai.util._limit import LimitExceededError, apply_limits
+from inspect_ai.util._span import span
 from ._chat_message import (
     ChatMessage,
@@ -109,26 +110,18 @@ async def execute_tools(
     """
     message = messages[-1]
     if isinstance(message, ChatMessageAssistant) and message.tool_calls:
-        from inspect_ai.log._transcript import (
-            ToolEvent,
-            Transcript,
-            init_transcript,
-            track_store_changes,
-            transcript,
-        )
+        from inspect_ai.log._transcript import ToolEvent, transcript
         tdefs = await tool_defs(tools)
         async def call_tool_task(
             call: ToolCall,
+            event: ToolEvent,
             conversation: list[ChatMessage],
             send_stream: MemoryObjectSendStream[
                 tuple[ExecuteToolsResult, ToolEvent, Exception | None]
             ],
         ) -> None:
-            # create a transript for this call
-            init_transcript(Transcript(name=call.function))
             result: ToolResult = ""
             messages: list[ChatMessage] = []
             output: ModelOutput | None = None
@@ -136,15 +129,14 @@ async def execute_tools(
             tool_error: ToolCallError | None = None
             tool_exception: Exception | None = None
             try:
-                with track_store_changes():
-                    try:
-                        result, messages, output, agent = await call_tool(
-                            tdefs, message.text, call, conversation
-                        )
-                    # unwrap exception group
-                    except Exception as ex:
-                        inner_ex = inner_exception(ex)
-                        raise inner_ex.with_traceback(inner_ex.__traceback__)
+                try:
+                    result, messages, output, agent = await call_tool(
+                        tdefs, message.text, call, event, conversation
+                    )
+                # unwrap exception group
+                except Exception as ex:
+                    inner_ex = inner_exception(ex)
+                    raise inner_ex.with_traceback(inner_ex.__traceback__)
             except TimeoutError:
                 tool_error = ToolCallError(
@@ -227,7 +219,6 @@ async def execute_tools(
                 truncated=truncated,
                 view=call.view,
                 error=tool_error,
-                events=list(transcript().events),
                 agent=agent,
             )
@@ -270,7 +261,6 @@ async def execute_tools(
                 internal=call.internal,
                 pending=True,
             )
-            transcript()._event(event)
             # execute the tool call. if the operator cancels the
             # tool call then synthesize the appropriate message/event
@@ -280,7 +270,7 @@ async def execute_tools(
             result_exception = None
             async with anyio.create_task_group() as tg:
-                tg.start_soon(call_tool_task, call, messages, send_stream)
+                tg.start_soon(call_tool_task, call, event, messages, send_stream)
                 event._set_cancel_fn(tg.cancel_scope.cancel)
                 async with receive_stream:
                     (
@@ -306,7 +296,6 @@ async def execute_tools(
                     truncated=None,
                     view=call.view,
                     error=tool_message.error,
-                    events=[],
                 )
                 transcript().info(
                     f"Tool call '{call.function}' was cancelled by operator."
@@ -326,7 +315,6 @@ async def execute_tools(
                 result=result_event.result,
                 truncated=result_event.truncated,
                 error=result_event.error,
-                events=result_event.events,
                 waiting_time=waiting_time_end - waiting_time_start,
                 agent=result_event.agent,
                 failed=True if result_exception else None,
@@ -347,19 +335,34 @@ async def execute_tools(
 async def call_tool(
-    tools: list[ToolDef], message: str, call: ToolCall, conversation: list[ChatMessage]
+    tools: list[ToolDef],
+    message: str,
+    call: ToolCall,
+    event: BaseModel,
+    conversation: list[ChatMessage],
 ) -> tuple[ToolResult, list[ChatMessage], ModelOutput | None, str | None]:
     from inspect_ai.agent._handoff import AgentTool
-    from inspect_ai.log._transcript import SampleLimitEvent, transcript
+    from inspect_ai.log._transcript import SampleLimitEvent, ToolEvent, transcript
+    # dodge circular import
+    assert isinstance(event, ToolEvent)
+    # this function is responsible for transcript events so that it can
+    # put them in the right enclosure (e.g. handoff/agent/tool). This
+    # means that if we throw early we need to do the enclosure when raising.
+    async def record_tool_parsing_error(error: str) -> Exception:
+        async with span(name=call.function, type="tool"):
+            transcript()._event(event)
+        return ToolParsingError(error)
     # if there was an error parsing the ToolCall, raise that
     if call.parse_error:
-        raise ToolParsingError(call.parse_error)
+        raise await record_tool_parsing_error(call.parse_error)
     # find the tool
     tool_def = next((tool for tool in tools if tool.name == call.function), None)
     if tool_def is None:
-        raise ToolParsingError(f"Tool {call.function} not found")
+        raise await record_tool_parsing_error(f"Tool {call.function} not found")
     # if we have a tool approver, apply it now
     from inspect_ai.approval._apply import apply_tool_approval
@@ -382,7 +385,7 @@ async def call_tool(
     # validate the schema of the passed object
     validation_errors = validate_tool_input(call.arguments, tool_def.parameters)
     if validation_errors:
-        raise ToolParsingError(validation_errors)
+        raise await record_tool_parsing_error(validation_errors)
     # get arguments (with creation of dataclasses, pydantic objects, etc.)
     arguments = tool_params(call.arguments, tool_def.tool)
@@ -391,14 +394,18 @@ async def call_tool(
     with trace_action(
         logger, "Tool Call", format_function_call(tool_def.name, arguments, width=1000)
     ):
-        # agent tools get special handling
         if isinstance(tool_def.tool, AgentTool):
-            return await agent_handoff(tool_def, call, conversation)
+            async with span(tool_def.tool.name, type="handoff"):
+                async with span(name=call.function, type="tool"):
+                    transcript()._event(event)
+                    return await agent_handoff(tool_def, call, conversation)
         # normal tool call
         else:
-            result: ToolResult = await tool_def.tool(**arguments)
-            return result, [], None, None
+            async with span(name=call.function, type="tool"):
+                transcript()._event(event)
+                result: ToolResult = await tool_def.tool(**arguments)
+                return result, [], None, None
 async def agent_handoff(
@@ -463,7 +470,8 @@ async def agent_handoff(
     agent_state = AgentState(messages=copy(agent_conversation))
     try:
         with apply_limits(agent_tool.limits):
-            agent_state = await agent_tool.agent(agent_state, **arguments)
+            async with span(name=agent_name, type="agent"):
+                agent_state = await agent_tool.agent(agent_state, **arguments)
     except LimitExceededError as ex:
         limit_error = ex

inspect-ai 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.93py3-none-any.whl → 0.3.94py3-none-any.whl