PyPI - inspect-ai - Versions diffs - 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl - Mend

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/task/run.py +21 -12
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/exception.py +4 -0
inspect_ai/_util/hash.py +39 -0
inspect_ai/_util/local_server.py +51 -21
inspect_ai/_util/path.py +22 -0
inspect_ai/_util/trace.py +1 -1
inspect_ai/_util/working.py +4 -0
inspect_ai/_view/www/dist/assets/index.css +23 -22
inspect_ai/_view/www/dist/assets/index.js +517 -204
inspect_ai/_view/www/log-schema.json +375 -0
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +90 -12
inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/types.ts +12 -2
inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
inspect_ai/_view/www/src/state/hooks.ts +19 -3
inspect_ai/_view/www/src/state/logSlice.ts +23 -5
inspect_ai/_view/www/yarn.lock +9 -9
inspect_ai/agent/_as_solver.py +3 -1
inspect_ai/agent/_as_tool.py +6 -4
inspect_ai/agent/_bridge/patch.py +1 -3
inspect_ai/agent/_handoff.py +5 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +6 -1
inspect_ai/agent/_types.py +9 -0
inspect_ai/analysis/__init__.py +0 -0
inspect_ai/analysis/beta/__init__.py +57 -0
inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
inspect_ai/analysis/beta/_dataframe/record.py +377 -0
inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
inspect_ai/analysis/beta/_dataframe/util.py +157 -0
inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +10 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +1 -1
inspect_ai/log/_log.py +21 -1
inspect_ai/log/_samples.py +14 -17
inspect_ai/log/_transcript.py +77 -35
inspect_ai/log/_tree.py +118 -0
inspect_ai/model/_call_tools.py +44 -35
inspect_ai/model/_model.py +51 -44
inspect_ai/model/_openai_responses.py +17 -18
inspect_ai/model/_providers/anthropic.py +30 -5
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/sglang.py +8 -2
inspect_ai/model/_providers/vllm.py +6 -2
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +9 -23
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +7 -3
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_mcp/_context.py +3 -5
inspect_ai/tool/_mcp/_mcp.py +6 -5
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_sandbox/events.py +3 -2
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +114 -82
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0

inspect_ai/analysis/beta/_dataframe/validate.py ADDED Viewed

@@ -0,0 +1,171 @@
+from __future__ import annotations
+from logging import getLogger
+from typing import Any, Iterator, Mapping, Type
+import jsonref  # type: ignore
+from jsonpath_ng import Fields, Index, JSONPath, Slice, Where, WhereNot  # type: ignore
+from jsonpath_ng.ext.filter import Filter  # type: ignore
+from pydantic import BaseModel
+logger = getLogger(__name__)
+Schema = Mapping[str, Any]
+def resolved_schema(model: Type[BaseModel]) -> Schema:
+    schema_dict = model.model_json_schema()
+    base = "file:///memory/inspect_schema.json"
+    schema: Schema = jsonref.replace_refs(
+        schema_dict, base_uri=base, jsonschema=True, proxies=False
+    )
+    return schema
+def jsonpath_in_schema(expr: JSONPath, schema: Schema) -> bool:
+    # don't validate unsupported constructs
+    if find_unsupported(expr):
+        return True
+    def descend(sch: Schema, tok: str | int | None) -> list[Schema]:
+        # First, branch through anyOf/oneOf/allOf
+        outs: list[Schema] = []
+        for branch in _expand_union(sch):
+            outs.extend(descend_concrete(branch, tok))
+        return outs
+    def descend_concrete(sch: Schema, tok: str | int | None) -> list[Schema]:
+        # totally open object – accept any child
+        if sch == {}:
+            return [{}]  # stay alive, accept any key
+        outs: list[Schema] = []
+        def open_dict(node: Schema) -> None:
+            """Append the schema that governs unknown keys.
+            - None / missing  -> open object  ->   {}
+            - True            -> open object  ->   {}
+            - Mapping         -> that mapping (could be {} or a real subschema)
+            - False           -> closed object ->   (do nothing)
+            """
+            if "additionalProperties" not in node:
+                if not node.get("properties"):
+                    outs.append({})
+            else:
+                ap = node["additionalProperties"]
+                if ap is True:
+                    outs.append({})
+                elif isinstance(ap, Mapping):  # {} or {...}
+                    outs.append(ap)
+                # ap is False  -> closed dict  ->  ignore
+        # Wildcard -----------------------------------------------------------
+        if tok is None:
+            if "properties" in sch:
+                outs.extend(sch["properties"].values())
+            if "object" in _types(sch):
+                open_dict(sch)
+            if "array" in _types(sch) and "items" in sch:
+                outs.extend(_normalize_items(sch["items"]))
+            return outs
+        # Property access ----------------------------------------------------
+        if isinstance(tok, str):
+            if "properties" in sch and tok in sch["properties"]:
+                outs.append(sch["properties"][tok])
+            elif "additionalProperties" in sch:  # PRESENCE, not truthiness
+                open_dict(sch)
+            elif "object" in _types(sch):
+                open_dict(sch)
+        # Array index --------------------------------------------------------
+        else:  # tok is int or None from an Index node
+            if "array" in _types(sch) and "items" in sch:
+                outs.extend(_normalize_items(sch["items"], index=tok))
+        return outs
+    def _types(sch: Schema) -> set[str]:
+        t = sch.get("type")
+        return set(t) if isinstance(t, list) else {t} if t else set()
+    def _normalize_items(items: Any, index: int | None = None) -> list[Schema]:
+        if isinstance(items, list):
+            if index is None:  # wildcard/slice
+                return items
+            if 0 <= index < len(items):
+                return [items[index]]
+            return []
+        if isinstance(items, Mapping):
+            return [items]
+        return []
+    states = [schema]
+    for tok in iter_tokens(expr):
+        next_states: list[Schema] = []
+        for st in states:
+            next_states.extend(descend(st, tok))
+        if not next_states:  # nothing matched this segment
+            return False
+        states = next_states
+    return True  # every segment found at least one schema
+def iter_tokens(node: JSONPath) -> Iterator[str | int | None]:
+    """Linearise a jsonpath-ng AST into a stream of tokens we care about."""
+    if hasattr(node, "left"):  # Child, Descendants, etc.
+        yield from iter_tokens(node.left)
+        yield from iter_tokens(node.right)
+    elif isinstance(node, Fields):
+        yield from node.fields  # e.g. ["foo"]
+    elif isinstance(node, Index):
+        yield node.index  # 0  /  -1  /  None for wildcard
+    elif isinstance(node, Slice):
+        yield None  # treat any slice as wildcard
+COMBINATORS = ("anyOf", "oneOf", "allOf")
+def _expand_union(sch: Schema) -> list[Schema]:
+    """Return sch itself or the list of subschemas if it is a combinator."""
+    for key in COMBINATORS:
+        if key in sch:
+            subs: list[Schema] = []
+            for sub in sch[key]:
+                # a sub-schema might itself be an anyOf/oneOf/allOf
+                subs.extend(_expand_union(sub))
+            return subs
+    return [sch]
+UNSUPPORTED: tuple[type[JSONPath], ...] = (
+    Filter,  # [?foo > 0]
+    Where,  # .foo[(@.bar < 42)]
+    WhereNot,
+    Slice,  # [1:5]  (wildcard “[*]” is Index/None, not Slice)
+)
+def find_unsupported(node: JSONPath) -> list[type[JSONPath]]:
+    """Return a list of node types present in `node` that we do not validate."""
+    bad: list[type[JSONPath]] = []
+    stack: list[JSONPath] = [node]
+    while stack:
+        n = stack.pop()
+        if isinstance(n, UNSUPPORTED):
+            bad.append(type(n))
+        # Drill into children (jsonpath-ng uses .left / .right / .child attributes)
+        for attr in ("left", "right", "child", "expression"):
+            stack.extend(
+                [getattr(n, attr)]
+                if hasattr(n, attr) and isinstance(getattr(n, attr), JSONPath)
+                else []
+            )
+        # handle containers like Fields(fields=[...]) and Index(index=[...])
+        if hasattr(n, "__dict__"):
+            for v in n.__dict__.values():
+                if isinstance(v, list):
+                    stack.extend(x for x in v if isinstance(x, JSONPath))
+    return bad

inspect_ai/dataset/_dataset.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import (
 from pydantic import BaseModel, Field, ValidationError
 from typing_extensions import override
+from inspect_ai._util.answer import answer_character, answer_index
 from inspect_ai.model import ChatMessage
 from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
 from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
             shuffled_choices = [sample.choices[i] for i in positions]
             # Map of original position / target letter
-            position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
+            position_map = {
+                i: answer_character(new_i) for new_i, i in enumerate(positions)
+            }
             # Update to the shuffled choices and target
             sample.choices = shuffled_choices
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
         self, target: str | list[str], position_map: dict[int, str]
     ) -> str | list[str]:
         if isinstance(target, list):
-            return [position_map[ord(t) - 65] for t in target]
+            return [position_map[answer_index(t)] for t in target]
         else:
-            return position_map[ord(target) - 65]
+            return position_map[answer_index(target)]
     @override
     def sort(

inspect_ai/log/__init__.py CHANGED Viewed

@@ -48,6 +48,8 @@ from ._transcript import (
     SampleLimitEvent,
     SandboxEvent,
     ScoreEvent,
+    SpanBeginEvent,
+    SpanEndEvent,
     StateEvent,
     StepEvent,
     StoreEvent,
@@ -56,6 +58,7 @@ from ._transcript import (
     Transcript,
     transcript,
 )
+from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
 __all__ = [
     "EvalConfig",
@@ -92,6 +95,8 @@ __all__ = [
     "SampleLimitEvent",
     "SandboxEvent",
     "ScoreEvent",
+    "SpanBeginEvent",
+    "SpanEndEvent",
     "StateEvent",
     "StepEvent",
     "StoreEvent",
@@ -111,4 +116,9 @@ __all__ = [
     "write_log_dir_manifest",
     "retryable_eval_logs",
     "bundle_log_dir",
+    "event_tree",
+    "event_sequence",
+    "EventTree",
+    "EventNode",
+    "SpanNode",
 ]

inspect_ai/log/_convert.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from typing import Literal
 from inspect_ai._util.error import PrerequisiteError
-from inspect_ai._util.file import copy_file, exists, filesystem
+from inspect_ai._util.file import exists, filesystem
 from inspect_ai.log._file import (
     log_files_from_ls,
     read_eval_log,
@@ -66,14 +66,9 @@ def convert_eval_logs(
                 "Output file {output_file} already exists (use --overwrite to overwrite existing files)"
             )
-        # if the input and output files have the same format just copy
-        if input_file.endswith(f".{to}"):
-            copy_file(input_file, output_file)
-        # otherwise do a full read/write
-        else:
-            log = read_eval_log(input_file)
-            write_eval_log(log, output_file)
+        # do a full read/write (normalized deprecated constructs and adds sample summaries)
+        log = read_eval_log(input_file)
+        write_eval_log(log, output_file)
     if fs.info(path).type == "file":
         convert_file(path)

inspect_ai/log/_file.py CHANGED Viewed

@@ -524,7 +524,7 @@ def manifest_eval_log_name(info: EvalLogInfo, log_dir: str, sep: str) -> str:
 def log_files_from_ls(
     ls: list[FileInfo],
-    formats: list[Literal["eval", "json"]] | None,
+    formats: list[Literal["eval", "json"]] | None = None,
     descending: bool = True,
 ) -> list[EvalLogInfo]:
     extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]

inspect_ai/log/_log.py CHANGED Viewed

@@ -17,9 +17,11 @@ from pydantic import (
 )
 from rich.console import Console, RenderableType
 from rich.traceback import Traceback
+from shortuuid import uuid
-from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
+from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, DESERIALIZING, PKG_NAME
 from inspect_ai._util.error import EvalError, exception_message
+from inspect_ai._util.hash import base57_id_hash
 from inspect_ai._util.logger import warn_once
 from inspect_ai.approval._policy import ApprovalPolicyConfig
 from inspect_ai.dataset._dataset import MT, metadata_as
@@ -677,6 +679,9 @@ class EvalModelConfig(BaseModel):
 class EvalSpec(BaseModel):
     """Eval target and configuration."""
+    eval_id: str = Field(default_factory=str)
+    """Globally unique id for eval."""
     run_id: str = Field(default_factory=str)
     """Unique run id"""
@@ -757,6 +762,21 @@ class EvalSpec(BaseModel):
     # allow field model_args
     model_config = ConfigDict(protected_namespaces=())
+    def model_post_init(self, __context: Any) -> None:
+        # check if deserializing
+        is_deserializing = isinstance(__context, dict) and __context.get(
+            DESERIALIZING, False
+        )
+        # Generate eval_id if needed
+        if self.eval_id == "":
+            if is_deserializing:
+                # we want the eval_id to be stable across reads of the eval log so we compose it
+                # as a hash that matches the size/apperance of shortuuid-based uuids
+                self.eval_id = base57_id_hash(self.run_id + self.task_id + self.created)
+            else:
+                self.eval_id = uuid()
     @model_validator(mode="before")
     @classmethod
     def read_sandbox_spec(

inspect_ai/log/_samples.py CHANGED Viewed

@@ -5,12 +5,11 @@ from typing import AsyncGenerator, Iterator, Literal
 from shortuuid import uuid
-from inspect_ai._util.constants import SAMPLE_SUBTASK
 from inspect_ai.dataset._dataset import Sample
 from inspect_ai.util._sandbox import SandboxConnection
 from inspect_ai.util._sandbox.context import sandbox_connections
-from ._transcript import Transcript, transcript
+from ._transcript import ModelEvent, Transcript
 class ActiveSample:
@@ -47,7 +46,6 @@ class ActiveSample:
         self.total_tokens = 0
         self.transcript = transcript
         self.sandboxes = sandboxes
-        self.retry_count = 0
         self._interrupt_action: Literal["score", "error"] | None = None
     @property
@@ -151,27 +149,26 @@ def set_active_sample_total_messages(total_messages: int) -> None:
         active.total_messages = total_messages
+_active_model_event: ContextVar[ModelEvent | None] = ContextVar(
+    "_active_model_event", default=None
+)
 @contextlib.contextmanager
-def track_active_sample_retries() -> Iterator[None]:
-    reset_active_sample_retries()
+def track_active_model_event(event: ModelEvent) -> Iterator[None]:
+    token = _active_model_event.set(event)
     try:
         yield
     finally:
-        reset_active_sample_retries()
-def reset_active_sample_retries() -> None:
-    active = sample_active()
-    if active:
-        active.retry_count = 0
+        _active_model_event.reset(token)
 def report_active_sample_retry() -> None:
-    active = sample_active()
-    if active:
-        # only do this for the top level subtask
-        if transcript().name == SAMPLE_SUBTASK:
-            active.retry_count = active.retry_count + 1
+    model_event = _active_model_event.get()
+    if model_event is not None:
+        if model_event.retries is None:
+            model_event.retries = 0
+        model_event.retries = model_event.retries + 1
 _sample_active: ContextVar[ActiveSample | None] = ContextVar(

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -23,9 +23,10 @@ from pydantic import (
 )
 from shortuuid import uuid
-from inspect_ai._util.constants import SAMPLE_SUBTASK
+from inspect_ai._util.constants import DESERIALIZING
 from inspect_ai._util.error import EvalError
-from inspect_ai._util.json import JsonChange, json_changes
+from inspect_ai._util.json import JsonChange
+from inspect_ai._util.logger import warn_once
 from inspect_ai._util.working import sample_working_time
 from inspect_ai.dataset._dataset import Sample
 from inspect_ai.log._message import LoggingMessage
@@ -34,7 +35,6 @@ from inspect_ai.model._generate_config import GenerateConfig
 from inspect_ai.model._model_call import ModelCall
 from inspect_ai.model._model_output import ModelOutput
 from inspect_ai.scorer._metric import Score
-from inspect_ai.solver._task_state import state_jsonable
 from inspect_ai.tool._tool import ToolResult
 from inspect_ai.tool._tool_call import (
     ToolCall,
@@ -44,6 +44,7 @@ from inspect_ai.tool._tool_call import (
 )
 from inspect_ai.tool._tool_choice import ToolChoice
 from inspect_ai.tool._tool_info import ToolInfo
+from inspect_ai.util._span import current_span_id
 from inspect_ai.util._store import store, store_changes, store_jsonable
 logger = getLogger(__name__)
@@ -57,6 +58,9 @@ class BaseEvent(BaseModel):
     }
     id_: str = Field(default_factory=lambda: str(uuid()), exclude=True)
+    span_id: str | None = Field(default=None)
+    """Span the event occurred within."""
     timestamp: datetime = Field(default_factory=datetime.now)
     """Clock time at which event occurred."""
@@ -66,6 +70,17 @@ class BaseEvent(BaseModel):
     pending: bool | None = Field(default=None)
     """Is this event pending?"""
+    def model_post_init(self, __context: Any) -> None:
+        # check if deserializing
+        is_deserializing = isinstance(__context, dict) and __context.get(
+            DESERIALIZING, False
+        )
+        # Generate context id fields if not deserializing
+        if not is_deserializing:
+            if self.span_id is None:
+                self.span_id = current_span_id()
     @field_serializer("timestamp")
     def serialize_timestamp(self, dt: datetime) -> str:
         return dt.astimezone().isoformat()
@@ -147,6 +162,9 @@ class ModelEvent(BaseEvent):
     output: ModelOutput
     """Output from model."""
+    retries: int | None = Field(default=None)
+    """Retries for the model API request."""
     error: str | None = Field(default=None)
     """Error which occurred during model call."""
@@ -203,7 +221,13 @@ class ToolEvent(BaseEvent):
     """Error that occurred during tool call."""
     events: list["Event"] = Field(default_factory=list)
-    """Transcript of events for tool."""
+    """Transcript of events for tool.
+    Note that events are no longer recorded separately within
+    tool events but rather all events are recorded in the main
+    transcript. This field is deprecated and here for backwards
+    compatibility with transcripts that have sub-events.
+    """
     completed: datetime | None = Field(default=None)
     """Time that tool call completed (see `timestamp` for started)"""
@@ -222,7 +246,6 @@ class ToolEvent(BaseEvent):
         result: ToolResult,
         truncated: tuple[int, int] | None,
         error: ToolCallError | None,
-        events: list["Event"],
         waiting_time: float,
         agent: str | None,
         failed: bool | None,
@@ -230,7 +253,6 @@ class ToolEvent(BaseEvent):
         self.result = result
         self.truncated = truncated
         self.error = error
-        self.events = events
         self.pending = None
         completed = datetime.now()
         self.completed = completed
@@ -402,6 +424,35 @@ class ScoreEvent(BaseEvent):
     """Was this an intermediate scoring?"""
+class SpanBeginEvent(BaseEvent):
+    """Mark the beginning of a transcript span."""
+    event: Literal["span_begin"] = Field(default="span_begin")
+    """Event type."""
+    id: str
+    """Unique identifier for span."""
+    parent_id: str | None = Field(default=None)
+    """Identifier for parent span."""
+    type: str | None = Field(default=None)
+    """Optional 'type' field for span."""
+    name: str
+    """Span name."""
+class SpanEndEvent(BaseEvent):
+    """Mark the end of a transcript span."""
+    event: Literal["span_end"] = Field(default="span_end")
+    """Event type."""
+    id: str
+    """Unique identifier for span."""
 class StepEvent(BaseEvent):
     """Step within current sample or subtask."""
@@ -437,7 +488,13 @@ class SubtaskEvent(BaseEvent):
     """Subtask function result."""
     events: list["Event"] = Field(default_factory=list)
-    """Transcript of events for subtask."""
+    """Transcript of events for subtask.
+    Note that events are no longer recorded separately within
+    subtasks but rather all events are recorded in the main
+    transcript. This field is deprecated and here for backwards
+    compatibility with transcripts that have sub-events.
+    """
     completed: datetime | None = Field(default=None)
     """Time that subtask completed (see `timestamp` for started)"""
@@ -467,6 +524,8 @@ Event: TypeAlias = Union[
     | ErrorEvent
     | LoggerEvent
     | InfoEvent
+    | SpanBeginEvent
+    | SpanEndEvent
     | StepEvent
     | SubtaskEvent,
 ]
@@ -480,8 +539,7 @@ class Transcript:
     _event_logger: Callable[[Event], None] | None
-    def __init__(self, name: str = "") -> None:
-        self.name = name
+    def __init__(self) -> None:
         self._event_logger = None
         self._events: list[Event] = []
@@ -498,19 +556,20 @@ class Transcript:
     def step(self, name: str, type: str | None = None) -> Iterator[None]:
         """Context manager for recording StepEvent.
+        The `step()` context manager is deprecated and will be removed in a future version.
+        Please use the `span()` context manager instead.
         Args:
             name (str): Step name.
             type (str | None): Optional step type.
         """
-        # step event
-        self._event(StepEvent(action="begin", name=name, type=type))
-        # run the step (tracking state/store changes)
-        with track_state_changes(type), track_store_changes():
-            yield
-        # end step event
-        self._event(StepEvent(action="end", name=name, type=type))
+        warn_once(
+            logger,
+            "The `transcript().step()` context manager is deprecated and will "
+            + "be removed in a future version. Please replace the call to step() "
+            + "with a call to span().",
+        )
+        yield
     @property
     def events(self) -> Sequence[Event]:
@@ -551,23 +610,6 @@ def track_store_changes() -> Iterator[None]:
         transcript()._event(StoreEvent(changes=changes))
-@contextlib.contextmanager
-def track_state_changes(type: str | None = None) -> Iterator[None]:
-    # we only want to track for step() inside the the sample
-    # (solver level tracking is handled already and there are
-    # no state changes in subtasks)
-    if transcript().name == SAMPLE_SUBTASK and type != "solver":
-        before = state_jsonable()
-        yield
-        after = state_jsonable()
-        changes = json_changes(before, after)
-        if changes:
-            transcript()._event(StateEvent(changes=changes))
-    else:
-        yield
 def init_transcript(transcript: Transcript) -> None:
     _transcript.set(transcript)

inspect-ai 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl