PyPI - inspect-ai - Versions diffs - 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.93py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/task/run.py +10 -7
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +51 -21
inspect_ai/_view/www/dist/assets/index.css +14 -13
inspect_ai/_view/www/dist/assets/index.js +400 -84
inspect_ai/_view/www/log-schema.json +375 -0
inspect_ai/_view/www/src/@types/log.d.ts +90 -12
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/agent/_as_solver.py +3 -1
inspect_ai/agent/_as_tool.py +6 -4
inspect_ai/agent/_handoff.py +5 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +6 -1
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +10 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_samples.py +14 -17
inspect_ai/log/_transcript.py +77 -35
inspect_ai/log/_tree.py +118 -0
inspect_ai/model/_call_tools.py +42 -34
inspect_ai/model/_model.py +45 -40
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/sglang.py +8 -2
inspect_ai/model/_providers/vllm.py +6 -2
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_mcp/_mcp.py +6 -5
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +56 -51
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
                 )
                 if isinstance(last_event, ModelEvent):
                     # see if there are retries in play
-                    if sample.retry_count > 0:
-                        suffix = "retry" if sample.retry_count == 1 else "retries"
+                    if last_event.retries:
+                        suffix = "retry" if last_event.retries == 1 else "retries"
                         pending_caption_text = (
-                            f"Generating ({sample.retry_count:,} {suffix})..."
+                            f"Generating ({last_event.retries:,} {suffix})..."
                         )
                     else:
                         pending_caption_text = "Generating..."

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
     SampleInitEvent,
     SampleLimitEvent,
     ScoreEvent,
-    StepEvent,
+    SpanBeginEvent,
     SubtaskEvent,
     ToolEvent,
 )
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     # render the call
     content = transcript_tool_call(event)
-    # render sub-events
-    if event.events:
-        content.extend(render_sub_events(event.events))
     # render the output
     if isinstance(event.result, list):
         result: ToolResult = "\n".join(
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     return [EventDisplay("tool call", Group(*content))]
-def render_step_event(event: StepEvent) -> EventDisplay:
-    if event.type == "solver":
-        return render_solver_event(event)
-    if event.type == "scorer":
-        return render_scorer_event(event)
-    else:
-        return EventDisplay(step_title(event))
-def render_solver_event(event: StepEvent) -> EventDisplay:
-    return EventDisplay(step_title(event))
-def render_scorer_event(event: StepEvent) -> EventDisplay:
-    return EventDisplay(step_title(event))
 def render_score_event(event: ScoreEvent) -> EventDisplay:
     table = Table(box=None, show_header=False)
     table.add_column("", min_width=10, justify="left")
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
     # render header
     content: list[RenderableType] = [transcript_function(event.name, event.input)]
-    # render sub-events
-    if event.events:
-        content.extend(render_sub_events(event.events))
     if event.result:
         content.append(Text())
         if isinstance(event.result, str | int | float | bool | None):
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
     return content
-def step_title(event: StepEvent) -> str:
-    return f"{event.type or 'step'}: {event.name}"
+def span_title(event: SpanBeginEvent) -> str:
+    return f"{event.type or 'span'}: {event.name}"
 EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
 _renderers: list[tuple[Type[Event], EventRenderer]] = [
     (SampleInitEvent, render_sample_init_event),
     (SampleLimitEvent, render_sample_limit_event),
-    (StepEvent, render_step_event),
     (ModelEvent, render_model_event),
     (ToolEvent, render_tool_event),
     (SubtaskEvent, render_subtask_event),

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -24,7 +24,6 @@ from inspect_ai._util._async import tg_collect
 from inspect_ai._util.constants import (
     DEFAULT_EPOCHS,
     DEFAULT_MAX_CONNECTIONS,
-    SAMPLE_SUBTASK,
 )
 from inspect_ai._util.datetime import iso_now
 from inspect_ai._util.error import exception_message
@@ -65,8 +64,8 @@ from inspect_ai.log._transcript import (
     SampleInitEvent,
     SampleLimitEvent,
     ScoreEvent,
-    StepEvent,
     Transcript,
+    init_transcript,
     transcript,
 )
 from inspect_ai.model import (
@@ -91,7 +90,8 @@ from inspect_ai.solver._task_state import sample_state, set_sample_state, state_
 from inspect_ai.util._limit import LimitExceededError
 from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
-from inspect_ai.util._subtask import init_subtask
+from inspect_ai.util._span import span
+from inspect_ai.util._store import init_subtask_store
 from ..context import init_task_context
 from ..task import Task
@@ -558,7 +558,9 @@ async def task_run_sample(
     # initialise subtask and scoring context
     init_sample_model_usage()
     set_sample_state(state)
-    sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
+    sample_transcript = Transcript()
+    init_transcript(sample_transcript)
+    init_subtask_store(state.store)
     if logger:
         sample_transcript._subscribe(
             lambda event: logger.log_sample_event(sample_id, state.epoch, event)
@@ -617,7 +619,8 @@ async def task_run_sample(
         results: dict[str, SampleScore] = {}
         try:
             # begin init
-            transcript()._event(StepEvent(action="begin", name="init"))
+            init_span = span("init", type="init")
+            await init_span.__aenter__()
             # sample init event (remove file bodies as they have content or absolute paths)
             event_sample = sample.model_copy(
@@ -639,7 +642,7 @@ async def task_run_sample(
                     active.sandboxes = await sandbox_connections()
                     # end init
-                    transcript()._event(StepEvent(action="end", name="init"))
+                    await init_span.__aexit__(None, None, None)
                     # initialise timeout context manager
                     timeout_cm = (
@@ -742,7 +745,7 @@ async def task_run_sample(
                                 scorer_name = unique_scorer_name(
                                     scorer, list(results.keys())
                                 )
-                                with transcript().step(name=scorer_name, type="scorer"):
+                                async with span(name=scorer_name, type="scorer"):
                                     score_result = (
                                         await scorer(state, Target(sample.target))
                                         if scorer

inspect_ai/_util/answer.py ADDED Viewed

@@ -0,0 +1,26 @@
+def answer_character(index: int) -> str:
+    r"""
+    Helper to go from array index to char, for example:
+        0 -> 'A', 1 -> 'B', etc
+    """
+    if index < 26:
+        return chr(ord("A") + index)
+    else:
+        return str(index - 25)
+def answer_index(char: str) -> int:
+    r"""
+    Helper to go from char to array index, for example:
+        'A' -> 0, 'B' -> 1, etc
+    """
+    if char.isalpha() or char == "," or char == " ":
+        return ord(char.upper()) - ord("A")
+    elif char.isnumeric():
+        return 25 + int(char)
+    else:
+        raise ValueError(
+            f"Unepxected multiple choice answer: {char} (must be a letter or number)"
+        )

inspect_ai/_util/constants.py CHANGED Viewed

@@ -34,7 +34,6 @@ EVAL_LOG_FORMAT = "eval"
 DEFAULT_DISPLAY = "full"
 LOG_SCHEMA_VERSION = 2
 SCORED_SUFFIX = "-scored"
-SAMPLE_SUBTASK = "sample"
 CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
 SANDBOX_SETUP_TIMEOUT = 300

inspect_ai/_util/local_server.py CHANGED Viewed

@@ -62,16 +62,24 @@ def release_port(lock_socket: socket.socket) -> None:
         logger.error(f"Error closing socket: {e}")
-def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
+def execute_shell_command(
+    command: list[str], env: Optional[dict[str, str]] = None
+) -> subprocess.Popen[str]:
     """
     Execute a command and return its process handle.
     Args:
         command: List of command arguments
+        env: Optional environment variables to pass to the subprocess
     Returns:
         A subprocess.Popen object representing the running process
     """
+    # Create a process environment by copying current environment and updating with new values
+    process_env = os.environ.copy()
+    if env:
+        process_env.update(env)
     # Create a process that redirects output to pipes so we can capture it
     process = subprocess.Popen(
         command,
@@ -79,6 +87,7 @@ def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         bufsize=1,  # Line buffered
+        env=process_env,  # Pass the environment variables
     )
     # Set up background thread to read and log stdout
@@ -134,7 +143,10 @@ def kill_process_tree(pid: int) -> None:
 def launch_server_cmd(
-    command: list[str], host: str = "0.0.0.0", port: Optional[int] = None
+    command: list[str],
+    host: str = "0.0.0.0",
+    port: Optional[int] = None,
+    env: Optional[dict[str, str]] = None,
 ) -> Tuple[subprocess.Popen[str], int, list[str]]:
     """
     Launch a server process with the given base command and return the process, port, and full command.
@@ -143,6 +155,7 @@ def launch_server_cmd(
         command: Base command to execute
         host: Host to bind to
         port: Port to bind to. If None, a free port is reserved.
+        env: Optional environment variables to pass to the subprocess
     Returns:
         Tuple of (process, port, full_command)
@@ -155,7 +168,7 @@ def launch_server_cmd(
     full_command = command + ["--port", str(port)]
     logger.info(f"Launching server on port {port}")
-    process = execute_shell_command(full_command)
+    process = execute_shell_command(full_command, env=env)
     if lock_socket is not None:
         process_socket_map[process] = lock_socket
@@ -181,6 +194,7 @@ def wait_for_server(
     base_url: str,
     process: subprocess.Popen[str],
     full_command: Optional[list[str]] = None,
+    env: Optional[dict[str, str]] = None,
     timeout: Optional[int] = None,
     api_key: Optional[str] = None,
 ) -> None:
@@ -191,6 +205,7 @@ def wait_for_server(
         base_url: The base URL of the server
         process: The subprocess running the server
         full_command: The full command used to launch the server
+        env: The environment variables to use for the request
         timeout: Maximum time to wait in seconds. None means wait forever.
         api_key: The API key to use for the request
     """
@@ -198,7 +213,10 @@ def wait_for_server(
     start_time = time.time()
     debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
     if full_command:
-        debug_advice += f" Alternatively, you can run the following launch command manually to see the full traceback:\n\n{' '.join(full_command)}\n\n"
+        debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
+        if env:
+            debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
+        debug_advice += " ".join(full_command) + "\n\n"
     while True:
         # Check for timeout first
@@ -245,6 +263,7 @@ def start_local_server(
     server_type: str = "server",
     timeout: Optional[int] = DEFAULT_TIMEOUT,
     server_args: Optional[dict[str, Any]] = None,
+    env: Optional[dict[str, str]] = None,
 ) -> Tuple[str, subprocess.Popen[str], int]:
     """
     Start a server with the given command and handle potential errors.
@@ -257,6 +276,7 @@ def start_local_server(
         server_type: Type of server being started (for error messages)
         timeout: Maximum time to wait for server to become ready
         server_args: Additional server arguments to pass to the command
+        env: Optional environment variables to pass to the subprocess
     Returns:
         Tuple of (base_url, process, port)
@@ -266,15 +286,22 @@ def start_local_server(
     full_command = base_cmd
     server_process = None
+    # Initialize environment variables if not provided
+    process_env = {} if env is None else env.copy()
     if server_args:
         for key, value in server_args.items():
             # Convert Python style args (underscore) to CLI style (dash)
             cli_key = key.replace("_", "-")
-            full_command.extend([f"--{cli_key}", str(value)])
+            if value == "":
+                # If the value is empty, just add the flag
+                full_command.extend([f"--{cli_key}"])
+            else:
+                full_command.extend([f"--{cli_key}", str(value)])
     try:
         server_process, found_port, full_command = launch_server_cmd(
-            full_command, host=host, port=port
+            full_command, host=host, port=port, env=process_env
         )
         base_url = f"http://localhost:{found_port}/v1"
         wait_for_server(
@@ -283,6 +310,7 @@ def start_local_server(
             api_key=api_key,
             timeout=timeout,
             full_command=full_command,
+            env=process_env,
         )
         return base_url, server_process, found_port
     except Exception as e:
@@ -330,17 +358,18 @@ def merge_env_server_args(
 def configure_devices(
     server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
-) -> dict[str, Any]:
-    """Configure device settings and return updated server args.
+) -> tuple[dict[str, Any], dict[str, str]]:
+    """Configure device settings and return updated server args and environment variables.
     Args:
         server_args: Dictionary of server arguments
         parallel_size_param: Name of parameter to set with device count if not specified
     Returns:
-        Updated server arguments dict
+        Tuple of (updated server arguments dict, environment variables dict)
     """
     result = server_args.copy()
+    env_vars = {}
     devices = None
     if "device" in result and "devices" in result:
@@ -350,19 +379,20 @@ def configure_devices(
     elif "device" in result:
         devices = result.pop("device")
-    # Convert device list to comma-separated string if needed
-    if isinstance(devices, list):
-        device_str = ",".join(map(str, devices))
-    else:
-        device_str = str(devices)
+    if devices is not None:
+        # Convert device list to comma-separated string if needed
+        if isinstance(devices, list):
+            device_str = ",".join(map(str, devices))
+        else:
+            device_str = str(devices)
-    # Set CUDA_VISIBLE_DEVICES environment variable
-    os.environ["CUDA_VISIBLE_DEVICES"] = device_str
+        # Add to env_vars instead of setting os.environ directly
+        env_vars["CUDA_VISIBLE_DEVICES"] = device_str
-    device_count = len(device_str.split(","))
+        device_count = len(device_str.split(","))
-    # Set parallel size parameter if not explicitly provided
-    if parallel_size_param not in result:
-        result[parallel_size_param] = device_count
+        # Set parallel size parameter if not explicitly provided
+        if parallel_size_param not in result:
+            result[parallel_size_param] = device_count
-    return result
+    return result, env_vars

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -17342,37 +17342,38 @@ pre[class*="language-"] {
 ._metadata_1a3fk_21 {
   margin: 0.5em 0;
 }
-._contents_iwnfd_1 {
+._contents_1irga_1 {
   margin-top: 0.5em;
 }
-._contents_iwnfd_1 > :last-child {
+._contents_1irga_1 > :last-child {
   margin-bottom: 0;
 }
-._twoColumn_iwnfd_9 {
+._twoColumn_1irga_9 {
   display: grid;
   grid-template-columns: auto 1fr;
   column-gap: 1.5em;
 }
-._exec_iwnfd_15 {
-  margin-top: 0.5em;
+._exec_1irga_15 {
+  margin-top: 0;
 }
-._result_iwnfd_19 {
+._result_1irga_19 {
   margin-top: 0.5em;
 }
-._fileLabel_iwnfd_23 {
+._fileLabel_1irga_23 {
   margin-top: 0;
   margin-bottom: 0;
 }
-._wrapPre_iwnfd_28 {
+._wrapPre_1irga_28 {
   white-space: pre-wrap;
   word-wrap: break-word;
   overflow-wrap: break-word;
+  margin-bottom: 0;
 }
 ._explanation_1ww42_1 {
   display: grid;
@@ -20001,20 +20002,20 @@ span.ap-marker-container:hover span.ap-marker {
   padding-top: 0rem;
   margin-top: -8px;
 }
-._darkenedBg_1sie6_1 {
+._darkenedBg_u9na2_1 {
   background-color: var(--bs-light-bg-subtle);
 }
-._normalBg_1sie6_5 {
+._normalBg_u9na2_5 {
   background-color: var(--bs-body-bg);
 }
-._node_1sie6_9 {
+._node_u9na2_9 {
   padding-top: 0.7rem;
-  padding-bottom: 0em;
+  padding-bottom: 1px;
 }
-._attached_1sie6_14 {
+._attached_u9na2_14 {
   padding-top: 0rem;
   margin-top: -8px;
 }

inspect-ai 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.93py3-none-any.whl → 0.3.94py3-none-any.whl