PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -24,7 +24,6 @@ from inspect_ai._util._async import tg_collect
 from inspect_ai._util.constants import (
     DEFAULT_EPOCHS,
     DEFAULT_MAX_CONNECTIONS,
-    SAMPLE_SUBTASK,
 )
 from inspect_ai._util.datetime import iso_now
 from inspect_ai._util.error import exception_message
@@ -51,8 +50,12 @@ from inspect_ai.log import (
 )
 from inspect_ai.log._condense import condense_sample
 from inspect_ai.log._file import eval_log_json_str
-from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
-from inspect_ai.log._recorders.types import SampleSummary
+from inspect_ai.log._log import (
+    EvalSampleLimit,
+    EvalSampleReductions,
+    EvalSampleSummary,
+    eval_error,
+)
 from inspect_ai.log._samples import (
     active_sample,
 )
@@ -61,8 +64,8 @@ from inspect_ai.log._transcript import (
     SampleInitEvent,
     SampleLimitEvent,
     ScoreEvent,
-    StepEvent,
     Transcript,
+    init_transcript,
     transcript,
 )
 from inspect_ai.model import (
@@ -82,12 +85,13 @@ from inspect_ai.scorer._scorer import unique_scorer_name
 from inspect_ai.solver import Generate, Plan, TaskState
 from inspect_ai.solver._chain import Chain, unroll
 from inspect_ai.solver._fork import set_task_generate
-from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
+from inspect_ai.util._limit import LimitExceededError
 from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
-from inspect_ai.util._subtask import init_subtask
+from inspect_ai.util._span import span
+from inspect_ai.util._store import init_subtask_store
 from ..context import init_task_context
 from ..task import Task
@@ -554,7 +558,9 @@ async def task_run_sample(
     # initialise subtask and scoring context
     init_sample_model_usage()
     set_sample_state(state)
-    sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
+    sample_transcript = Transcript()
+    init_transcript(sample_transcript)
+    init_subtask_store(state.store)
     if logger:
         sample_transcript._subscribe(
             lambda event: logger.log_sample_event(sample_id, state.epoch, event)
@@ -613,7 +619,8 @@ async def task_run_sample(
         results: dict[str, SampleScore] = {}
         try:
             # begin init
-            transcript()._event(StepEvent(action="begin", name="init"))
+            init_span = span("init", type="init")
+            await init_span.__aenter__()
             # sample init event (remove file bodies as they have content or absolute paths)
             event_sample = sample.model_copy(
@@ -635,7 +642,7 @@ async def task_run_sample(
                     active.sandboxes = await sandbox_connections()
                     # end init
-                    transcript()._event(StepEvent(action="end", name="init"))
+                    await init_span.__aexit__(None, None, None)
                     # initialise timeout context manager
                     timeout_cm = (
@@ -649,17 +656,18 @@ async def task_run_sample(
                     init_sample_working_limit(start_time, working_limit)
                     # run sample w/ optional timeout
-                    with timeout_cm:
+                    with timeout_cm, state._token_limit, state._message_limit:
                         # mark started
                         active.started = datetime.now().timestamp()
                         if logger is not None:
                             await logger.start_sample(
-                                SampleSummary(
+                                EvalSampleSummary(
                                     id=sample_id,
                                     epoch=state.epoch,
                                     input=sample.input,
                                     target=sample.target,
+                                    metadata=sample.metadata or {},
                                 )
                             )
@@ -707,18 +715,9 @@ async def task_run_sample(
                         # handle the cancel exception
                         raise
-                except SampleLimitExceededError as ex:
-                    # sample limit event
-                    transcript()._event(
-                        SampleLimitEvent(
-                            type=ex.type,
-                            limit=ex.limit,
-                            message=f"Sample completed: {ex.message}",
-                        )
-                    )
+                except LimitExceededError:
                     # capture most recent state for scoring
-                    state = ex.state or sample_state() or state
+                    state = sample_state() or state
                 except BaseException as ex:
                     error, raise_error = handle_error(ex)
@@ -735,9 +734,6 @@ async def task_run_sample(
                 if time_limit is not None:
                     timeout_cm = anyio.fail_after(time_limit / 2)
-                # turn off message and token limits
-                state.message_limit = None
-                state.token_limit = None
                 set_sample_state(state)
                 # scoring
@@ -749,7 +745,7 @@ async def task_run_sample(
                                 scorer_name = unique_scorer_name(
                                     scorer, list(results.keys())
                                 )
-                                with transcript().step(name=scorer_name, type="scorer"):
+                                async with span(name=scorer_name, type="scorer"):
                                     score_result = (
                                         await scorer(state, Target(sample.target))
                                         if scorer
@@ -929,7 +925,7 @@ async def log_sample(
         input=sample.input,
         choices=sample.choices,
         target=sample.target,
-        metadata=state.metadata if state.metadata else {},
+        metadata=sample.metadata or {},
         sandbox=sample.sandbox,
         files=list(sample.files.keys()) if sample.files else None,
         setup=sample.setup,

inspect_ai/_util/answer.py ADDED Viewed

@@ -0,0 +1,26 @@
+def answer_character(index: int) -> str:
+    r"""
+    Helper to go from array index to char, for example:
+        0 -> 'A', 1 -> 'B', etc
+    """
+    if index < 26:
+        return chr(ord("A") + index)
+    else:
+        return str(index - 25)
+def answer_index(char: str) -> int:
+    r"""
+    Helper to go from char to array index, for example:
+        'A' -> 0, 'B' -> 1, etc
+    """
+    if char.isalpha() or char == "," or char == " ":
+        return ord(char.upper()) - ord("A")
+    elif char.isnumeric():
+        return 25 + int(char)
+    else:
+        raise ValueError(
+            f"Unepxected multiple choice answer: {char} (must be a letter or number)"
+        )

inspect_ai/_util/constants.py CHANGED Viewed

@@ -34,7 +34,6 @@ EVAL_LOG_FORMAT = "eval"
 DEFAULT_DISPLAY = "full"
 LOG_SCHEMA_VERSION = 2
 SCORED_SUFFIX = "-scored"
-SAMPLE_SUBTASK = "sample"
 CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
 SANDBOX_SETUP_TIMEOUT = 300

inspect_ai/_util/local_server.py ADDED Viewed

@@ -0,0 +1,398 @@
+import json
+import logging
+import os
+import random
+import socket
+import subprocess
+import time
+from typing import Any, Dict, Optional, Tuple
+import httpx
+# Set up logger for this module
+logger = logging.getLogger(__name__)
+# Global dictionary to keep track of process -> reserved port mappings
+process_socket_map = {}
+DEFAULT_TIMEOUT = 60 * 10  # fairly conservative default timeout of 10 minutes
+def reserve_port(
+    host: str, start: int = 30000, end: int = 40000
+) -> Tuple[int, socket.socket]:
+    """
+    Reserve an available port by trying to bind a socket.
+    Args:
+        host: Host to bind to
+        start: Minimum port number to try
+        end: Maximum port number to try
+    Returns:
+        A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
+    """
+    candidates = list(range(start, end))
+    random.shuffle(candidates)
+    for port in candidates:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            # Attempt to bind to the port on localhost
+            sock.bind((host, port))
+            return port, sock
+        except socket.error:
+            sock.close()  # Failed to bind, try next port
+            continue
+    raise RuntimeError("No free port available.")
+def release_port(lock_socket: socket.socket) -> None:
+    """
+    Release the reserved port by closing the lock socket.
+    Args:
+        lock_socket: The socket to close
+    """
+    try:
+        lock_socket.close()
+    except Exception as e:
+        logger.error(f"Error closing socket: {e}")
+def execute_shell_command(
+    command: list[str], env: Optional[dict[str, str]] = None
+) -> subprocess.Popen[str]:
+    """
+    Execute a command and return its process handle.
+    Args:
+        command: List of command arguments
+        env: Optional environment variables to pass to the subprocess
+    Returns:
+        A subprocess.Popen object representing the running process
+    """
+    # Create a process environment by copying current environment and updating with new values
+    process_env = os.environ.copy()
+    if env:
+        process_env.update(env)
+    # Create a process that redirects output to pipes so we can capture it
+    process = subprocess.Popen(
+        command,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        bufsize=1,  # Line buffered
+        env=process_env,  # Pass the environment variables
+    )
+    # Set up background thread to read and log stdout
+    def log_output() -> None:
+        if process.stdout is None:
+            return
+        for line in iter(process.stdout.readline, ""):
+            if line:
+                logger.debug(line.strip())
+        process.stdout.close()
+    # Set up background thread to read and log stderr
+    def log_error() -> None:
+        if process.stderr is None:
+            return
+        for line in iter(process.stderr.readline, ""):
+            if line:
+                logger.info(line.strip())
+        process.stderr.close()
+    # Start background threads to handle output
+    import threading
+    threading.Thread(target=log_output, daemon=True).start()
+    threading.Thread(target=log_error, daemon=True).start()
+    logger.info(f"Started server with command: {' '.join(command)}")
+    return process
+def kill_process_tree(pid: int) -> None:
+    """
+    Kill a process and all its children.
+    Args:
+        pid: Process ID to kill
+    """
+    try:
+        # Send SIGTERM
+        subprocess.run(["pkill", "-TERM", "-P", str(pid)], check=False)
+        subprocess.run(["kill", "-TERM", str(pid)], check=False)
+        time.sleep(1)
+        # If process still exists, send SIGKILL
+        try:
+            os.kill(pid, 0)  # Check if process exists
+            subprocess.run(["pkill", "-KILL", "-P", str(pid)], check=False)
+            subprocess.run(["kill", "-KILL", str(pid)], check=False)
+        except OSError:
+            pass  # Process already terminated
+    except Exception as e:
+        logger.error(f"Error killing process tree: {e}")
+def launch_server_cmd(
+    command: list[str],
+    host: str = "0.0.0.0",
+    port: Optional[int] = None,
+    env: Optional[dict[str, str]] = None,
+) -> Tuple[subprocess.Popen[str], int, list[str]]:
+    """
+    Launch a server process with the given base command and return the process, port, and full command.
+    Args:
+        command: Base command to execute
+        host: Host to bind to
+        port: Port to bind to. If None, a free port is reserved.
+        env: Optional environment variables to pass to the subprocess
+    Returns:
+        Tuple of (process, port, full_command)
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+    full_command = command + ["--port", str(port)]
+    logger.info(f"Launching server on port {port}")
+    process = execute_shell_command(full_command, env=env)
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+    return process, port, full_command
+def terminate_process(process: subprocess.Popen[str]) -> None:
+    """
+    Terminate the process and automatically release the reserved port.
+    Args:
+        process: The process to terminate
+    """
+    kill_process_tree(process.pid)
+    lock_socket = process_socket_map.pop(process, None)
+    if lock_socket is not None:
+        release_port(lock_socket)
+def wait_for_server(
+    base_url: str,
+    process: subprocess.Popen[str],
+    full_command: Optional[list[str]] = None,
+    env: Optional[dict[str, str]] = None,
+    timeout: Optional[int] = None,
+    api_key: Optional[str] = None,
+) -> None:
+    """
+    Wait for the server to be ready by polling the /v1/models endpoint.
+    Args:
+        base_url: The base URL of the server
+        process: The subprocess running the server
+        full_command: The full command used to launch the server
+        env: The environment variables to use for the request
+        timeout: Maximum time to wait in seconds. None means wait forever.
+        api_key: The API key to use for the request
+    """
+    logger.info(f"Waiting for server at {base_url} to become ready...")
+    start_time = time.time()
+    debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
+    if full_command:
+        debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
+        if env:
+            debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
+        debug_advice += " ".join(full_command) + "\n\n"
+    while True:
+        # Check for timeout first
+        if timeout and time.time() - start_time > timeout:
+            error_msg = f"Server did not become ready within timeout period ({timeout} seconds). Try increasing the timeout with '-M timeout=...'. {debug_advice}"
+            logger.error(error_msg)
+            raise TimeoutError(error_msg)
+        # Check if the process is still alive
+        if process.poll() is not None:
+            exit_code = process.poll()
+            error_msg = f"Server process exited unexpectedly with code {exit_code}. {debug_advice}"
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+        try:
+            response = httpx.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": f"Bearer {api_key or 'None'}"},
+                timeout=5.0,  # Short timeout for individual requests
+            )
+            if response.status_code == 200:
+                logger.info("Server is ready.")
+                break
+            # Log non-200 status but don't treat as hard error yet
+            logger.debug(
+                f"Server check returned status {response.status_code}, retrying..."
+            )
+        except httpx.RequestError as e:
+            # Log connection errors but don't treat as hard error yet
+            logger.debug(f"Server check failed: {e}, retrying...")
+            pass  # Request failed (e.g., connection refused), will retry
+        # Wait before the next poll attempt
+        time.sleep(1)
+def start_local_server(
+    base_cmd: list[str],
+    host: str,
+    port: Optional[int] = None,
+    api_key: Optional[str] = None,
+    server_type: str = "server",
+    timeout: Optional[int] = DEFAULT_TIMEOUT,
+    server_args: Optional[dict[str, Any]] = None,
+    env: Optional[dict[str, str]] = None,
+) -> Tuple[str, subprocess.Popen[str], int]:
+    """
+    Start a server with the given command and handle potential errors.
+    Args:
+        base_cmd: List of base command arguments
+        host: Host to bind to
+        port: Port to bind to. If None, a free port is reserved.
+        api_key: API key to use for server authentication
+        server_type: Type of server being started (for error messages)
+        timeout: Maximum time to wait for server to become ready
+        server_args: Additional server arguments to pass to the command
+        env: Optional environment variables to pass to the subprocess
+    Returns:
+        Tuple of (base_url, process, port)
+    Raises:
+        RuntimeError: If server fails to start
+    """
+    full_command = base_cmd
+    server_process = None
+    # Initialize environment variables if not provided
+    process_env = {} if env is None else env.copy()
+    if server_args:
+        for key, value in server_args.items():
+            # Convert Python style args (underscore) to CLI style (dash)
+            cli_key = key.replace("_", "-")
+            if value == "":
+                # If the value is empty, just add the flag
+                full_command.extend([f"--{cli_key}"])
+            else:
+                full_command.extend([f"--{cli_key}", str(value)])
+    try:
+        server_process, found_port, full_command = launch_server_cmd(
+            full_command, host=host, port=port, env=process_env
+        )
+        base_url = f"http://localhost:{found_port}/v1"
+        wait_for_server(
+            f"http://localhost:{found_port}",
+            server_process,
+            api_key=api_key,
+            timeout=timeout,
+            full_command=full_command,
+            env=process_env,
+        )
+        return base_url, server_process, found_port
+    except Exception as e:
+        # Cleanup any partially started server
+        if server_process:
+            terminate_process(server_process)
+        # Re-raise with more context
+        raise RuntimeError(f"Failed to start {server_type} server: {str(e)}") from e
+def merge_env_server_args(
+    env_var_name: str,
+    provided_args: Dict[str, Any],
+    logger: logging.Logger,
+) -> Dict[str, Any]:
+    """
+    Load server arguments from an environment variable and merge them with provided arguments.
+    Args:
+        env_var_name: Name of the environment variable containing JSON server args
+        provided_args: Dictionary of server arguments provided by the user
+        logger: Logger instance to log messages
+    Returns:
+        Dictionary of merged server arguments, with provided args taking precedence
+    """
+    env_server_args = {}
+    server_args_json = os.environ.get(env_var_name)
+    if server_args_json:
+        try:
+            env_server_args = json.loads(server_args_json)
+            logger.info(
+                f"Loaded server args from environment {env_var_name}: {env_server_args}"
+            )
+        except json.JSONDecodeError:
+            logger.warning(
+                f"Failed to parse {env_var_name} as JSON: {server_args_json}"
+            )
+    # Merge environment args with provided args (provided args take precedence)
+    return {**env_server_args, **provided_args}
+def configure_devices(
+    server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
+) -> tuple[dict[str, Any], dict[str, str]]:
+    """Configure device settings and return updated server args and environment variables.
+    Args:
+        server_args: Dictionary of server arguments
+        parallel_size_param: Name of parameter to set with device count if not specified
+    Returns:
+        Tuple of (updated server arguments dict, environment variables dict)
+    """
+    result = server_args.copy()
+    env_vars = {}
+    devices = None
+    if "device" in result and "devices" in result:
+        raise ValueError("Cannot specify both device and devices in server args")
+    elif "devices" in result:
+        devices = result.pop("devices")
+    elif "device" in result:
+        devices = result.pop("device")
+    if devices is not None:
+        # Convert device list to comma-separated string if needed
+        if isinstance(devices, list):
+            device_str = ",".join(map(str, devices))
+        else:
+            device_str = str(devices)
+        # Add to env_vars instead of setting os.environ directly
+        env_vars["CUDA_VISIBLE_DEVICES"] = device_str
+        device_count = len(device_str.split(","))
+        # Set parallel size parameter if not explicitly provided
+        if parallel_size_param not in result:
+            result[parallel_size_param] = device_count
+    return result, env_vars

inspect_ai/_util/working.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import time
 from contextvars import ContextVar
+from inspect_ai.util._limit import LimitExceededError
 def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
     _sample_working_limit.set(working_limit)
@@ -22,6 +24,8 @@ def report_sample_waiting_time(waiting_time: float) -> None:
 def check_sample_working_limit() -> None:
+    from inspect_ai.log._transcript import SampleLimitEvent, transcript
     # no check if we don't have a limit
     working_limit = _sample_working_limit.get()
     if working_limit is None:
@@ -31,13 +35,15 @@ def check_sample_working_limit() -> None:
     running_time = time.monotonic() - _sample_start_time.get()
     working_time = running_time - sample_waiting_time()
     if working_time > working_limit:
-        from inspect_ai.solver._limit import SampleLimitExceededError
-        raise SampleLimitExceededError(
+        message = f"Exceeded working time limit ({working_limit:,} seconds)"
+        transcript()._event(
+            SampleLimitEvent(type="working", limit=int(working_limit), message=message)
+        )
+        raise LimitExceededError(
             type="working",
             value=int(working_time),
             limit=int(working_limit),
-            message=f"Exceeded working time limit ({working_limit:,} seconds)",
+            message=message,
         )

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl