PyPI - inspect-ai - Versions diffs - 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl - Mend

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

inspect_ai/_cli/eval.py +13 -1
inspect_ai/_display/plain/display.py +9 -11
inspect_ai/_display/textual/app.py +5 -5
inspect_ai/_display/textual/widgets/samples.py +47 -18
inspect_ai/_display/textual/widgets/transcript.py +25 -12
inspect_ai/_eval/eval.py +14 -2
inspect_ai/_eval/evalset.py +6 -1
inspect_ai/_eval/run.py +6 -0
inspect_ai/_eval/task/run.py +44 -15
inspect_ai/_eval/task/task.py +26 -3
inspect_ai/_util/interrupt.py +15 -0
inspect_ai/_util/logger.py +23 -0
inspect_ai/_util/rich.py +7 -8
inspect_ai/_util/text.py +301 -1
inspect_ai/_util/transcript.py +10 -2
inspect_ai/_util/working.py +46 -0
inspect_ai/_view/www/dist/assets/index.css +56 -12
inspect_ai/_view/www/dist/assets/index.js +905 -751
inspect_ai/_view/www/log-schema.json +337 -2
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
inspect_ai/_view/www/src/appearance/icons.ts +3 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
inspect_ai/_view/www/src/types/log.d.ts +188 -108
inspect_ai/_view/www/src/utils/format.ts +7 -4
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_condense.py +1 -0
inspect_ai/log/_log.py +72 -12
inspect_ai/log/_samples.py +5 -5
inspect_ai/log/_transcript.py +31 -1
inspect_ai/model/_call_tools.py +1 -1
inspect_ai/model/_conversation.py +1 -1
inspect_ai/model/_model.py +35 -16
inspect_ai/model/_model_call.py +10 -3
inspect_ai/model/_providers/anthropic.py +13 -2
inspect_ai/model/_providers/bedrock.py +7 -0
inspect_ai/model/_providers/cloudflare.py +20 -7
inspect_ai/model/_providers/google.py +358 -302
inspect_ai/model/_providers/groq.py +57 -23
inspect_ai/model/_providers/hf.py +6 -0
inspect_ai/model/_providers/mistral.py +81 -52
inspect_ai/model/_providers/openai.py +9 -0
inspect_ai/model/_providers/providers.py +6 -6
inspect_ai/model/_providers/util/tracker.py +92 -0
inspect_ai/model/_providers/vllm.py +13 -5
inspect_ai/solver/_basic_agent.py +1 -3
inspect_ai/solver/_bridge/patch.py +0 -2
inspect_ai/solver/_limit.py +4 -4
inspect_ai/solver/_plan.py +3 -3
inspect_ai/solver/_solver.py +3 -0
inspect_ai/solver/_task_state.py +10 -1
inspect_ai/tool/_tools/_web_search.py +3 -3
inspect_ai/util/_concurrency.py +14 -8
inspect_ai/util/_sandbox/context.py +15 -0
inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
inspect_ai/util/_sandbox/docker/compose.py +5 -9
inspect_ai/util/_sandbox/docker/docker.py +20 -6
inspect_ai/util/_sandbox/docker/util.py +10 -1
inspect_ai/util/_sandbox/environment.py +32 -1
inspect_ai/util/_sandbox/events.py +149 -0
inspect_ai/util/_sandbox/local.py +3 -3
inspect_ai/util/_sandbox/self_check.py +2 -1
inspect_ai/util/_subprocess.py +4 -1
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from logging import getLogger
-from typing import Any, Callable, Sequence, cast
+from typing import Any, Awaitable, Callable, Sequence, cast
 from pydantic import BaseModel
 from typing_extensions import TypedDict, Unpack
@@ -17,6 +17,7 @@ from inspect_ai.scorer import Metric, Scorer
 from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
 from inspect_ai.solver import Plan, Solver, generate
 from inspect_ai.solver._chain import chain
+from inspect_ai.solver._task_state import TaskState
 from inspect_ai.util._sandbox.environment import (
     SandboxEnvironmentSpec,
     SandboxEnvironmentType,
@@ -46,6 +47,7 @@ class Task:
         dataset: Dataset | Sequence[Sample] | None = None,
         setup: Solver | list[Solver] | None = None,
         solver: Solver | list[Solver] = generate(),
+        cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
         scorer: Scorer | list[Scorer] | None = None,
         metrics: list[Metric] | dict[str, list[Metric]] | None = None,
         config: GenerateConfig = GenerateConfig(),
@@ -56,6 +58,7 @@ class Task:
         message_limit: int | None = None,
         token_limit: int | None = None,
         time_limit: int | None = None,
+        working_limit: int | None = None,
         name: str | None = None,
         version: int = 0,
         metadata: dict[str, Any] | None = None,
@@ -69,6 +72,9 @@ class Task:
                 even when the main `solver` is replaced).
             solver: (Solver | list[Solver]): Solver or list of solvers.
                 Defaults to generate(), a normal call to the model.
+            cleanup: Optional cleanup function for task. Called after
+                all solvers have run for each sample (including if an
+                exception occurs during the run)
             scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
             metrics (list[Metric] | dict[str, list[Metric]] | None):
                 Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -86,7 +92,10 @@ class Task:
                 eval if a count of samples fails.
             message_limit (int | None): Limit on total messages used for each sample.
             token_limit (int | None): Limit on total tokens used for each sample.
-            time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+            time_limit: Limit on clock time (in seconds) for samples.
+            working_limit: Limit on working time (in seconds) for sample. Working
+                time includes model generation, tool calls, etc. but does not include
+                time spent waiting on retries or shared resources.
             name: (str | None): Task name. If not specified is automatically
                 determined based on the name of the task directory (or "task")
                 if its anonymous task (e.g. created in a notebook and passed to
@@ -123,6 +132,7 @@ class Task:
         self.dataset = resolve_dataset(dataset)
         self.setup = setup
         self.solver = resolve_solver(solver)
+        self.cleanup = cleanup
         self.scorer = resolve_scorer(scorer)
         self.metrics = metrics
         self.config = config
@@ -135,6 +145,7 @@ class Task:
         self.message_limit = message_limit
         self.token_limit = token_limit
         self.time_limit = time_limit
+        self.working_limit = working_limit
         self.version = version
         self._name = name
         self.metadata = metadata
@@ -162,6 +173,7 @@ def task_with(
     dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
     setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
     solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
+    cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
     scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
     metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
     config: GenerateConfig | NotGiven = NOT_GIVEN,
@@ -172,6 +184,7 @@ def task_with(
     message_limit: int | None | NotGiven = NOT_GIVEN,
     token_limit: int | None | NotGiven = NOT_GIVEN,
     time_limit: int | None | NotGiven = NOT_GIVEN,
+    working_limit: int | None | NotGiven = NOT_GIVEN,
     name: str | None | NotGiven = NOT_GIVEN,
     version: int | NotGiven = NOT_GIVEN,
     metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
@@ -185,6 +198,9 @@ def task_with(
             even when the main `solver` is replaced).
         solver: (Solver | list[Solver]): Solver or list of solvers.
             Defaults to generate(), a normal call to the model.
+        cleanup: Optional cleanup function for task. Called after
+            all solvers have run for each sample (including if an
+            exception occurs during the run)
         scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
         metrics (list[Metric] | dict[str, list[Metric]] | None):
             Alternative metrics (overrides the metrics provided by the specified scorer).
@@ -202,7 +218,10 @@ def task_with(
             eval if a count of samples fails.
         message_limit (int | None): Limit on total messages used for each sample.
         token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on execution time (in seconds) for sample. Execution
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         name: (str | None): Task name. If not specified is automatically
             determined based on the name of the task directory (or "task")
             if its anonymous task (e.g. created in a notebook and passed to
@@ -223,6 +242,8 @@ def task_with(
         task.setup = setup
     if not isinstance(solver, NotGiven):
         task.solver = resolve_solver(solver)
+    if not isinstance(cleanup, NotGiven):
+        task.cleanup = cleanup
     if not isinstance(scorer, NotGiven):
         task.scorer = resolve_scorer(scorer)
     if not isinstance(metrics, NotGiven):
@@ -245,6 +266,8 @@ def task_with(
         task.token_limit = token_limit
     if not isinstance(time_limit, NotGiven):
         task.time_limit = time_limit
+    if not isinstance(working_limit, NotGiven):
+        task.working_limit = working_limit
     if not isinstance(version, NotGiven):
         task.version = version
     if not isinstance(name, NotGiven):

inspect_ai/_util/interrupt.py ADDED Viewed

@@ -0,0 +1,15 @@
+import asyncio
+from .working import check_sample_working_limit
+def check_sample_interrupt() -> None:
+    from inspect_ai.log._samples import sample_active
+    # check for user interrupt
+    sample = sample_active()
+    if sample and sample.interrupt_action:
+        raise asyncio.CancelledError()
+    # check for working_limit
+    check_sample_working_limit()

inspect_ai/_util/logger.py CHANGED Viewed

@@ -90,6 +90,10 @@ class LogHandler(RichHandler):
         if "Event loop is closed" in record.getMessage():
             return
+        # skip google-genai AFC message
+        if "AFC is enabled with max remote calls" in record.getMessage():
+            return
         # write to stderr if we are at or above the threshold
         if record.levelno >= self.display_level:
             super().emit(record)
@@ -156,7 +160,9 @@ def init_logger(
     # init logging handler on demand
     global _logHandler
+    removed_root_handlers = False
     if not _logHandler:
+        removed_root_handlers = remove_non_pytest_root_logger_handlers()
         _logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
         getLogger().addHandler(_logHandler)
@@ -169,6 +175,11 @@ def init_logger(
     getLogger("httpx").setLevel(capture_level)
     getLogger("botocore").setLevel(DEBUG)
+    if removed_root_handlers:
+        getLogger(PKG_NAME).warning(
+            "Inspect removed pre-existing root logger handlers and replaced them with its own handler."
+        )
     # set the levelno on the global handler
     _logHandler.display_level = levelno
@@ -176,6 +187,18 @@ def init_logger(
 _logHandler: LogHandler | None = None
+def remove_non_pytest_root_logger_handlers() -> bool:
+    root_logger = getLogger()
+    non_pytest_handlers = [
+        handler
+        for handler in root_logger.handlers
+        if handler.__module__ != "_pytest.logging"
+    ]
+    for handler in non_pytest_handlers:
+        root_logger.removeHandler(handler)
+    return len(non_pytest_handlers) > 0
 def notify_logger_record(record: LogRecord, write: bool) -> None:
     from inspect_ai.log._message import LoggingMessage
     from inspect_ai.log._transcript import LoggerEvent, transcript

inspect_ai/_util/rich.py CHANGED Viewed

@@ -2,23 +2,22 @@ from rich.console import RenderableType
 from rich.style import Style
 from rich.text import Text
+from inspect_ai._util.text import truncate_lines
 def lines_display(
     text: str, max_lines: int = 100, style: str | Style = ""
 ) -> list[RenderableType]:
-    lines = text.splitlines()
-    if len(lines) > max_lines:
-        content: list[RenderableType] = [
-            Text("\n".join(lines[0:max_lines]), style=style)
-        ]
+    lines, truncated = truncate_lines(text, max_lines)
+    content: list[RenderableType] = [Text(lines, style=style)]
+    if truncated is not None:
         content.append(Text())
         content.append(
             Text.from_markup(
-                f"[italic]Output truncated ({len(lines) - max_lines} additional lines)...[/italic]",
+                f"[italic]Output truncated ({truncated} additional lines)...[/italic]",
                 style=style,
             )
         )
-    else:
-        content = [Text(text, style=style)]
     return content

inspect_ai/_util/text.py CHANGED Viewed

@@ -1,7 +1,8 @@
+import random
 import re
 import string
 from logging import getLogger
-from typing import NamedTuple
+from typing import List, NamedTuple
 logger = getLogger(__name__)
@@ -131,3 +132,302 @@ def truncate(text: str, length: int, overflow: str = "...", pad: bool = True) ->
     truncated = text[: length - overflow_length] + overflow
     return truncated
+def truncate_lines(
+    text: str, max_lines: int = 100, max_characters: int | None = 100 * 100
+) -> tuple[str, int | None]:
+    if max_characters is not None:
+        text = truncate(text, max_characters)
+    lines = text.splitlines()
+    if len(lines) > max_lines:
+        output = "\n".join(lines[0:max_lines])
+        return output, len(lines) - max_lines
+    else:
+        return text, None
+def generate_large_text(target_tokens: int) -> str:
+    """Generate a large amount of text with approximately the target number of tokens"""
+    generated_text = []
+    estimated_tokens = 0
+    while estimated_tokens < target_tokens:
+        sentence = generate_sentence()
+        # Add paragraph breaks occasionally
+        if random.random() < 0.1:
+            sentence += "\n\n"
+        generated_text.append(sentence)
+        # Rough estimate of tokens (words + punctuation)
+        estimated_tokens += len(sentence.split()) + 2
+    return " ".join(generated_text)
+def generate_sentence() -> str:
+    """Generate a random sentence using predefined templates"""
+    adjectives, nouns, verbs = create_word_lists()
+    templates = [
+        f"The {random.choice(adjectives)} {random.choice(nouns)} {random.choice(verbs)} the {random.choice(adjectives)} {random.choice(nouns)}.",
+        f"A {random.choice(adjectives)} {random.choice(nouns)} {random.choice(verbs)} near the {random.choice(nouns)}.",
+        f"In the {random.choice(adjectives)} {random.choice(nouns)}, the {random.choice(nouns)} {random.choice(verbs)} {random.choice(adjectives)}.",
+        f"When the {random.choice(nouns)} {random.choice(verbs)}, a {random.choice(adjectives)} {random.choice(nouns)} {random.choice(verbs)}.",
+        f"The {random.choice(nouns)} {random.choice(verbs)} while the {random.choice(adjectives)} {random.choice(nouns)} {random.choice(verbs)}.",
+    ]
+    return random.choice(templates)
+def create_word_lists() -> tuple[List[str], List[str], List[str]]:
+    """Create basic word lists for sentence generation"""
+    # Common adjectives
+    adjectives = [
+        "red",
+        "blue",
+        "green",
+        "dark",
+        "bright",
+        "quiet",
+        "loud",
+        "small",
+        "large",
+        "quick",
+        "slow",
+        "happy",
+        "sad",
+        "clever",
+        "wise",
+        "ancient",
+        "modern",
+        "complex",
+        "simple",
+        "elegant",
+        "rough",
+        "smooth",
+        "sharp",
+        "dull",
+        "fresh",
+        "stale",
+        "clean",
+        "dirty",
+        "heavy",
+        "light",
+        "hot",
+        "cold",
+        "dry",
+        "wet",
+        "rich",
+        "poor",
+        "thick",
+        "thin",
+        "strong",
+        "weak",
+        "early",
+        "late",
+        "young",
+        "old",
+        "good",
+        "bad",
+        "high",
+        "low",
+        "long",
+        "short",
+        "deep",
+        "shallow",
+        "hard",
+        "soft",
+        "near",
+        "far",
+        "wide",
+        "narrow",
+        "big",
+        "little",
+        "fast",
+        "slow",
+        "busy",
+        "lazy",
+        "new",
+        "old",
+        "full",
+        "empty",
+        "loud",
+        "quiet",
+        "sweet",
+        "sour",
+        "brave",
+        "scared",
+    ]
+    # Common nouns
+    nouns = [
+        "time",
+        "person",
+        "year",
+        "way",
+        "day",
+        "thing",
+        "man",
+        "world",
+        "life",
+        "hand",
+        "part",
+        "child",
+        "eye",
+        "woman",
+        "place",
+        "work",
+        "week",
+        "case",
+        "point",
+        "group",
+        "number",
+        "room",
+        "fact",
+        "idea",
+        "water",
+        "money",
+        "month",
+        "book",
+        "line",
+        "city",
+        "business",
+        "night",
+        "question",
+        "story",
+        "job",
+        "word",
+        "house",
+        "power",
+        "game",
+        "country",
+        "plant",
+        "animal",
+        "tree",
+        "stone",
+        "river",
+        "fire",
+        "problem",
+        "theory",
+        "street",
+        "family",
+        "history",
+        "mind",
+        "car",
+        "music",
+        "art",
+        "nation",
+        "science",
+        "nature",
+        "truth",
+        "peace",
+        "voice",
+        "class",
+        "paper",
+        "space",
+        "ground",
+        "market",
+        "court",
+        "force",
+        "price",
+        "action",
+        "reason",
+        "love",
+        "law",
+        "bird",
+        "literature",
+        "knowledge",
+        "society",
+        "valley",
+        "ocean",
+        "machine",
+        "energy",
+        "metal",
+        "mountain",
+    ]
+    # Common verbs (present tense)
+    verbs = [
+        "run",
+        "walk",
+        "jump",
+        "sing",
+        "dance",
+        "write",
+        "read",
+        "speak",
+        "listen",
+        "watch",
+        "think",
+        "grow",
+        "live",
+        "play",
+        "work",
+        "move",
+        "stop",
+        "start",
+        "create",
+        "destroy",
+        "build",
+        "break",
+        "push",
+        "pull",
+        "open",
+        "close",
+        "rise",
+        "fall",
+        "increase",
+        "decrease",
+        "begin",
+        "end",
+        "love",
+        "hate",
+        "help",
+        "hurt",
+        "make",
+        "take",
+        "give",
+        "receive",
+        "buy",
+        "sell",
+        "eat",
+        "drink",
+        "sleep",
+        "wake",
+        "laugh",
+        "cry",
+        "learn",
+        "teach",
+        "change",
+        "stay",
+        "come",
+        "go",
+        "arrive",
+        "leave",
+        "enter",
+        "exit",
+        "succeed",
+        "fail",
+        "win",
+        "lose",
+        "fight",
+        "defend",
+        "attack",
+        "protect",
+        "save",
+        "waste",
+        "gather",
+        "scatter",
+        "collect",
+        "distribute",
+        "join",
+        "separate",
+        "unite",
+        "divide",
+        "share",
+    ]
+    return adjectives, nouns, verbs

inspect_ai/_util/transcript.py CHANGED Viewed

@@ -122,8 +122,16 @@ def transcript_reasoning(reasoning: str) -> list[RenderableType]:
     return content
-def transcript_separator(title: str, color: str) -> RenderableType:
-    return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
+def transcript_separator(
+    title: str, color: str, characters: str = "─"
+) -> RenderableType:
+    return Rule(
+        title=title,
+        characters=characters,
+        style=f"{color} bold",
+        align="center",
+        end="\n\n",
+    )
 def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableType:

inspect_ai/_util/working.py ADDED Viewed

@@ -0,0 +1,46 @@
+import time
+from contextvars import ContextVar
+def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
+    _sample_working_limit.set(working_limit)
+    _sample_start_time.set(start_time)
+    _sample_waiting_time.set(0)
+def sample_waiting_time() -> float:
+    return _sample_waiting_time.get()
+def report_sample_waiting_time(waiting_time: float) -> None:
+    _sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
+    check_sample_working_limit()
+def check_sample_working_limit() -> None:
+    # no check if we don't have a limit
+    working_limit = _sample_working_limit.get()
+    if working_limit is None:
+        return
+    # are we over the limit?
+    running_time = time.monotonic() - _sample_start_time.get()
+    working_time = running_time - sample_waiting_time()
+    if working_time > working_limit:
+        from inspect_ai.solver._limit import SampleLimitExceededError
+        raise SampleLimitExceededError(
+            type="working",
+            value=int(working_time),
+            limit=int(working_limit),
+            message=f"Exceeded working time limit ({working_limit:,} seconds)",
+        )
+_sample_working_limit: ContextVar[float | None] = ContextVar(
+    "sample_working_limit", default=None
+)
+_sample_start_time: ContextVar[float] = ContextVar("sample_start_time", default=0)
+_sample_waiting_time: ContextVar[float] = ContextVar("sample_waiting_time", default=0)

inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl