PyPI - inspect-ai - Versions diffs - 0.3.72__py3-none-any.whl → 0.3.73__py3-none-any.whl - Mend

inspect-ai 0.3.72py3-none-any.whl → 0.3.73py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

inspect_ai/_cli/eval.py +14 -3
inspect_ai/_cli/sandbox.py +3 -3
inspect_ai/_cli/score.py +6 -4
inspect_ai/_cli/trace.py +53 -6
inspect_ai/_display/core/config.py +1 -1
inspect_ai/_display/core/display.py +2 -1
inspect_ai/_display/core/footer.py +6 -6
inspect_ai/_display/plain/display.py +11 -6
inspect_ai/_display/rich/display.py +23 -13
inspect_ai/_display/textual/app.py +10 -9
inspect_ai/_display/textual/display.py +2 -2
inspect_ai/_display/textual/widgets/footer.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +14 -5
inspect_ai/_eval/context.py +1 -2
inspect_ai/_eval/eval.py +54 -41
inspect_ai/_eval/loader.py +9 -2
inspect_ai/_eval/run.py +148 -81
inspect_ai/_eval/score.py +13 -8
inspect_ai/_eval/task/images.py +31 -21
inspect_ai/_eval/task/run.py +62 -59
inspect_ai/_eval/task/rundir.py +16 -9
inspect_ai/_eval/task/sandbox.py +7 -8
inspect_ai/_eval/task/util.py +7 -0
inspect_ai/_util/_async.py +118 -10
inspect_ai/_util/constants.py +0 -2
inspect_ai/_util/file.py +15 -29
inspect_ai/_util/future.py +37 -0
inspect_ai/_util/http.py +3 -99
inspect_ai/_util/httpx.py +60 -0
inspect_ai/_util/interrupt.py +2 -2
inspect_ai/_util/json.py +5 -52
inspect_ai/_util/logger.py +30 -86
inspect_ai/_util/retry.py +10 -61
inspect_ai/_util/trace.py +2 -2
inspect_ai/_view/server.py +86 -3
inspect_ai/_view/www/dist/assets/index.js +25837 -13269
inspect_ai/_view/www/log-schema.json +253 -186
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +8 -3
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +2 -3
inspect_ai/_view/www/src/types/log.d.ts +122 -94
inspect_ai/approval/_human/manager.py +6 -10
inspect_ai/approval/_human/panel.py +2 -2
inspect_ai/dataset/_sources/util.py +7 -6
inspect_ai/log/__init__.py +4 -0
inspect_ai/log/_file.py +35 -61
inspect_ai/log/_log.py +18 -1
inspect_ai/log/_recorders/eval.py +14 -23
inspect_ai/log/_recorders/json.py +3 -18
inspect_ai/log/_samples.py +27 -2
inspect_ai/log/_transcript.py +8 -8
inspect_ai/model/__init__.py +2 -1
inspect_ai/model/_call_tools.py +60 -40
inspect_ai/model/_chat_message.py +3 -2
inspect_ai/model/_generate_config.py +25 -0
inspect_ai/model/_model.py +74 -36
inspect_ai/model/_openai.py +9 -1
inspect_ai/model/_providers/anthropic.py +24 -26
inspect_ai/model/_providers/azureai.py +11 -9
inspect_ai/model/_providers/bedrock.py +33 -24
inspect_ai/model/_providers/cloudflare.py +8 -9
inspect_ai/model/_providers/goodfire.py +7 -3
inspect_ai/model/_providers/google.py +47 -13
inspect_ai/model/_providers/groq.py +15 -15
inspect_ai/model/_providers/hf.py +24 -17
inspect_ai/model/_providers/mistral.py +36 -20
inspect_ai/model/_providers/openai.py +30 -25
inspect_ai/model/_providers/openai_o1.py +1 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/together.py +3 -4
inspect_ai/model/_providers/util/__init__.py +2 -2
inspect_ai/model/_providers/util/chatapi.py +6 -19
inspect_ai/model/_providers/util/hooks.py +165 -0
inspect_ai/model/_providers/vertex.py +20 -3
inspect_ai/model/_providers/vllm.py +16 -19
inspect_ai/scorer/_multi.py +5 -2
inspect_ai/solver/_bridge/patch.py +31 -1
inspect_ai/solver/_fork.py +5 -3
inspect_ai/solver/_human_agent/agent.py +3 -2
inspect_ai/tool/__init__.py +8 -2
inspect_ai/tool/_tool_info.py +4 -90
inspect_ai/tool/_tool_params.py +4 -34
inspect_ai/tool/_tools/_web_search.py +30 -24
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_concurrency.py +5 -6
inspect_ai/util/_display.py +6 -0
inspect_ai/util/_json.py +170 -0
inspect_ai/util/_sandbox/docker/cleanup.py +13 -9
inspect_ai/util/_sandbox/docker/docker.py +5 -0
inspect_ai/util/_sandbox/environment.py +56 -9
inspect_ai/util/_sandbox/service.py +12 -5
inspect_ai/util/_subprocess.py +94 -113
inspect_ai/util/_subtask.py +2 -4
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/METADATA +6 -2
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/RECORD +99 -99
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/WHEEL +1 -1
inspect_ai/_util/timeouts.py +0 -160
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
inspect_ai/model/_providers/util/tracker.py +0 -92
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Any, Callable, Tuple, cast
 from typing_extensions import overload
 from inspect_ai._eval.task.util import task_file, task_run_dir
+from inspect_ai._util._async import configured_async_backend
 from inspect_ai._util.decorator import parse_decorators
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.logger import warn_once
@@ -306,11 +307,17 @@ def create_file_tasks(
             setattr(task, TASK_RUN_DIR_ATTR, run_dir)
             tasks.append(task)
-            # warn about deprecated chdir attrib
+            # warn that chdir is deprecated
             if "chdir" in task.attribs:
+                if configured_async_backend() == "trio":
+                    raise RuntimeError(
+                        "The task 'chdir' attribute is not compatible with the trio async backend."
+                    )
                 warn_once(
                     logger,
-                    "The 'chdir' task attribute is deprecated (tasks now always chdir)",
+                    "The 'chdir' task attribute is deprecated and will be removed in a future release "
+                    + "(you should write your tasks to not depend on their runtime working directory)",
                 )
         return tasks

inspect_ai/_eval/run.py CHANGED Viewed

@@ -1,8 +1,15 @@
-import asyncio
+import functools
 import logging
 import os
+import sys
 from typing import Any, Awaitable, Callable, Set, cast
+from inspect_ai._util.trace import trace_action
+if sys.version_info < (3, 11):
+    from exceptiongroup import ExceptionGroup
+import anyio
 from shortuuid import uuid
 from typing_extensions import Unpack
@@ -12,6 +19,7 @@ from inspect_ai._display.core.active import (
     init_task_screen,
 )
 from inspect_ai._display.core.display import TaskSpec
+from inspect_ai._util._async import tg_collect
 from inspect_ai._util.error import PrerequisiteError, exception_message
 from inspect_ai._util.path import chdir
 from inspect_ai._util.registry import registry_unqualified_name
@@ -44,7 +52,7 @@ from .task.log import TaskLogger
 from .task.run import TaskRunOptions, task_run
 from .task.rundir import task_run_dir_switching
 from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
-from .task.util import slice_dataset, task_run_dir
+from .task.util import slice_dataset, task_chdir, task_run_dir
 log = logging.getLogger(__name__)
@@ -67,6 +75,7 @@ async def eval_run(
     # see if we need to use run_dir switching
     run_dir = task_run_dir(tasks[0].task)
     multiple_run_dirs = any([task_run_dir(task.task) != run_dir for task in tasks])
+    tasks_chdir = any([task_chdir(task.task) is not None for task in tasks])
     has_sandbox = next((task.has_sandbox for task in tasks), None)
     # get cwd before switching to task dir
@@ -219,19 +228,25 @@ async def eval_run(
         # multiple mode is for running/displaying multiple
         # task definitions, which requires some smart scheduling
         # to ensure that we spread work among models
-        if parallel > 1:
-            if multiple_run_dirs:
-                with task_run_dir_switching():
-                    return await run_multiple(task_run_options, parallel)
+        if tasks_chdir:
+            if parallel > 1:
+                if multiple_run_dirs:
+                    with task_run_dir_switching():
+                        return await run_multiple(task_run_options, parallel)
+                else:
+                    with chdir(run_dir):
+                        return await run_multiple(task_run_options, parallel)
+            # single mode is for a single task definitions (which
+            # could in turn be executed for multiple models)
             else:
                 with chdir(run_dir):
-                    return await run_multiple(task_run_options, parallel)
-        # single mode is for a single task definitions (which
-        # could in turn be executed for multiple models)
+                    return await run_single(task_run_options, debug_errors)
         else:
-            with chdir(run_dir):
-                return await run_single(task_run_options)
+            if parallel > 1:
+                return await run_multiple(task_run_options, parallel)
+            else:
+                return await run_single(task_run_options, debug_errors)
     finally:
         # shutdown sandbox environments
@@ -246,28 +261,37 @@ async def eval_run(
 # single mode -- run a single logical task (could consist of multiple
 # executable tasks if we are evaluating against multiple models)
-async def run_single(tasks: list[TaskRunOptions]) -> list[EvalLog]:
-    # https://discuss.python.org/t/asyncio-cancel-a-cancellation-utility-as-a-coroutine-this-time-with-feeling/26304/3
+async def run_single(tasks: list[TaskRunOptions], debug_errors: bool) -> list[EvalLog]:
     async with display().task_screen(task_specs(tasks), parallel=False) as screen:
+        # init ui
         init_task_screen(screen)
-        asyncio_tasks = [asyncio.create_task(task_run(task)) for task in tasks]
+        results: list[tuple[int, EvalLog]] = []
         try:
-            return await asyncio.gather(*asyncio_tasks)
-        except asyncio.CancelledError:
-            results: list[EvalLog] = []
-            for task in asyncio_tasks:
-                if task.done():
-                    results.append(task.result())
-                else:
-                    task.cancel()
-                    await task
-                    results.append(task.result())
-            return results
+            async with anyio.create_task_group() as tg:
+                async def run_task(index: int) -> None:
+                    result = await task_run(tasks[index])
+                    results.append((index, result))
+                for i in range(0, len(tasks)):
+                    tg.start_soon(run_task, i)
+        # exceptions can escape when debug_errors is True and that's okay
+        except ExceptionGroup as ex:
+            if debug_errors:
+                raise ex.exceptions[0]
+            else:
+                raise
+        except anyio.get_cancelled_exc_class():
+            # child tasks have already each handled this and updated results
+            pass
         finally:
+            # clear ui
             clear_task_screen()
+        # sort results by original index and return just the values
+        return [r for _, r in sorted(results)]
 # multiple mode -- run multiple logical tasks (requires some smart
 # schedluing to ensure that we are spreading work among models)
@@ -280,82 +304,125 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
     # setup pending tasks, queue, and results
     pending_tasks = tasks.copy()
-    queue: asyncio.Queue[TaskRunOptions] = asyncio.Queue()
     results: list[EvalLog] = []
     tasks_completed = 0
     total_tasks = len(tasks)
+    # produce/consume tasks
+    send_channel, receive_channel = anyio.create_memory_object_stream[TaskRunOptions](
+        parallel * 2
+    )
+    # find a task that keeps as many different models as possible running concurrently
     async def enque_next_task() -> bool:
         if tasks_completed < total_tasks:
-            # find a task that keeps as many different models as possible running concurrently
-            model = min(model_counts.items(), key=lambda m: m[1])[0]
-            next_task = next((t for t in pending_tasks if str(t.model) == model), None)
-            if next_task:
-                pending_tasks.remove(next_task)
-                model_counts[str(next_task.model)] += 1
-                await queue.put(next_task)
-                return True
-            else:
+            # filter out models that have no pending tasks
+            models_with_pending = {
+                model
+                for model in model_counts
+                if any(str(t.model) == model for t in pending_tasks)
+            }
+            if not models_with_pending:
                 return False
+            # among those models, pick one with the least usage
+            model = min(models_with_pending, key=lambda m: model_counts[m])
+            # now we know there’s at least one pending task for this model so it’s safe to pick it
+            next_task = next(t for t in pending_tasks if str(t.model) == model)
+            pending_tasks.remove(next_task)
+            model_counts[str(next_task.model)] += 1
+            with trace_action(
+                log, "Enque Task", f"task: {next_task.task.name} ({next_task.model})"
+            ):
+                await send_channel.send(next_task)
+            return True
         else:
             return False
     async def worker() -> None:
-        # worker runs untiil cancelled
-        nonlocal tasks_completed
-        while True:
-            # remove the task from the queue and run it
-            task_options = await queue.get()
-            task = asyncio.create_task(task_run(task_options))
-            try:
-                await task
-                result = task.result()
-                results.append(result)
-            except asyncio.CancelledError:
-                task.cancel()
-                await task
-                result = task.result()
-                results.append(result)
-            except Exception as ex:
-                # errors generally don't escape from tasks (the exception being if an error
-                # occurs during the final write of the log)
-                log.error(
-                    f"Task '{task_options.task.name}' encountered an error during finalisation: {ex}"
-                )
-            # tracking
-            tasks_completed += 1
-            model_counts[str(task_options.model)] -= 1
-            queue.task_done()
+        try:
+            nonlocal tasks_completed
+            async for task_options in receive_channel:
+                result: EvalLog | None = None
+                # run the task
+                try:
+                    with trace_action(
+                        log,
+                        "Run Task",
+                        f"task: {task_options.task.name} ({task_options.model})",
+                    ):
+                        tg_results = await tg_collect(
+                            [functools.partial(task_run, task_options)]
+                        )
+                    # check for empty results list (indicates cancellation)
+                    if len(tg_results) == 0:
+                        # task was cancelled, break out of the worker loop
+                        result = None
+                    else:
+                        result = tg_results[0]
+                        results.append(result)
+                except Exception as ex:
+                    # errors generally don't escape from tasks (the exception being if an error
+                    # occurs during the final write of the log)
+                    log.error(
+                        f"Task '{task_options.task.name}' encountered an error during finalisation: {ex}"
+                    )
-            if result.status != "cancelled":
-                await enque_next_task()
-            else:
-                break
+                # tracking
+                tasks_completed += 1
+                model_counts[str(task_options.model)] -= 1
+                # if a task was cancelled we are done
+                if not result or result.status == "cancelled":
+                    break
+                # check if there are more tasks to process
+                if tasks_completed < total_tasks:
+                    await enque_next_task()
+                elif tasks_completed == total_tasks:
+                    # all tasks are complete, close the stream
+                    try:
+                        await send_channel.aclose()
+                    except anyio.ClosedResourceError:
+                        # another worker might have already closed it
+                        pass
+        except anyio.EndOfStream:
+            pass
     # with task display
     async with display().task_screen(task_specs(tasks), parallel=True) as screen:
         # init screen
         init_task_screen(screen)
-        # start worker tasks
-        workers = [asyncio.create_task(worker()) for _ in range(0, parallel)]
-        # enque initial set of tasks
-        for _ in range(0, parallel):
-            await enque_next_task()
-        # wait for all tasks to complete
+        # Use anyio task group instead of manual task management
         try:
-            await queue.join()
-        except asyncio.CancelledError:
+            async with anyio.create_task_group() as tg:
+                # start worker tasks
+                for _ in range(parallel):
+                    tg.start_soon(worker)
+                # enqueue initial set of tasks
+                for _ in range(min(parallel, total_tasks)):
+                    await enque_next_task()
+        except anyio.get_cancelled_exc_class():
             pass
         finally:
-            clear_task_screen()
+            # Always ensure channels are closed
+            try:
+                await send_channel.aclose()
+            except anyio.ClosedResourceError:
+                pass
-        # cancel worker tasks
-        for w in workers:
-            w.cancel()
+            try:
+                await receive_channel.aclose()
+            except anyio.ClosedResourceError:
+                pass
+            clear_task_screen()
         return results

inspect_ai/_eval/score.py CHANGED Viewed

@@ -1,10 +1,13 @@
-import asyncio
+import functools
 from copy import deepcopy
 from pathlib import Path
 from typing import Any, Callable, Literal, cast
+import anyio
 from inspect_ai._display import display
 from inspect_ai._eval.loader import scorer_from_spec
+from inspect_ai._util._async import tg_collect
 from inspect_ai._util.platform import platform_init
 from inspect_ai._util.registry import registry_create, registry_unqualified_name
 from inspect_ai.log import (
@@ -53,7 +56,7 @@ def score(
     # resolve scorers into a list
     scorers = [scorers] if isinstance(scorers, Scorer) else scorers
-    return asyncio.run(score_async(log, scorers, epochs_reducer, action))
+    return anyio.run(score_async, log, scorers, epochs_reducer, action)
 async def score_async(
@@ -105,13 +108,15 @@ async def score_async(
         def progress() -> None:
             p.update(1)
-        tasks = [
-            run_score_task(state, Target(sample.target), scorers, progress)
-            for (sample, state) in zip(log.samples, states)
-        ]
         # do scoring
-        scores: list[dict[str, SampleScore]] = await asyncio.gather(*tasks)
+        scores: list[dict[str, SampleScore]] = await tg_collect(
+            [
+                functools.partial(
+                    run_score_task, state, Target(sample.target), scorers, progress
+                )
+                for (sample, state) in zip(log.samples, states)
+            ]
+        )
         # write them back (gather ensures that they come back in the same order)
         for index, score in enumerate(scores):

inspect_ai/_eval/task/images.py CHANGED Viewed

@@ -1,16 +1,19 @@
-import asyncio
+import functools
+from inspect_ai._util._async import tg_collect
 from inspect_ai._util.constants import BASE_64_DATA_REMOVED
 from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentVideo
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_data_uri
 from inspect_ai.dataset import Sample
-from inspect_ai.model import ChatMessage, ChatMessageUser
+from inspect_ai.model import ChatMessage
 from inspect_ai.solver import TaskState
 async def states_with_base64_content(states: list[TaskState]) -> list[TaskState]:
-    return await asyncio.gather(*[state_with_base64_content(state) for state in states])
+    return await tg_collect(
+        [functools.partial(state_with_base64_content, state) for state in states]
+    )
 async def state_with_base64_content(state: TaskState) -> TaskState:
@@ -24,8 +27,8 @@ def state_without_base64_content(state: TaskState) -> TaskState:
 async def samples_with_base64_content(samples: list[Sample]) -> list[Sample]:
-    return await asyncio.gather(
-        *[sample_with_base64_content(sample) for sample in samples]
+    return await tg_collect(
+        [functools.partial(sample_with_base64_content, sample) for sample in samples]
     )
@@ -50,8 +53,11 @@ def sample_without_base64_content(sample: Sample) -> Sample:
 async def messages_with_base64_content(
     messages: list[ChatMessage],
 ) -> list[ChatMessage]:
-    return await asyncio.gather(
-        *[message_with_base64_content(message) for message in messages]
+    return await tg_collect(
+        [
+            functools.partial(message_with_base64_content, message)
+            for message in messages
+        ]
     )
@@ -60,27 +66,31 @@ def messages_without_base64_content(messages: list[ChatMessage]) -> list[ChatMes
 async def message_with_base64_content(message: ChatMessage) -> ChatMessage:
-    if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
-        return ChatMessageUser(
-            content=[
-                await chat_content_with_base64_content(content)
-                for content in message.content
-            ],
-            source=message.source,
+    if not isinstance(message.content, str):
+        return message.model_copy(
+            update=dict(
+                content=[
+                    await chat_content_with_base64_content(content)
+                    for content in message.content
+                ]
+            )
         )
     else:
         return message
 def message_without_base64_content(message: ChatMessage) -> ChatMessage:
-    if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
-        return ChatMessageUser(
-            content=[
-                chat_content_without_base64_content(content)
-                for content in message.content
-            ],
-            source=message.source,
+    if not isinstance(message.content, str):
+        return message.model_copy(
+            update=dict(
+                content=[
+                    chat_content_without_base64_content(content)
+                    for content in message.content
+                ]
+            )
         )
     else:
         return message

inspect-ai 0.3.72__py3-none-any.whl → 0.3.73__py3-none-any.whl

inspect-ai 0.3.72py3-none-any.whl → 0.3.73py3-none-any.whl