PyPI - inspect-ai - Versions diffs - 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl - Mend

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/task/run.py +21 -12
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/exception.py +4 -0
inspect_ai/_util/hash.py +39 -0
inspect_ai/_util/local_server.py +51 -21
inspect_ai/_util/path.py +22 -0
inspect_ai/_util/trace.py +1 -1
inspect_ai/_util/working.py +4 -0
inspect_ai/_view/www/dist/assets/index.css +23 -22
inspect_ai/_view/www/dist/assets/index.js +517 -204
inspect_ai/_view/www/log-schema.json +375 -0
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +90 -12
inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/types.ts +12 -2
inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
inspect_ai/_view/www/src/state/hooks.ts +19 -3
inspect_ai/_view/www/src/state/logSlice.ts +23 -5
inspect_ai/_view/www/yarn.lock +9 -9
inspect_ai/agent/_as_solver.py +3 -1
inspect_ai/agent/_as_tool.py +6 -4
inspect_ai/agent/_bridge/patch.py +1 -3
inspect_ai/agent/_handoff.py +5 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +6 -1
inspect_ai/agent/_types.py +9 -0
inspect_ai/analysis/__init__.py +0 -0
inspect_ai/analysis/beta/__init__.py +57 -0
inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
inspect_ai/analysis/beta/_dataframe/record.py +377 -0
inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
inspect_ai/analysis/beta/_dataframe/util.py +157 -0
inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +10 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +1 -1
inspect_ai/log/_log.py +21 -1
inspect_ai/log/_samples.py +14 -17
inspect_ai/log/_transcript.py +77 -35
inspect_ai/log/_tree.py +118 -0
inspect_ai/model/_call_tools.py +44 -35
inspect_ai/model/_model.py +51 -44
inspect_ai/model/_openai_responses.py +17 -18
inspect_ai/model/_providers/anthropic.py +30 -5
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/sglang.py +8 -2
inspect_ai/model/_providers/vllm.py +6 -2
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +9 -23
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +7 -3
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_mcp/_context.py +3 -5
inspect_ai/tool/_mcp/_mcp.py +6 -5
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
inspect_ai/util/__init__.py +4 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_sandbox/events.py +3 -2
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +114 -82
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0

inspect_ai/tool/_tools/_web_search/_tavily.py ADDED Viewed

@@ -0,0 +1,77 @@
+import os
+from typing import Awaitable, Callable
+import httpx
+from pydantic import BaseModel, Field
+from tenacity import (
+    retry,
+    retry_if_exception,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential_jitter,
+)
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai._util.httpx import httpx_should_retry, log_httpx_retry_attempt
+from inspect_ai.util._concurrency import concurrency
+class TavilySearchResult(BaseModel):
+    title: str
+    url: str
+    content: str
+    score: float
+class TavilySearchResponse(BaseModel):
+    query: str
+    answer: str | None = Field(default=None)
+    images: list[object]
+    results: list[TavilySearchResult]
+    response_time: float
+def tavily_search_provider(
+    num_results: int, max_connections: int
+) -> Callable[[str], Awaitable[str | None]]:
+    tavily_api_key = os.environ.get("TAVILY_API_KEY", None)
+    if not tavily_api_key:
+        raise PrerequisiteError(
+            "TAVILY_API_KEY not set in the environment. Please ensure ths variable is defined to use Tavily with the web_search tool.\n\nLearn more about the Tavily web search provider at https://inspect.aisi.org.uk/tools.html#tavily-provider"
+        )
+    if num_results > 20:
+        raise PrerequisiteError(
+            "The Tavily search provider is limited to 20 results per query."
+        )
+    # Create the client within the provider
+    client = httpx.AsyncClient(timeout=30)
+    async def search(query: str) -> str | None:
+        search_url = "https://api.tavily.com/search"
+        headers = {
+            "Authorization": f"Bearer {tavily_api_key}",
+        }
+        body = {
+            "query": query,
+            "max_results": 10,  # num_results,
+            # "search_depth": "advanced",
+            "include_answer": "advanced",
+        }
+        # retry up to 5 times over a period of up to 1 minute
+        @retry(
+            wait=wait_exponential_jitter(),
+            stop=stop_after_attempt(5) | stop_after_delay(60),
+            retry=retry_if_exception(httpx_should_retry),
+            before_sleep=log_httpx_retry_attempt(search_url),
+        )
+        async def _search() -> httpx.Response:
+            response = await client.post(search_url, headers=headers, json=body)
+            response.raise_for_status()
+            return response
+        async with concurrency("tavily_web_search", max_connections):
+            return TavilySearchResponse.model_validate((await _search()).json()).answer
+    return search

inspect_ai/tool/_tools/_web_search/_web_search.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import Literal
+from inspect_ai._util.deprecation import deprecation_warning
+from ..._tool import Tool, ToolResult, tool
+from ._google import google_search_provider, maybe_get_google_api_keys
+from ._tavily import tavily_search_provider
+@tool
+def web_search(
+    provider: Literal["tavily", "google"] | None = None,
+    num_results: int = 3,
+    max_provider_calls: int = 3,
+    max_connections: int = 10,
+    model: str | None = None,
+) -> Tool:
+    """Web search tool.
+    A tool that can be registered for use by models to search the web. Use
+    the `use_tools()` solver to make the tool available (e.g.
+    `use_tools(web_search(provider="tavily"))`))
+    A web search is conducted using the specified provider.
+    - When using Tavily, all logic for relevance and summarization is handled by
+    the Tavily API.
+    - When using Google, the results are parsed for relevance using the specified
+    model, and the top 'num_results' relevant pages are returned.
+    See further documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-web-search>.
+    Args:
+      provider: Search provider to use:
+        - "tavily": Uses Tavily's Research API.
+        - "google": Uses Google Custom Search.
+        Note: The `| None` type is only for backwards compatibility. Passing
+        `None` is deprecated.
+      num_results: The number of search result pages used to provide information
+        back to the model.
+      max_provider_calls: Maximum number of search calls to make to the search
+        provider.
+      max_connections: Maximum number of concurrent connections to API endpoint
+        of search provider.
+      model: Model used to parse web pages for relevance - used only by the
+        `google` provider.
+    Returns:
+       A tool that can be registered for use by models to search the web.
+    """
+    if provider is None:
+        if maybe_get_google_api_keys():
+            deprecation_warning(
+                "The `google` `web_search` provider was inferred based on the presence of environment variables. Please specify the provider explicitly to avoid this warning."
+            )
+            provider = "google"
+        else:
+            raise ValueError(
+                "Omitting `provider` is no longer supported. Please specify the `web_search` provider explicitly to avoid this error."
+            )
+    search_provider = (
+        google_search_provider(num_results, max_provider_calls, max_connections, model)
+        if provider == "google"
+        else tavily_search_provider(num_results, max_connections)
+    )
+    async def execute(query: str) -> ToolResult:
+        """
+        Use the web_search tool to perform keyword searches of the web.
+        Args:
+            query (str): Search query.
+        """
+        search_result = await search_provider(query)
+        return (
+            (
+                "Here are your web search results. Please read them carefully as they may be useful later!\n"
+                + search_result
+            )
+            if search_result
+            else ("I'm sorry, I couldn't find any relevant information on the web.")
+        )
+    return execute

inspect_ai/util/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ from inspect_ai.util._limit import (
     token_limit,
 )
+from ._collect import collect
 from ._concurrency import concurrency
 from ._console import input_screen
 from ._display import DisplayType, display_counter, display_type
@@ -28,6 +29,7 @@ from ._sandbox import (
     sandbox_with,
     sandboxenv,
 )
+from ._span import span
 from ._store import Store, store
 from ._store_model import StoreModel, store_as
 from ._subprocess import (
@@ -71,6 +73,8 @@ __all__ = [
     "store",
     "StoreModel",
     "store_as",
+    "span",
+    "collect",
     "Subtask",
     "subtask",
     "throttle",

inspect_ai/util/_anyio.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import itertools
 import sys
+import anyio
+from inspect_ai._util._async import current_async_backend
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
@@ -36,3 +40,10 @@ def _flatten_exception(exc: Exception) -> list[Exception]:
     ]
     return maybe_this_exception + other_exceptions
+def safe_current_task_id() -> int | None:
+    if current_async_backend() is not None:
+        return anyio.get_current_task().id
+    else:
+        return None

inspect_ai/util/_collect.py ADDED Viewed

@@ -0,0 +1,50 @@
+import sys
+from typing import Awaitable, TypeVar, cast
+import anyio
+from ._span import span
+if sys.version_info < (3, 11):
+    from exceptiongroup import ExceptionGroup
+T = TypeVar("T")
+async def collect(*tasks: Awaitable[T]) -> list[T]:
+    """Run and collect the results of one or more async coroutines.
+    Similar to [`asyncio.gather()`](https://docs.python.org/3/library/asyncio-task.html#asyncio.gather),
+    but also works when [Trio](https://trio.readthedocs.io/en/stable/) is the async backend.
+    Automatically includes each task in a `span()`, which
+    ensures that its events are grouped together in the transcript.
+    Using `collect()` in preference to `asyncio.gather()` is highly recommended
+    for both Trio compatibility and more legible transcript output.
+    Args:
+        *tasks: Tasks to run
+    Returns:
+        List of task results.
+    """
+    results: list[None | T] = [None] * len(tasks)
+    try:
+        async with anyio.create_task_group() as tg:
+            async def run_task(index: int, task: Awaitable[T]) -> None:
+                async with span(f"task-{index + 1}", type="task"):
+                    results[index] = await task
+            for i, task in enumerate(tasks):
+                tg.start_soon(run_task, i, task)
+    except ExceptionGroup as ex:
+        if len(ex.exceptions) == 1:
+            raise ex.exceptions[0] from None
+        else:
+            raise
+    return cast(list[T], results)

inspect_ai/util/_sandbox/events.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import contextlib
 import shlex
 from datetime import datetime
-from typing import Iterator, Literal, Type, Union, overload
+from typing import Any, Iterator, Literal, Type, Union, overload
 from pydantic import JsonValue
 from pydantic_core import to_jsonable_python
@@ -134,7 +134,8 @@ class SandboxEnvironmentProxy(SandboxEnvironment):
     @override
     async def connection(self, *, user: str | None = None) -> SandboxConnection:
-        return await self._sandbox.connection(user=user)
+        params: dict[str, Any] = {"user": user} if user is not None else {}
+        return await self._sandbox.connection(**params)
     @override
     def as_type(self, sandbox_cls: Type[ST]) -> ST:

inspect_ai/util/_span.py ADDED Viewed

@@ -0,0 +1,58 @@
+import contextlib
+from contextvars import ContextVar
+from typing import AsyncIterator
+from uuid import uuid4
+@contextlib.asynccontextmanager
+async def span(name: str, *, type: str | None = None) -> AsyncIterator[None]:
+    """Context manager for establishing a transcript span.
+    Args:
+        name (str): Step name.
+        type (str | None): Optional span type.
+    """
+    from inspect_ai.log._transcript import (
+        SpanBeginEvent,
+        SpanEndEvent,
+        track_store_changes,
+        transcript,
+    )
+    # span id
+    id = uuid4().hex
+    # capture parent id
+    parent_id = _current_span_id.get()
+    # set new current span (reset at the end)
+    token = _current_span_id.set(id)
+    # run the span
+    try:
+        # span begin event
+        transcript()._event(
+            SpanBeginEvent(
+                id=id,
+                parent_id=parent_id,
+                type=type,
+                name=name,
+            )
+        )
+        # run span w/ store change events
+        with track_store_changes():
+            yield
+    finally:
+        # send end event
+        transcript()._event(SpanEndEvent(id=id))
+        _current_span_id.reset(token)
+def current_span_id() -> str | None:
+    return _current_span_id.get()
+_current_span_id: ContextVar[str | None] = ContextVar("_current_span_id", default=None)

inspect_ai/util/_subtask.py CHANGED Viewed

@@ -16,6 +16,7 @@ from inspect_ai._util._async import is_callable_coroutine, tg_collect
 from inspect_ai._util.content import Content
 from inspect_ai._util.trace import trace_action
 from inspect_ai._util.working import sample_waiting_time
+from inspect_ai.util._span import span
 from inspect_ai.util._store import Store, dict_jsonable, init_subtask_store
 SubtaskResult = str | int | float | bool | list[Content]
@@ -85,9 +86,7 @@ def subtask(
     def create_subtask_wrapper(func: Subtask, name: str | None = None) -> Subtask:
         from inspect_ai.log._transcript import (
-            Event,
             SubtaskEvent,
-            track_store_changes,
             transcript,
         )
@@ -118,43 +117,41 @@ def subtask(
                 log_input = dict_jsonable(log_input | kwargs)
             # create coroutine so we can provision a subtask contextvars
-            async def run() -> tuple[RT, list[Event]]:
+            async def run() -> RT:
                 # initialise subtask (provisions store and transcript)
-                init_subtask(subtask_name, store if store else Store())
+                init_subtask_store(store if store else Store())
                 # run the subtask
                 with trace_action(logger, "Subtask", subtask_name):
-                    with track_store_changes():  # type: ignore
+                    async with span(name=subtask_name, type="subtask"):
+                        # create subtask event
+                        waiting_time_start = sample_waiting_time()
+                        event = SubtaskEvent(
+                            name=subtask_name, input=log_input, type=type, pending=True
+                        )
+                        transcript()._event(event)
+                        # run the subtask
                         result = await func(*args, **kwargs)
-                # return result and event
-                return result, list(transcript().events)
+                        # time accounting
+                        completed = datetime.now()
+                        waiting_time_end = sample_waiting_time()
+                        event.completed = completed
+                        event.working_time = (
+                            completed - event.timestamp
+                        ).total_seconds() - (waiting_time_end - waiting_time_start)
-            # create subtask event
-            waiting_time_start = sample_waiting_time()
-            event = SubtaskEvent(
-                name=subtask_name, input=log_input, type=type, pending=True
-            )
-            transcript()._event(event)
-            # create and run the task as a coroutine
-            result, events = (await tg_collect([run]))[0]
-            # time accounting
-            completed = datetime.now()
-            waiting_time_end = sample_waiting_time()
-            event.completed = completed
-            event.working_time = (completed - event.timestamp).total_seconds() - (
-                waiting_time_end - waiting_time_start
-            )
+                        # update event
+                        event.result = result
+                        event.pending = None
+                        transcript()._event_updated(event)
-            # update event
-            event.result = result
-            event.events = events
-            event.pending = None
-            transcript()._event_updated(event)
+                        # return result
+                        return result  # type: ignore[no-any-return]
-            # return result
+            # create and run the task as a coroutine
+            result = (await tg_collect([run]))[0]
             return result
         return run_subtask
@@ -167,15 +164,3 @@ def subtask(
         return wrapper
     else:
         return create_subtask_wrapper(name)
-def init_subtask(name: str, store: Store) -> Any:
-    from inspect_ai.log._transcript import (
-        Transcript,
-        init_transcript,
-    )
-    init_subtask_store(store)
-    transcript = Transcript(name=name)
-    init_transcript(transcript)
-    return transcript

{inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.93
+Version: 0.3.95
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License
@@ -32,6 +32,8 @@ Requires-Dist: httpx
 Requires-Dist: ijson>=3.2.0
 Requires-Dist: jsonlines>=3.0.0
 Requires-Dist: jsonpatch>=1.32
+Requires-Dist: jsonpath-ng>=1.7.0
+Requires-Dist: jsonref>=1.1.0
 Requires-Dist: jsonschema>3.1.1
 Requires-Dist: mmh3>3.1.0
 Requires-Dist: nest_asyncio
@@ -59,6 +61,7 @@ Requires-Dist: google-genai; extra == "dev"
 Requires-Dist: griffe; extra == "dev"
 Requires-Dist: groq; extra == "dev"
 Requires-Dist: ipython; extra == "dev"
+Requires-Dist: jsonpath-ng; extra == "dev"
 Requires-Dist: markdown; extra == "dev"
 Requires-Dist: mcp; extra == "dev"
 Requires-Dist: mistralai; extra == "dev"
@@ -66,9 +69,11 @@ Requires-Dist: moto[server]; extra == "dev"
 Requires-Dist: mypy; extra == "dev"
 Requires-Dist: nbformat; extra == "dev"
 Requires-Dist: openai; extra == "dev"
+Requires-Dist: pandas>=2.0.0; extra == "dev"
 Requires-Dist: panflute; extra == "dev"
 Requires-Dist: pip; extra == "dev"
 Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: pyarrow>=10.0.1; extra == "dev"
 Requires-Dist: pylint; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-asyncio; extra == "dev"
@@ -78,6 +83,8 @@ Requires-Dist: pytest-xdist; extra == "dev"
 Requires-Dist: ruff==0.9.6; extra == "dev"
 Requires-Dist: textual-dev>=0.86.2; extra == "dev"
 Requires-Dist: trio; extra == "dev"
+Requires-Dist: pandas-stubs; extra == "dev"
+Requires-Dist: pyarrow-stubs; extra == "dev"
 Requires-Dist: types-Markdown; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-beautifulsoup4; extra == "dev"

inspect-ai 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl

inspect-ai 0.3.93py3-none-any.whl → 0.3.95py3-none-any.whl