PyPI - inspect-ai - Versions diffs - 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl - Mend

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

inspect_ai/_cli/eval.py +13 -1
inspect_ai/_display/plain/display.py +9 -11
inspect_ai/_display/textual/app.py +5 -5
inspect_ai/_display/textual/widgets/samples.py +47 -18
inspect_ai/_display/textual/widgets/transcript.py +25 -12
inspect_ai/_eval/eval.py +14 -2
inspect_ai/_eval/evalset.py +6 -1
inspect_ai/_eval/run.py +6 -0
inspect_ai/_eval/task/run.py +44 -15
inspect_ai/_eval/task/task.py +26 -3
inspect_ai/_util/interrupt.py +15 -0
inspect_ai/_util/logger.py +23 -0
inspect_ai/_util/rich.py +7 -8
inspect_ai/_util/text.py +301 -1
inspect_ai/_util/transcript.py +10 -2
inspect_ai/_util/working.py +46 -0
inspect_ai/_view/www/dist/assets/index.css +56 -12
inspect_ai/_view/www/dist/assets/index.js +905 -751
inspect_ai/_view/www/log-schema.json +337 -2
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
inspect_ai/_view/www/src/appearance/icons.ts +3 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
inspect_ai/_view/www/src/types/log.d.ts +188 -108
inspect_ai/_view/www/src/utils/format.ts +7 -4
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_condense.py +1 -0
inspect_ai/log/_log.py +72 -12
inspect_ai/log/_samples.py +5 -5
inspect_ai/log/_transcript.py +31 -1
inspect_ai/model/_call_tools.py +1 -1
inspect_ai/model/_conversation.py +1 -1
inspect_ai/model/_model.py +35 -16
inspect_ai/model/_model_call.py +10 -3
inspect_ai/model/_providers/anthropic.py +13 -2
inspect_ai/model/_providers/bedrock.py +7 -0
inspect_ai/model/_providers/cloudflare.py +20 -7
inspect_ai/model/_providers/google.py +358 -302
inspect_ai/model/_providers/groq.py +57 -23
inspect_ai/model/_providers/hf.py +6 -0
inspect_ai/model/_providers/mistral.py +81 -52
inspect_ai/model/_providers/openai.py +9 -0
inspect_ai/model/_providers/providers.py +6 -6
inspect_ai/model/_providers/util/tracker.py +92 -0
inspect_ai/model/_providers/vllm.py +13 -5
inspect_ai/solver/_basic_agent.py +1 -3
inspect_ai/solver/_bridge/patch.py +0 -2
inspect_ai/solver/_limit.py +4 -4
inspect_ai/solver/_plan.py +3 -3
inspect_ai/solver/_solver.py +3 -0
inspect_ai/solver/_task_state.py +10 -1
inspect_ai/tool/_tools/_web_search.py +3 -3
inspect_ai/util/_concurrency.py +14 -8
inspect_ai/util/_sandbox/context.py +15 -0
inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
inspect_ai/util/_sandbox/docker/compose.py +5 -9
inspect_ai/util/_sandbox/docker/docker.py +20 -6
inspect_ai/util/_sandbox/docker/util.py +10 -1
inspect_ai/util/_sandbox/environment.py +32 -1
inspect_ai/util/_sandbox/events.py +149 -0
inspect_ai/util/_sandbox/local.py +3 -3
inspect_ai/util/_sandbox/self_check.py +2 -1
inspect_ai/util/_subprocess.py +4 -1
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0

inspect_ai/model/_providers/groq.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
+from copy import copy
 from typing import Any, Dict, Iterable, List, Optional
 import httpx
@@ -19,9 +20,14 @@ from groq.types.chat import (
     ChatCompletionToolMessageParam,
     ChatCompletionUserMessageParam,
 )
+from pydantic import JsonValue
 from typing_extensions import override
-from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_MAX_TOKENS
+from inspect_ai._util.constants import (
+    BASE_64_DATA_REMOVED,
+    DEFAULT_MAX_RETRIES,
+    DEFAULT_MAX_TOKENS,
+)
 from inspect_ai._util.content import Content
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_http_url
@@ -48,6 +54,7 @@ from .util import (
     environment_prerequisite_error,
     model_base_url,
 )
+from .util.tracker import HttpxTimeTracker
 GROQ_API_KEY = "GROQ_API_KEY"
@@ -87,6 +94,9 @@ class GroqAPI(ModelAPI):
             http_client=httpx.AsyncClient(limits=httpx.Limits(max_connections=None)),
         )
+        # create time tracker
+        self._time_tracker = HttpxTimeTracker(self.client._client)
     @override
     async def close(self) -> None:
         await self.client.close()
@@ -98,6 +108,21 @@ class GroqAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> tuple[ModelOutput, ModelCall]:
+        # allocate request_id (so we can see it from ModelCall)
+        request_id = self._time_tracker.start_request()
+        # setup request and response for ModelCall
+        request: dict[str, Any] = {}
+        response: dict[str, Any] = {}
+        def model_call() -> ModelCall:
+            return ModelCall.create(
+                request=request,
+                response=response,
+                filter=model_call_filter,
+                time=self._time_tracker.end_request(request_id),
+            )
         messages = await as_groq_chat_messages(input)
         params = self.completion_params(config)
@@ -109,51 +134,52 @@ class GroqAPI(ModelAPI):
             if config.parallel_tool_calls is not None:
                 params["parallel_tool_calls"] = config.parallel_tool_calls
-        response: ChatCompletion = await self.client.chat.completions.create(
+        request = dict(
             messages=messages,
             model=self.model_name,
+            extra_headers={HttpxTimeTracker.REQUEST_ID_HEADER: request_id},
             **params,
         )
+        completion: ChatCompletion = await self.client.chat.completions.create(
+            **request,
+        )
+        response = completion.model_dump()
         # extract metadata
         metadata: dict[str, Any] = {
-            "id": response.id,
-            "system_fingerprint": response.system_fingerprint,
-            "created": response.created,
+            "id": completion.id,
+            "system_fingerprint": completion.system_fingerprint,
+            "created": completion.created,
         }
-        if response.usage:
+        if completion.usage:
             metadata = metadata | {
-                "queue_time": response.usage.queue_time,
-                "prompt_time": response.usage.prompt_time,
-                "completion_time": response.usage.completion_time,
-                "total_time": response.usage.total_time,
+                "queue_time": completion.usage.queue_time,
+                "prompt_time": completion.usage.prompt_time,
+                "completion_time": completion.usage.completion_time,
+                "total_time": completion.usage.total_time,
             }
         # extract output
-        choices = self._chat_choices_from_response(response, tools)
+        choices = self._chat_choices_from_response(completion, tools)
         output = ModelOutput(
-            model=response.model,
+            model=completion.model,
             choices=choices,
             usage=(
                 ModelUsage(
-                    input_tokens=response.usage.prompt_tokens,
-                    output_tokens=response.usage.completion_tokens,
-                    total_tokens=response.usage.total_tokens,
+                    input_tokens=completion.usage.prompt_tokens,
+                    output_tokens=completion.usage.completion_tokens,
+                    total_tokens=completion.usage.total_tokens,
                 )
-                if response.usage
+                if completion.usage
                 else None
             ),
             metadata=metadata,
         )
-        # record call
-        call = ModelCall.create(
-            request=dict(messages=messages, model=self.model_name, **params),
-            response=response.model_dump(),
-        )
         # return
-        return output, call
+        return output, model_call()
     def completion_params(self, config: GenerateConfig) -> Dict[str, Any]:
         params: dict[str, Any] = {}
@@ -307,3 +333,11 @@ def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAs
         tool_calls=chat_tool_calls(message, tools),
         reasoning=reasoning,
     )
+def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
+    # remove base64 encoded images
+    if key == "image_url" and isinstance(value, dict):
+        value = copy(value)
+        value.update(url=BASE_64_DATA_REMOVED)
+    return value

inspect_ai/model/_providers/hf.py CHANGED Viewed

@@ -4,6 +4,7 @@ import functools
 import gc
 import json
 import os
+import time
 from dataclasses import dataclass
 from queue import Empty, Queue
 from threading import Thread
@@ -220,6 +221,7 @@ class HuggingFaceAPI(ModelAPI):
                 output_tokens=response.output_tokens,
                 total_tokens=response.total_tokens,
             ),
+            time=response.time,
         )
     @override
@@ -377,6 +379,7 @@ class GenerateOutput:
     output_tokens: int
     total_tokens: int
     logprobs: torch.Tensor | None
+    time: float
 @dataclass
@@ -432,6 +435,7 @@ def process_batches() -> None:
         try:
             # capture the generator and decoder functions
+            start_time = time.monotonic()
             first_input = inputs[0][0]
             device = first_input.device
             tokenizer = first_input.tokenizer
@@ -467,6 +471,7 @@ def process_batches() -> None:
             outputs = decoder(sequences=generated_tokens)
             # call back futures
+            total_time = time.monotonic() - start_time
             for i, output in enumerate(outputs):
                 future = inputs[i][1]
                 input_tokens = input_ids.size(dim=1)
@@ -483,6 +488,7 @@ def process_batches() -> None:
                         output_tokens=output_tokens,
                         total_tokens=input_tokens + output_tokens,
                         logprobs=logprobs[i] if logprobs is not None else None,
+                        time=total_time,
                     ),
                 )

inspect_ai/model/_providers/mistral.py CHANGED Viewed

@@ -61,6 +61,7 @@ from .._model_output import (
     StopReason,
 )
 from .util import environment_prerequisite_error, model_base_url
+from .util.tracker import HttpxTimeTracker
 AZURE_MISTRAL_API_KEY = "AZURE_MISTRAL_API_KEY"
 AZUREAI_MISTRAL_API_KEY = "AZUREAI_MISTRAL_API_KEY"
@@ -111,16 +112,12 @@ class MistralAPI(ModelAPI):
         if base_url:
             model_args["server_url"] = base_url
-        # create client
-        self.client = Mistral(
-            api_key=self.api_key,
-            timeout_ms=(config.timeout if config.timeout else DEFAULT_TIMEOUT) * 1000,
-            **model_args,
-        )
+        self.model_args = model_args
     @override
     async def close(self) -> None:
-        await self.client.sdk_configuration.async_client.aclose()
+        # client is created and destroyed in generate
+        pass
     async def generate(
         self,
@@ -129,51 +126,83 @@ class MistralAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
-        # build request
-        request: dict[str, Any] = dict(
-            model=self.model_name,
-            messages=await mistral_chat_messages(input),
-            tools=mistral_chat_tools(tools) if len(tools) > 0 else None,
-            tool_choice=(
-                mistral_chat_tool_choice(tool_choice) if len(tools) > 0 else None
-            ),
-        )
-        if config.temperature is not None:
-            request["temperature"] = config.temperature
-        if config.top_p is not None:
-            request["top_p"] = config.top_p
-        if config.max_tokens is not None:
-            request["max_tokens"] = config.max_tokens
-        if config.seed is not None:
-            request["random_seed"] = config.seed
-        # send request
-        try:
-            response = await self.client.chat.complete_async(**request)
-        except SDKError as ex:
-            if ex.status_code == 400:
-                return self.handle_bad_request(ex), mistral_model_call(request, None)
-            else:
-                raise ex
-        if response is None:
-            raise RuntimeError("Mistral model did not return a response from generate.")
-        # return model output (w/ tool calls if they exist)
-        choices = completion_choices_from_response(response, tools)
-        return ModelOutput(
-            model=response.model,
-            choices=choices,
-            usage=ModelUsage(
-                input_tokens=response.usage.prompt_tokens,
-                output_tokens=(
-                    response.usage.completion_tokens
-                    if response.usage.completion_tokens
-                    else response.usage.total_tokens - response.usage.prompt_tokens
+        # create client
+        with Mistral(
+            api_key=self.api_key,
+            timeout_ms=(config.timeout if config.timeout else DEFAULT_TIMEOUT) * 1000,
+            **self.model_args,
+        ) as client:
+            # create time tracker
+            time_tracker = HttpxTimeTracker(client.sdk_configuration.async_client)
+            # build request
+            request_id = time_tracker.start_request()
+            request: dict[str, Any] = dict(
+                model=self.model_name,
+                messages=await mistral_chat_messages(input),
+                tools=mistral_chat_tools(tools) if len(tools) > 0 else None,
+                tool_choice=(
+                    mistral_chat_tool_choice(tool_choice) if len(tools) > 0 else None
                 ),
-                total_tokens=response.usage.total_tokens,
-            ),
-        ), mistral_model_call(request, response)
+                http_headers={HttpxTimeTracker.REQUEST_ID_HEADER: request_id},
+            )
+            if config.temperature is not None:
+                request["temperature"] = config.temperature
+            if config.top_p is not None:
+                request["top_p"] = config.top_p
+            if config.max_tokens is not None:
+                request["max_tokens"] = config.max_tokens
+            if config.seed is not None:
+                request["random_seed"] = config.seed
+            # prepare response for inclusion in model call
+            response: dict[str, Any] = {}
+            def model_call() -> ModelCall:
+                req = request.copy()
+                req.update(
+                    messages=[message.model_dump() for message in req["messages"]]
+                )
+                if req.get("tools", None) is not None:
+                    req["tools"] = [tool.model_dump() for tool in req["tools"]]
+                return ModelCall.create(
+                    request=req,
+                    response=response,
+                    time=time_tracker.end_request(request_id),
+                )
+            # send request
+            try:
+                completion = await client.chat.complete_async(**request)
+                response = completion.model_dump()
+            except SDKError as ex:
+                if ex.status_code == 400:
+                    return self.handle_bad_request(ex), model_call()
+                else:
+                    raise ex
+            if completion is None:
+                raise RuntimeError(
+                    "Mistral model did not return a response from generate."
+                )
+            # return model output (w/ tool calls if they exist)
+            choices = completion_choices_from_response(completion, tools)
+            return ModelOutput(
+                model=completion.model,
+                choices=choices,
+                usage=ModelUsage(
+                    input_tokens=completion.usage.prompt_tokens,
+                    output_tokens=(
+                        completion.usage.completion_tokens
+                        if completion.usage.completion_tokens
+                        else completion.usage.total_tokens
+                        - completion.usage.prompt_tokens
+                    ),
+                    total_tokens=completion.usage.total_tokens,
+                ),
+            ), model_call()
     @override
     def is_rate_limit(self, ex: BaseException) -> bool:
@@ -205,7 +234,7 @@ def mistral_model_call(
     request.update(messages=[message.model_dump() for message in request["messages"]])
     if request.get("tools", None) is not None:
         request["tools"] = [tool.model_dump() for tool in request["tools"]]
-    return ModelCall(
+    return ModelCall.create(
         request=request, response=response.model_dump() if response else {}
     )

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -21,6 +21,7 @@ from inspect_ai._util.constants import DEFAULT_MAX_RETRIES
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.logger import warn_once
 from inspect_ai.model._openai import chat_choices_from_openai
+from inspect_ai.model._providers.util.tracker import HttpxTimeTracker
 from inspect_ai.tool import ToolChoice, ToolInfo
 from .._chat_message import ChatMessage
@@ -137,6 +138,9 @@ class OpenAIAPI(ModelAPI):
                 **model_args,
             )
+        # create time tracker
+        self._time_tracker = HttpxTimeTracker(self.client._client)
     def is_azure(self) -> bool:
         return self.service == "azure"
@@ -172,6 +176,9 @@ class OpenAIAPI(ModelAPI):
                 **self.completion_params(config, False),
             )
+        # allocate request_id (so we can see it from ModelCall)
+        request_id = self._time_tracker.start_request()
         # setup request and response for ModelCall
         request: dict[str, Any] = {}
         response: dict[str, Any] = {}
@@ -181,6 +188,7 @@ class OpenAIAPI(ModelAPI):
                 request=request,
                 response=response,
                 filter=image_url_filter,
+                time=self._time_tracker.end_request(request_id),
             )
         # unlike text models, vision models require a max_tokens (and set it to a very low
@@ -199,6 +207,7 @@ class OpenAIAPI(ModelAPI):
             tool_choice=openai_chat_tool_choice(tool_choice)
             if len(tools) > 0
             else NOT_GIVEN,
+            extra_headers={HttpxTimeTracker.REQUEST_ID_HEADER: request_id},
             **self.completion_params(config, len(tools) > 0),
         )

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -93,8 +93,8 @@ def vertex() -> type[ModelAPI]:
 @modelapi(name="google")
 def google() -> type[ModelAPI]:
     FEATURE = "Google API"
-    PACKAGE = "google-generativeai"
-    MIN_VERSION = "0.8.4"
+    PACKAGE = "google-genai"
+    MIN_VERSION = "1.2.0"
     # workaround log spam
     # https://github.com/ray-project/ray/issues/24917
@@ -102,7 +102,7 @@ def google() -> type[ModelAPI]:
     # verify we have the package
     try:
-        import google.generativeai  # type: ignore  # noqa: F401
+        import google.genai  # type: ignore  # noqa: F401
     except ImportError:
         raise pip_dependency_error(FEATURE, [PACKAGE])
@@ -110,9 +110,9 @@ def google() -> type[ModelAPI]:
     verify_required_version(FEATURE, PACKAGE, MIN_VERSION)
     # in the clear
-    from .google import GoogleAPI
+    from .google import GoogleGenAIAPI
-    return GoogleAPI
+    return GoogleGenAIAPI
 @modelapi(name="hf")
@@ -148,7 +148,7 @@ def cf() -> type[ModelAPI]:
 def mistral() -> type[ModelAPI]:
     FEATURE = "Mistral API"
     PACKAGE = "mistralai"
-    MIN_VERSION = "1.2.0"
+    MIN_VERSION = "1.5.0"
     # verify we have the package
     try:

inspect_ai/model/_providers/util/tracker.py ADDED Viewed

@@ -0,0 +1,92 @@
+import re
+import time
+from typing import Any, cast
+import httpx
+from shortuuid import uuid
+class HttpTimeTracker:
+    def __init__(self) -> None:
+        # track request start times
+        self._requests: dict[str, float] = {}
+    def start_request(self) -> str:
+        request_id = uuid()
+        self._requests[request_id] = time.monotonic()
+        return request_id
+    def end_request(self, request_id: str) -> float:
+        # read the request time if (if available) and purge from dict
+        request_time = self._requests.pop(request_id, None)
+        if request_time is None:
+            raise RuntimeError(f"request_id not registered: {request_id}")
+        # return elapsed time
+        return time.monotonic() - request_time
+    def update_request_time(self, request_id: str) -> None:
+        request_time = self._requests.get(request_id, None)
+        if not request_time:
+            raise RuntimeError(f"No request registered for request_id: {request_id}")
+        # update the request time
+        self._requests[request_id] = time.monotonic()
+class BotoTimeTracker(HttpTimeTracker):
+    def __init__(self, session: Any) -> None:
+        from aiobotocore.session import AioSession
+        super().__init__()
+        # register hook
+        session = cast(AioSession, session._session)
+        session.register(
+            "before-send.bedrock-runtime.Converse", self.converse_before_send
+        )
+    def converse_before_send(self, **kwargs: Any) -> None:
+        user_agent = kwargs["request"].headers["User-Agent"].decode()
+        match = re.search(rf"{self.USER_AGENT_PREFIX}(\w+)", user_agent)
+        if match:
+            request_id = match.group(1)
+            self.update_request_time(request_id)
+    def user_agent_extra(self, request_id: str) -> str:
+        return f"{self.USER_AGENT_PREFIX}{request_id}"
+    USER_AGENT_PREFIX = "ins/rid#"
+class HttpxTimeTracker(HttpTimeTracker):
+    """Class which tracks the duration of successful (200 status) http requests.
+    A special header is injected into requests which is then read from
+    an httpx 'request' event hook -- this creates a record of when the request
+    started. Note that with retries a single request id could be started
+    several times; our request hook makes sure we always track the time of
+    the last request.
+    To determine the total time, we also install an httpx response hook. In
+    this hook we look for 200 responses which have a registered request id.
+    When we find one, we update the end time of the request.
+    There is an 'end_request()' method which gets the total requeset time
+    for a request_id and then purges the request_id from our tracking (so
+    the dict doesn't grow unbounded)
+    """
+    REQUEST_ID_HEADER = "x-irid"
+    def __init__(self, client: httpx.AsyncClient):
+        super().__init__()
+        # install httpx request hook
+        client.event_hooks["request"].append(self.request_hook)
+    async def request_hook(self, request: httpx.Request) -> None:
+        # update the last request time for this request id (as there could be retries)
+        request_id = request.headers.get(self.REQUEST_ID_HEADER, None)
+        if request_id:
+            self.update_request_time(request_id)

inspect_ai/model/_providers/vllm.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import functools
 import gc
 import os
+import time
 from dataclasses import dataclass
 from queue import Empty, Queue
 from threading import Thread
@@ -48,7 +49,8 @@ class GenerateOutput:
     output_tokens: int
     total_tokens: int
     stop_reason: StopReason
-    logprobs: Logprobs | None = None
+    logprobs: Logprobs | None
+    time: float
 class VLLMAPI(ModelAPI):
@@ -258,6 +260,7 @@ class VLLMAPI(ModelAPI):
         ]
         # TODO: what's the best way to calculate token usage for num_choices > 1
+        total_time = responses[0].time
         input_tokens = responses[0].input_tokens
         output_tokens = sum(response.output_tokens for response in responses)
         total_tokens = input_tokens + output_tokens
@@ -270,6 +273,7 @@ class VLLMAPI(ModelAPI):
                 output_tokens=output_tokens,
                 total_tokens=total_tokens,
             ),
+            time=total_time,
         )
@@ -356,7 +360,7 @@ def get_stop_reason(finish_reason: str | None) -> StopReason:
 def post_process_output(
-    output: RequestOutput, i: int, num_top_logprobs: int | None
+    output: RequestOutput, i: int, num_top_logprobs: int | None, total_time: float
 ) -> GenerateOutput:
     completion = output.outputs[i]
     output_text: str = completion.text
@@ -377,14 +381,15 @@ def post_process_output(
         total_tokens=total_tokens,
         stop_reason=get_stop_reason(completion.finish_reason),
         logprobs=extract_logprobs(completion, num_top_logprobs),
+        time=total_time,
     )
 def post_process_outputs(
-    output: RequestOutput, num_top_logprobs: int | None
+    output: RequestOutput, num_top_logprobs: int | None, total_time: float
 ) -> list[GenerateOutput]:
     return [
-        post_process_output(output, i, num_top_logprobs)
+        post_process_output(output, i, num_top_logprobs, total_time)
         for i in range(len(output.outputs))
     ]
@@ -412,6 +417,7 @@ def process_batches() -> None:
             continue
         try:
+            start_time = time.monotonic()
             first_input = inputs[0][0]
             generator = first_input.generator
             num_top_logprobs = first_input.num_top_logprobs
@@ -419,6 +425,7 @@ def process_batches() -> None:
             # generate
             outputs = generator([input[0].input for input in inputs])
+            total_time = time.monotonic() - start_time
             for i, output in enumerate(outputs):
                 future = inputs[i][1]
@@ -426,7 +433,8 @@ def process_batches() -> None:
                 # down to this point, so we can mark the future as done in a thread safe manner.
                 # see: https://docs.python.org/3/library/asyncio-dev.html#concurrency-and-multithreading
                 loop.call_soon_threadsafe(
-                    future.set_result, post_process_outputs(output, num_top_logprobs)
+                    future.set_result,
+                    post_process_outputs(output, num_top_logprobs, total_time),
                 )
         except Exception as e:

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -24,7 +24,7 @@ logger = getLogger(__name__)
 DEFAULT_SYSTEM_MESSAGE = """
 You are a helpful assistant attempting to submit the correct answer. You have
-several functions available to help with finding the answer. Each message may
+several functions available to help with finding the answer. Each message
 may perform one function call. You will see the result of the function right
 after sending the message. If you need to perform multiple actions, you can
 always send more messages with subsequent function calls. Do some reasoning
@@ -206,13 +206,11 @@ def basic_agent(
                             # exit if we are at max_attempts
                             attempts += 1
                             if attempts >= max_attempts:
-                                state.completed = True
                                 break
                             # exit if the submission is successful
                             answer_scores = await score(state)
                             if score_value_fn(answer_scores[0].value) == 1.0:
-                                state.completed = True
                                 break
                             # otherwise notify the model that it was incorrect and continue

inspect_ai/solver/_bridge/patch.py CHANGED Viewed

@@ -72,8 +72,6 @@ def init_openai_request_patch() -> None:
                 _patch_enabled.get()
                 # completions request
                 and options.url == "/chat/completions"
-                # call to openai not another service (e.g. TogetherAI)
-                and self.base_url == "https://api.openai.com/v1/"
             ):
                 # must also be an explicit request for an inspect model
                 json_data = cast(dict[str, Any], options.json_data)

inspect_ai/solver/_limit.py CHANGED Viewed

@@ -7,15 +7,15 @@ class SampleLimitExceededError(Exception):
     """Exception raised when a sample limit is exceeded.
     Args:
-       type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
-       value (int): Value compared to
-       limit (int): Limit applied.
+       type: Type of limit exceeded.
+       value: Value compared to
+       limit: Limit applied.
        message (str | None): Optional. Human readable message.
     """
     def __init__(
         self,
-        type: Literal["message", "time", "token", "operator", "custom"],
+        type: Literal["message", "time", "working", "token", "operator", "custom"],
         *,
         value: int,
         limit: int,

inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl