PyPI - inspect-ai - Versions diffs - 0.3.88__py3-none-any.whl → 0.3.89__py3-none-any.whl - Mend

inspect-ai 0.3.88py3-none-any.whl → 0.3.89py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_cli/score.py +1 -12
inspect_ai/_cli/util.py +4 -2
inspect_ai/_display/core/footer.py +2 -2
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_eval/context.py +7 -1
inspect_ai/_eval/eval.py +51 -27
inspect_ai/_eval/evalset.py +27 -10
inspect_ai/_eval/loader.py +7 -8
inspect_ai/_eval/run.py +23 -31
inspect_ai/_eval/score.py +18 -1
inspect_ai/_eval/task/log.py +5 -13
inspect_ai/_eval/task/resolved.py +1 -0
inspect_ai/_eval/task/run.py +231 -244
inspect_ai/_eval/task/task.py +25 -2
inspect_ai/_eval/task/util.py +1 -8
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/json.py +8 -3
inspect_ai/_util/registry.py +30 -13
inspect_ai/_view/www/App.css +5 -0
inspect_ai/_view/www/dist/assets/index.css +55 -18
inspect_ai/_view/www/dist/assets/index.js +550 -458
inspect_ai/_view/www/log-schema.json +66 -0
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
inspect_ai/_view/www/src/types/log.d.ts +24 -6
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
inspect_ai/agent/_agent.py +12 -0
inspect_ai/agent/_as_tool.py +1 -1
inspect_ai/agent/_bridge/bridge.py +9 -2
inspect_ai/agent/_react.py +142 -74
inspect_ai/agent/_run.py +13 -2
inspect_ai/agent/_types.py +6 -0
inspect_ai/approval/_apply.py +6 -7
inspect_ai/approval/_approver.py +3 -3
inspect_ai/approval/_auto.py +2 -2
inspect_ai/approval/_call.py +20 -4
inspect_ai/approval/_human/approver.py +3 -3
inspect_ai/approval/_human/manager.py +2 -2
inspect_ai/approval/_human/panel.py +3 -3
inspect_ai/approval/_policy.py +3 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +23 -2
inspect_ai/log/_model.py +58 -0
inspect_ai/log/_recorders/file.py +14 -3
inspect_ai/log/_transcript.py +3 -0
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_call_tools.py +4 -1
inspect_ai/model/_model.py +49 -3
inspect_ai/model/_openai.py +151 -21
inspect_ai/model/_providers/anthropic.py +20 -12
inspect_ai/model/_providers/bedrock.py +3 -3
inspect_ai/model/_providers/cloudflare.py +29 -108
inspect_ai/model/_providers/google.py +21 -10
inspect_ai/model/_providers/grok.py +23 -17
inspect_ai/model/_providers/groq.py +61 -37
inspect_ai/model/_providers/llama_cpp_python.py +8 -9
inspect_ai/model/_providers/mistral.py +8 -3
inspect_ai/model/_providers/ollama.py +8 -9
inspect_ai/model/_providers/openai.py +53 -157
inspect_ai/model/_providers/openai_compatible.py +195 -0
inspect_ai/model/_providers/openrouter.py +4 -15
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/model/_providers/together.py +25 -23
inspect_ai/model/_trim.py +83 -0
inspect_ai/solver/_plan.py +5 -3
inspect_ai/tool/_tool_def.py +8 -2
inspect_ai/util/__init__.py +3 -0
inspect_ai/util/_concurrency.py +15 -2
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +84 -79
inspect_ai/_eval/task/rundir.py +0 -78
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0

inspect_ai/model/_providers/cloudflare.py CHANGED Viewed

@@ -1,33 +1,24 @@
 import os
 from typing import Any
-import httpx
+from openai import APIStatusError
 from typing_extensions import override
 from inspect_ai._util.constants import DEFAULT_MAX_TOKENS
-from inspect_ai.tool import ToolChoice, ToolInfo
+from inspect_ai.model._model_output import ModelOutput
+from inspect_ai.model._providers.openai_compatible import OpenAICompatibleAPI
-from ...model import ChatMessage, GenerateConfig, ModelAPI, ModelOutput
-from .._model_call import ModelCall
-from .._model_output import ChatCompletionChoice
-from .util import (
-    ChatAPIHandler,
-    Llama31Handler,
-    chat_api_input,
-    chat_api_request,
-    environment_prerequisite_error,
-    model_base_url,
-    should_retry_chat_api_error,
-)
-from .util.hooks import HttpxHooks
+from ...model import GenerateConfig
+from .util import environment_prerequisite_error
 # https://developers.cloudflare.com/workers-ai/models/#text-generation
+# https://developers.cloudflare.com/workers-ai/configuration/open-ai-compatibility/
+CLOUDFLARE_API_KEY = "CLOUDFLARE_API_KEY"
 CLOUDFLARE_API_TOKEN = "CLOUDFLARE_API_TOKEN"
-class CloudFlareAPI(ModelAPI):
+class CloudFlareAPI(OpenAICompatibleAPI):
     def __init__(
         self,
         model_name: str,
@@ -36,98 +27,34 @@ class CloudFlareAPI(ModelAPI):
         config: GenerateConfig = GenerateConfig(),
         **model_args: Any,
     ):
+        # migrate formerly used CLOUDFLARE_API_TOKEN if no other key is specified
+        if api_key is None and CLOUDFLARE_API_KEY not in os.environ:
+            api_key = os.environ.get(CLOUDFLARE_API_TOKEN, None)
+        # account id used for limits and forming base url
+        self.account_id = os.getenv("CLOUDFLARE_ACCOUNT_ID", None)
+        if not self.account_id:
+            raise environment_prerequisite_error("CloudFlare", "CLOUDFLARE_ACCOUNT_ID")
         super().__init__(
-            model_name=model_name,
+            model_name=f"@cf/{model_name}",
             base_url=base_url,
             api_key=api_key,
-            api_key_vars=[CLOUDFLARE_API_TOKEN],
             config=config,
+            service="CloudFlare",
+            service_base_url=f"https://api.cloudflare.com/client/v4/accounts/{self.account_id}/ai/v1",
+            **model_args,
         )
-        self.account_id = os.getenv("CLOUDFLARE_ACCOUNT_ID")
-        if not self.account_id:
-            raise environment_prerequisite_error("CloudFlare", "CLOUDFLARE_ACCOUNT_ID")
-        if not self.api_key:
-            self.api_key = os.getenv(CLOUDFLARE_API_TOKEN)
-            if not self.api_key:
-                raise environment_prerequisite_error("CloudFlare", CLOUDFLARE_API_TOKEN)
-        self.client = httpx.AsyncClient()
-        self._http_hooks = HttpxHooks(self.client)
-        base_url = model_base_url(base_url, "CLOUDFLARE_BASE_URL")
-        self.base_url = (
-            base_url if base_url else "https://api.cloudflare.com/client/v4/accounts"
-        )
-        self.model_args = model_args
     @override
-    async def aclose(self) -> None:
-        await self.client.aclose()
-    async def generate(
-        self,
-        input: list[ChatMessage],
-        tools: list[ToolInfo],
-        tool_choice: ToolChoice,
-        config: GenerateConfig,
-    ) -> tuple[ModelOutput, ModelCall]:
-        # chat url
-        chat_url = f"{self.base_url}/{self.account_id}/ai/run/@cf"
-        # chat api input
-        json: dict[str, Any] = dict(**self.model_args)
-        if config.max_tokens is not None:
-            json["max_tokens"] = config.max_tokens
-        json["messages"] = chat_api_input(input, tools, self.chat_api_handler())
-        # request_id
-        request_id = self._http_hooks.start_request()
-        # setup response
-        response: dict[str, Any] = {}
-        def model_call() -> ModelCall:
-            return ModelCall.create(
-                request=json,
-                response=response,
-                time=self._http_hooks.end_request(request_id),
-            )
-        # make the call
-        response = await chat_api_request(
-            self.client,
-            model_name=self.model_name,
-            url=f"{chat_url}/{self.model_name}",
-            headers={
-                "Authorization": f"Bearer {self.api_key}",
-                HttpxHooks.REQUEST_ID_HEADER: request_id,
-            },
-            json=json,
-        )
-        # handle response
-        if response["success"]:
-            # extract output
-            content = response["result"]["response"]
-            output = ModelOutput(
-                model=self.model_name,
-                choices=[
-                    ChatCompletionChoice(
-                        message=self.chat_api_handler().parse_assistant_response(
-                            content, tools
-                        ),
-                        stop_reason="stop",
-                    )
-                ],
-            )
-            # return
-            return output, model_call()
-        else:
-            error = str(response.get("errors", "Unknown"))
-            raise RuntimeError(f"Error calling {self.model_name}: {error}")
-    @override
-    def should_retry(self, ex: Exception) -> bool:
-        return should_retry_chat_api_error(ex)
+    def handle_bad_request(self, ex: APIStatusError) -> ModelOutput | Exception:
+        if ex.status_code == 403:
+            content = str(ex)
+            if "context window limit" in content:
+                return ModelOutput.from_content(
+                    self.model_name, content=content, stop_reason="model_length"
+                )
+        return ex
     # cloudflare enforces rate limits by model for each account
     @override
@@ -138,9 +65,3 @@ class CloudFlareAPI(ModelAPI):
     @override
     def max_tokens(self) -> int:
         return DEFAULT_MAX_TOKENS
-    def chat_api_handler(self) -> ChatAPIHandler:
-        if "llama" in self.model_name.lower():
-            return Llama31Handler(self.model_name)
-        else:
-            return ChatAPIHandler(self.model_name)

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -127,7 +127,6 @@ class GoogleGenAIAPI(ModelAPI):
         parts = model_name.split("/")
         if len(parts) > 1:
             self.service: str | None = parts[0]
-            model_name = "/".join(parts[1:])
         else:
             self.service = None
@@ -245,14 +244,14 @@ class GoogleGenAIAPI(ModelAPI):
         try:
             response = await client.aio.models.generate_content(
-                model=self.model_name,
+                model=self.service_model_name(),
                 contents=gemini_contents,
                 config=parameters,
             )
         except ClientError as ex:
             return self.handle_client_error(ex), model_call()
-        model_name = response.model_version or self.model_name
+        model_name = response.model_version or self.service_model_name()
         output = ModelOutput(
             model=model_name,
             choices=completion_choices_from_candidates(model_name, response),
@@ -261,6 +260,10 @@ class GoogleGenAIAPI(ModelAPI):
         return output, model_call()
+    def service_model_name(self) -> str:
+        """Model name without any service prefix."""
+        return self.model_name.replace(f"{self.service}/", "", 1)
     @override
     def should_retry(self, ex: Exception) -> bool:
         if isinstance(ex, APIError) and ex.code is not None:
@@ -270,8 +273,8 @@ class GoogleGenAIAPI(ModelAPI):
     @override
     def connection_key(self) -> str:
-        """Scope for enforcing max_connections (could also use endpoint)."""
-        return self.model_name
+        """Scope for enforcing max_connections."""
+        return str(self.api_key)
     def handle_client_error(self, ex: ClientError) -> ModelOutput | Exception:
         if (
@@ -283,7 +286,9 @@ class GoogleGenAIAPI(ModelAPI):
             )
         ):
             return ModelOutput.from_content(
-                self.model_name, content=ex.message, stop_reason="model_length"
+                self.service_model_name(),
+                content=ex.message,
+                stop_reason="model_length",
             )
         else:
             raise ex
@@ -644,10 +649,16 @@ def completion_choices_from_candidates(
             )
         ]
     else:
-        raise RuntimeError(
-            "Google response includes no completion candidates and no block reason: "
-            + f"{response.model_dump_json(indent=2)}"
-        )
+        return [
+            ChatCompletionChoice(
+                message=ChatMessageAssistant(
+                    content=NO_CONTENT,
+                    model=model,
+                    source="generate",
+                ),
+                stop_reason="stop",
+            )
+        ]
 def split_reasoning(content: str) -> tuple[str | None, str]:

inspect_ai/model/_providers/grok.py CHANGED Viewed

@@ -1,15 +1,12 @@
-import os
+from openai import APIStatusError
-from inspect_ai.model._providers.util import model_base_url
-from inspect_ai.model._providers.util.util import environment_prerequisite_error
+from inspect_ai.model._model_output import ModelOutput
 from .._generate_config import GenerateConfig
-from .openai import OpenAIAPI
+from .openai_compatible import OpenAICompatibleAPI
-GROK_API_KEY = "GROK_API_KEY"
-class GrokAPI(OpenAIAPI):
+class GrokAPI(OpenAICompatibleAPI):
     def __init__(
         self,
         model_name: str,
@@ -17,19 +14,28 @@ class GrokAPI(OpenAIAPI):
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
     ) -> None:
-        # resolve base url
-        base_url = model_base_url(base_url, "GROK_BASE_URL")
-        base_url = base_url or "https://api.x.ai/v1"
-        # resolve api key
-        api_key = api_key or os.environ.get(GROK_API_KEY, None)
-        if api_key is None:
-            raise environment_prerequisite_error("Grok", GROK_API_KEY)
-        # call super
         super().__init__(
             model_name=model_name,
             base_url=base_url,
             api_key=api_key,
             config=config,
+            service="Grok",
+            service_base_url="https://api.x.ai/v1",
         )
+    def handle_bad_request(self, ex: APIStatusError) -> ModelOutput | Exception:
+        if ex.status_code == 400:
+            # extract message
+            if isinstance(ex.body, dict) and "message" in ex.body.keys():
+                content = str(ex.body.get("message"))
+            else:
+                content = ex.message
+            if "prompt length" in content:
+                return ModelOutput.from_content(
+                    model=self.model_name, content=content, stop_reason="model_length"
+                )
+            else:
+                return ex
+        else:
+            return ex

inspect_ai/model/_providers/groq.py CHANGED Viewed

@@ -102,7 +102,7 @@ class GroqAPI(ModelAPI):
         tools: list[ToolInfo],
         tool_choice: ToolChoice,
         config: GenerateConfig,
-    ) -> tuple[ModelOutput, ModelCall]:
+    ) -> tuple[ModelOutput | Exception, ModelCall]:
         # allocate request_id (so we can see it from ModelCall)
         request_id = self._http_hooks.start_request()
@@ -136,45 +136,48 @@ class GroqAPI(ModelAPI):
             **params,
         )
-        completion: ChatCompletion = await self.client.chat.completions.create(
-            **request,
-        )
+        try:
+            completion: ChatCompletion = await self.client.chat.completions.create(
+                **request,
+            )
-        response = completion.model_dump()
-        # extract metadata
-        metadata: dict[str, Any] = {
-            "id": completion.id,
-            "system_fingerprint": completion.system_fingerprint,
-            "created": completion.created,
-        }
-        if completion.usage:
-            metadata = metadata | {
-                "queue_time": completion.usage.queue_time,
-                "prompt_time": completion.usage.prompt_time,
-                "completion_time": completion.usage.completion_time,
-                "total_time": completion.usage.total_time,
-            }
+            response = completion.model_dump()
-        # extract output
-        choices = self._chat_choices_from_response(completion, tools)
-        output = ModelOutput(
-            model=completion.model,
-            choices=choices,
-            usage=(
-                ModelUsage(
-                    input_tokens=completion.usage.prompt_tokens,
-                    output_tokens=completion.usage.completion_tokens,
-                    total_tokens=completion.usage.total_tokens,
-                )
-                if completion.usage
-                else None
-            ),
-            metadata=metadata,
-        )
+            # extract metadata
+            metadata: dict[str, Any] = {
+                "id": completion.id,
+                "system_fingerprint": completion.system_fingerprint,
+                "created": completion.created,
+            }
+            if completion.usage:
+                metadata = metadata | {
+                    "queue_time": completion.usage.queue_time,
+                    "prompt_time": completion.usage.prompt_time,
+                    "completion_time": completion.usage.completion_time,
+                    "total_time": completion.usage.total_time,
+                }
+            # extract output
+            choices = self._chat_choices_from_response(completion, tools)
+            output = ModelOutput(
+                model=completion.model,
+                choices=choices,
+                usage=(
+                    ModelUsage(
+                        input_tokens=completion.usage.prompt_tokens,
+                        output_tokens=completion.usage.completion_tokens,
+                        total_tokens=completion.usage.total_tokens,
+                    )
+                    if completion.usage
+                    else None
+                ),
+                metadata=metadata,
+            )
-        # return
-        return output, model_call()
+            # return
+            return output, model_call()
+        except APIStatusError as ex:
+            return self.handle_bad_request(ex), model_call()
     def completion_params(self, config: GenerateConfig) -> Dict[str, Any]:
         params: dict[str, Any] = {}
@@ -234,6 +237,27 @@ class GroqAPI(ModelAPI):
     def max_tokens(self) -> Optional[int]:
         return DEFAULT_MAX_TOKENS
+    def handle_bad_request(self, ex: APIStatusError) -> ModelOutput | Exception:
+        if ex.status_code == 400:
+            # extract code and message
+            content = ex.message
+            code = ""
+            if isinstance(ex.body, dict) and isinstance(
+                ex.body.get("error", None), dict
+            ):
+                error = ex.body.get("error", {})
+                content = str(error.get("message", content))
+                code = error.get("code", code)
+            if code == "context_length_exceeded":
+                return ModelOutput.from_content(
+                    model=self.model_name,
+                    content=content,
+                    stop_reason="model_length",
+                )
+        return ex
 async def as_groq_chat_messages(
     messages: list[ChatMessage],

inspect_ai/model/_providers/llama_cpp_python.py CHANGED Viewed

@@ -1,10 +1,8 @@
-from inspect_ai.model._providers.util import model_base_url
 from .._generate_config import GenerateConfig
-from .openai import OpenAIAPI
+from .openai_compatible import OpenAICompatibleAPI
-class LlamaCppPythonAPI(OpenAIAPI):
+class LlamaCppPythonAPI(OpenAICompatibleAPI):
     def __init__(
         self,
         model_name: str,
@@ -12,10 +10,11 @@ class LlamaCppPythonAPI(OpenAIAPI):
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
     ) -> None:
-        base_url = model_base_url(base_url, "LLAMA_CPP_PYTHON_BASE_URL")
-        base_url = base_url if base_url else "http://localhost:8000/v1"
-        if not api_key:
-            api_key = "llama-cpp-python"
         super().__init__(
-            model_name=model_name, base_url=base_url, api_key=api_key, config=config
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key or "llama-cpp-python",
+            config=config,
+            service="llama_cpp_python",
+            service_base_url="http://localhost:8000/v1",
         )

inspect_ai/model/_providers/mistral.py CHANGED Viewed

@@ -86,7 +86,6 @@ class MistralAPI(ModelAPI):
         parts = model_name.split("/")
         if len(parts) > 1:
             self.service: str | None = parts[0]
-            model_name = "/".join(parts[1:])
         else:
             self.service = None
@@ -150,7 +149,7 @@ class MistralAPI(ModelAPI):
             # build request
             request_id = http_hooks.start_request()
             request: dict[str, Any] = dict(
-                model=self.model_name,
+                model=self.service_model_name(),
                 messages=await mistral_chat_messages(input),
                 tools=mistral_chat_tools(tools) if len(tools) > 0 else None,
                 tool_choice=(
@@ -228,6 +227,10 @@ class MistralAPI(ModelAPI):
                 ),
             ), model_call()
+    def service_model_name(self) -> str:
+        """Model name without any service prefix."""
+        return self.model_name.replace(f"{self.service}/", "", 1)
     @override
     def should_retry(self, ex: Exception) -> bool:
         if isinstance(ex, SDKError):
@@ -246,7 +249,9 @@ class MistralAPI(ModelAPI):
         content = body.get("message", ex.body)
         if "maximum context length" in ex.body:
             return ModelOutput.from_content(
-                model=self.model_name, content=content, stop_reason="model_length"
+                model=self.service_model_name(),
+                content=content,
+                stop_reason="model_length",
             )
         else:
             return ex

inspect_ai/model/_providers/ollama.py CHANGED Viewed

@@ -1,10 +1,8 @@
-from inspect_ai.model._providers.util import model_base_url
 from .._generate_config import GenerateConfig
-from .openai import OpenAIAPI
+from .openai_compatible import OpenAICompatibleAPI
-class OllamaAPI(OpenAIAPI):
+class OllamaAPI(OpenAICompatibleAPI):
     def __init__(
         self,
         model_name: str,
@@ -12,10 +10,11 @@ class OllamaAPI(OpenAIAPI):
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
     ) -> None:
-        base_url = model_base_url(base_url, "OLLAMA_BASE_URL")
-        base_url = base_url if base_url else "http://localhost:11434/v1"
-        if not api_key:
-            api_key = "ollama"
         super().__init__(
-            model_name=model_name, base_url=base_url, api_key=api_key, config=config
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key or "ollama",
+            config=config,
+            service="Ollama",
+            service_base_url="http://localhost:11434/v1",
         )

inspect-ai 0.3.88__py3-none-any.whl → 0.3.89__py3-none-any.whl

inspect-ai 0.3.88py3-none-any.whl → 0.3.89py3-none-any.whl