PyPI - inspect-ai - Versions diffs - 0.3.88__py3-none-any.whl → 0.3.90__py3-none-any.whl - Mend

inspect-ai 0.3.88py3-none-any.whl → 0.3.90py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_cli/score.py +1 -12
inspect_ai/_cli/util.py +4 -2
inspect_ai/_display/core/footer.py +2 -2
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_eval/context.py +7 -1
inspect_ai/_eval/eval.py +51 -27
inspect_ai/_eval/evalset.py +27 -10
inspect_ai/_eval/loader.py +7 -8
inspect_ai/_eval/run.py +23 -31
inspect_ai/_eval/score.py +18 -1
inspect_ai/_eval/task/log.py +5 -13
inspect_ai/_eval/task/resolved.py +1 -0
inspect_ai/_eval/task/run.py +231 -256
inspect_ai/_eval/task/task.py +25 -2
inspect_ai/_eval/task/util.py +1 -8
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/json.py +8 -3
inspect_ai/_util/registry.py +30 -13
inspect_ai/_view/www/App.css +5 -0
inspect_ai/_view/www/dist/assets/index.css +71 -36
inspect_ai/_view/www/dist/assets/index.js +573 -475
inspect_ai/_view/www/log-schema.json +66 -0
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -2
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -6
inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +0 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
inspect_ai/_view/www/src/types/log.d.ts +24 -6
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
inspect_ai/agent/_agent.py +12 -0
inspect_ai/agent/_as_tool.py +1 -1
inspect_ai/agent/_bridge/bridge.py +9 -2
inspect_ai/agent/_react.py +142 -74
inspect_ai/agent/_run.py +13 -2
inspect_ai/agent/_types.py +6 -0
inspect_ai/approval/_apply.py +6 -7
inspect_ai/approval/_approver.py +3 -3
inspect_ai/approval/_auto.py +2 -2
inspect_ai/approval/_call.py +20 -4
inspect_ai/approval/_human/approver.py +3 -3
inspect_ai/approval/_human/manager.py +2 -2
inspect_ai/approval/_human/panel.py +3 -3
inspect_ai/approval/_policy.py +3 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +23 -2
inspect_ai/log/_model.py +58 -0
inspect_ai/log/_recorders/file.py +14 -3
inspect_ai/log/_transcript.py +3 -0
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_call_tools.py +4 -1
inspect_ai/model/_model.py +49 -3
inspect_ai/model/_openai.py +151 -21
inspect_ai/model/_providers/anthropic.py +20 -12
inspect_ai/model/_providers/bedrock.py +3 -3
inspect_ai/model/_providers/cloudflare.py +29 -108
inspect_ai/model/_providers/google.py +21 -10
inspect_ai/model/_providers/grok.py +23 -17
inspect_ai/model/_providers/groq.py +61 -37
inspect_ai/model/_providers/llama_cpp_python.py +8 -9
inspect_ai/model/_providers/mistral.py +8 -3
inspect_ai/model/_providers/ollama.py +8 -9
inspect_ai/model/_providers/openai.py +53 -157
inspect_ai/model/_providers/openai_compatible.py +195 -0
inspect_ai/model/_providers/openrouter.py +4 -15
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/model/_providers/together.py +25 -23
inspect_ai/model/_trim.py +83 -0
inspect_ai/solver/_plan.py +5 -3
inspect_ai/tool/_tool_def.py +8 -2
inspect_ai/util/__init__.py +3 -0
inspect_ai/util/_concurrency.py +15 -2
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/METADATA +1 -1
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/RECORD +88 -83
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/WHEEL +1 -1
inspect_ai/_eval/task/rundir.py +0 -78
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/top_level.txt +0 -0

inspect_ai/log/_model.py ADDED Viewed

@@ -0,0 +1,58 @@
+from inspect import isgenerator
+from typing import Any, Iterator
+from inspect_ai.log._log import EvalModelConfig
+from inspect_ai.model._model import Model, get_model
+def model_roles_to_model_roles_config(
+    model_roles: dict[str, Model] | None,
+) -> dict[str, EvalModelConfig] | None:
+    if model_roles is not None:
+        return {k: model_to_model_config(v) for k, v in model_roles.items()}
+    else:
+        return None
+def model_roles_config_to_model_roles(
+    model_config: dict[str, EvalModelConfig] | None,
+) -> dict[str, Model] | None:
+    if model_config is not None:
+        return {k: model_config_to_model(v) for k, v in model_config.items()}
+    else:
+        return None
+def model_to_model_config(model: Model) -> EvalModelConfig:
+    return EvalModelConfig(
+        model=str(model),
+        config=model.config,
+        base_url=model.api.base_url,
+        args=model_args_for_log(model.model_args),
+    )
+def model_config_to_model(model_config: EvalModelConfig) -> Model:
+    return get_model(
+        model=model_config.model,
+        config=model_config.config,
+        base_url=model_config.base_url,
+        memoize=False,
+        **model_config.args,
+    )
+def model_args_for_log(model_args: dict[str, Any]) -> dict[str, Any]:
+    # redact authentication oriented model_args
+    model_args = model_args.copy()
+    if "api_key" in model_args:
+        del model_args["api_key"]
+    model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
+    # don't try to serialise generators
+    model_args = {
+        k: v
+        for k, v in model_args.items()
+        if not isgenerator(v) and not isinstance(v, Iterator)
+    }
+    return model_args

inspect_ai/log/_recorders/file.py CHANGED Viewed

@@ -1,8 +1,10 @@
+import os
 from logging import getLogger
 from typing import Any
 from typing_extensions import override
+from inspect_ai._util.constants import MODEL_NONE
 from inspect_ai._util.file import filesystem
 from inspect_ai._util.registry import registry_unqualified_name
@@ -71,9 +73,18 @@ class FileRecorder(Recorder):
             return s.replace("_", "-").replace("/", "-").replace(":", "-")
         # remove package from task name
-        task = registry_unqualified_name(eval.task)
-        return f"{clean(eval.created)}_{clean(task)}_{clean(eval.task_id)}"
+        task = registry_unqualified_name(eval.task)  # noqa: F841
+        # derive log file pattern
+        log_file_pattern = os.getenv("INSPECT_EVAL_LOG_FILE_PATTERN", "{task}_{id}")
+        # compute and return log file name
+        log_file_name = f"{clean(eval.created)}_" + log_file_pattern
+        log_file_name = log_file_name.replace("{task}", clean(task))
+        log_file_name = log_file_name.replace("{id}", clean(eval.task_id))
+        model = clean(eval.model) if eval.model != MODEL_NONE else ""
+        log_file_name = log_file_name.replace("{model}", model)
+        return log_file_name
     def _log_file_path(self, eval: EvalSpec) -> str:
         return f"{self.log_dir}{self.fs.sep}{self._log_file_key(eval)}{self.suffix}"

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -123,6 +123,9 @@ class ModelEvent(BaseEvent):
     model: str
     """Model name."""
+    role: str | None = Field(default=None)
+    """Model role."""
     input: list[ChatMessage]
     """Model input (list of messages)."""

inspect_ai/model/__init__.py CHANGED Viewed

@@ -47,6 +47,7 @@ from ._model_output import (
 )
 from ._providers.providers import *
 from ._registry import modelapi
+from ._trim import trim_messages
 __all__ = [
     "GenerateConfig",
@@ -80,6 +81,7 @@ __all__ = [
     "call_tools",
     "execute_tools",
     "ExecuteToolsResult",
+    "trim_messages",
     "cache_clear",
     "cache_list_expired",
     "cache_path",

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -264,6 +264,7 @@ async def execute_tools(
                 tuple[ExecuteToolsResult, ToolEvent, Exception | None]
             ]()
+            result_exception = None
             async with anyio.create_task_group() as tg:
                 tg.start_soon(call_tool_task, call, messages, send_stream)
                 event._set_cancel_fn(tg.cancel_scope.cancel)
@@ -348,7 +349,9 @@ async def call_tool(
     # if we have a tool approver, apply it now
     from inspect_ai.approval._apply import apply_tool_approval
-    approved, approval = await apply_tool_approval(message, call, tool_def.viewer)
+    approved, approval = await apply_tool_approval(
+        message, call, tool_def.viewer, conversation
+    )
     if not approved:
         if approval and approval.decision == "terminate":
             from inspect_ai.solver._limit import SampleLimitExceededError

inspect_ai/model/_model.py CHANGED Viewed

@@ -270,6 +270,7 @@ class Model:
         self.api = api
         self.config = config
         self.model_args = model_args
+        self._role: str | None = None
         # state indicating whether our lifetime is bound by a context manager
         self._context_bound = False
@@ -311,6 +312,14 @@ class Model:
         """Model name."""
         return self.api.model_name
+    @property
+    def role(self) -> str | None:
+        """Model role."""
+        return self._role
+    def _set_role(self, role: str) -> None:
+        self._role = role
     def __str__(self) -> str:
         return f"{ModelName(self)}"
@@ -716,7 +725,7 @@ class Model:
         )
         model_name = ModelName(self)
         async with concurrency(
-            name=f"{model_name.api}",
+            name=str(model_name),
             concurrency=max_connections,
             key=f"Model{self.api.connection_key()}",
         ):
@@ -738,6 +747,7 @@ class Model:
         model = str(self)
         event = ModelEvent(
             model=model,
+            role=self.role,
             input=input,
             tools=tools,
             tool_choice=tool_choice,
@@ -828,6 +838,9 @@ class ModelName:
 def get_model(
     model: str | Model | None = None,
+    *,
+    role: str | None = None,
+    default: str | Model | None = None,
     config: GenerateConfig = GenerateConfig(),
     base_url: str | None = None,
     api_key: str | None = None,
@@ -858,6 +871,11 @@ def get_model(
           if `None` is passed then the model currently being
           evaluated is returned (or if there is no evaluation
           then the model referred to by `INSPECT_EVAL_MODEL`).
+       role: Optional named role for model (e.g. for roles specified
+          at the task or eval level). Provide a `default` as a fallback
+          in the case where the `role` hasn't been externally specified.
+       default: Optional. Fallback model in case the specified
+          `model` or `role` is not found.
        config: Configuration for model.
        base_url: Optional. Alternate base URL for model.
        api_key: Optional. API key for model.
@@ -878,6 +896,22 @@ def get_model(
     if model == "none":
         model = "none/none"
+    # resolve model role
+    if role is not None:
+        model_for_role = model_roles().get(role, None)
+        if model_for_role is not None:
+            return model_for_role
+    # if a default was specified then use it as the model if
+    # no model was passed
+    if model is None:
+        if isinstance(default, Model):
+            if role is not None:
+                default._set_role(role)
+            return default
+        else:
+            model = default
     # now try finding an 'ambient' model (active or env var)
     if model is None:
         # return active_model if there is one
@@ -901,6 +935,7 @@ def get_model(
     if memoize:
         model_cache_key = (
             model
+            + str(role)
             + config.model_dump_json(exclude_none=True)
             + str(base_url)
             + str(api_key)
@@ -941,10 +976,11 @@ def get_model(
             **model_args,
         )
         m = Model(modelapi_instance, config, model_args)
+        if role is not None:
+            m._set_role(role)
         if memoize:
             _models[model_cache_key] = m
         return m
     else:
         from_api = f" from {api_name}" if api_name else ""
         raise ValueError(f"Model name {model}{from_api} not recognized.")
@@ -1353,10 +1389,20 @@ def active_model() -> Model | None:
     return active_model_context_var.get(None)
-# shared contexts for asyncio tasks
+def init_model_roles(roles: dict[str, Model]) -> None:
+    _model_roles.set(roles)
+def model_roles() -> dict[str, Model]:
+    return _model_roles.get()
 active_model_context_var: ContextVar[Model | None] = ContextVar("active_model")
+_model_roles: ContextVar[dict[str, Model]] = ContextVar("model_roles", default={})
+# shared contexts for asyncio tasks
 def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:
     from inspect_ai.log._samples import (
         active_sample_message_limit,

inspect_ai/model/_openai.py CHANGED Viewed

@@ -1,9 +1,18 @@
 import json
 import re
+import socket
 from copy import copy
-from typing import Literal
-from openai import APIStatusError, OpenAIError
+from typing import Any, Literal
+import httpx
+from openai import (
+    DEFAULT_CONNECTION_LIMITS,
+    DEFAULT_TIMEOUT,
+    APIStatusError,
+    APITimeoutError,
+    OpenAIError,
+    RateLimitError,
+)
 from openai.types.chat import (
     ChatCompletion,
     ChatCompletionAssistantMessageParam,
@@ -38,9 +47,11 @@ from inspect_ai._util.content import (
     ContentReasoning,
     ContentText,
 )
+from inspect_ai._util.http import is_retryable_http_status
 from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_http_url
 from inspect_ai.model._call_tools import parse_tool_call
+from inspect_ai.model._generate_config import GenerateConfig
 from inspect_ai.model._model_output import ChatCompletionChoice, Logprobs
 from inspect_ai.model._reasoning import parse_content_with_reasoning
 from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
@@ -146,24 +157,20 @@ async def openai_chat_completion_part(
 async def openai_chat_message(
-    message: ChatMessage, model: str
+    message: ChatMessage, system_role: Literal["user", "system", "developer"] = "system"
 ) -> ChatCompletionMessageParam:
     if message.role == "system":
-        # o1-mini does not support developer or system messages
-        # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
-        if is_o1_mini(model):
-            return ChatCompletionUserMessageParam(role="user", content=message.text)
-        # other o-series models use 'developer' rather than 'system' messages
-        # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
-        elif is_o_series(model):
-            return ChatCompletionDeveloperMessageParam(
-                role="developer", content=message.text
-            )
-        # gpt models use standard 'system' messages
-        else:
-            return ChatCompletionSystemMessageParam(
-                role=message.role, content=message.text
-            )
+        match system_role:
+            case "user":
+                return ChatCompletionUserMessageParam(role="user", content=message.text)
+            case "system":
+                return ChatCompletionSystemMessageParam(
+                    role=message.role, content=message.text
+                )
+            case "developer":
+                return ChatCompletionDeveloperMessageParam(
+                    role="developer", content=message.text
+                )
     elif message.role == "user":
         return ChatCompletionUserMessageParam(
             role=message.role,
@@ -202,9 +209,54 @@ async def openai_chat_message(
 async def openai_chat_messages(
-    messages: list[ChatMessage], model: str
+    messages: list[ChatMessage],
+    system_role: Literal["user", "system", "developer"] = "system",
 ) -> list[ChatCompletionMessageParam]:
-    return [await openai_chat_message(message, model) for message in messages]
+    return [await openai_chat_message(message, system_role) for message in messages]
+def openai_completion_params(
+    model: str, config: GenerateConfig, tools: bool
+) -> dict[str, Any]:
+    params: dict[str, Any] = dict(model=model)
+    if config.max_tokens is not None:
+        params["max_tokens"] = config.max_tokens
+    if config.frequency_penalty is not None:
+        params["frequency_penalty"] = config.frequency_penalty
+    if config.stop_seqs is not None:
+        params["stop"] = config.stop_seqs
+    if config.presence_penalty is not None:
+        params["presence_penalty"] = config.presence_penalty
+    if config.logit_bias is not None:
+        params["logit_bias"] = config.logit_bias
+    if config.seed is not None:
+        params["seed"] = config.seed
+    if config.temperature is not None:
+        params["temperature"] = config.temperature
+    if config.top_p is not None:
+        params["top_p"] = config.top_p
+    if config.num_choices is not None:
+        params["n"] = config.num_choices
+    if config.logprobs is not None:
+        params["logprobs"] = config.logprobs
+    if config.top_logprobs is not None:
+        params["top_logprobs"] = config.top_logprobs
+    if tools and config.parallel_tool_calls is not None:
+        params["parallel_tool_calls"] = config.parallel_tool_calls
+    if config.reasoning_effort is not None:
+        params["reasoning_effort"] = config.reasoning_effort
+    if config.response_schema is not None:
+        params["response_format"] = dict(
+            type="json_schema",
+            json_schema=dict(
+                name=config.response_schema.name,
+                schema=config.response_schema.json_schema.model_dump(exclude_none=True),
+                description=config.response_schema.description,
+                strict=config.response_schema.strict,
+            ),
+        )
+    return params
 def openai_assistant_content(message: ChatMessageAssistant) -> str:
@@ -496,6 +548,35 @@ def chat_message_assistant_from_openai(
     )
+def model_output_from_openai(
+    completion: ChatCompletion,
+    choices: list[ChatCompletionChoice],
+) -> ModelOutput:
+    return ModelOutput(
+        model=completion.model,
+        choices=choices,
+        usage=(
+            ModelUsage(
+                input_tokens=completion.usage.prompt_tokens,
+                output_tokens=completion.usage.completion_tokens,
+                input_tokens_cache_read=(
+                    completion.usage.prompt_tokens_details.cached_tokens
+                    if completion.usage.prompt_tokens_details is not None
+                    else None  # openai only have cache read stats/pricing.
+                ),
+                reasoning_tokens=(
+                    completion.usage.completion_tokens_details.reasoning_tokens
+                    if completion.usage.completion_tokens_details is not None
+                    else None
+                ),
+                total_tokens=completion.usage.total_tokens,
+            )
+            if completion.usage
+            else None
+        ),
+    )
 def chat_choices_from_openai(
     response: ChatCompletion, tools: list[ToolInfo]
 ) -> list[ChatCompletionChoice]:
@@ -517,6 +598,19 @@ def chat_choices_from_openai(
     ]
+def openai_should_retry(ex: Exception) -> bool:
+    if isinstance(ex, RateLimitError):
+        return True
+    elif isinstance(ex, APIStatusError):
+        return is_retryable_http_status(ex.status_code)
+    elif isinstance(ex, OpenAIResponseError):
+        return ex.code in ["rate_limit_exceeded", "server_error"]
+    elif isinstance(ex, APITimeoutError):
+        return True
+    else:
+        return False
 def openai_handle_bad_request(
     model_name: str, e: APIStatusError
 ) -> ModelOutput | Exception:
@@ -559,3 +653,39 @@ def openai_media_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
         value = copy(value)
         value.update(data=BASE_64_DATA_REMOVED)
     return value
+class OpenAIAsyncHttpxClient(httpx.AsyncClient):
+    """Custom async client that deals better with long running Async requests.
+    Based on Anthropic DefaultAsyncHttpClient implementation that they
+    released along with Claude 3.7 as well as the OpenAI DefaultAsyncHttpxClient
+    """
+    def __init__(self, **kwargs: Any) -> None:
+        # This is based on the openai DefaultAsyncHttpxClient:
+        # https://github.com/openai/openai-python/commit/347363ed67a6a1611346427bb9ebe4becce53f7e
+        kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
+        kwargs.setdefault("limits", DEFAULT_CONNECTION_LIMITS)
+        kwargs.setdefault("follow_redirects", True)
+        # This is based on the anthrpopic changes for claude 3.7:
+        # https://github.com/anthropics/anthropic-sdk-python/commit/c5387e69e799f14e44006ea4e54fdf32f2f74393#diff-3acba71f89118b06b03f2ba9f782c49ceed5bb9f68d62727d929f1841b61d12bR1387-R1403
+        # set socket options to deal with long running reasoning requests
+        socket_options = [
+            (socket.SOL_SOCKET, socket.SO_KEEPALIVE, True),
+            (socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 60),
+            (socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 5),
+        ]
+        TCP_KEEPIDLE = getattr(socket, "TCP_KEEPIDLE", None)
+        if TCP_KEEPIDLE is not None:
+            socket_options.append((socket.IPPROTO_TCP, TCP_KEEPIDLE, 60))
+        kwargs["transport"] = httpx.AsyncHTTPTransport(
+            limits=DEFAULT_CONNECTION_LIMITS,
+            socket_options=socket_options,
+        )
+        super().__init__(**kwargs)

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -82,7 +82,6 @@ class AnthropicAPI(ModelAPI):
         parts = model_name.split("/")
         if len(parts) > 1:
             self.service: str | None = parts[0]
-            model_name = "/".join(parts[1:])
         else:
             self.service = None
@@ -237,7 +236,7 @@ class AnthropicAPI(ModelAPI):
             # extract output
             output = await model_output_from_message(
-                self.client, self.model_name, message, tools
+                self.client, self.service_model_name(), message, tools
             )
             # return output and call
@@ -249,7 +248,7 @@ class AnthropicAPI(ModelAPI):
         except APIStatusError as ex:
             if ex.status_code == 413:
                 return ModelOutput.from_content(
-                    model=self.model_name,
+                    model=self.service_model_name(),
                     content=ex.message,
                     stop_reason="model_length",
                     error=ex.message,
@@ -261,7 +260,7 @@ class AnthropicAPI(ModelAPI):
         self, config: GenerateConfig
     ) -> tuple[dict[str, Any], dict[str, str], list[str]]:
         max_tokens = cast(int, config.max_tokens)
-        params = dict(model=self.model_name, max_tokens=max_tokens)
+        params = dict(model=self.service_model_name(), max_tokens=max_tokens)
         headers: dict[str, str] = {}
         betas: list[str] = []
         # some params not compatible with thinking models
@@ -311,18 +310,22 @@ class AnthropicAPI(ModelAPI):
         return not self.is_claude_3() and not self.is_claude_3_5()
     def is_claude_3(self) -> bool:
-        return re.search(r"claude-3-[a-zA-Z]", self.model_name) is not None
+        return re.search(r"claude-3-[a-zA-Z]", self.service_model_name()) is not None
     def is_claude_3_5(self) -> bool:
-        return "claude-3-5-" in self.model_name
+        return "claude-3-5-" in self.service_model_name()
     def is_claude_3_7(self) -> bool:
-        return "claude-3-7-" in self.model_name
+        return "claude-3-7-" in self.service_model_name()
     @override
     def connection_key(self) -> str:
         return str(self.api_key)
+    def service_model_name(self) -> str:
+        """Model name without any service prefix."""
+        return self.model_name.replace(f"{self.service}/", "", 1)
     @override
     def should_retry(self, ex: Exception) -> bool:
         if isinstance(ex, APIStatusError):
@@ -371,7 +374,11 @@ class AnthropicAPI(ModelAPI):
         # NOTE: Using case insensitive matching because the Anthropic Bedrock API seems to capitalize the work 'input' in its error message, other times it doesn't.
         if any(
             message in error.lower()
-            for message in ["prompt is too long", "input is too long"]
+            for message in [
+                "prompt is too long",
+                "input is too long",
+                "input length and `max_tokens` exceed context limit",
+            ]
         ):
             if (
                 isinstance(ex.body, dict)
@@ -392,7 +399,7 @@ class AnthropicAPI(ModelAPI):
         if content and stop_reason:
             return ModelOutput.from_content(
-                model=self.model_name,
+                model=self.service_model_name(),
                 content=content,
                 stop_reason=stop_reason,
                 error=error,
@@ -440,10 +447,11 @@ class AnthropicAPI(ModelAPI):
         # only certain claude models qualify
         if cache_prompt:
+            model_name = self.service_model_name()
             if (
-                "claude-3-sonnet" in self.model_name
-                or "claude-2" in self.model_name
-                or "claude-instant" in self.model_name
+                "claude-3-sonnet" in model_name
+                or "claude-2" in model_name
+                or "claude-instant" in model_name
             ):
                 cache_prompt = False

inspect_ai/model/_providers/bedrock.py CHANGED Viewed

@@ -368,7 +368,7 @@ class BedrockAPI(ModelAPI):
                 toolConfig=tool_config,
             )
-            def model_call(response: dict[str, Any] | None = None) -> ModelCall:
+            def model_call(response: dict[str, Any] = {}) -> ModelCall:
                 return ModelCall.create(
                     request=replace_bytes_with_placeholder(
                         request.model_dump(exclude_none=True)
@@ -388,14 +388,14 @@ class BedrockAPI(ModelAPI):
                 # Look for an explicit validation exception
                 if ex.response["Error"]["Code"] == "ValidationException":
                     response = ex.response["Error"]["Message"]
-                    if "Too many input tokens" in response:
+                    if "too many input tokens" in response.lower():
                         return ModelOutput.from_content(
                             model=self.model_name,
                             content=response,
                             stop_reason="model_length",
                         )
                     else:
-                        return ex, model_call(None)
+                        return ex, model_call()
                 else:
                     raise ex

inspect-ai 0.3.88__py3-none-any.whl → 0.3.90__py3-none-any.whl

inspect-ai 0.3.88py3-none-any.whl → 0.3.90py3-none-any.whl