PyPI - inspect-ai - Versions diffs - 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl - Mend

inspect-ai 0.3.58py3-none-any.whl → 0.3.59py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

inspect_ai/_cli/common.py +3 -1
inspect_ai/_cli/eval.py +15 -2
inspect_ai/_display/core/active.py +4 -1
inspect_ai/_display/core/config.py +3 -3
inspect_ai/_display/core/panel.py +7 -3
inspect_ai/_display/plain/__init__.py +0 -0
inspect_ai/_display/plain/display.py +203 -0
inspect_ai/_display/rich/display.py +0 -5
inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
inspect_ai/_display/textual/widgets/samples.py +78 -11
inspect_ai/_display/textual/widgets/sandbox.py +37 -0
inspect_ai/_eval/score.py +1 -0
inspect_ai/_eval/task/results.py +50 -22
inspect_ai/_eval/task/run.py +41 -7
inspect_ai/_eval/task/sandbox.py +10 -5
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/port_names.py +61 -0
inspect_ai/_util/text.py +23 -0
inspect_ai/_view/www/App.css +31 -1
inspect_ai/_view/www/dist/assets/index.css +31 -1
inspect_ai/_view/www/dist/assets/index.js +25344 -1849
inspect_ai/_view/www/log-schema.json +32 -2
inspect_ai/_view/www/package.json +2 -0
inspect_ai/_view/www/src/App.mjs +8 -10
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
inspect_ai/_view/www/src/index.js +75 -2
inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +24 -12
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
inspect_ai/_view/www/src/types/log.d.ts +13 -2
inspect_ai/_view/www/src/utils/Format.mjs +10 -3
inspect_ai/_view/www/src/utils/Json.mjs +12 -6
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
inspect_ai/_view/www/vite.config.js +7 -0
inspect_ai/_view/www/yarn.lock +116 -0
inspect_ai/approval/_human/__init__.py +0 -0
inspect_ai/approval/_policy.py +12 -6
inspect_ai/log/_log.py +1 -1
inspect_ai/log/_samples.py +16 -0
inspect_ai/log/_transcript.py +4 -1
inspect_ai/model/_call_tools.py +4 -0
inspect_ai/model/_conversation.py +20 -8
inspect_ai/model/_generate_config.py +10 -4
inspect_ai/model/_model.py +117 -18
inspect_ai/model/_model_output.py +7 -2
inspect_ai/model/_providers/anthropic.py +100 -44
inspect_ai/model/_providers/azureai.py +20 -20
inspect_ai/model/_providers/bedrock.py +37 -40
inspect_ai/model/_providers/google.py +46 -54
inspect_ai/model/_providers/mistral.py +11 -11
inspect_ai/model/_providers/openai.py +15 -16
inspect_ai/model/_providers/openai_o1.py +9 -8
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/together.py +8 -8
inspect_ai/model/_providers/vertex.py +1 -4
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/scorer/_scorer.py +2 -2
inspect_ai/solver/__init__.py +2 -5
inspect_ai/solver/_prompt.py +35 -5
inspect_ai/solver/_task_state.py +80 -38
inspect_ai/tool/__init__.py +2 -0
inspect_ai/tool/_tool.py +12 -1
inspect_ai/tool/_tool_call.py +10 -0
inspect_ai/tool/_tool_def.py +16 -5
inspect_ai/tool/_tool_with.py +21 -4
inspect_ai/tool/beta/__init__.py +5 -0
inspect_ai/tool/beta/_computer/__init__.py +3 -0
inspect_ai/tool/beta/_computer/_common.py +133 -0
inspect_ai/tool/beta/_computer/_computer.py +155 -0
inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_limit.py +26 -0
inspect_ai/util/_sandbox/docker/docker.py +64 -1
inspect_ai/util/_sandbox/docker/internal.py +3 -1
inspect_ai/util/_sandbox/environment.py +14 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +126 -98
inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.58.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -11,7 +11,6 @@ import proto  # type: ignore
 from google.ai.generativelanguage import (
     Blob,
     Candidate,
-    File,
     FunctionCall,
     FunctionCallingConfig,
     FunctionDeclaration,
@@ -29,29 +28,29 @@ from google.api_core.exceptions import (
     TooManyRequests,
 )
 from google.api_core.retry.retry_base import if_transient_error
-from google.generativeai import (  # type: ignore
-    GenerationConfig,
-    GenerativeModel,
-    configure,
-    get_file,
-    upload_file,
-)
-from google.generativeai.types import (  # type: ignore
-    AsyncGenerateContentResponse,
+from google.generativeai.client import configure
+from google.generativeai.files import get_file, upload_file
+from google.generativeai.generative_models import GenerativeModel
+from google.generativeai.types import (
     ContentDict,
-    HarmBlockThreshold,
-    HarmCategory,
+    GenerationConfig,
     PartDict,
     PartType,
-    SafetySettingDict,
     Tool,
 )
+from google.generativeai.types.file_types import File
+from google.generativeai.types.generation_types import AsyncGenerateContentResponse
+from google.generativeai.types.safety_types import (
+    EasySafetySettingDict,
+    HarmBlockThreshold,
+    HarmCategory,
+)
 from google.protobuf.json_format import MessageToDict, ParseDict
 from google.protobuf.struct_pb2 import Struct
 from pydantic import JsonValue
 from typing_extensions import override
-from inspect_ai._util.constants import BASE_64_DATA_REMOVED
+from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
 from inspect_ai._util.content import (
     Content,
     ContentAudio,
@@ -89,7 +88,7 @@ logger = getLogger(__name__)
 SAFETY_SETTINGS = "safety_settings"
-DEFAULT_SAFETY_SETTINGS: SafetySettingDict = {
+DEFAULT_SAFETY_SETTINGS: EasySafetySettingDict = {
     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
@@ -141,7 +140,7 @@ class GoogleAPI(ModelAPI):
         tools: list[ToolInfo],
         tool_choice: ToolChoice,
         config: GenerateConfig,
-    ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
+    ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         parameters = GenerationConfig(
             temperature=config.temperature,
             top_p=config.top_p,
@@ -149,11 +148,8 @@ class GoogleAPI(ModelAPI):
             max_output_tokens=config.max_tokens,
             stop_sequences=config.stop_seqs,
             candidate_count=config.num_choices,
-            seed=config.seed,
             presence_penalty=config.presence_penalty,
             frequency_penalty=config.frequency_penalty,
-            response_logprobs=config.logprobs,
-            logprobs=config.top_logprobs,
         )
         # google-native messages
@@ -176,18 +172,15 @@ class GoogleAPI(ModelAPI):
                 response=response,
             )
-        # cast to AsyncGenerateContentResponse since we passed stream=False
         try:
-            response = cast(
-                AsyncGenerateContentResponse,
-                await self.model.generate_content_async(
-                    contents=contents,
-                    safety_settings=self.safety_settings,
-                    generation_config=parameters,
-                    tools=gemini_tools,
-                    tool_config=gemini_tool_config,
-                ),
+            response = await self.model.generate_content_async(
+                contents=contents,
+                safety_settings=self.safety_settings,
+                generation_config=parameters,
+                tools=gemini_tools,
+                tool_config=gemini_tool_config,
             )
         except InvalidArgument as ex:
             return self.handle_invalid_argument(ex), model_call()
@@ -205,15 +198,13 @@ class GoogleAPI(ModelAPI):
         # return
         return output, model_call()
-    def handle_invalid_argument(self, ex: InvalidArgument) -> ModelOutput:
+    def handle_invalid_argument(self, ex: InvalidArgument) -> ModelOutput | Exception:
         if "size exceeds the limit" in ex.message.lower():
             return ModelOutput.from_content(
                 model=self.model_name, content=ex.message, stop_reason="model_length"
             )
         else:
-            return ModelOutput.from_content(
-                model=self.model_name, content=ex.message, stop_reason="unknown"
-            )
+            return ex
     @override
     def is_rate_limit(self, ex: BaseException) -> bool:
@@ -231,7 +222,7 @@ class GoogleAPI(ModelAPI):
 def build_model_call(
     contents: list[ContentDict],
     generation_config: GenerationConfig,
-    safety_settings: SafetySettingDict,
+    safety_settings: EasySafetySettingDict,
     tools: list[Tool] | None,
     tool_config: ToolConfig | None,
     response: AsyncGenerateContentResponse | None,
@@ -248,7 +239,7 @@ def build_model_call(
             if tool_config is not None
             else None,
         ),
-        response=response.to_dict() if response is not None else {},
+        response=response.to_dict() if response is not None else {},  # type: ignore[no-untyped-call]
         filter=model_call_filter,
     )
@@ -269,12 +260,12 @@ def model_call_content(content: ContentDict) -> ContentDict:
 def model_call_part(part: PartType) -> PartType:
     if isinstance(part, proto.Message):
-        return MessageToDict(part._pb)
+        return cast(PartDict, MessageToDict(part._pb))
     elif isinstance(part, dict):
         part = part.copy()
         keys = list(part.keys())
         for key in keys:
-            part[key] = model_call_part(part[key])
+            part[key] = model_call_part(part[key])  # type: ignore[literal-required]
         return part
     else:
         return part
@@ -316,9 +307,6 @@ def consective_tool_message_reducer(
     return messages
-NO_CONTENT = "(no content)"
 async def content_dict(
     message: ChatMessageUser | ChatMessageAssistant | ChatMessageTool,
 ) -> ContentDict:
@@ -326,13 +314,13 @@ async def content_dict(
         return ContentDict(
             role="user",
             parts=(
-                [PartDict(text=message.content or NO_CONTENT)]
+                [message.content or NO_CONTENT]
                 if isinstance(message.content, str)
                 else [await content_part(content) for content in message.content]
             ),
         )
     elif isinstance(message, ChatMessageAssistant):
-        content_parts: list[Part] = []
+        content_parts: list[PartType] = []
         # tool call parts
         if message.tool_calls is not None:
             content_parts.extend(
@@ -383,9 +371,9 @@ def dict_to_struct(x: dict[str, Any]) -> Struct:
 async def content_part(content: Content | str) -> PartType:
     if isinstance(content, str):
-        return PartDict(text=content or NO_CONTENT)
+        return content or NO_CONTENT
     elif isinstance(content, ContentText):
-        return PartDict(text=content.text or NO_CONTENT)
+        return content.text or NO_CONTENT
     else:
         return await chat_content_to_part(content)
@@ -404,7 +392,9 @@ def prepend_system_messages(
     messages: list[ContentDict], system_messages: list[ChatMessageSystem]
 ) -> None:
     # create system_parts
-    system_parts = [Part(text=message.content) for message in system_messages]
+    system_parts: list[PartType] = [
+        Part(text=message.content) for message in system_messages
+    ]
     # we want the system messages to be prepended to the first user message
     # (if there is no first user message then prepend one)
@@ -476,6 +466,8 @@ def schema_from_param(param: ToolParam | ToolParams, nullable: bool = False) ->
             return schema_from_param(param.anyOf[0], nullable=True)
         else:
             return Schema(type=Type.TYPE_UNSPECIFIED)
+    elif param.enum:
+        return Schema(type=Type.STRING, format="enum", enum=param.enum)
     else:
         return Schema(type=Type.TYPE_UNSPECIFIED)
@@ -600,14 +592,14 @@ def gapi_should_retry(ex: BaseException) -> bool:
 def parse_safety_settings(
     safety_settings: Any,
-) -> dict[HarmCategory, HarmBlockThreshold]:
+) -> EasySafetySettingDict:
     # ensure we have a dict
     if isinstance(safety_settings, str):
         safety_settings = json.loads(safety_settings)
     if not isinstance(safety_settings, dict):
         raise ValueError(f"{SAFETY_SETTINGS} must be dictionary.")
-    parsed_settings: dict[HarmCategory, HarmBlockThreshold] = {}
+    parsed_settings: EasySafetySettingDict = {}
     for key, value in safety_settings.items():
         if isinstance(key, str):
             key = str_to_harm_category(key)
@@ -623,23 +615,23 @@ def parse_safety_settings(
     return parsed_settings
-def str_to_harm_category(category: str) -> HarmCategory:
+def str_to_harm_category(category: str) -> int:
     category = category.upper()
     if "HARASSMENT" in category:
-        return HarmCategory.HARM_CATEGORY_HARASSMENT
+        return cast(int, HarmCategory.HARM_CATEGORY_HARASSMENT)
     elif "HATE_SPEECH" in category:
-        return HarmCategory.HARM_CATEGORY_HATE_SPEECH
+        return cast(int, HarmCategory.HARM_CATEGORY_HATE_SPEECH)
     elif "SEXUALLY_EXPLICIT" in category:
-        return HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT
+        return cast(int, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT)
     elif "DANGEROUS_CONTENT" in category:
-        return HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT
+        return cast(int, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT)
     else:
         # NOTE: Although there is an "UNSPECIFIED" category, in the
         # documentation, the API does not accept it.
         raise ValueError(f"Unknown HarmCategory: {category}")
-def str_to_harm_block_threshold(threshold: str) -> HarmBlockThreshold:
+def str_to_harm_block_threshold(threshold: str) -> int:
     threshold = threshold.upper()
     if "LOW" in threshold:
         return HarmBlockThreshold.BLOCK_LOW_AND_ABOVE
@@ -673,7 +665,7 @@ async def file_for_content(content: ContentAudio | ContentVideo) -> File:
         uploaded_file = files_db.get(content_sha256)
         if uploaded_file:
             try:
-                upload = cast(File, get_file(uploaded_file))
+                upload = get_file(uploaded_file)
                 if upload.state.name == "ACTIVE":
                     trace(f"Using uploaded file: {uploaded_file}")
                     return upload

inspect_ai/model/_providers/mistral.py CHANGED Viewed

@@ -40,6 +40,7 @@ from typing_extensions import override
 # https://github.com/mistralai/client-python/blob/main/MIGRATION.md
 from inspect_ai._util.constants import (
     DEFAULT_TIMEOUT,
+    NO_CONTENT,
 )
 from inspect_ai._util.content import Content, ContentImage, ContentText
 from inspect_ai._util.images import file_as_data_uri
@@ -122,7 +123,7 @@ class MistralAPI(ModelAPI):
         tools: list[ToolInfo],
         tool_choice: ToolChoice,
         config: GenerateConfig,
-    ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
+    ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         # build request
         request: dict[str, Any] = dict(
             model=self.model_name,
@@ -146,7 +147,7 @@ class MistralAPI(ModelAPI):
             response = await self.client.chat.complete_async(**request)
         except SDKError as ex:
             if ex.status_code == 400:
-                return self.handle_bad_request(ex)
+                return self.handle_bad_request(ex), mistral_model_call(request, None)
             else:
                 raise ex
@@ -181,25 +182,27 @@ class MistralAPI(ModelAPI):
     def connection_key(self) -> str:
         return str(self.api_key)
-    def handle_bad_request(self, ex: SDKError) -> ModelOutput:
+    def handle_bad_request(self, ex: SDKError) -> ModelOutput | Exception:
+        body = json.loads(ex.body)
+        content = body.get("message", ex.body)
         if "maximum context length" in ex.body:
-            body = json.loads(ex.body)
-            content = body.get("message", ex.body)
             return ModelOutput.from_content(
                 model=self.model_name, content=content, stop_reason="model_length"
             )
         else:
-            raise ex
+            return ex
 def mistral_model_call(
-    request: dict[str, Any], response: MistralChatCompletionResponse
+    request: dict[str, Any], response: MistralChatCompletionResponse | None
 ) -> ModelCall:
     request = request.copy()
     request.update(messages=[message.model_dump() for message in request["messages"]])
     if request.get("tools", None) is not None:
         request["tools"] = [tool.model_dump() for tool in request["tools"]]
-    return ModelCall(request=request, response=response.model_dump())
+    return ModelCall(
+        request=request, response=response.model_dump() if response else {}
+    )
 def mistral_chat_tools(tools: list[ToolInfo]) -> list[MistralTool]:
@@ -326,9 +329,6 @@ async def mistral_chat_message(
         )
-NO_CONTENT = "(no content)"
 async def mistral_message_content(
     content: str | list[Content],
 ) -> str | list[ContentChunk]:

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -166,7 +166,7 @@ class OpenAIAPI(ModelAPI):
         tools: list[ToolInfo],
         tool_choice: ToolChoice,
         config: GenerateConfig,
-    ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
+    ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         # short-circuit to call o1- models that are text only
         if self.is_o1_preview() or self.is_o1_mini():
             return await generate_o1(
@@ -307,27 +307,26 @@ class OpenAIAPI(ModelAPI):
         return params
     # convert some well known bad request errors into ModelOutput
-    def handle_bad_request(self, e: BadRequestError) -> ModelOutput:
-        if e.status_code == 400:
-            # extract message
-            if isinstance(e.body, dict) and "message" in e.body.keys():
-                content = str(e.body.get("message"))
-            else:
-                content = e.message
+    def handle_bad_request(self, e: BadRequestError) -> ModelOutput | Exception:
+        # extract message
+        if isinstance(e.body, dict) and "message" in e.body.keys():
+            content = str(e.body.get("message"))
+        else:
+            content = e.message
-            # narrow stop_reason
-            if e.code == "context_length_exceeded":
-                stop_reason: StopReason = "model_length"
-            elif e.code == "invalid_prompt":
-                stop_reason = "content_filter"
-            else:
-                stop_reason = "unknown"
+        # narrow stop_reason
+        stop_reason: StopReason | None = None
+        if e.code == "context_length_exceeded":
+            stop_reason = "model_length"
+        elif e.code == "invalid_prompt":
+            stop_reason = "content_filter"
+        if stop_reason:
             return ModelOutput.from_content(
                 model=self.model_name, content=content, stop_reason=stop_reason
             )
         else:
-            raise e
+            return e
 async def as_openai_chat_messages(

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -44,7 +44,7 @@ async def generate_o1(
     input: list[ChatMessage],
     tools: list[ToolInfo],
     **params: Any,
-) -> ModelOutput | tuple[ModelOutput, ModelCall]:
+) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
     # create chatapi handler
     handler = O1PreviewChatAPIHandler()
@@ -82,17 +82,18 @@ async def generate_o1(
     ), model_call()
-def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput:
+def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput | Exception:
     if ex.code == "context_length_exceeded":
-        stop_reason: StopReason = "model_length"
+        stop_reason: StopReason | None = "model_length"
     elif ex.code == "invalid_prompt":
         stop_reason = "content_filter"
-    else:
-        stop_reason = "unknown"
-    return ModelOutput.from_content(
-        model=model, content=str(ex), stop_reason=stop_reason
-    )
+    if stop_reason:
+        return ModelOutput.from_content(
+            model=model, content=str(ex), stop_reason=stop_reason
+        )
+    else:
+        return ex
 def chat_messages(

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -94,7 +94,7 @@ def vertex() -> type[ModelAPI]:
 def google() -> type[ModelAPI]:
     FEATURE = "Google API"
     PACKAGE = "google-generativeai"
-    MIN_VERSION = "0.8.3"
+    MIN_VERSION = "0.8.4"
     # workaround log spam
     # https://github.com/ray-project/ray/issues/24917

inspect_ai/model/_providers/together.py CHANGED Viewed

@@ -103,18 +103,18 @@ class TogetherAIAPI(OpenAIAPI):
         return DEFAULT_MAX_TOKENS
     @override
-    def handle_bad_request(self, ex: BadRequestError) -> ModelOutput:
-        if ex.status_code == 400 and "max_new_tokens" in ex.message:
-            response = ex.response.json()
-            if "error" in response and "message" in response.get("error"):
-                content = response.get("error").get("message")
-            else:
-                content = str(response)
+    def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | Exception:
+        response = ex.response.json()
+        if "error" in response and "message" in response.get("error"):
+            content = response.get("error").get("message")
+        else:
+            content = str(response)
+        if "max_new_tokens" in ex.message:
             return ModelOutput.from_content(
                 model=self.model_name, content=content, stop_reason="model_length"
             )
         else:
-            raise ex
+            return ex
     # Together has a slightly different logprobs structure to OpenAI, so we need to remap it.
     def _chat_choices_from_response(

inspect_ai/model/_providers/vertex.py CHANGED Viewed

@@ -23,7 +23,7 @@ from vertexai.generative_models import (  # type: ignore
 )
 from vertexai.generative_models import Content as VertexContent
-from inspect_ai._util.constants import BASE_64_DATA_REMOVED
+from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
 from inspect_ai._util.content import (
     Content,
     ContentAudio,
@@ -250,9 +250,6 @@ def consective_tool_message_reducer(
     return messages
-NO_CONTENT = "(no content)"
 async def content_dict(
     message: ChatMessageUser | ChatMessageAssistant | ChatMessageTool,
 ) -> VertexContent:

inspect_ai/scorer/_reducer/reducer.py CHANGED Viewed

@@ -111,7 +111,7 @@ def pass_at(
             if total - correct < k:
                 return 1.0
             else:
-                return 1.0 - cast(
+                return 1.0 - cast(  # type: ignore[redundant-cast]
                     float,
                     np.prod(1.0 - k / np.arange(total - correct + 1, total + 1)).item(),
                 )

inspect_ai/scorer/_scorer.py CHANGED Viewed

@@ -151,8 +151,8 @@ def scorer_metrics(
         return cast(list[Metric | dict[str, list[Metric]]], metrics_raw)
-def unique_scorer_name(scorer: Scorer, already_used_names: list[str]) -> str:
-    base_name = registry_unqualified_name(scorer)
+def unique_scorer_name(scorer: Scorer | str, already_used_names: list[str]) -> str:
+    base_name = scorer if isinstance(scorer, str) else registry_unqualified_name(scorer)
     scorer_name = base_name
     count = 1
     while scorer_name in already_used_names:

inspect_ai/solver/__init__.py CHANGED Viewed

@@ -7,11 +7,7 @@ from ._fork import fork
 from ._human_agent.agent import human_agent
 from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
 from ._plan import Plan, plan
-from ._prompt import (
-    chain_of_thought,
-    prompt_template,
-    system_message,
-)
+from ._prompt import chain_of_thought, prompt_template, system_message, user_message
 from ._solver import Generate, Solver, SolverSpec, generate, solver
 from ._task_state import Choice, Choices, TaskState
 from ._use_tools import use_tools
@@ -26,6 +22,7 @@ __all__ = [
     "chain_of_thought",
     "multiple_choice",
     "system_message",
+    "user_message",
     "self_critique",
     "use_tools",
     "plan",

inspect_ai/solver/_prompt.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any
 from inspect_ai._util.dict import omit
 from inspect_ai.model import ChatMessageSystem
+from inspect_ai.model._chat_message import ChatMessageUser
 from inspect_ai.util import resource
 from ._solver import Generate, Solver, solver
@@ -15,7 +16,8 @@ def prompt_template(template: str, **params: Any) -> Solver:
     Prompt template containing a `{prompt}` placeholder and any
     number of additional `params`. All values contained in sample
-    `metadata` are also automatically included in the `params`.
+    `metadata` and `store` are also automatically included in the
+    `params`.
     Args:
       template: (str): Template for prompt.
@@ -29,7 +31,7 @@ def prompt_template(template: str, **params: Any) -> Solver:
     async def solve(state: TaskState, generate: Generate) -> TaskState:
         prompt = state.user_prompt
-        kwargs = omit(state.metadata, ["prompt"]) | params
+        kwargs = omit(state.metadata | state.store._data, ["prompt"]) | params
         prompt.text = prompt_template.format(prompt=prompt.text, **kwargs)
         return state
@@ -41,8 +43,9 @@ def system_message(template: str, **params: Any) -> Solver:
     """Solver which inserts a system message into the conversation.
     System message template containing any number of optional `params`.
-    for substitution. All values contained in sample `metadata` are also
-    automatically included in the `params`.
+    for substitution using the `str.format()` method. All values
+    contained in sample `metadata` and `store` are also automatically
+    included in the `params`.
     The new message will go after other system messages (if there
     are none it will be inserted at the beginning of the conversation).
@@ -58,7 +61,7 @@ def system_message(template: str, **params: Any) -> Solver:
     content = resource(template)
     async def solve(state: TaskState, generate: Generate) -> TaskState:
-        kwargs = state.metadata | params
+        kwargs = state.metadata | state.store._data | params
         append_system_message(
             state.messages, ChatMessageSystem(content=content.format(**kwargs))
         )
@@ -67,6 +70,33 @@ def system_message(template: str, **params: Any) -> Solver:
     return solve
+@solver
+def user_message(template: str, **params: Any) -> Solver:
+    """Solver which inserts a user message into the conversation.
+    User message template containing any number of optional `params`.
+    for substitution using the `str.format()` method. All values
+    contained in sample `metadata` and `store` are also automatically
+    included in the `params`.
+    Args:
+      template (str): Template for user message.
+      **params (dict[str,Any]): Parameters to fill into the template.
+    Returns:
+      A solver that inserts the parameterised user message.
+    """
+    # read template
+    content = resource(template)
+    async def solve(state: TaskState, generate: Generate) -> TaskState:
+        kwargs = state.metadata | state.store._data | params
+        state.messages.append(ChatMessageUser(content=content.format(**kwargs)))
+        return state
+    return solve
 DEFAULT_COT_TEMPLATE = r"""
 {prompt}

inspect-ai 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl

inspect-ai 0.3.58py3-none-any.whl → 0.3.59py3-none-any.whl