PyPI - inspect-ai - Versions diffs - 0.3.90__py3-none-any.whl → 0.3.91__py3-none-any.whl - Mend

inspect-ai 0.3.90py3-none-any.whl → 0.3.91py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (370) hide show

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -29,9 +29,9 @@ from .._openai import (
     OpenAIAsyncHttpxClient,
     is_computer_use_preview,
     is_gpt,
-    is_o1_mini,
-    is_o1_preview,
-    is_o1_pro,
+    is_o1,
+    is_o1_early,
+    is_o3_mini,
     is_o_series,
     model_output_from_openai,
     openai_chat_messages,
@@ -62,6 +62,9 @@ class OpenAIAPI(ModelAPI):
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
         responses_api: bool | None = None,
+        responses_store: Literal["auto"] | bool = "auto",
+        service_tier: str | None = None,
+        client_timeout: float | None = None,
         **model_args: Any,
     ) -> None:
         # extract azure service prefix from model name (other providers
@@ -82,9 +85,25 @@ class OpenAIAPI(ModelAPI):
             config=config,
         )
-        # note whether we are forcing the responses_api
-        self.responses_api = (
-            responses_api or self.is_o1_pro() or self.is_computer_use_preview()
+        # is this a model we use responses api by default for?
+        responses_model = (
+            self.is_o_series() and not self.is_o1_early()
+        ) or self.is_computer_use_preview()
+        # resolve whether we are forcing the responses api
+        self.responses_api = responses_api or responses_model
+        # resolve whether we are using the responses store
+        self.responses_store = (
+            responses_store if isinstance(responses_store, bool) else responses_model
+        )
+        # set service tier if specified
+        self.service_tier = service_tier
+        # bump up default client timeout to 15 minutes for service_tier=="flex"
+        self.client_timeout = client_timeout or (
+            900.0 if self.service_tier == "flex" else None
         )
         # resolve api_key
@@ -140,6 +159,7 @@ class OpenAIAPI(ModelAPI):
                 api_version=api_version,
                 azure_endpoint=base_url,
                 http_client=http_client,
+                timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
                 **model_args,
             )
         else:
@@ -147,6 +167,7 @@ class OpenAIAPI(ModelAPI):
                 api_key=self.api_key,
                 base_url=model_base_url(base_url, "OPENAI_BASE_URL"),
                 http_client=http_client,
+                timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
                 **model_args,
             )
@@ -159,14 +180,14 @@ class OpenAIAPI(ModelAPI):
     def is_o_series(self) -> bool:
         return is_o_series(self.service_model_name())
-    def is_o1_pro(self) -> bool:
-        return is_o1_pro(self.service_model_name())
+    def is_o1(self) -> bool:
+        return is_o1(self.service_model_name())
-    def is_o1_mini(self) -> bool:
-        return is_o1_mini(self.service_model_name())
+    def is_o1_early(self) -> bool:
+        return is_o1_early(self.service_model_name())
-    def is_o1_preview(self) -> bool:
-        return is_o1_preview(self.service_model_name())
+    def is_o3_mini(self) -> bool:
+        return is_o3_mini(self.service_model_name())
     def is_computer_use_preview(self) -> bool:
         return is_computer_use_preview(self.service_model_name())
@@ -184,8 +205,18 @@ class OpenAIAPI(ModelAPI):
     @override
     def tool_result_images(self) -> bool:
-        # o1-pro, o1, and computer_use_preview support image inputs (but we're not strictly supporting o1)
-        return self.is_o1_pro() or self.is_computer_use_preview()
+        # o1-pro, o1, and computer_use_preview support image inputs
+        if self.is_computer_use_preview():
+            return True
+        elif self.is_o_series():
+            if self.is_o1_early():
+                return False
+            elif self.is_o3_mini():
+                return False
+            else:
+                return True
+        else:
+            return False
     @override
     def disable_computer_screenshot_truncation(self) -> bool:
@@ -203,7 +234,7 @@ class OpenAIAPI(ModelAPI):
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
         # short-circuit to call o1- models that are text only
-        if self.is_o1_preview() or self.is_o1_mini():
+        if self.is_o1_early():
             return await generate_o1(
                 client=self.client,
                 input=input,
@@ -219,6 +250,8 @@ class OpenAIAPI(ModelAPI):
                 tools=tools,
                 tool_choice=tool_choice,
                 config=config,
+                service_tier=self.service_tier,
+                store=self.responses_store,
             )
         # allocate request_id (so we can see it from ModelCall)
@@ -248,7 +281,7 @@ class OpenAIAPI(ModelAPI):
         # determine system role
         # o1-mini does not support developer or system messages
         # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
-        if self.is_o1_mini():
+        if self.is_o1_early():
             system_role: Literal["user", "system", "developer"] = "user"
         # other o-series models use 'developer' rather than 'system' messages
         # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
@@ -309,6 +342,10 @@ class OpenAIAPI(ModelAPI):
         # first call the default processing
         params = openai_completion_params(self.service_model_name(), config, tools)
+        # add service_tier if specified
+        if self.service_tier is not None:
+            params["service_tier"] = self.service_tier
         # now tailor to current model
         if config.max_tokens is not None:
             if self.is_o_series():
@@ -329,7 +366,7 @@ class OpenAIAPI(ModelAPI):
         # remove reasoning_effort if not supported
         if "reasoning_effort" in params.keys() and (
-            self.is_gpt() or self.is_o1_mini() or self.is_o1_preview()
+            self.is_gpt() or self.is_o1_early()
         ):
             del params["reasoning_effort"]

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -212,7 +212,7 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
         prompt that asks the model to use the <tool_call>...</tool_call> syntax)
         """
         # extract tool calls
-        tool_call_regex = rf"<{TOOL_CALL}>((?:.|\n)*?)</{TOOL_CALL}>"
+        tool_call_regex = rf"<{TOOL_CALL}>\s*(\{{[\s\S]*?\}})\s*</{TOOL_CALL}>"
         tool_calls_content: list[str] = re.findall(tool_call_regex, response)
         # if there are tool calls proceed with parsing

inspect_ai/model/_providers/openai_responses.py CHANGED Viewed

@@ -15,9 +15,7 @@ from .._model_output import ModelOutput, ModelUsage
 from .._openai import (
     OpenAIResponseError,
     is_computer_use_preview,
-    is_gpt,
-    is_o1_mini,
-    is_o1_preview,
+    is_o1_early,
     is_o_series,
     openai_handle_bad_request,
     openai_media_filter,
@@ -41,6 +39,8 @@ async def generate_responses(
     tools: list[ToolInfo],
     tool_choice: ToolChoice,
     config: GenerateConfig,
+    service_tier: str | None,
+    store: bool,
 ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
     # allocate request_id (so we can see it from ModelCall)
     request_id = http_hooks.start_request()
@@ -61,14 +61,20 @@ async def generate_responses(
     # prepare request (we do this so we can log the ModelCall)
     tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
     request = dict(
-        input=await openai_responses_inputs(input, model_name),
+        input=await openai_responses_inputs(input, model_name, store),
         tools=tool_params,
         tool_choice=openai_responses_tool_choice(tool_choice, tool_params)
         if isinstance(tool_params, list) and tool_choice != "auto"
         else NOT_GIVEN,
         truncation="auto" if is_computer_use_preview(model_name) else NOT_GIVEN,
         extra_headers={HttpxHooks.REQUEST_ID_HEADER: request_id},
-        **completion_params_responses(model_name, config, len(tools) > 0),
+        **completion_params_responses(
+            model_name,
+            config=config,
+            service_tier=service_tier,
+            tools=len(tools) > 0,
+            store=store,
+        ),
     )
     try:
@@ -110,7 +116,12 @@ async def generate_responses(
 def completion_params_responses(
-    model_name: str, config: GenerateConfig, tools: bool
+    model_name: str,
+    *,
+    config: GenerateConfig,
+    service_tier: str | None,
+    tools: bool,
+    store: bool,
 ) -> dict[str, Any]:
     # TODO: we'll need a computer_use_preview bool for the 'include'
     # and 'reasoning' parameters
@@ -120,9 +131,9 @@ def completion_params_responses(
             f"OpenAI Responses API does not support the '{param}' parameter.",
         )
-    params: dict[str, Any] = dict(
-        model=model_name, store=is_computer_use_preview(model_name)
-    )
+    params: dict[str, Any] = dict(model=model_name, store=store)
+    if service_tier is not None:
+        params["service_tier"] = service_tier
     if config.max_tokens is not None:
         params["max_output_tokens"] = config.max_tokens
     if config.frequency_penalty is not None:
@@ -153,13 +164,14 @@ def completion_params_responses(
         unsupported_warning("top_logprobs")
     if tools and config.parallel_tool_calls is not None and not is_o_series(model_name):
         params["parallel_tool_calls"] = config.parallel_tool_calls
-    if (
-        config.reasoning_effort is not None
-        and not is_gpt(model_name)
-        and not is_o1_mini(model_name)
-        and not is_o1_preview(model_name)
-    ):
-        params["reasoning"] = dict(effort=config.reasoning_effort)
+    if is_o_series(model_name) and not is_o1_early(model_name):
+        reasoning: dict[str, str] = {}
+        if config.reasoning_effort is not None:
+            reasoning["effort"] = config.reasoning_effort
+        if config.reasoning_summary is not None:
+            reasoning["summary"] = config.reasoning_summary
+        if len(reasoning) > 0:
+            params["reasoning"] = reasoning
     if config.response_schema is not None:
         params["text"] = dict(
             format=ResponseFormatTextJSONSchemaConfigParam(

inspect_ai/model/_providers/openrouter.py CHANGED Viewed

@@ -111,6 +111,20 @@ class OpenRouterAPI(OpenAICompatibleAPI):
         # default params
         params = super().completion_params(config, tools)
+        # remove reasoning_effort it is exists
+        if "reasoning_effort" in params:
+            del params["reasoning_effort"]
+        # provide openrouter standard reasoning options
+        # https://openrouter.ai/docs/use-cases/reasoning-tokens
+        if config.reasoning_effort is not None or config.reasoning_tokens is not None:
+            reasoning: dict[str, str | int] = dict()
+            if config.reasoning_effort is not None:
+                reasoning["effort"] = config.reasoning_effort
+            if config.reasoning_tokens is not None:
+                reasoning["max_tokens"] = config.reasoning_tokens
+            params["reasoning"] = reasoning
         # pass args if specifed
         EXTRA_BODY = "extra_body"
         if self.models or self.provider or self.transforms:

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -105,7 +105,7 @@ def vertex() -> type[ModelAPI]:
 def google() -> type[ModelAPI]:
     FEATURE = "Google API"
     PACKAGE = "google-genai"
-    MIN_VERSION = "1.8.0"
+    MIN_VERSION = "1.12.1"
     # verify we have the package
     try:
@@ -267,7 +267,7 @@ def none() -> type[ModelAPI]:
 def validate_openai_client(feature: str) -> None:
     FEATURE = feature
     PACKAGE = "openai"
-    MIN_VERSION = "1.69.0"
+    MIN_VERSION = "1.75.0"
     # verify we have the package
     try:

inspect_ai/model/_providers/util/chatapi.py CHANGED Viewed

@@ -100,10 +100,20 @@ async def chat_api_request(
 # look at its `__cause__`. we've observed Cloudflare giving transient 500
 # status as well as a ReadTimeout, so we count these as rate limit errors
 def should_retry_chat_api_error(ex: BaseException) -> bool:
-    return isinstance(ex, RetryError) and (
-        (
-            isinstance(ex.__cause__, httpx.HTTPStatusError)
-            and is_retryable_http_status(ex.__cause__.response.status_code)
-        )
-        or isinstance(ex.__cause__, httpx.ReadTimeout)
-    )
+    # not a tenacity RetryError
+    if not isinstance(ex, RetryError):
+        return False
+    cause = ex.__cause__
+    if cause is None:
+        raise RuntimeError(f"Tenacity RetryError with no __cause__: {ex}")
+    if isinstance(cause, httpx.HTTPStatusError):
+        if is_retryable_http_status(cause.response.status_code):
+            return True
+    if httpx_should_retry(cause):
+        return True
+    return False

inspect_ai/model/_providers/vllm.py CHANGED Viewed

@@ -104,7 +104,7 @@ class VLLMAPI(ModelAPI):
         # set which GPUs are available to use
         if device is not None:
-            os.environ["CUDA_VISIBLE_DEVICES"] = str(device)
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(device)
         # tell vllm how many GPUs to use
         if "tensor_parallel_size" not in model_args:

inspect_ai/scorer/_metric.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import (
     Callable,
     ParamSpec,
     Protocol,
+    Type,
     Union,
     cast,
     overload,
@@ -24,6 +25,7 @@ from inspect_ai._util.registry import (
     registry_params,
     registry_tag,
 )
+from inspect_ai.dataset._dataset import MT, metadata_as
 logger = getLogger(__name__)
@@ -121,6 +123,20 @@ class SampleScore(BaseModel):
     sample_metadata: dict[str, Any] | None = Field(default=None)
     """Metadata from the sample"""
+    def sample_metadata_as(self, metadata_cls: Type[MT]) -> MT | None:
+        """Pydantic model interface to sample metadata.
+        Args:
+          metadata_cls: Pydantic model type
+        Returns:
+          BaseModel: Instance of metadata_cls bound to sample metadata.
+        """
+        if self.sample_metadata is not None:
+            return metadata_as(self.sample_metadata, metadata_cls)
+        else:
+            return None
     scorer: str | None = Field(default=None)
     """Registry name of scorer that created this score."""
@@ -265,7 +281,7 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
     Returns:
         Metric with registry info attribute
     """
-    return cast(Metric, registry_create("metric", name, **kwargs))
+    return registry_create("metric", name, **kwargs)
 def to_metric_specs(

inspect_ai/scorer/_model.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import re
 from functools import partial
-from typing import Callable
+from typing import Any, Callable
+from inspect_ai._util.content import Content, ContentText
 from inspect_ai._util.dict import omit
 from inspect_ai._util.format import format_function_call
 from inspect_ai._util.list import remove_last_match_and_after
@@ -13,6 +14,7 @@ from inspect_ai.model._chat_message import (
     ChatMessageUser,
 )
 from inspect_ai.model._model import Model, get_model
+from inspect_ai.model._model_output import ModelOutput
 from inspect_ai.solver._task_state import TaskState
 from inspect_ai.util import resource
@@ -166,16 +168,17 @@ def _model_graded_qa_single(
             question = state.input_text
         # format the scoring template
-        score_prompt = grading_template.format(
+        scoring_prompt = model_scoring_prompt(
+            template=grading_template,
             question=question,
-            answer=state.output.completion,
+            output=state.output,
             criterion=target.text,
             instructions=instructions,
-            **metadata,
+            metadata=metadata,
         )
         # query the model for the score
-        result = await model.generate(score_prompt)
+        result = await model.generate([scoring_prompt])
         # extract the grade
         match = re.search(grade_pattern or DEFAULT_GRADE_PATTERN, result.completion)
@@ -186,7 +189,7 @@ def _model_graded_qa_single(
                 explanation=result.completion,
                 metadata=dict(
                     grading=[
-                        ChatMessageUser(content=score_prompt),
+                        scoring_prompt,
                         result.message,
                     ]
                 ),
@@ -300,3 +303,45 @@ def chat_history(state: TaskState) -> str:
                 )
     return "\n\n".join(history)
+def model_scoring_prompt(
+    *,
+    template: str,
+    question: str,
+    output: ModelOutput,
+    criterion: str,
+    instructions: str,
+    metadata: dict[str, Any],
+) -> ChatMessageUser:
+    # we need to remove media objects from output and reference them as attachements in the answer
+    answer = output.completion
+    media: list[Content] = (
+        [
+            content
+            for content in output.message.content
+            if content.type in ["image", "audio", "video"]
+        ]
+        if len(output.choices) > 0 and isinstance(output.message.content, list)
+        else []
+    )
+    if len(media) > 0:
+        if len(answer) > 0:
+            answer = f"{answer} (see also attached media)"
+        else:
+            answer = "See attached media"
+    # format the prompt
+    prompt = template.format(
+        question=question,
+        answer=answer,
+        criterion=criterion,
+        instructions=instructions,
+        **metadata,
+    )
+    # return with media if necessary
+    if len(media) > 0:
+        return ChatMessageUser(content=[ContentText(text=prompt)] + media)
+    else:
+        return ChatMessageUser(content=prompt)

inspect_ai/scorer/_scorer.py CHANGED Viewed

@@ -117,7 +117,7 @@ def scorer_create(name: str, **kwargs: Any) -> Scorer:
     Returns:
         Scorer with registry info attribute
     """
-    return cast(Scorer, registry_create("scorer", name, **kwargs))
+    return registry_create("scorer", name, **kwargs)
 def scorer(

inspect_ai/solver/_human_agent.py CHANGED Viewed

@@ -13,6 +13,7 @@ def human_agent(
     answer: bool | str = True,
     intermediate_scoring: bool = False,
     record_session: bool = True,
+    user: str | None = None,
 ) -> Solver:
     """Human solver for agentic tasks that run in a Linux environment.
@@ -32,6 +33,7 @@ def human_agent(
           that the answer matches the expected format.
        intermediate_scoring: Allow the human agent to check their score while working.
        record_session: Record all user commands and outputs in the sandbox bash session.
+       user: User to login as. Defaults to the sandbox environment's default user.
     Returns:
        Solver: Human agent solver.
@@ -48,5 +50,6 @@ def human_agent(
             answer=answer,
             intermediate_scoring=intermediate_scoring,
             record_session=record_session,
+            user=user,
         )
     )

inspect_ai/solver/_plan.py CHANGED Viewed

@@ -230,4 +230,4 @@ def plan_create(name: str, **kwargs: Any) -> Plan:
     Returns:
         Plan with registry info attribute
     """
-    return cast(Plan, registry_create("plan", name, **kwargs))  # type: ignore[arg-type]
+    return registry_create("plan", name, **kwargs)

inspect_ai/solver/_solver.py CHANGED Viewed

@@ -136,7 +136,7 @@ def solver_create(name: str, **kwargs: Any) -> Solver:
     Returns:
         Solver with registry info attribute
     """
-    return cast(Solver, registry_create("solver", name, **kwargs))
+    return registry_create("solver", name, **kwargs)
 SolverType: TypeAlias = Solver | Agent

inspect_ai/solver/_use_tools.py CHANGED Viewed

@@ -1,4 +1,7 @@
+from typing import Sequence
 from inspect_ai.tool import Tool, ToolChoice
+from inspect_ai.tool._tool import ToolSource
 from inspect_ai.tool._tool_def import ToolDef
 from ._solver import Generate, Solver, solver
@@ -7,7 +10,7 @@ from ._task_state import TaskState
 @solver
 def use_tools(
-    *tools: Tool | list[Tool],
+    *tools: Tool | ToolDef | ToolSource | Sequence[Tool | ToolDef | ToolSource],
     tool_choice: ToolChoice | None = "auto",
     append: bool = False,
 ) -> Solver:
@@ -34,17 +37,20 @@ def use_tools(
         tools_update: list[Tool] = []
         # add tool function to take care of tool/tool_def
-        def add_tool(tool: Tool | ToolDef) -> None:
-            if isinstance(tool, ToolDef):
-                tool = tool.as_tool()
-            tools_update.append(tool)
+        async def add_tools(tool: Tool | ToolDef | ToolSource) -> None:
+            if isinstance(tool, ToolSource):
+                tools_update.extend(await tool.tools())
+            else:
+                if isinstance(tool, ToolDef):
+                    tool = tool.as_tool()
+                tools_update.append(tool)
         for tool in tools:
-            if isinstance(tool, list):
+            if isinstance(tool, Sequence):
                 for t in tool:
-                    add_tool(t)
+                    await add_tools(t)
             else:
-                add_tool(tool)
+                await add_tools(tool)
         if len(tools_update) > 0:
             if append:
                 existing_tools = state.tools

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -8,7 +8,15 @@ from inspect_ai._util.content import (
 )
 from inspect_ai._util.deprecation import relocated_module_attribute
-from ._tool import Tool, ToolError, ToolResult, tool
+from ._mcp import (
+    MCPServer,
+    mcp_connection,
+    mcp_server_sandbox,
+    mcp_server_sse,
+    mcp_server_stdio,
+    mcp_tools,
+)
+from ._tool import Tool, ToolError, ToolResult, ToolSource, tool
 from ._tool_call import (
     ToolCall,
     ToolCallContent,
@@ -45,6 +53,13 @@ __all__ = [
     "ToolCallError",
     "ToolError",
     "ToolResult",
+    "ToolSource",
+    "mcp_tools",
+    "mcp_connection",
+    "mcp_server_stdio",
+    "mcp_server_sse",
+    "mcp_server_sandbox",
+    "MCPServer",
     "Content",
     "ContentAudio",
     "ContentImage",

inspect-ai 0.3.90__py3-none-any.whl → 0.3.91__py3-none-any.whl

inspect-ai 0.3.90py3-none-any.whl → 0.3.91py3-none-any.whl