PyPI - inspect-ai - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl - Mend

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +2 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/core/progress.py +1 -1
inspect_ai/_display/textual/app.py +8 -4
inspect_ai/_display/textual/widgets/samples.py +6 -5
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/__init__.py +0 -0
inspect_ai/_eval/eval.py +100 -97
inspect_ai/_eval/evalset.py +69 -69
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +6 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/__init__.py +0 -0
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/App.css +8 -3
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +66 -38
inspect_ai/_view/www/dist/assets/index.js +525 -523
inspect_ai/_view/www/log-schema.json +86 -73
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/App.tsx +1 -0
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
inspect_ai/_view/www/src/types/log.d.ts +107 -19
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +36 -45
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +13 -13
inspect_ai/dataset/_sources/hf.py +29 -29
inspect_ai/dataset/_sources/json.py +10 -10
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +98 -7
inspect_ai/log/_message.py +3 -1
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +2 -2
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openrouter.py +1 -1
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +1 -1
inspect_ai/scorer/_classification.py +4 -0
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +15 -18
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +2 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/_tools/_computer/_common.py +2 -2
inspect_ai/tool/_tools/_computer/_computer.py +11 -0
inspect_ai/tool/_tools/_execute.py +15 -9
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +10 -1
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0

inspect_ai/model/_cache.py CHANGED Viewed

@@ -58,22 +58,23 @@ def _parse_expiry(period: str) -> int:
 class CachePolicy:
     """The `CachePolicy` is used to define various criteria that impact how model calls are cached.
-    Attributes:
-        expiry(str | None): Default "24h". The expiry time for the cache entry.
-          This is a string of the format "12h" for 12 hours or "1W" for a week,
-          etc. This is how long we will keep the cache entry, if we access it
-          after this point we'll clear it. Setting to `None` will cache
-          indefinitely.
-        per_epoch(bool): Default True. By default we cache responses separately
-          for different epochs. The general use case is that if there are
-          multiple epochs, we should cache each response separately because
-          scorers will aggregate across epochs. However, sometimes a response
-          can be cached regardless of epoch if the call being made isn't under
-          test as part of the evaluation. If False, this option allows you to
-          bypass that and cache independently of the epoch.
-        scopes(dict[str, str]): A dictionary of additional metadata that should
-          be included in the cache key. This allows for more fine-grained
-          control over the cache key generation.
+    `expiry`: Default "24h". The expiry time for the cache entry.
+    This is a string of the format "12h" for 12 hours or "1W" for a week,
+    etc. This is how long we will keep the cache entry, if we access it
+    after this point we'll clear it. Setting to `None` will cache
+    indefinitely.
+    `per_epoch`: Default True. By default we cache responses separately
+    for different epochs. The general use case is that if there are
+    multiple epochs, we should cache each response separately because
+    scorers will aggregate across epochs. However, sometimes a response
+    can be cached regardless of epoch if the call being made isn't under
+    test as part of the evaluation. If False, this option allows you to
+    bypass that and cache independently of the epoch.
+    `scopes`: A dictionary of additional metadata that should
+    be included in the cache key. This allows for more fine-grained
+    control over the cache key generation.
     """
     def __init__(
@@ -82,6 +83,14 @@ class CachePolicy:
         per_epoch: bool = True,
         scopes: dict[str, str] = {},
     ) -> None:
+        """Create a CachePolicy.
+        Args:
+           expiry: Expiry.
+           per_epoch: Per epoch
+           scopes: Scopes
+        """
         self.per_epoch = per_epoch
         self.scopes = scopes
@@ -236,7 +245,11 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
 def cache_clear(model: str = "") -> bool:
-    """Clear the cache directory."""
+    """Clear the cache directory.
+    Args:
+       model: Model to clear cache for.
+    """
     try:
         path = cache_path(model)
@@ -252,6 +265,11 @@ def cache_clear(model: str = "") -> bool:
 def cache_path(model: str = "") -> Path:
+    """Path to cache directory.
+    Args:
+       model: Path to cache directory for specific model.
+    """
     env_cache_dir = os.environ.get("INSPECT_CACHE_DIR", None)
     if env_cache_dir:
         generate_cache = Path(env_cache_dir) / "generate"
@@ -320,9 +338,9 @@ def cache_size(
     will be calculated.
     Args:
-        subdirs(list[str]): List of folders to filter by, which are generally
+        subdirs: List of folders to filter by, which are generally
             model names. Empty directories will be ignored.
-        files(list[str]): List of files to filter by explicitly. Note that
+        files: List of files to filter by explicitly. Note that
             return value group these up by their parent directory
     Returns:
@@ -344,7 +362,7 @@ def cache_list_expired(filter_by: list[str] = []) -> list[Path]:
     """Returns a list of all the cached files that have passed their expiry time.
     Args:
-        filter_by(list[str]): Default []. List of model names to filter by. If
+        filter_by: Default []. List of model names to filter by. If
             an empty list, this will search the entire cache.
     """
     expired_cache_entries = []
@@ -384,7 +402,7 @@ def cache_prune(files: list[Path] = []) -> None:
     """Delete all expired cache entries.
     Args:
-        files(list[Path]): Default []. List of files to prune. If empty, this
+        files: List of files to prune. If empty, this
             will search the entire cache.
     """
     if not files:

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -187,7 +187,7 @@ async def call_tools(
                 view=call.view,
                 pending=True,
             )
-            event.set_task(task)
+            event._set_task(task)
             transcript()._event(event)
             # execute the tool call. if the operator cancelled the
@@ -227,7 +227,7 @@ async def call_tools(
             conversation_tool_mesage(tool_message)
             # update the event with the results
-            event.set_result(
+            event._set_result(
                 result=result_event.result,
                 truncated=result_event.truncated,
                 error=result_event.error,

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -13,8 +13,13 @@ logger = getLogger(__name__)
 class ChatMessageBase(BaseModel):
+    """Base class for chat messages."""
+    role: Literal["system", "user", "assistant", "tool"]
+    """Conversation role"""
     content: str | list[Content]
-    """Content (simple string or list of string|image content)"""
+    """Content (simple string or list of content objects)"""
     source: Literal["input", "generate"] | None = Field(default=None)
     """Source of message."""
@@ -31,9 +36,6 @@ class ChatMessageBase(BaseModel):
         property returns either the plain str content, or if the
         content is a list of text and images, the text items
         concatenated together (separated by newline)
-        Returns: Text content of `ChatMessage` If this message does
-          not have text content then "" is returned.
         """
         if isinstance(self.content, str):
             return self.content
@@ -66,11 +68,15 @@ class ChatMessageBase(BaseModel):
 class ChatMessageSystem(ChatMessageBase):
+    """System chat message."""
     role: Literal["system"] = Field(default="system")
     """Conversation role."""
 class ChatMessageUser(ChatMessageBase):
+    """User chat message."""
     role: Literal["user"] = Field(default="user")
     """Conversation role."""
@@ -79,6 +85,8 @@ class ChatMessageUser(ChatMessageBase):
 class ChatMessageAssistant(ChatMessageBase):
+    """Assistant chat message."""
     role: Literal["assistant"] = Field(default="assistant")
     """Conversation role."""
@@ -112,6 +120,8 @@ class ChatMessageAssistant(ChatMessageBase):
 class ChatMessageTool(ChatMessageBase):
+    """Tool chat message."""
     role: Literal["tool"] = Field(default="tool")
     """Conversation role."""

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -80,7 +80,7 @@ class GenerateConfigArgs(TypedDict, total=False):
 class GenerateConfig(BaseModel):
-    """Base class for model generation configs."""
+    """Model generation options."""
     max_retries: int | None = Field(default=None)
     """Maximum number of times to retry request (defaults to 5)."""

inspect_ai/model/_model.py CHANGED Viewed

@@ -149,7 +149,11 @@ class ModelAPI(abc.ABC):
         return "default"
     def is_rate_limit(self, ex: BaseException) -> bool:
-        """Is this exception a rate limit error."""
+        """Is this exception a rate limit error.
+        Args:
+           ex: Exception to check for rate limit.
+        """
         return False
     def collapse_user_messages(self) -> bool:
@@ -176,12 +180,18 @@ class ModelAPI(abc.ABC):
 class Model:
     """Model interface."""
+    api: ModelAPI
+    """Model API."""
+    config: GenerateConfig
+    """Generation config."""
     def __init__(self, api: ModelAPI, config: GenerateConfig) -> None:
         """Create a model.
         Args:
-           api (ModelAPI): Model API provider.
-           config (GenerateConfig): Model configuration.
+           api: Model API provider.
+           config: Model configuration.
         """
         self.api = api
         self.config = config
@@ -212,16 +222,12 @@ class Model:
         """Generate output from the model.
         Args:
-          input (str | list[ChatMessage]): Chat message
-            input (if a `str` is passed it is converted
+          input: Chat message input (if a `str` is passed it is converted
             to a `ChatMessageUser`).
-          tools (list[Tool] | list[ToolDef] | list[ToolInfo]): Tools available for the
-            model to call.
-          tool_choice (ToolChoice): Directives to the model
-            as to which tools to prefer.
-          cache (bool | CachePolicy): Caching behavior for
-            generate responses (defaults to no caching).
-          config (GenerateConfig): Model configuration.
+          tools: Tools available for the model to call.
+          tool_choice: Directives to the model as to which tools to prefer.
+          config: Model configuration.
+          cache: Caching behavior for generate responses (defaults to no caching).
         Returns:
            ModelOutput
@@ -517,7 +523,8 @@ class Model:
         ) -> None:
             # trace
             if isinstance(result, ModelOutput):
-                conversation_assistant_message(input, result.choices[0].message)
+                if result.choices:
+                    conversation_assistant_message(input, result.choices[0].message)
                 event.output = result
             else:
                 conversation_assistant_error(result)
@@ -550,7 +557,7 @@ class ModelName:
         """Create a ModelName.
         Args:
-           model: (str | Model): Model to create name for.
+           model: Model to create name for.
         """
         if isinstance(model, str):
             (api, name) = self._parse_model(model)
@@ -596,16 +603,16 @@ def get_model(
     """Get an instance of a model.
     Args:
-       model (str | Model | None): Model specification.
-         If `Model` is passed it is returned unmodified,
-         if `None` is passed then the model currently being
-         evaluated is returned (or if there is no evaluation
-         then the model referred to by `INSPECT_EVAL_MODEL`).
-       config (GenerateConfig): Configuration for model.
-       base_url (str | None): Optional. Alternate base URL for model.
-       api_key (str | None): Optional. API key for model.
-       **model_args (dict[str,Any]): Additional args to
-         pass to model constructor.
+       model: Model specification.
+          If `Model` is passed it is returned unmodified,
+          if `None` is passed then the model currently being
+          evaluated is returned (or if there is no evaluation
+          then the model referred to by `INSPECT_EVAL_MODEL`).
+       config: Configuration for model.
+       base_url: Optional. Alternate base URL for model.
+       api_key: Optional. API key for model.
+       **model_args: Additional args to
+          pass to model constructor.
     Returns:
         Model instance.

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -9,6 +9,8 @@ from ._chat_message import ChatMessageAssistant
 class ModelUsage(BaseModel):
+    """Token usage for completion."""
     input_tokens: int = Field(default=0)
     """Total input tokens used."""
@@ -73,6 +75,8 @@ class Logprobs(BaseModel):
 class ChatCompletionChoice(BaseModel):
+    """Choice generated for completion."""
     message: ChatMessageAssistant
     """Assistant message."""
@@ -96,6 +100,8 @@ class ChatCompletionChoice(BaseModel):
 class ModelOutput(BaseModel):
+    """Output from model generation."""
     model: str = Field(default_factory=str)
     """Model used for generation."""
@@ -155,7 +161,14 @@ class ModelOutput(BaseModel):
         stop_reason: StopReason = "stop",
         error: str | None = None,
     ) -> "ModelOutput":
-        """Convenient method to create ModelOutput from simple text content."""
+        """Create ModelOutput from simple text content.
+        Args:
+           model: Model name.
+           content: Text content from generation.
+           stop_reason: Stop reason for generation.
+           error: Error message.
+        """
         return ModelOutput(
             model=model,
             choices=[

inspect_ai/model/_openai.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Literal
 from openai.types.chat import (
@@ -44,29 +45,13 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
 def is_o_series(name: str) -> bool:
-    return is_o1(name) or is_o3(name)
-def is_o1(name: str) -> bool:
-    return name.startswith("o1")
-def is_o3(name: str) -> bool:
-    return name.startswith("o3")
-def is_o1_full(name: str) -> bool:
-    return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
+    return bool(re.match(r"^o\d+", name))
 def is_o1_mini(name: str) -> bool:
     return name.startswith("o1-mini")
-def is_o3_mini(name: str) -> bool:
-    return name.startswith("o3-mini")
 def is_o1_preview(name: str) -> bool:
     return name.startswith("o1-preview")
@@ -132,10 +117,17 @@ async def openai_chat_message(
     message: ChatMessage, model: str
 ) -> ChatCompletionMessageParam:
     if message.role == "system":
-        if is_o1(model):
+        # o1-mini does not support developer or system messages
+        # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
+        if is_o1_mini(model):
+            return ChatCompletionUserMessageParam(role="user", content=message.text)
+        # other o-series models use 'developer' rather than 'system' messages
+        # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
+        elif is_o_series(model):
             return ChatCompletionDeveloperMessageParam(
                 role="developer", content=message.text
             )
+        # gpt models use standard 'system' messages
         else:
             return ChatCompletionSystemMessageParam(
                 role=message.role, content=message.text

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -5,7 +5,7 @@ import json
 from copy import copy
 from io import BytesIO
 from logging import getLogger
-from typing import Any, cast
+from typing import Any, MutableSequence, cast
 import proto  # type: ignore
 from google.ai.generativelanguage import (
@@ -553,11 +553,15 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
 def completion_choices_from_candidates(
-    candidates: list[Candidate],
+    candidates: MutableSequence[Candidate],
 ) -> list[ChatCompletionChoice]:
-    candidates = copy(candidates)
-    candidates.sort(key=lambda c: c.index)
-    return [completion_choice_from_candidate(candidate) for candidate in candidates]
+    if candidates:
+        candidates_list = sorted(candidates, key=lambda c: c.index)
+        return [
+            completion_choice_from_candidate(candidate) for candidate in candidates_list
+        ]
+    else:
+        return []
 # google doesn't export FinishReason (it's in a sub-namespace with a beta

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -36,10 +36,8 @@ from .._model_output import (
 )
 from .._openai import (
     is_gpt,
-    is_o1_full,
     is_o1_mini,
     is_o1_preview,
-    is_o3,
     is_o_series,
     openai_chat_messages,
     openai_chat_tool_choice,
@@ -145,15 +143,9 @@ class OpenAIAPI(ModelAPI):
     def is_o_series(self) -> bool:
         return is_o_series(self.model_name)
-    def is_o1_full(self) -> bool:
-        return is_o1_full(self.model_name)
     def is_o1_mini(self) -> bool:
         return is_o1_mini(self.model_name)
-    def is_o3(self) -> bool:
-        return is_o3(self.model_name)
     def is_o1_preview(self) -> bool:
         return is_o1_preview(self.model_name)
@@ -303,7 +295,11 @@ class OpenAIAPI(ModelAPI):
             params["top_logprobs"] = config.top_logprobs
         if tools and config.parallel_tool_calls is not None and not self.is_o_series():
             params["parallel_tool_calls"] = config.parallel_tool_calls
-        if config.reasoning_effort is not None and not self.is_gpt():
+        if (
+            config.reasoning_effort is not None
+            and not self.is_gpt()
+            and not self.is_o1_mini()
+        ):
             params["reasoning_effort"] = config.reasoning_effort
         return params

inspect_ai/model/_providers/openrouter.py CHANGED Viewed

@@ -81,6 +81,6 @@ class OpenRouterAPI(OpenAIAPI):
             if self.provider:
                 params[EXTRA_BODY]["provider"] = self.provider
             if self.transforms:
-                params[EXTRA_BODY]["tranforms"] = self.transforms
+                params[EXTRA_BODY]["transforms"] = self.transforms
         return params

inspect_ai/scorer/__init__.py CHANGED Viewed

@@ -10,6 +10,8 @@ from ._metric import (
     NOANSWER,
     PARTIAL,
     Metric,
+    MetricProtocol,
+    SampleScore,
     Score,
     Value,
     ValueToFloat,
@@ -18,7 +20,7 @@ from ._metric import (
 )
 from ._metrics.accuracy import accuracy
 from ._metrics.mean import mean
-from ._metrics.std import bootstrap_stderr, std, stderr
+from ._metrics.std import bootstrap_stderr, std, stderr, var
 from ._model import model_graded_fact, model_graded_qa
 from ._multi import multi_scorer
 from ._pattern import pattern
@@ -56,9 +58,12 @@ __all__ = [
     "std",
     "stderr",
     "mean",
+    "var",
     "Metric",
+    "MetricProtocol",
     "metric",
     "Score",
+    "SampleScore",
     "score",
     "Value",
     "ValueToFloat",

inspect_ai/scorer/_answer.py CHANGED Viewed

@@ -43,7 +43,7 @@ def answer(pattern: Literal["letter", "word", "line"]) -> Scorer:
     Note that you must specify a `type` for the answer scorer.
     Args:
-      pattern: (Literal["letter", "word", "line"]): Type of answer
+      pattern: Type of answer
         to extract. "letter" is used with multiple choice and
         extracts a single letter; "word" will extract the next
         word (often used for yes/no answers); "line" will take

inspect_ai/scorer/_classification.py CHANGED Viewed

@@ -17,6 +17,10 @@ def f1(
     """Scorer which produces an F1 score
     Computes the `F1` score for the answer (which balances recall precision by taking the harmonic mean between recall and precision).
+    Args:
+       answer_fn: Custom function to extract the answer from the completion (defaults to using the completion).
+       stop_words: Stop words to include in answer tokenization.
     """
     async def score(state: TaskState, target: Target) -> Score:

inspect_ai/scorer/_match.py CHANGED Viewed

@@ -15,12 +15,11 @@ def match(
     """Scorer which matches text or a number.
     Args:
-       location (Literal["begin", "end", "any", "exact"]):
-          Location to match at. "any" matches anywhere in the
+       location: Location to match at. "any" matches anywhere in the
           output; "exact" requires the output be exactly
           equal to the target (module whitespace, etc.)
-       ignore_case (bool): Do case insensitive comparison.
-       numeric (bool): Is this a numeric match? (in this
+       ignore_case: Do case insensitive comparison.
+       numeric: Is this a numeric match? (in this
           case different punctuation removal rules are
           used and numbers are normalized before comparison).
     """
@@ -42,7 +41,7 @@ def includes(ignore_case: bool = True) -> Scorer:
     """Check whether the specified text is included in the model output.
     Args:
-       ignore_case (bool): Use a case insensitive comparison.
+       ignore_case: Use a case insensitive comparison.
     """

inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl