PyPI - inspect-ai - Versions diffs - 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl - Mend

inspect-ai 0.3.61py3-none-any.whl → 0.3.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (430) hide show

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -27,11 +27,7 @@ from inspect_ai.tool import ToolCall, ToolInfo
 from .._call_tools import parse_tool_call, tool_parse_error_message
 from .._model_call import ModelCall
 from .._model_output import ModelUsage, StopReason, as_stop_reason
-from .._providers.util import (
-    ChatAPIHandler,
-    ChatAPIMessage,
-    chat_api_input,
-)
+from .._providers.util import ChatAPIHandler, ChatAPIMessage, chat_api_input
 logger = getLogger(__name__)
@@ -85,6 +81,8 @@ def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput | Excepti
         stop_reason: StopReason | None = "model_length"
     elif ex.code == "invalid_prompt":
         stop_reason = "content_filter"
+    else:
+        stop_reason = None
     if stop_reason:
         return ModelOutput.from_content(

inspect_ai/model/_providers/openrouter.py ADDED Viewed

@@ -0,0 +1,86 @@
+import os
+from typing import Any
+from typing_extensions import override
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai.model._providers.util import model_base_url
+from inspect_ai.model._providers.util.util import environment_prerequisite_error
+from .._generate_config import GenerateConfig
+from .openai import OpenAIAPI
+OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
+class OpenRouterAPI(OpenAIAPI):
+    def __init__(
+        self,
+        model_name: str,
+        base_url: str | None = None,
+        api_key: str | None = None,
+        config: GenerateConfig = GenerateConfig(),
+        **model_args: Any,
+    ) -> None:
+        # api_key
+        if not api_key:
+            api_key = os.environ.get(OPENROUTER_API_KEY, None)
+            if not api_key:
+                raise environment_prerequisite_error("OpenRouter", OPENROUTER_API_KEY)
+        # base_url
+        base_url = model_base_url(base_url, "OPENROUTER_BASE_URL")
+        base_url = base_url if base_url else "https://openrouter.ai/api/v1"
+        # collect known model args that we forward to generate
+        def collect_model_arg(name: str) -> Any | None:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        # models arg
+        self.models = collect_model_arg("models")
+        if self.models is not None:
+            if not isinstance(self.models, list):
+                raise PrerequisiteError("models must be a list of strings")
+        # providers arg
+        self.provider = collect_model_arg("provider")
+        if self.provider is not None:
+            if not isinstance(self.provider, dict):
+                raise PrerequisiteError("provider must be a dict")
+        # transforms arg
+        self.transforms = collect_model_arg("transforms")
+        if self.transforms is not None:
+            if not isinstance(self.transforms, list):
+                raise PrerequisiteError("transforms must be a list of strings")
+        # call super
+        super().__init__(
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+            **model_args,
+        )
+    @override
+    def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, Any]:
+        # default params
+        params = super().completion_params(config, tools)
+        # pass args if specifed
+        EXTRA_BODY = "extra_body"
+        if self.models or self.provider or self.transforms:
+            params[EXTRA_BODY] = params.get(EXTRA_BODY, {})
+            if self.models:
+                params[EXTRA_BODY]["models"] = self.models
+            if self.provider:
+                params[EXTRA_BODY]["provider"] = self.provider
+            if self.transforms:
+                params[EXTRA_BODY]["tranforms"] = self.transforms
+        return params

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .._registry import modelapi
 def groq() -> type[ModelAPI]:
     FEATURE = "Groq API"
     PACKAGE = "groq"
-    MIN_VERSION = "0.11.0"
+    MIN_VERSION = "0.16.0"
     # verify we have the package
     try:
@@ -198,6 +198,17 @@ def ollama() -> type[ModelAPI]:
     return OllamaAPI
+@modelapi(name="openrouter")
+def openrouter() -> type[ModelAPI]:
+    # validate
+    validate_openai_client("OpenRouter API")
+    # in the clear
+    from .openrouter import OpenRouterAPI
+    return OpenRouterAPI
 @modelapi(name="llama-cpp-python")
 def llama_cpp_python() -> type[ModelAPI]:
     # validate

inspect_ai/model/_reasoning.py ADDED Viewed

@@ -0,0 +1,17 @@
+import re
+from typing import NamedTuple
+class ContentWithReasoning(NamedTuple):
+    content: str
+    reasoning: str
+def parse_content_with_reasoning(content: str) -> ContentWithReasoning | None:
+    match = re.match(r"\s*<think>(.*?)</think>(.*)", content, re.DOTALL)
+    if match:
+        return ContentWithReasoning(
+            content=match.group(2).strip(), reasoning=match.group(1).strip()
+        )
+    else:
+        return None

inspect_ai/scorer/_answer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from inspect_ai._util.pattern import (
 )
 from ._metrics import accuracy, stderr
-from ._pattern import pattern
+from ._pattern import pattern as make_pattern
 from ._scorer import Scorer, scorer
@@ -33,7 +33,7 @@ class AnswerPattern(str, Enum):
 @scorer(metrics=[accuracy(), stderr()])
-def answer(type: Literal["letter", "word", "line"]) -> Scorer:
+def answer(pattern: Literal["letter", "word", "line"]) -> Scorer:
     """Scorer for model output that preceded answers with ANSWER:.
     Some solvers including multiple_choice solicit answers from
@@ -43,7 +43,7 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
     Note that you must specify a `type` for the answer scorer.
     Args:
-      type: (Literal["letter", "word", "line"]): Type of answer
+      pattern: (Literal["letter", "word", "line"]): Type of answer
         to extract. "letter" is used with multiple choice and
         extracts a single letter; "word" will extract the next
         word (often used for yes/no answers); "line" will take
@@ -53,10 +53,10 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
         with a separate line at the end.
     """
-    match type:
+    match pattern:
         case "letter":
-            return pattern(AnswerPattern.LETTER)
+            return make_pattern(AnswerPattern.LETTER)
         case "word":
-            return pattern(AnswerPattern.WORD)
+            return make_pattern(AnswerPattern.WORD)
         case "line":
-            return pattern(AnswerPattern.LINE)
+            return make_pattern(AnswerPattern.LINE)

inspect_ai/scorer/_classification.py CHANGED Viewed

@@ -12,7 +12,7 @@ from ._target import Target
 @scorer(metrics=[mean(), stderr()])
 def f1(
-    answer_fn: Callable[[str], str] | None = None,
+    answer_fn: Callable[[str], str] | None = None, stop_words: list[str] | None = None
 ) -> Scorer:
     """Scorer which produces an F1 score
@@ -26,7 +26,7 @@ def f1(
         )
         targets = target.target
-        f1_score = max_f1_score(answer, targets)
+        f1_score = max_f1_score(answer, targets, stop_words=stop_words)
         return Score(
             value=f1_score,
             answer=answer,
@@ -53,12 +53,14 @@ def exact() -> Scorer:
     return score
-def max_f1_score(answer: str, targets: List[str]) -> float:
+def max_f1_score(
+    answer: str, targets: List[str], stop_words: list[str] | None = None
+) -> float:
     # Find the maximum F1 score for this answer
     max_f1 = 0.0
     for target in targets:
         if target[0].strip():
-            f1_score = compute_f1(answer, target)
+            f1_score = compute_f1(answer, target, stop_words)
             max_f1 = max(max_f1, f1_score)
     return round(max_f1, 2)
@@ -75,18 +77,16 @@ def max_exact_score(answer: str, targets: List[str]) -> float:
     return max_exact
-def compute_f1(answer: str, target: str) -> float:
+def compute_f1(answer: str, target: str, stop_words: list[str] | None = None) -> float:
     """Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the SQuAD F1 metric for the prediction."""
-    answer_words = _to_words(answer)
-    target_words = _to_words(target)
+    answer_words = _to_words(answer, stop_words)
+    target_words = _to_words(target, stop_words)
     return _f1(answer_words=answer_words, target_words=target_words)
-def _to_words(
-    answer: str,
-) -> set[str]:
-    normalized = _normalize(answer)
+def _to_words(answer: str, stop_words: list[str] | None = None) -> set[str]:
+    normalized = _normalize(answer, stop_words)
     token_bag = set(normalized.split())
     return token_bag
@@ -147,16 +147,32 @@ def _tokenize(text: str) -> List[str]:
     return re.split(" |-", text)
-def _normalize(answer: str) -> str:
+def _normalize(text: str, stop_words: list[str] | None = None) -> str:
     """Normalize text to remove extraneous characters and words."""
     tokens = []
-    tokenized_answer = _tokenize(answer)
+    tokenized_answer = _tokenize(text)
+    # Process stop words, if present
+    if stop_words is not None:
+        folded_stop_words = [_normalize_token(word) for word in stop_words]
+    else:
+        folded_stop_words = []
+    # Now process the text
     for token in tokenized_answer:
-        token = _remove_punc(token.casefold())
-        token = _normalize_number(token)
-        token = _remove_articles(token)
-        token = _normalize_whitespace(token)
-        tokens.append(token)
+        token = _normalize_token(token)
+        if folded_stop_words is None or token not in folded_stop_words:
+            tokens.append(token)
+    # re-join the tokens into a normalized string
     tokens = [token for token in tokens if token.strip()]
     normalized = " ".join(tokens).strip()
     return normalized
+def _normalize_token(token: str) -> str:
+    token = _remove_punc(token.casefold())
+    token = _normalize_number(token)
+    token = _remove_articles(token)
+    token = _normalize_whitespace(token)
+    return token

inspect_ai/scorer/_common.py CHANGED Viewed

@@ -25,19 +25,13 @@ def str_match_scorer(match: Callable[[str, str], tuple[str, bool]]) -> Scorer:
         for value in target:
             answer, matched = match(state.output.completion, value)
             if matched:
-                explanation = (
-                    state.output.completion
-                    if state.output.completion != answer
-                    else None
-                )
                 return Score(
                     value=CORRECT, answer=answer, explanation=state.output.completion
                 )
-        explanation = (
-            state.output.completion if state.output.completion != answer else None
+        return Score(
+            value=INCORRECT, answer=answer, explanation=state.output.completion
         )
-        return Score(value=INCORRECT, answer=answer, explanation=explanation)
     return score

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from logging import getLogger
-from typing import Callable, cast
+from typing import Awaitable, Callable, cast
 from typing_extensions import TypedDict, Unpack
+from inspect_ai._util._async import is_callable_coroutine
 from inspect_ai.model._cache import CachePolicy
 from inspect_ai.model._call_tools import call_tools
 from inspect_ai.model._chat_message import ChatMessageTool, ChatMessageUser
@@ -58,7 +59,9 @@ def basic_agent(
     max_tool_output: int | None = None,
     score_value: ValueToFloat | None = None,
     incorrect_message: str
-    | Callable[[TaskState, list[Score]], str] = DEFAULT_INCORRECT_MESSAGE,
+    | Callable[
+        [TaskState, list[Score]], str | Awaitable[str]
+    ] = DEFAULT_INCORRECT_MESSAGE,
     continue_message: str = DEFAULT_CONTINUE_MESSAGE,
     submit_name: str = DEFAULT_SUBMIT_NAME,
     submit_description: str = DEFAULT_SUBMIT_DESCRIPTION,
@@ -93,8 +96,9 @@ def basic_agent(
           Defaults to max_tool_output from active GenerateConfig.
        score_value (ValueToFloat): Function used to extract float from scores (defaults
          to standard value_to_float())
-       incorrect_message (str | Callable[[TaskState, list[Score]], str]): User message reply for an
-         incorrect submission from the model. Alternatively, a function which returns a message.
+       incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
+         User message reply for an incorrect submission from the model. Alternatively,
+         a function which returns a message (function may optionally be async)
        continue_message (str): User message to urge the model to continue when it
          doesn't make a tool call.
        submit_name (str): Name for tool used to make submissions
@@ -216,11 +220,17 @@ def basic_agent(
                             # otherwise notify the model that it was incorrect and continue
                             else:
-                                response_message = (
-                                    incorrect_message(state, answer_scores)
-                                    if callable(incorrect_message)
-                                    else incorrect_message
-                                )
+                                if is_callable_coroutine(incorrect_message):
+                                    response_message: str = await incorrect_message(
+                                        state, answer_scores
+                                    )  # type: ignore[misc,operator]
+                                elif callable(incorrect_message):
+                                    response_message = cast(
+                                        str, incorrect_message(state, answer_scores)
+                                    )
+                                else:
+                                    response_message = incorrect_message
                                 state.messages.append(
                                     ChatMessageUser(content=response_message)
                                 )

inspect_ai/solver/_multiple_choice.py CHANGED Viewed

@@ -1,13 +1,19 @@
+import logging
 import re
 from enum import Enum
 from random import Random
-from typing import Match
+from typing import Match, TypedDict
+from typing_extensions import Unpack
+from inspect_ai._util.logger import warn_once
 from inspect_ai.util import resource
 from ._solver import Generate, Solver, solver
 from ._task_state import Choices, TaskState
+logger = logging.getLogger(__name__)
 SINGLE_ANSWER_TEMPLATE = r"""
 Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
@@ -201,13 +207,17 @@ class MultipleChoiceTemplate(str, Enum):
     MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
+class DeprecatedArgs(TypedDict, total=False):
+    shuffle: bool | Random
 @solver
 def multiple_choice(
     *,
     template: str | None = None,
     cot: bool = False,
     multiple_correct: bool = False,
-    shuffle: bool | Random = False,
+    **kwargs: Unpack[DeprecatedArgs],
 ) -> Solver:
     """Multiple choice question solver.
@@ -223,10 +233,7 @@ def multiple_choice(
     ### Shuffling
-    If the choices are shuffled, we will unshuffle them in the message history
-    after the model has been called, essentially rewriting history. It is
-    something to be aware of if writing custom scorers or solvers that interact
-    with this scorer.
+    You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
     Args:
       template (str | None): Template to use for the multiple choice question.
@@ -243,10 +250,18 @@ def multiple_choice(
         squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
         as `False` if there's exactly one correct answer from the choices
         available. NOTE: this has no effect if you provide a custom template.
-      shuffle (bool | Random): Default `False`. Whether to shuffle the choices
-        in the multiple.  Passing a `Random` instance will use that for shuffling,
-        if `True` a new `Random` instance will be created.
+      **kwargs (Any): Deprecated arguments for backward compatibility.
     """
+    shuffle: bool | Random = False
+    if "shuffle" in kwargs:
+        shuffle = kwargs["shuffle"]
+        if shuffle:
+            warn_once(
+                logger,
+                "The multiple choice shuffle parameter is deprecated. Please shuffle choices at the time your dataset is read by using the shuffle_choices method/parameter of the datasets API.",
+            )
     if template and not valid_template(template):
         raise ValueError(
             "The template must contain '{question}' and '{choices}' placeholders for string substitution."

inspect_ai/tool/__init__.py CHANGED Viewed

@@ -21,12 +21,14 @@ from ._tool_def import ToolDef
 from ._tool_info import ToolInfo
 from ._tool_params import ToolParam, ToolParams
 from ._tool_with import tool_with
+from ._tools._computer import computer
 from ._tools._execute import bash, python
 from ._tools._web_browser import web_browser
 from ._tools._web_search import web_search
 __all__ = [
     "bash",
+    "computer",
     "python",
     "web_browser",
     "web_search",

inspect_ai/tool/{beta → _tools}/_computer/_computer.py RENAMED Viewed

@@ -2,10 +2,7 @@ from typing import Awaitable, Callable
 from inspect_ai._util.content import Content, ContentImage, ContentText
 from inspect_ai.tool import Tool, ToolResult, tool
-from inspect_ai.tool._tool import (
-    TOOL_INIT_MODEL_INPUT,
-    ToolParsingError,
-)
+from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
 from inspect_ai.tool._tool_call import ToolCallModelInput
 from . import _common as common
@@ -84,7 +81,7 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
             if coordinate is not None:
                 raise ToolParsingError(f"coordinate is not accepted for {action}")
             if not isinstance(text, str):
-                raise ToolParsingError(output=f"{text} must be a string")
+                raise ToolParsingError(f"{text} must be a string")
             if action == "key":
                 return await common.press_key(text, timeout=timeout)

inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile RENAMED Viewed

@@ -60,6 +60,10 @@ RUN apt-get install -y \
 # configure noVNC
 RUN ln -s /usr/share/novnc/vnc.html /usr/share/novnc/index.html
+# configure python alias
+RUN ln -s /usr/bin/python3 /usr/bin/python
 # We copy requirements.txt by itself so that changes to the scripts will be in a later layer
 # and we only pip install if requirements.txt changes
 COPY tool/requirements.txt /opt/inspect/tool/requirements.txt

inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb ADDED Viewed

Binary file

inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json ADDED Viewed

@@ -0,0 +1,3 @@
+{
+  "security.workspace.trust.enabled": false
+}

inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml ADDED Viewed

@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<channel name="xfce4-panel" version="1.0">
+  <property name="configver" type="int" value="2"/>
+  <property name="panels" type="array">
+    <value type="int" value="1"/>
+    <property name="dark-mode" type="bool" value="true"/>
+    <property name="panel-1" type="empty">
+      <property name="position" type="string" value="p=6;x=0;y=0"/>
+      <property name="length" type="uint" value="100"/>
+      <property name="position-locked" type="bool" value="true"/>
+      <property name="icon-size" type="uint" value="16"/>
+      <property name="size" type="uint" value="26"/>
+      <property name="plugin-ids" type="array">
+        <value type="int" value="1"/>
+        <value type="int" value="2"/>
+        <value type="int" value="3"/>
+        <value type="int" value="4"/>
+        <value type="int" value="5"/>
+        <value type="int" value="6"/>
+        <value type="int" value="8"/>
+        <value type="int" value="10"/>
+        <value type="int" value="11"/>
+        <value type="int" value="12"/>
+        <value type="int" value="13"/>
+        <value type="int" value="14"/>
+      </property>
+    </property>
+  </property>
+  <property name="plugins" type="empty">
+    <property name="plugin-1" type="string" value="applicationsmenu"/>
+    <property name="plugin-2" type="string" value="tasklist">
+      <property name="grouping" type="uint" value="1"/>
+    </property>
+    <property name="plugin-3" type="string" value="separator">
+      <property name="expand" type="bool" value="true"/>
+      <property name="style" type="uint" value="0"/>
+    </property>
+    <property name="plugin-4" type="string" value="pager"/>
+    <property name="plugin-5" type="string" value="separator">
+      <property name="style" type="uint" value="0"/>
+    </property>
+    <property name="plugin-6" type="string" value="systray">
+      <property name="square-icons" type="bool" value="true"/>
+    </property>
+    <property name="plugin-8" type="string" value="pulseaudio">
+      <property name="enable-keyboard-shortcuts" type="bool" value="true"/>
+      <property name="show-notifications" type="bool" value="true"/>
+    </property>
+    <property name="plugin-9" type="string" value="power-manager-plugin"/>
+    <property name="plugin-10" type="string" value="notification-plugin"/>
+    <property name="plugin-11" type="string" value="separator">
+      <property name="style" type="uint" value="0"/>
+    </property>
+    <property name="plugin-12" type="string" value="clock"/>
+    <property name="plugin-13" type="string" value="separator">
+      <property name="style" type="uint" value="0"/>
+    </property>
+    <property name="plugin-14" type="string" value="actions"/>
+  </property>
+</channel>

inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop ADDED Viewed

@@ -0,0 +1,10 @@
+[Desktop Entry]
+Version=1.0
+Type=Application
+Name=Terminal
+Comment=Open Terminal
+Exec=/usr/bin/exo-open --launch TerminalEmulator
+Icon=utilities-terminal
+Path=
+Terminal=false
+StartupNotify=false

inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py ADDED Viewed

File without changes

inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py RENAMED Viewed

@@ -138,7 +138,7 @@ class X11Client:
             if coordinate is not None:
                 raise ToolError(f"coordinate is not accepted for {action}")
             if not isinstance(text, str):
-                raise ToolError(output=f"{text} must be a string")
+                raise ToolError(f"{text} must be a string")
             if action == "key":
                 return await self.shell(

inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt ADDED Viewed

File without changes

inspect_ai/tool/_tools/_execute.py CHANGED Viewed

@@ -74,8 +74,14 @@ def python(timeout: int | None = None, user: str | None = None) -> Tool:
         """
         Use the python function to execute Python code.
-        The python function will only return you the stdout of the script,
-        so make sure to use print to see the output.
+        The Python tool executes single-run Python scripts. Important notes:
+        1. Each execution is independent - no state is preserved between runs
+        2. You must explicitly use print() statements to see any output
+        3. Simply writing expressions (like in notebooks) will not display results
+        4. The script cannot accept interactive input during execution
+        5. Return statements alone won't produce visible output
+        6. All variables and imports are cleared between executions
+        7. Standard output (via print()) is the only way to see results
         Args:
           code (str): The python code to execute.

inspect_ai/tool/beta.py ADDED Viewed

@@ -0,0 +1,3 @@
+from inspect_ai._util.deprecation import relocated_module_attribute
+relocated_module_attribute("computer", "inspect_ai.tool.computer", "0.3.62", "0.4")

inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl

inspect-ai 0.3.61py3-none-any.whl → 0.3.63py3-none-any.whl