PyPI - inspect-ai - Versions diffs - 0.3.49__py3-none-any.whl → 0.3.50__py3-none-any.whl - Mend

inspect-ai 0.3.49py3-none-any.whl → 0.3.50py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

inspect_ai/_cli/info.py +2 -2
inspect_ai/_cli/log.py +2 -2
inspect_ai/_cli/score.py +2 -2
inspect_ai/_display/core/display.py +19 -0
inspect_ai/_display/core/panel.py +37 -7
inspect_ai/_display/core/progress.py +29 -2
inspect_ai/_display/core/results.py +79 -40
inspect_ai/_display/core/textual.py +21 -0
inspect_ai/_display/rich/display.py +28 -8
inspect_ai/_display/textual/app.py +107 -1
inspect_ai/_display/textual/display.py +1 -1
inspect_ai/_display/textual/widgets/samples.py +132 -91
inspect_ai/_display/textual/widgets/task_detail.py +232 -0
inspect_ai/_display/textual/widgets/tasks.py +74 -6
inspect_ai/_display/textual/widgets/toggle.py +32 -0
inspect_ai/_eval/context.py +2 -0
inspect_ai/_eval/eval.py +4 -3
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/run.py +35 -2
inspect_ai/_eval/task/log.py +13 -11
inspect_ai/_eval/task/results.py +12 -3
inspect_ai/_eval/task/run.py +139 -36
inspect_ai/_eval/task/sandbox.py +2 -1
inspect_ai/_util/_async.py +30 -1
inspect_ai/_util/file.py +31 -4
inspect_ai/_util/html.py +3 -0
inspect_ai/_util/logger.py +6 -5
inspect_ai/_util/platform.py +5 -6
inspect_ai/_util/registry.py +1 -1
inspect_ai/_view/server.py +9 -9
inspect_ai/_view/www/App.css +2 -2
inspect_ai/_view/www/dist/assets/index.css +2 -2
inspect_ai/_view/www/dist/assets/index.js +352 -294
inspect_ai/_view/www/log-schema.json +13 -0
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +16 -13
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
inspect_ai/_view/www/src/types/log.d.ts +2 -0
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
inspect_ai/_view/www/yarn.lock +9 -4
inspect_ai/approval/__init__.py +1 -1
inspect_ai/approval/_human/approver.py +35 -0
inspect_ai/approval/_human/console.py +62 -0
inspect_ai/approval/_human/manager.py +108 -0
inspect_ai/approval/_human/panel.py +233 -0
inspect_ai/approval/_human/util.py +51 -0
inspect_ai/dataset/_sources/hf.py +2 -2
inspect_ai/dataset/_sources/util.py +1 -1
inspect_ai/log/_file.py +106 -36
inspect_ai/log/_recorders/eval.py +226 -158
inspect_ai/log/_recorders/file.py +9 -6
inspect_ai/log/_recorders/json.py +35 -12
inspect_ai/log/_recorders/recorder.py +15 -15
inspect_ai/log/_samples.py +52 -0
inspect_ai/model/_model.py +14 -0
inspect_ai/model/_model_output.py +4 -0
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/hf.py +106 -4
inspect_ai/model/_providers/util/__init__.py +2 -0
inspect_ai/model/_providers/util/hf_handler.py +200 -0
inspect_ai/scorer/_common.py +1 -1
inspect_ai/solver/_plan.py +0 -8
inspect_ai/solver/_task_state.py +18 -1
inspect_ai/solver/_use_tools.py +9 -1
inspect_ai/tool/_tool_def.py +2 -2
inspect_ai/tool/_tool_info.py +14 -2
inspect_ai/tool/_tool_params.py +2 -1
inspect_ai/tool/_tools/_execute.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
inspect_ai/util/__init__.py +5 -6
inspect_ai/util/_panel.py +91 -0
inspect_ai/util/_sandbox/__init__.py +2 -6
inspect_ai/util/_sandbox/context.py +4 -3
inspect_ai/util/_sandbox/docker/compose.py +12 -2
inspect_ai/util/_sandbox/docker/docker.py +19 -9
inspect_ai/util/_sandbox/docker/util.py +10 -2
inspect_ai/util/_sandbox/environment.py +47 -41
inspect_ai/util/_sandbox/local.py +15 -10
inspect_ai/util/_subprocess.py +43 -3
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/METADATA +2 -2
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/RECORD +90 -82
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
inspect_ai/approval/_human.py +0 -123
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.50.dist-info}/top_level.txt +0 -0

inspect_ai/log/_recorders/recorder.py CHANGED Viewed

@@ -14,20 +14,27 @@ from inspect_ai.log._log import (
 class Recorder(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def handles_location(cls, location: str) -> bool: ...
+    @abc.abstractmethod
+    def default_log_buffer(self) -> int: ...
     @abc.abstractmethod
-    def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
+    async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
     @abc.abstractmethod
-    def log_start(self, eval: EvalSpec, plan: EvalPlan) -> None: ...
+    async def log_start(self, eval: EvalSpec, plan: EvalPlan) -> None: ...
     @abc.abstractmethod
-    def log_sample(self, eval: EvalSpec, sample: EvalSample) -> None: ...
+    async def log_sample(self, eval: EvalSpec, sample: EvalSample) -> None: ...
     @abc.abstractmethod
-    def flush(self, eval: EvalSpec) -> None: ...
+    async def flush(self, eval: EvalSpec) -> None: ...
     @abc.abstractmethod
-    def log_finish(
+    async def log_finish(
         self,
         eval: EvalSpec,
         status: Literal["success", "cancelled", "error"],
@@ -37,23 +44,16 @@ class Recorder(abc.ABC):
         error: EvalError | None = None,
     ) -> EvalLog: ...
-    @abc.abstractmethod
-    def default_log_buffer(self) -> int: ...
-    @classmethod
-    @abc.abstractmethod
-    def handles_location(cls, location: str) -> bool: ...
     @classmethod
     @abc.abstractmethod
-    def read_log(cls, location: str, header_only: bool = False) -> EvalLog: ...
+    async def read_log(cls, location: str, header_only: bool = False) -> EvalLog: ...
     @classmethod
     @abc.abstractmethod
-    def read_log_sample(
+    async def read_log_sample(
         cls, location: str, id: str | int, epoch: int = 1
     ) -> EvalSample: ...
     @classmethod
     @abc.abstractmethod
-    def write_log(cls, location: str, log: EvalLog) -> None: ...
+    async def write_log(cls, location: str, log: EvalLog) -> None: ...

inspect_ai/log/_samples.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import contextlib
+from contextvars import ContextVar
 from datetime import datetime
 from typing import AsyncGenerator, Literal
@@ -15,10 +16,14 @@ from ._transcript import Transcript
 class ActiveSample:
     def __init__(
         self,
+        *,
         task: str,
         model: str,
         sample: Sample,
         epoch: int,
+        message_limit: int | None,
+        token_limit: int | None,
+        time_limit: int | None,
         fails_on_error: bool,
         transcript: Transcript,
         sandboxes: dict[str, SandboxConnection],
@@ -30,7 +35,12 @@ class ActiveSample:
         self.model = model
         self.sample = sample
         self.epoch = epoch
+        self.message_limit = message_limit
+        self.token_limit = token_limit
+        self.time_limit = time_limit
         self.fails_on_error = fails_on_error
+        self.total_messages = 0
+        self.total_tokens = 0
         self.transcript = transcript
         self.sandboxes = sandboxes
         self._sample_task = asyncio.current_task()
@@ -59,10 +69,14 @@ def init_active_samples() -> None:
 @contextlib.asynccontextmanager
 async def active_sample(
+    *,
     task: str,
     model: str,
     sample: Sample,
     epoch: int,
+    message_limit: int | None,
+    token_limit: int | None,
+    time_limit: int | None,
     fails_on_error: bool,
     transcript: Transcript,
 ) -> AsyncGenerator[ActiveSample, None]:
@@ -72,17 +86,55 @@ async def active_sample(
         model=model,
         sample=sample,
         epoch=epoch,
+        message_limit=message_limit,
+        token_limit=token_limit,
+        time_limit=time_limit,
         sandboxes=await sandbox_connections(),
         fails_on_error=fails_on_error,
         transcript=transcript,
     )
     _active_samples.append(active)
+    _sample_active.set(active)
     try:
         yield active
     finally:
         active.completed = datetime.now().timestamp()
         _active_samples.remove(active)
+        _sample_active.set(None)
+def sample_active() -> ActiveSample | None:
+    return _sample_active.get(None)
+def set_active_sample_token_limit(token_limit: int | None) -> None:
+    active = sample_active()
+    if active:
+        active.token_limit = token_limit
+def set_active_sample_total_tokens(total_tokens: int) -> None:
+    active = sample_active()
+    if active:
+        active.total_tokens = total_tokens
+def set_active_sample_message_limit(message_limit: int | None) -> None:
+    active = sample_active()
+    if active:
+        active.message_limit = message_limit
+def set_active_sample_total_messages(total_messages: int) -> None:
+    active = sample_active()
+    if active:
+        active.total_messages = total_messages
+_sample_active: ContextVar[ActiveSample | None] = ContextVar(
+    "_sample_active", default=None
+)
 def active_samples() -> list[ActiveSample]:

inspect_ai/model/_model.py CHANGED Viewed

@@ -4,6 +4,7 @@ import functools
 import json
 import logging
 import os
+import time
 from contextvars import ContextVar
 from copy import deepcopy
 from typing import Any, Callable, Literal, Type, cast
@@ -355,12 +356,14 @@ class Model:
             generate_id = uuid()
             logger.debug(f"model generate {generate_id} ({str(self)})")
+            time_start = time.perf_counter()
             result = await self.api.generate(
                 input=input,
                 tools=tools,
                 tool_choice=tool_choice,
                 config=config,
             )
+            time_elapsed = time.perf_counter() - time_start
             logger.debug(f"model generate {generate_id} (completed)")
             if isinstance(result, tuple):
                 output, call = result
@@ -368,12 +371,18 @@ class Model:
                 output = result
                 call = None
+            # update output with time elapsed
+            output.time = time_elapsed
             # complete the transcript event
             complete(output, call)
             # record usage
             if output.usage:
+                # record usage
                 record_model_usage(f"{self}", output.usage)
+                # send telemetry if its hooked up
                 await send_telemetry(
                     "model_usage",
                     json.dumps(dict(model=str(self), usage=output.usage.model_dump())),
@@ -762,6 +771,11 @@ def record_model_usage(model: str, usage: ModelUsage) -> None:
     set_model_usage(model, usage, sample_model_usage_context_var.get(None))
     set_model_usage(model, usage, model_usage_context_var.get(None))
+    # update active sample
+    from inspect_ai.log._samples import set_active_sample_total_tokens
+    set_active_sample_total_tokens(sample_total_tokens())
 def set_model_usage(
     model: str, usage: ModelUsage, model_usage: dict[str, ModelUsage] | None

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -100,7 +100,11 @@ class ModelOutput(BaseModel):
     usage: ModelUsage | None = Field(default=None)
     """Model token usage"""
+    time: float | None = Field(default=None)
+    """Time elapsed (in seconds) for call to generate."""
     metadata: dict[str, Any] | None = Field(default=None)
+    """Additional metadata associated with model output."""
     error: str | None = Field(default=None)
     """Error message in the case of content moderation refusals."""

inspect_ai/model/_providers/azureai.py CHANGED Viewed

@@ -362,7 +362,7 @@ def chat_completion_assistant_message(
         return handler.parse_assistant_response(response.content, tools)
     else:
         return ChatMessageAssistant(
-            content=response.content,
+            content=response.content or "",
             tool_calls=[
                 chat_completion_tool_call(call, tools) for call in response.tool_calls
             ]

inspect_ai/model/_providers/hf.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import asyncio
+import copy
 import functools
+import json
 import os
 from dataclasses import dataclass
 from queue import Empty, Queue
@@ -18,6 +20,7 @@ from transformers import (  # type: ignore
 from typing_extensions import override
 from inspect_ai._util.constants import DEFAULT_MAX_TOKENS
+from inspect_ai._util.content import ContentText
 from inspect_ai.tool import ToolChoice, ToolInfo
 from .._chat_message import ChatMessage, ChatMessageAssistant
@@ -31,7 +34,7 @@ from .._model_output import (
     ModelUsage,
     TopLogprob,
 )
-from .util import chat_api_input
+from .util import ChatAPIHandler, HFHandler
 HF_TOKEN = "HF_TOKEN"
@@ -71,6 +74,9 @@ class HuggingFaceAPI(ModelAPI):
         tokenizer_path = collect_model_arg("tokenizer_path")
         self.batch_size = collect_model_arg("batch_size")
         self.chat_template = collect_model_arg("chat_template")
+        self.tokenizer_call_args = collect_model_arg("tokenizer_call_args")
+        if self.tokenizer_call_args is None:
+            self.tokenizer_call_args = {}
         # device
         if device:
@@ -113,11 +119,22 @@ class HuggingFaceAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> ModelOutput:
+        # create handler
+        handler: ChatAPIHandler | None = (
+            HFHandler(self.model_name) if len(tools) > 0 else None
+        )
         # create chat
         chat = self.hf_chat(input, tools)
+        assert isinstance(self.tokenizer_call_args, dict)
         # prepare tokenizer
-        tokenizer = functools.partial(self.tokenizer, return_tensors="pt", padding=True)
+        tokenizer = functools.partial(
+            self.tokenizer,
+            return_tensors="pt",
+            padding=True,
+            **self.tokenizer_call_args,
+        )
         # prepare generator
         kwargs: dict[str, Any] = dict(do_sample=True)
@@ -172,6 +189,15 @@ class HuggingFaceAPI(ModelAPI):
             ),
         )
+        choice = ChatCompletionChoice(
+            message=chat_completion_assistant_message(
+                response, tools, handler, self.model_name
+            ),
+            logprobs=(
+                Logprobs(content=final_logprobs) if final_logprobs is not None else None
+            ),
+        )
         # return output
         return ModelOutput(
             model=self.model_name,
@@ -199,18 +225,94 @@ class HuggingFaceAPI(ModelAPI):
     def hf_chat(self, messages: list[ChatMessage], tools: list[ToolInfo]) -> str:
         # convert to hf format
-        hf_messages = chat_api_input(messages, tools)
+        tools_list = []
+        hf_messages = copy.deepcopy(messages)
+        if len(tools) > 0:
+            tools_list = [
+                json.loads(tool.model_dump_json(exclude_none=True, indent=2))
+                for tool in tools
+            ]
+            if "mistral" in self.model_name.lower():
+                hf_messages = shorten_tool_id(hf_messages)
+                tools_list = tools_to_mistral_format(tools_list)
+            elif "qwen" in self.model_name.lower():
+                hf_messages = inspect_tools_to_string(hf_messages)
         # apply chat template
         chat = self.tokenizer.apply_chat_template(
             hf_messages,
             add_generation_prompt=True,
             tokenize=False,
-            chat_template=self.chat_template,
+            tools=tools_list if len(tools_list) > 0 else None,
         )
         # return
         return cast(str, chat)
+def shorten_tool_id(messages: list[ChatMessage]) -> list[ChatMessage]:
+    """Shorten the tool_call_id in the messages to the last 9 characters for Mistral."""
+    for i, message in enumerate(messages):
+        if message.role == "tool":
+            # Trim tool_call_id in tool messages
+            if message.tool_call_id is not None:
+                message.tool_call_id = message.tool_call_id[-9:]
+        elif message.role == "assistant" and hasattr(message, "tool_calls"):
+            # Trim tool_call IDs inside tool_calls for assistant messages
+            for tool_call in message.tool_calls or []:
+                tool_call.id = tool_call.id[-9:]
+    return messages
+def tools_to_mistral_format(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Convert tools to the format required for Mistral."""
+    mistral_tools = []
+    for tool in tools:
+        mistral_tools.append(
+            {
+                "function": {
+                    "name": tool["name"],
+                    "description": tool["description"],
+                    "parameters": {
+                        "type": tool["parameters"]["type"],
+                        "properties": tool["parameters"]["properties"],
+                        "required": tool["parameters"]["required"],
+                    },
+                }
+            }
+        )
+    return mistral_tools
+def inspect_tools_to_string(messages: list[ChatMessage]) -> list[ChatMessage]:
+    """Convert tools to a string for Qwen."""
+    for message in messages:
+        if message.role == "assistant":
+            # check if the message contains a tool call
+            tool_content = ""
+            if message.tool_calls:
+                for tool_call in message.tool_calls:
+                    tool_content += f'\n```json\n{{"name": "{tool_call.function}", "arguments": {json.dumps(tool_call.arguments)}}}\n```'
+            # remove the tool call from the message
+            message.tool_calls = None
+            if isinstance(message.content, str):
+                message.content += tool_content
+            else:
+                message.content.append(ContentText(text=tool_content))
+    return messages
+def chat_completion_assistant_message(
+    response: Any,
+    tools: list[ToolInfo],
+    handler: ChatAPIHandler | None,
+    model_name: str,
+) -> ChatMessageAssistant:
+    if handler:
+        return handler.parse_assistant_response(response.output, tools)
+    else:
+        return ChatMessageAssistant(content=response.output, source="generate")
 def set_random_seeds(seed: int | None = None) -> None:
     if seed is None:
         seed = np.random.default_rng().integers(2**32 - 1)

inspect_ai/model/_providers/util/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .chatapi import (
     chat_api_request,
     is_chat_api_rate_limit,
 )
+from .hf_handler import HFHandler
 from .llama31 import Llama31Handler
 from .util import (
     as_stop_reason,
@@ -26,4 +27,5 @@ __all__ = [
     "ChatAPIHandler",
     "ChatAPIMessage",
     "Llama31Handler",
+    "HFHandler",
 ]

inspect_ai/model/_providers/util/hf_handler.py ADDED Viewed

@@ -0,0 +1,200 @@
+import json
+import re
+from logging import getLogger
+from shortuuid import uuid
+from typing_extensions import override
+from inspect_ai.tool._tool_call import ToolCall
+from inspect_ai.tool._tool_info import ToolInfo
+from ..._chat_message import ChatMessageAssistant
+from .chatapi import ChatAPIHandler
+from .util import parse_tool_call, tool_parse_error_message
+logger = getLogger(__name__)
+# Hugging Face handler currently supports LLama, Mistral and Qwen models, but will
+# work with any model that uses the same tool calling conventions
+class HFHandler(ChatAPIHandler):
+    def __init__(self, model_name: str) -> None:
+        self.model_name = model_name
+    @override
+    def parse_assistant_response(
+        self, response: str, tools: list[ToolInfo]
+    ) -> ChatMessageAssistant:
+        """Parse content and tool calls from a model response.
+        This method has an interdependency with `input_with_tools()` (as that is the
+        prompt that asks the model to use the <tool_call>...</tool_call> syntax)
+        """
+        # extract tool calls
+        content, tool_calls_content = model_specific_tool_parse(
+            response, self.model_name
+        )
+        # if there are tool calls proceed with parsing
+        if len(tool_calls_content) > 0:
+            # parse each tool call (if there are parsing error that occur
+            # this will be reported in the `parse_error` field of the ToolCall
+            # and ultimately reported back to the model)
+            tool_calls = [
+                parse_tool_call_content(content, tools)
+                for content in tool_calls_content
+            ]
+            # return the message
+            return ChatMessageAssistant(
+                content=content,
+                tool_calls=tool_calls,
+                source="generate",
+            )
+        # otherwise this is just an ordinary assistant message
+        else:
+            return ChatMessageAssistant(
+                content=filter_assistant_header(response), source="generate"
+            )
+def parse_tool_call_content(content: str, tools: list[ToolInfo]) -> ToolCall:
+    """Attempt to parse content from inside <tool_call> tags.
+    Content inside a <tool_call> should be a JSON dictionary with `name` and
+    `arguments` (which in turn should be a `dict[str,Any]` but in some cases
+    we've seen models pass `str`). This function attempts to extract this from
+    the passed tcontentext. A `ToolCall` is returned for all cases (if the
+    parsing fails then it will have a `parse_error`, which will be subsequently
+    reported to the model.
+    """
+    try:
+        # parse raw JSON
+        tool_call_data = json.loads(content)
+        if "parameters" in tool_call_data:
+            tool_call_data["arguments"] = tool_call_data.pop("parameters")
+        # if its not a dict then report error
+        if not isinstance(tool_call_data, dict):
+            raise ValueError("The provided arguments are not a JSON dictionary.")
+        # see if we can get the fields (if not report error)
+        name = tool_call_data.get("name", None)
+        arguments = tool_call_data.get("arguments", None)
+        if not name or not arguments:
+            raise ValueError(
+                "Required 'name' and 'arguments' not provided in JSON dictionary."
+            )
+        # now perform the parse (we need to call thi function because it includes
+        # the special handling to for mapping arguments that are a plain `str`
+        # to the first parameter of the function)
+        unique_id = f"{name}_{uuid()}"
+        return parse_tool_call(unique_id, name, json.dumps(arguments), tools)
+    except Exception as ex:
+        # buld error message
+        parse_error = tool_parse_error_message(content, ex)
+        # log it to 'info'
+        logger.info(parse_error)
+        # notify model
+        return ToolCall(
+            id="unknown",
+            function="unknown",
+            arguments={},
+            type="function",
+            parse_error=parse_error,
+        )
+def model_specific_tool_parse(response: str, model_name: str) -> tuple[str, list[str]]:
+    model_name = model_name.lower()
+    if "llama" in model_name:
+        if "name" in response and ("parameters" in response or "arguments" in response):
+            function_calls, content = json_extract_raw(response)
+        else:
+            content = response
+            function_calls = []
+    elif "mistral" in model_name:
+        if "name" in response and "arguments" in response:
+            content = ""
+            function_calls = [json.dumps(tool) for tool in json.loads(response)]
+        else:
+            content = response
+            function_calls = []
+    elif "qwen" in model_name and "coder" in model_name:
+        if "name" in response and "arguments" in response:
+            function_calls, content = json_extract(response)
+        else:
+            content = response
+            function_calls = []
+    elif "qwen" in model_name and "instruct" in model_name:
+        if "name" in response and "arguments" in response:
+            function_calls, content = xml_extract(response, "tool_call")
+        else:
+            content = response
+            function_calls = []
+    else:
+        try:
+            function_calls, content = parse_unknown_tool_calls(response)
+        except Exception:
+            raise ValueError(
+                f"Unsupported model: {model_name}. No tool parsing implemented. Check if any of the current parsings work with your tool calling conventions and add the model name to the correct elif block."
+            )
+    return content, function_calls
+def json_extract(raw_string: str) -> tuple[list[str], str]:
+    """Extract tools in form ```json{...}``` and return the remaining content."""
+    function_calls = re.findall(r"```json\s*(\{.*?\})\s*```", raw_string, re.DOTALL)
+    remaining_content = re.sub(
+        r"```json\s*\{.*?\}\s*```", "", raw_string, flags=re.DOTALL
+    ).strip()
+    return function_calls, remaining_content
+def json_extract_raw(raw_string: str) -> tuple[list[str], str]:
+    """Extract tools in form `{...}` and return the remaining content."""
+    # Regex to extract sequences starting with '{' and ending with '}}'
+    json_like_regex = r"\{.*?\}\}"
+    function_calls = re.findall(json_like_regex, raw_string)
+    remaining_content = re.sub(json_like_regex, "", raw_string).strip()
+    return function_calls, remaining_content
+def xml_extract(raw_string: str, tag: str) -> tuple[list[str], str]:
+    """Extract tools in form <tag>{...}</tag> and return the remaining content."""
+    tool_call_regex = rf"<{tag}>((?:.|\n)*?)</{tag}>"
+    function_calls = re.findall(tool_call_regex, raw_string)
+    tool_call_content_regex = rf"<{tag}>(?:.|\n)*?</{tag}>"
+    other_content = re.split(tool_call_content_regex, raw_string, flags=re.DOTALL)
+    other_content = [
+        str(content).strip() for content in other_content if str(content).strip()
+    ]
+    content = "\n\n".join(other_content)
+    return function_calls, content
+def parse_unknown_tool_calls(response: str) -> tuple[list[str], str]:
+    if "```json" in response:
+        return json_extract(response)
+    elif "<tool_call>" in response:
+        return xml_extract(response, "tool_call")
+    elif "<function>" in response:
+        return xml_extract(response, "function")
+    elif "{" in response and "}}" in response:
+        return json_extract_raw(response)
+    else:
+        return [], response
+def filter_assistant_header(message: str) -> str:
+    return re.sub(r"<\|start_header_id\|>assistant<\|end_header_id\|>", "", message)

inspect_ai/scorer/_common.py CHANGED Viewed

@@ -61,7 +61,7 @@ def match_str(
     if ignore_case:
         v = v.casefold()
         t = t.casefold()
-    if numeric:
+    if numeric and t.isnumeric():
         # remove punctuation
         v = strip_numeric_punctuation(v)
         t = strip_numeric_punctuation(t)

inspect_ai/solver/_plan.py CHANGED Viewed

@@ -57,7 +57,6 @@ class Plan(Solver):
         self.finish = finish
         self.cleanup = cleanup
-        self.progress: Callable[[], None] = lambda: None
         self._name = name
         if not internal:
@@ -106,14 +105,8 @@ class Plan(Solver):
                     state = await solver(state, generate)
                     st.complete(state)
-                # tick progress
-                self.progress()
                 # check for completed
                 if state.completed:
-                    # tick rest of progress
-                    for _ in range(index + 1, len(self.steps)):
-                        self.progress()
                     # exit loop
                     break
@@ -122,7 +115,6 @@ class Plan(Solver):
                 with solver_transcript(self.finish, state) as st:
                     state = await self.finish(state, generate)
                     st.complete(state)
-                self.progress()
             # mark completed
             state.completed = True

inspect-ai 0.3.49__py3-none-any.whl → 0.3.50__py3-none-any.whl

inspect-ai 0.3.49py3-none-any.whl → 0.3.50py3-none-any.whl