PyPI - inspect-ai - Versions diffs - 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl - Mend

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_display/textual/app.py +14 -3
inspect_ai/_display/textual/display.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +9 -3
inspect_ai/_display/textual/widgets/task_detail.py +3 -4
inspect_ai/_display/textual/widgets/tasks.py +17 -1
inspect_ai/_display/textual/widgets/vscode.py +48 -0
inspect_ai/_eval/eval.py +36 -24
inspect_ai/_eval/evalset.py +17 -18
inspect_ai/_eval/loader.py +34 -11
inspect_ai/_eval/run.py +8 -13
inspect_ai/_eval/score.py +13 -3
inspect_ai/_eval/task/generate.py +8 -9
inspect_ai/_eval/task/log.py +2 -0
inspect_ai/_eval/task/task.py +23 -9
inspect_ai/_util/file.py +13 -0
inspect_ai/_util/json.py +2 -1
inspect_ai/_util/registry.py +1 -0
inspect_ai/_util/vscode.py +37 -0
inspect_ai/_view/www/App.css +6 -0
inspect_ai/_view/www/dist/assets/index.css +304 -128
inspect_ai/_view/www/dist/assets/index.js +47495 -27519
inspect_ai/_view/www/log-schema.json +124 -31
inspect_ai/_view/www/package.json +3 -0
inspect_ai/_view/www/src/App.tsx +12 -0
inspect_ai/_view/www/src/appearance/icons.ts +1 -0
inspect_ai/_view/www/src/components/Card.tsx +6 -4
inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
inspect_ai/_view/www/src/components/Modal.module.css +38 -0
inspect_ai/_view/www/src/components/Modal.tsx +77 -0
inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
inspect_ai/_view/www/src/state/hooks.ts +5 -3
inspect_ai/_view/www/src/state/logPolling.ts +5 -1
inspect_ai/_view/www/src/state/logSlice.ts +10 -0
inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
inspect_ai/_view/www/src/types/log.d.ts +34 -26
inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
inspect_ai/_view/www/yarn.lock +94 -1
inspect_ai/agent/__init__.py +36 -0
inspect_ai/agent/_agent.py +268 -0
inspect_ai/agent/_as_solver.py +72 -0
inspect_ai/agent/_as_tool.py +122 -0
inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
inspect_ai/agent/_filter.py +46 -0
inspect_ai/agent/_handoff.py +93 -0
inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
inspect_ai/agent/_react.py +241 -0
inspect_ai/agent/_run.py +36 -0
inspect_ai/agent/_types.py +81 -0
inspect_ai/log/_log.py +11 -2
inspect_ai/log/_transcript.py +13 -9
inspect_ai/model/__init__.py +7 -1
inspect_ai/model/_call_tools.py +256 -52
inspect_ai/model/_chat_message.py +7 -4
inspect_ai/model/_conversation.py +13 -62
inspect_ai/model/_display.py +85 -0
inspect_ai/model/_model.py +113 -14
inspect_ai/model/_model_output.py +14 -9
inspect_ai/model/_openai.py +16 -4
inspect_ai/model/_openai_computer_use.py +162 -0
inspect_ai/model/_openai_responses.py +319 -165
inspect_ai/model/_providers/anthropic.py +20 -21
inspect_ai/model/_providers/azureai.py +24 -13
inspect_ai/model/_providers/bedrock.py +1 -7
inspect_ai/model/_providers/cloudflare.py +3 -3
inspect_ai/model/_providers/goodfire.py +2 -6
inspect_ai/model/_providers/google.py +11 -10
inspect_ai/model/_providers/groq.py +6 -3
inspect_ai/model/_providers/hf.py +7 -3
inspect_ai/model/_providers/mistral.py +7 -10
inspect_ai/model/_providers/openai.py +47 -17
inspect_ai/model/_providers/openai_o1.py +11 -4
inspect_ai/model/_providers/openai_responses.py +12 -14
inspect_ai/model/_providers/providers.py +2 -2
inspect_ai/model/_providers/together.py +12 -2
inspect_ai/model/_providers/util/chatapi.py +7 -2
inspect_ai/model/_providers/util/hf_handler.py +4 -2
inspect_ai/model/_providers/util/llama31.py +4 -2
inspect_ai/model/_providers/vertex.py +11 -9
inspect_ai/model/_providers/vllm.py +4 -4
inspect_ai/scorer/__init__.py +2 -0
inspect_ai/scorer/_metrics/__init__.py +2 -0
inspect_ai/scorer/_metrics/grouped.py +84 -0
inspect_ai/scorer/_score.py +26 -6
inspect_ai/solver/__init__.py +2 -2
inspect_ai/solver/_basic_agent.py +22 -9
inspect_ai/solver/_bridge.py +31 -0
inspect_ai/solver/_chain.py +20 -12
inspect_ai/solver/_fork.py +5 -1
inspect_ai/solver/_human_agent.py +52 -0
inspect_ai/solver/_prompt.py +3 -1
inspect_ai/solver/_run.py +59 -0
inspect_ai/solver/_solver.py +14 -4
inspect_ai/solver/_task_state.py +5 -3
inspect_ai/tool/_tool_call.py +15 -8
inspect_ai/tool/_tool_def.py +17 -12
inspect_ai/tool/_tool_support_helpers.py +2 -2
inspect_ai/tool/_tool_with.py +14 -11
inspect_ai/tool/_tools/_bash_session.py +11 -2
inspect_ai/tool/_tools/_computer/_common.py +18 -2
inspect_ai/tool/_tools/_computer/_computer.py +18 -2
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_anyio.py +27 -0
inspect_ai/util/_sandbox/__init__.py +2 -1
inspect_ai/util/_sandbox/context.py +32 -7
inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/docker/docker.py +12 -1
inspect_ai/util/_store_model.py +30 -7
inspect_ai/util/_subprocess.py +13 -3
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/METADATA +1 -1
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/RECORD +179 -153
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
/inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/top_level.txt +0 -0

inspect_ai/model/_providers/openai_responses.py CHANGED Viewed

@@ -1,11 +1,7 @@
 from logging import getLogger
 from typing import Any
-from openai import (
-    AsyncAzureOpenAI,
-    AsyncOpenAI,
-    BadRequestError,
-)
+from openai import AsyncAzureOpenAI, AsyncOpenAI, BadRequestError
 from openai._types import NOT_GIVEN
 from openai.types.responses import Response, ResponseFormatTextJSONSchemaConfigParam
@@ -15,12 +11,10 @@ from inspect_ai.tool import ToolChoice, ToolInfo
 from .._chat_message import ChatMessage
 from .._generate_config import GenerateConfig
 from .._model_call import ModelCall
-from .._model_output import (
-    ModelOutput,
-    ModelUsage,
-)
+from .._model_output import ModelOutput, ModelUsage
 from .._openai import (
     OpenAIResponseError,
+    is_computer_use_preview,
     is_gpt,
     is_o1_mini,
     is_o1_preview,
@@ -65,12 +59,14 @@ async def generate_responses(
         )
     # prepare request (we do this so we can log the ModelCall)
+    tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
     request = dict(
         input=await openai_responses_inputs(input, model_name),
-        tools=openai_responses_tools(tools) if len(tools) > 0 else NOT_GIVEN,
-        tool_choice=openai_responses_tool_choice(tool_choice)
-        if len(tools) > 0
+        tools=tool_params,
+        tool_choice=openai_responses_tool_choice(tool_choice, tool_params)
+        if isinstance(tool_params, list) and tool_choice != "auto"
         else NOT_GIVEN,
+        truncation="auto" if is_computer_use_preview(model_name) else NOT_GIVEN,
         extra_headers={HttpxHooks.REQUEST_ID_HEADER: request_id},
         **completion_params_responses(model_name, config, len(tools) > 0),
     )
@@ -89,7 +85,7 @@ async def generate_responses(
         response = model_response.model_dump()
         # parse out choices
-        choices = openai_responses_chat_choices(model_response, tools)
+        choices = openai_responses_chat_choices(model_name, model_response, tools)
         # return output and call
         return ModelOutput(
@@ -124,7 +120,9 @@ def completion_params_responses(
             f"OpenAI Responses API does not support the '{param}' parameter.",
         )
-    params: dict[str, Any] = dict(model=model_name, store=False)
+    params: dict[str, Any] = dict(
+        model=model_name, store=is_computer_use_preview(model_name)
+    )
     if config.max_tokens is not None:
         params["max_output_tokens"] = config.max_tokens
     if config.frequency_penalty is not None:

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -48,7 +48,7 @@ def openai() -> type[ModelAPI]:
 def anthropic() -> type[ModelAPI]:
     FEATURE = "Anthropic API"
     PACKAGE = "anthropic"
-    MIN_VERSION = "0.47.1"
+    MIN_VERSION = "0.49.0"
     # verify we have the package
     try:
@@ -278,7 +278,7 @@ def goodfire() -> type[ModelAPI]:
 def validate_openai_client(feature: str) -> None:
     FEATURE = feature
     PACKAGE = "openai"
-    MIN_VERSION = "1.68.0"
+    MIN_VERSION = "1.69.0"
     # verify we have the package
     try:

inspect_ai/model/_providers/together.py CHANGED Viewed

@@ -68,7 +68,9 @@ def chat_choices_from_response_together(
         logprobs_models.append(Logprobs(content=logprobs_sequence))
     return [
         ChatCompletionChoice(
-            message=chat_message_assistant_from_openai(choice.message, tools),
+            message=chat_message_assistant_from_openai(
+                response.model, choice.message, tools
+            ),
             stop_reason=as_stop_reason(choice.finish_reason),
             logprobs=logprobs,
         )
@@ -116,6 +118,14 @@ class TogetherAIAPI(OpenAIAPI):
         else:
             return ex
+    @override
+    def set_logprobs_params(
+        self, params: dict[str, Any], config: GenerateConfig
+    ) -> dict[str, Any]:
+        if config.logprobs is True:
+            params["logprobs"] = 1
+        return params
     # Together has a slightly different logprobs structure to OpenAI, so we need to remap it.
     def _chat_choices_from_response(
         self, response: ChatCompletion, tools: list[ToolInfo]
@@ -228,7 +238,7 @@ class TogetherRESTAPI(ModelAPI):
         return DEFAULT_MAX_TOKENS
     def chat_api_handler(self) -> ChatAPIHandler:
-        return ChatAPIHandler()
+        return ChatAPIHandler(self.model_name)
 def together_choices(

inspect_ai/model/_providers/util/chatapi.py CHANGED Viewed

@@ -23,6 +23,9 @@ ChatAPIMessage = dict[Literal["role", "content"], str]
 class ChatAPIHandler:
+    def __init__(self, model: str) -> None:
+        self.model = model
     def input_with_tools(
         self, input: list[ChatMessage], tools: list[ToolInfo]
     ) -> list[ChatMessage]:
@@ -31,7 +34,9 @@ class ChatAPIHandler:
     def parse_assistant_response(
         self, response: str, tools: list[ToolInfo]
     ) -> ChatMessageAssistant:
-        return ChatMessageAssistant(content=response)
+        return ChatMessageAssistant(
+            content=response, model=self.model, source="generate"
+        )
     def assistant_message(self, message: ChatMessageAssistant) -> ChatAPIMessage:
         return {"role": "assistant", "content": message.text}
@@ -48,7 +53,7 @@ class ChatAPIHandler:
 def chat_api_input(
     input: list[ChatMessage],
     tools: list[ToolInfo],
-    handler: ChatAPIHandler = ChatAPIHandler(),
+    handler: ChatAPIHandler,
 ) -> list[ChatAPIMessage]:
     # add tools to input
     if len(tools) > 0:

inspect_ai/model/_providers/util/hf_handler.py CHANGED Viewed

@@ -50,13 +50,16 @@ class HFHandler(ChatAPIHandler):
             return ChatMessageAssistant(
                 content=content,
                 tool_calls=tool_calls,
+                model=self.model_name,
                 source="generate",
             )
         # otherwise this is just an ordinary assistant message
         else:
             return ChatMessageAssistant(
-                content=filter_assistant_header(response), source="generate"
+                content=filter_assistant_header(response),
+                model=self.model_name,
+                source="generate",
             )
@@ -106,7 +109,6 @@ def parse_tool_call_content(content: str, tools: list[ToolInfo]) -> ToolCall:
             id="unknown",
             function="unknown",
             arguments={},
-            type="function",
             parse_error=parse_error,
         )

inspect_ai/model/_providers/util/llama31.py CHANGED Viewed

@@ -106,13 +106,16 @@ class Llama31Handler(ChatAPIHandler):
             return ChatMessageAssistant(
                 content=filter_assistant_header(content),
                 tool_calls=tool_calls,
+                model=self.model,
                 source="generate",
             )
         # otherwise this is just an ordinary assistant message
         else:
             return ChatMessageAssistant(
-                content=filter_assistant_header(response), source="generate"
+                content=filter_assistant_header(response),
+                model=self.model,
+                source="generate",
             )
     @override
@@ -184,7 +187,6 @@ def parse_tool_call_content(content: str, tools: list[ToolInfo]) -> ToolCall:
             id="unknown",
             function="unknown",
             arguments={},
-            type="function",
             parse_error=parse_error,
         )

inspect_ai/model/_providers/vertex.py CHANGED Viewed

@@ -116,11 +116,6 @@ class VertexAPI(ModelAPI):
         self.model = GenerativeModel(model_name)
-    @override
-    async def close(self) -> None:
-        # GenerativeModel uses a cached/shared client so there is no 'close'
-        pass
     async def generate(
         self,
         input: list[ChatMessage],
@@ -155,7 +150,9 @@ class VertexAPI(ModelAPI):
         # capture output
         output = ModelOutput(
             model=self.model_name,
-            choices=completion_choices_from_candidates(response.candidates),
+            choices=completion_choices_from_candidates(
+                self.model_name, response.candidates
+            ),
             usage=ModelUsage(
                 input_tokens=response.usage_metadata.prompt_token_count,
                 output_tokens=response.usage_metadata.candidates_token_count,
@@ -377,7 +374,9 @@ def chat_tools(tools: list[ToolInfo]) -> list[Tool]:
     return [Tool(function_declarations=declarations)]
-def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoice:
+def completion_choice_from_candidate(
+    model: str, candidate: Candidate
+) -> ChatCompletionChoice:
     # check for completion text
     content = " ".join(
         [
@@ -394,7 +393,6 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
             function_call = MessageToDict(getattr(part.function_call, "_pb"))
             tool_calls.append(
                 ToolCall(
-                    type="function",
                     id=function_call["name"],
                     function=function_call["name"],
                     arguments=function_call["args"],
@@ -408,6 +406,7 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
         message=ChatMessageAssistant(
             content=content,
             tool_calls=tool_calls if len(tool_calls) > 0 else None,
+            model=model,
             source="generate",
         ),
         stop_reason=stop_reason,
@@ -435,11 +434,14 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
 def completion_choices_from_candidates(
+    model: str,
     candidates: list[Candidate],
 ) -> list[ChatCompletionChoice]:
     candidates = copy(candidates)
     candidates.sort(key=lambda c: c.index)
-    return [completion_choice_from_candidate(candidate) for candidate in candidates]
+    return [
+        completion_choice_from_candidate(model, candidate) for candidate in candidates
+    ]
 def candidate_stop_reason(finish_reason: FinishReason) -> StopReason:

inspect_ai/model/_providers/vllm.py CHANGED Viewed

@@ -28,7 +28,7 @@ from .._model_output import (
     StopReason,
     TopLogprob,
 )
-from .util import chat_api_input
+from .util import ChatAPIHandler, chat_api_input
 DEFAULT_START_TOKEN = "<|im_start|>"
 DEFAULT_END_TOKEN = "<|im_end|>"
@@ -137,7 +137,7 @@ class VLLMAPI(ModelAPI):
         self.tokenizer = self.model.get_tokenizer()
     @override
-    async def close(self) -> None:
+    def close(self) -> None:
         self.tokenizer = None
         self.model = None
         gc.collect()
@@ -148,7 +148,7 @@ class VLLMAPI(ModelAPI):
         # handle system message and consecutive user messages
         messages = simple_input_messages(messages)
         # convert to chat template input format
-        chat_messages = chat_api_input(messages, tools)
+        chat_messages = chat_api_input(messages, tools, ChatAPIHandler(self.model_name))
         # apply chat template
         chat = self.tokenizer.apply_chat_template(
             chat_messages,
@@ -253,7 +253,7 @@ class VLLMAPI(ModelAPI):
         choices = [
             ChatCompletionChoice(
                 message=ChatMessageAssistant(
-                    content=response.output, source="generate"
+                    content=response.output, model=self.model_name, source="generate"
                 ),
                 stop_reason=response.stop_reason,
                 logprobs=response.logprobs,

inspect_ai/scorer/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ._metric import (
     value_to_float,
 )
 from ._metrics.accuracy import accuracy
+from ._metrics.grouped import grouped
 from ._metrics.mean import mean
 from ._metrics.std import bootstrap_stderr, std, stderr, var
 from ._model import model_graded_fact, model_graded_qa
@@ -58,6 +59,7 @@ __all__ = [
     "std",
     "stderr",
     "mean",
+    "grouped",
     "var",
     "Metric",
     "MetricProtocol",

inspect_ai/scorer/_metrics/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from .accuracy import accuracy
+from .grouped import grouped
 from .mean import mean
 from .std import bootstrap_stderr, std, stderr, var
 __all__ = [
     "accuracy",
     "mean",
+    "grouped",
     "bootstrap_stderr",
     "std",
     "stderr",

inspect_ai/scorer/_metrics/grouped.py ADDED Viewed

@@ -0,0 +1,84 @@
+from typing import Literal, cast
+import numpy as np
+from inspect_ai.scorer._metric import (
+    Metric,
+    MetricProtocol,
+    SampleScore,
+    Value,
+    ValueToFloat,
+    metric,
+    value_to_float,
+)
+@metric
+def grouped(
+    metric: Metric,
+    group_key: str,
+    *,
+    all: Literal["samples", "groups"] | Literal[False] = "samples",
+    all_label: str = "all",
+    value_to_float: ValueToFloat = value_to_float(),
+) -> Metric:
+    """
+    Creates a grouped metric that applies the given metric to subgroups of samples.
+    Args:
+      metric: The metric to apply to each group of samples.
+      group_key: The metadata key used to group samples. Each sample must have this key in its metadata.
+      all: How to compute the "all" aggregate score:
+          - "samples": Apply the metric to all samples regardless of groups
+          - "groups": Calculate the mean of all group scores
+          - False: Don't calculate an aggregate score
+      all_label: The label for the "all" key in the returned dictionary.
+      value_to_float: Function to convert metric values to floats, used when all="groups".
+    Returns:
+        A new metric function that returns a dictionary mapping group names to their scores,
+        with an optional "all" key for the aggregate score.
+    """
+    def grouped_metric(scores: list[SampleScore]) -> Value:
+        # Satisfy the type checker that the metric is a MetricProtocol
+        metric_protocol = cast(MetricProtocol, metric)
+        # Slice the scores into groups
+        scores_dict: dict[str, list[SampleScore]] = {}
+        for sample_score in scores:
+            if (
+                sample_score.sample_metadata is None
+                or group_key not in sample_score.sample_metadata
+            ):
+                raise ValueError(
+                    f"Sample {sample_score.sample_id} has no {group_key} metadata. To compute a grouped metric each sample metadata must have a value for '{group_key}'"
+                )
+            group_name = str(sample_score.sample_metadata.get(group_key))
+            if group_name not in scores_dict:
+                scores_dict[group_name] = []
+            scores_dict[group_name].append(sample_score)
+        # Compute the per group metric
+        grouped_scores = {
+            group_name: metric_protocol(values)
+            for group_name, values in scores_dict.items()
+        }
+        if not all:
+            return cast(Value, grouped_scores)
+        else:
+            # Compute the all metric
+            all_group_metric = None
+            if all == "samples":
+                # samples means apply the metric to all samples
+                all_group_metric = metric_protocol(scores)
+            elif all == "groups":
+                # group means the overall score is the mean of all the group scores
+                all_group_metric = np.mean(
+                    [value_to_float(val) for val in grouped_scores.values()]
+                ).item()
+            return cast(Value, {**grouped_scores, all_label: all_group_metric})
+    return grouped_metric

inspect_ai/scorer/_score.py CHANGED Viewed

@@ -1,30 +1,50 @@
 from contextvars import ContextVar
+from copy import copy
-from inspect_ai.solver._task_state import TaskState
+from inspect_ai.model._conversation import ModelConversation
+from inspect_ai.solver._task_state import TaskState, sample_state
 from ._metric import Score
 from ._scorer import Scorer
 from ._target import Target
-async def score(state: TaskState) -> list[Score]:
-    """Score a TaskState.
+async def score(conversation: ModelConversation) -> list[Score]:
+    """Score a model conversation.
-    Score a task state from within a solver.
+    Score a model conversation (you may pass `TaskState` or `AgentState`
+    as the value for `conversation`)
     Args:
-      state (TaskState): `TaskState` to submit for scoring
+      conversation: Conversation to submit for scoring.
+        Note that both `TaskState` and `AgentState` can be passed
+        as the `conversation` parameter.
     Returns:
       List of scores (one for each task scorer)
     Raises:
-      RuntimerError: If called from outside a task or within
+      RuntimeError: If called from outside a task or within
         a task that does not have a scorer.
     """
     from inspect_ai.log._transcript import ScoreEvent, transcript
+    # get TaskState (if the `conversation` is a `TaskState` use it directly,
+    # otherwise synthesize one)
+    if isinstance(conversation, TaskState):
+        state = conversation
+    else:
+        current_state = sample_state()
+        if current_state is None:
+            raise RuntimeError(
+                "The score() function can only be called while executing a task"
+            )
+        state = copy(current_state)
+        state.messages = conversation.messages
+        state.output = conversation.output
+    # get current scorers and target
     scorers = _scorers.get(None)
     target = _target.get(None)
     if scorers is None or target is None:

inspect_ai/solver/__init__.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from inspect_ai._util.deprecation import relocated_module_attribute
 from ._basic_agent import basic_agent
-from ._bridge.bridge import bridge
+from ._bridge import bridge
 from ._chain import chain
 from ._critique import self_critique
 from ._fork import fork
-from ._human_agent.agent import human_agent
+from ._human_agent import human_agent
 from ._limit import SampleLimitExceededError
 from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
 from ._plan import Plan, plan

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing_extensions import TypedDict, Unpack
 from inspect_ai._util._async import is_callable_coroutine
 from inspect_ai.model._cache import CachePolicy
-from inspect_ai.model._call_tools import call_tools
-from inspect_ai.model._chat_message import ChatMessageTool, ChatMessageUser
+from inspect_ai.model._call_tools import execute_tools
+from inspect_ai.model._chat_message import ChatMessage, ChatMessageTool, ChatMessageUser
 from inspect_ai.model._model import get_model
 from inspect_ai.scorer._metric import Score, ValueToFloat, value_to_float
 from inspect_ai.scorer._score import score
@@ -65,6 +65,7 @@ def basic_agent(
     continue_message: str = DEFAULT_CONTINUE_MESSAGE,
     submit_name: str = DEFAULT_SUBMIT_NAME,
     submit_description: str = DEFAULT_SUBMIT_DESCRIPTION,
+    submit_append: bool = False,
     **kwargs: Unpack[BasicAgentDeprecatedArgs],
 ) -> Solver:
     """Basic ReAct agent.
@@ -102,6 +103,9 @@ def basic_agent(
           (defaults to 'submit')
        submit_description: Description of submit tool (defaults to
           'Submit an answer for evaluation')
+       submit_append: Append the submit tool output to the model completion
+           text (defaults to `False`, which means the submission overwrites
+           the model completion).
        **kwargs: Deprecated arguments for backward compatibility.
     Returns:
@@ -149,9 +153,14 @@ def basic_agent(
         return solve
     # helper to extract a submitted answer
-    def submission(tool_results: list[ChatMessageTool]) -> str | None:
+    def submission(tool_results: list[ChatMessage]) -> str | None:
         return next(
-            (result.text for result in tool_results if result.function == submit_name),
+            (
+                result.text
+                for result in tool_results
+                if isinstance(result, ChatMessageTool)
+                and result.function == submit_name
+            ),
             None,
         )
@@ -189,9 +198,9 @@ def basic_agent(
                     # resolve tools calls (if any)
                     if state.output.message.tool_calls:
-                        # call tool functions
-                        tool_results = await call_tools(
-                            state.output.message,
+                        # execute tool functions
+                        tool_results, _ = await execute_tools(
+                            [state.output.message],
                             state.tools,
                             max_output=max_tool_output,
                         )
@@ -200,8 +209,12 @@ def basic_agent(
                         # was an answer submitted?
                         answer = submission(tool_results)
                         if answer:
-                            # set the output to the answer for scoring
-                            state.output.completion = answer
+                            if submit_append:
+                                state.output.completion = (
+                                    f"{state.output.completion}\n\n{answer}".strip()
+                                )
+                            else:
+                                state.output.completion = answer
                             # exit if we are at max_attempts
                             attempts += 1

inspect_ai/solver/_bridge.py ADDED Viewed

@@ -0,0 +1,31 @@
+from logging import getLogger
+from typing import Any, Awaitable, Callable
+from inspect_ai._util.logger import warn_once
+from inspect_ai.agent._as_solver import as_solver
+from ._solver import Solver, solver
+logger = getLogger(__name__)
+@solver
+def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
+    """Bridge an external agent into an Inspect Solver.
+    See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
+    Args:
+      agent: Callable which takes a sample `dict` and returns a result `dict`.
+    Returns:
+      Standard Inspect solver.
+    """
+    from inspect_ai.agent._bridge.bridge import bridge as agent_bridge
+    warn_once(
+        logger,
+        "The bridge solver is deprecated. Please use the bridge agent from the agents module instead.",
+    )
+    return as_solver(agent_bridge(agent))

inspect_ai/solver/_chain.py CHANGED Viewed

@@ -1,14 +1,19 @@
-from typing import Sequence, overload
+from typing import Sequence, cast, overload
 from typing_extensions import override
+from inspect_ai.agent._agent import Agent, is_agent
+from inspect_ai.agent._as_solver import as_solver
 from ._solver import Generate, Solver, solver
 from ._task_state import TaskState
 @solver
-def chain(*solvers: Solver | list[Solver]) -> Solver:
-    """Compose a solver from multiple other solvers.
+def chain(
+    *solvers: Solver | Agent | list[Solver] | list[Solver | Agent],
+) -> Solver:
+    """Compose a solver from multiple other solvers and/or agents.
     Solvers are executed in turn, and a solver step event
     is added to the transcript for each. If a solver returns
@@ -16,10 +21,10 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
     early.
     Args:
-      *solvers: One or more solvers or lists of solvers to chain together.
+      *solvers: One or more solvers or agents to chain together.
     Returns:
-      Solver that executes the passed solvers as a chain.
+      Solver that executes the passed solvers and agents as a chain.
     """
     # flatten lists and chains
     all_solvers: list[Solver] = []
@@ -29,17 +34,20 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
     return Chain(all_solvers)
-def unroll(solver: Solver | list[Solver]) -> list[Solver]:
-    if isinstance(solver, Solver):
-        if isinstance(solver, Chain):
-            return unroll(solver._solvers)
-        else:
-            return [solver]
-    else:
+def unroll(
+    solver: Solver | Agent | list[Solver] | list[Solver | Agent],
+) -> list[Solver]:
+    if isinstance(solver, list):
         unrolled: list[Solver] = []
         for s in solver:
             unrolled.extend(unroll(s))
         return unrolled
+    elif is_agent(solver):
+        return [as_solver(solver)]
+    elif isinstance(solver, Chain):
+        return unroll(solver._solvers)
+    else:
+        return [cast(Solver, solver)]
 class Chain(Sequence[Solver], Solver):

inspect_ai/solver/_fork.py CHANGED Viewed

@@ -52,7 +52,7 @@ async def fork(
 async def solver_subtask(state: TaskState, solver: Solver) -> TaskState:
     # get the generate function for the current task
-    generate = _generate.get(None)
+    generate = task_generate()
     if generate is None:
         raise RuntimeError("Called fork() outside of a running task.")
@@ -88,4 +88,8 @@ def set_task_generate(generate: Generate) -> None:
     _generate.set(generate)
+def task_generate() -> Generate | None:
+    return _generate.get(None)
 _generate: ContextVar[Generate] = ContextVar("_generate")

inspect-ai 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl

inspect-ai 0.3.82py3-none-any.whl → 0.3.84py3-none-any.whl