PyPI - inspect-ai - Versions diffs - 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl - Mend

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_cli/score.py +1 -12
inspect_ai/_cli/util.py +4 -2
inspect_ai/_display/core/footer.py +2 -2
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_eval/context.py +7 -1
inspect_ai/_eval/eval.py +51 -27
inspect_ai/_eval/evalset.py +27 -10
inspect_ai/_eval/loader.py +7 -8
inspect_ai/_eval/run.py +23 -31
inspect_ai/_eval/score.py +18 -1
inspect_ai/_eval/task/log.py +5 -13
inspect_ai/_eval/task/resolved.py +1 -0
inspect_ai/_eval/task/run.py +231 -244
inspect_ai/_eval/task/task.py +25 -2
inspect_ai/_eval/task/util.py +1 -8
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/json.py +8 -3
inspect_ai/_util/registry.py +30 -13
inspect_ai/_view/www/App.css +5 -0
inspect_ai/_view/www/dist/assets/index.css +55 -18
inspect_ai/_view/www/dist/assets/index.js +550 -458
inspect_ai/_view/www/log-schema.json +84 -1
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
inspect_ai/_view/www/src/types/log.d.ts +150 -129
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
inspect_ai/agent/_agent.py +12 -0
inspect_ai/agent/_as_tool.py +1 -1
inspect_ai/agent/_bridge/bridge.py +9 -2
inspect_ai/agent/_react.py +142 -74
inspect_ai/agent/_run.py +13 -2
inspect_ai/agent/_types.py +6 -0
inspect_ai/approval/_apply.py +6 -9
inspect_ai/approval/_approver.py +3 -3
inspect_ai/approval/_auto.py +2 -2
inspect_ai/approval/_call.py +20 -4
inspect_ai/approval/_human/approver.py +3 -3
inspect_ai/approval/_human/manager.py +2 -2
inspect_ai/approval/_human/panel.py +3 -3
inspect_ai/approval/_policy.py +3 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +23 -2
inspect_ai/log/_model.py +58 -0
inspect_ai/log/_recorders/file.py +14 -3
inspect_ai/log/_transcript.py +3 -0
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_call_tools.py +15 -2
inspect_ai/model/_model.py +49 -3
inspect_ai/model/_openai.py +151 -21
inspect_ai/model/_providers/anthropic.py +25 -14
inspect_ai/model/_providers/bedrock.py +3 -3
inspect_ai/model/_providers/cloudflare.py +29 -108
inspect_ai/model/_providers/google.py +21 -10
inspect_ai/model/_providers/grok.py +23 -17
inspect_ai/model/_providers/groq.py +61 -37
inspect_ai/model/_providers/llama_cpp_python.py +8 -9
inspect_ai/model/_providers/mistral.py +8 -3
inspect_ai/model/_providers/ollama.py +8 -9
inspect_ai/model/_providers/openai.py +53 -157
inspect_ai/model/_providers/openai_compatible.py +195 -0
inspect_ai/model/_providers/openrouter.py +4 -15
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/model/_providers/together.py +25 -23
inspect_ai/model/_trim.py +83 -0
inspect_ai/solver/_plan.py +5 -3
inspect_ai/tool/_tool_call.py +3 -0
inspect_ai/tool/_tool_def.py +8 -2
inspect_ai/util/__init__.py +3 -0
inspect_ai/util/_concurrency.py +15 -2
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
inspect_ai/_eval/task/rundir.py +0 -78
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0

inspect_ai/model/_providers/together.py CHANGED Viewed

@@ -10,7 +10,6 @@ from openai.types.chat import (
 from typing_extensions import override
 from inspect_ai._util.constants import DEFAULT_MAX_TOKENS
-from inspect_ai.model._providers.util.chatapi import ChatAPIHandler
 from inspect_ai.tool._tool_choice import ToolChoice
 from inspect_ai.tool._tool_info import ToolInfo
@@ -27,16 +26,14 @@ from .._model_output import (
     as_stop_reason,
 )
 from .._openai import chat_message_assistant_from_openai
-from .openai import (
-    OpenAIAPI,
-)
+from .openai_compatible import OpenAICompatibleAPI
 from .util import (
     chat_api_input,
     chat_api_request,
-    environment_prerequisite_error,
     model_base_url,
     should_retry_chat_api_error,
 )
+from .util.chatapi import ChatAPIHandler
 def chat_choices_from_response_together(
@@ -78,10 +75,7 @@ def chat_choices_from_response_together(
     ]
-TOGETHER_API_KEY = "TOGETHER_API_KEY"
-class TogetherAIAPI(OpenAIAPI):
+class TogetherAIAPI(OpenAICompatibleAPI):
     def __init__(
         self,
         model_name: str,
@@ -89,14 +83,13 @@ class TogetherAIAPI(OpenAIAPI):
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
     ) -> None:
-        if not api_key:
-            api_key = os.environ.get(TOGETHER_API_KEY, None)
-            if not api_key:
-                raise environment_prerequisite_error("TogetherAI", TOGETHER_API_KEY)
-        base_url = model_base_url(base_url, "TOGETHER_BASE_URL")
-        base_url = base_url if base_url else "https://api.together.xyz/v1"
         super().__init__(
-            model_name=model_name, base_url=base_url, api_key=api_key, config=config
+            model_name=model_name,
+            base_url=base_url,
+            api_key=api_key,
+            config=config,
+            service="Together",
+            service_base_url="https://api.together.xyz/v1",
         )
     # Together uses a default of 512 so we bump it up
@@ -119,22 +112,31 @@ class TogetherAIAPI(OpenAIAPI):
             return ex
     @override
-    def set_logprobs_params(
-        self, params: dict[str, Any], config: GenerateConfig
-    ) -> dict[str, Any]:
-        if config.logprobs is True:
+    def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, Any]:
+        params = super().completion_params(config, tools)
+        if "logprobs" in params:
             params["logprobs"] = 1
+        if "top_logprobs" in params:
+            del params["top_logprobs"]
+        # together requires temperature with num_choices
+        if config.num_choices is not None and config.temperature is None:
+            params["temperature"] = 1
         return params
     # Together has a slightly different logprobs structure to OpenAI, so we need to remap it.
-    def _chat_choices_from_response(
-        self, response: ChatCompletion, tools: list[ToolInfo]
+    @override
+    def chat_choices_from_completion(
+        self, completion: ChatCompletion, tools: list[ToolInfo]
     ) -> list[ChatCompletionChoice]:
-        return chat_choices_from_response_together(response, tools)
+        return chat_choices_from_response_together(completion, tools)
 # Implementation of REST client for Together (currently not used)
+TOGETHER_API_KEY = "TOGETHER_API_KEY"
 class TogetherRESTAPI(ModelAPI):
     def __init__(

inspect_ai/model/_trim.py ADDED Viewed

@@ -0,0 +1,83 @@
+from dataclasses import dataclass, field
+from ._chat_message import ChatMessage
+def trim_messages(
+    messages: list[ChatMessage], preserve: float = 0.7
+) -> list[ChatMessage]:
+    """Trim message list to fit within model context.
+    Trim the list of messages by:
+    - Retaining all system messages.
+    - Retaining the 'input' messages from the sample.
+    - Preserving a proportion of the remaining messages (`preserve=0.7` by default).
+    - Ensuring that all assistant tool calls have corresponding tool messages.
+    Args:
+        messages: List of messages to trim.
+        preserve: Ratio of converation messages to preserve
+            (defaults to 0.7)
+    Returns:
+        Trimmed messages.
+    """
+    # validate preserve
+    if not 0 <= preserve <= 1:
+        raise ValueError(f"preserve must be in range [0,1], got {preserve}")
+    # partition messages
+    partitioned = _partition_messages(messages)
+    # slice messages from the beginning of the conversation as-per preserve
+    start_idx = int(len(partitioned.conversation) * (1 - preserve))
+    preserved_messages = partitioned.conversation[start_idx:]
+    # one last step: many model apis require tool messages to have a parent assistant
+    # message with a corresponding tool_call_id. to ensure this, we build the
+    # final list of conversation messages by filtering out tool messages for which
+    # we haven't seen a corresponding assistant message with their id
+    conversation_messages: list[ChatMessage] = []
+    active_tool_ids = set()
+    for message in preserved_messages:
+        if message.role == "assistant":
+            active_tool_ids = {tc.id for tc in (message.tool_calls or [])}
+            conversation_messages.append(message)
+        elif message.role == "tool" and message.tool_call_id in active_tool_ids:
+            conversation_messages.append(message)
+        elif message.role == "user":
+            active_tool_ids = set()
+            conversation_messages.append(message)
+    # return trimmed messages
+    return partitioned.system + partitioned.input + conversation_messages
+@dataclass
+class PartitionedMessages:
+    system: list[ChatMessage] = field(default_factory=list)
+    input: list[ChatMessage] = field(default_factory=list)
+    conversation: list[ChatMessage] = field(default_factory=list)
+def _partition_messages(messages: list[ChatMessage]) -> PartitionedMessages:
+    # first pass at partitioning
+    partitioned = PartitionedMessages()
+    for message in messages:
+        if message.role == "system":
+            partitioned.system.append(message)
+        elif message.source == "input":
+            partitioned.input.append(message)
+        else:
+            partitioned.conversation.append(message)
+    # if there are no input messages then take up to the first user message
+    if len(partitioned.input) == 0:
+        while partitioned.conversation:
+            message = partitioned.conversation.pop(0)
+            partitioned.input.append(message)
+            if message.role == "user":
+                break
+    # all done!
+    return partitioned

inspect_ai/solver/_plan.py CHANGED Viewed

@@ -164,7 +164,7 @@ def plan(*plan: PlanType | None, name: str | None = None, **attribs: Any) -> Any
                 plan_type,
                 plan,
                 RegistryInfo(
-                    type="plan",
+                    type="plan",  # type: ignore[arg-type]
                     name=plan_name,
                     metadata=dict(attribs=attribs, params=params),
                 ),
@@ -212,7 +212,9 @@ def plan_register(
     registry_add(
         plan,
         RegistryInfo(
-            type="plan", name=name, metadata=dict(attribs=attribs, params=params)
+            type="plan",  # type: ignore[arg-type]
+            name=name,
+            metadata=dict(attribs=attribs, params=params),
         ),
     )
     return plan
@@ -228,4 +230,4 @@ def plan_create(name: str, **kwargs: Any) -> Plan:
     Returns:
         Plan with registry info attribute
     """
-    return cast(Plan, registry_create("plan", name, **kwargs))
+    return cast(Plan, registry_create("plan", name, **kwargs))  # type: ignore[arg-type]

inspect_ai/tool/_tool_call.py CHANGED Viewed

@@ -53,6 +53,9 @@ class ToolCall:
     view: ToolCallContent | None = field(default=None)
     """Custom view of tool call input."""
+    type: str | None = field(default=None)
+    """Tool call type (deprecated)."""
 @dataclass
 class ToolCallError:

inspect_ai/tool/_tool_def.py CHANGED Viewed

@@ -234,9 +234,15 @@ def validate_tool_parameters(tool_name: str, parameters: dict[str, ToolParam]) -
     # validate that we have types/descriptions for paramters
     for param_name, param in parameters.items():
-        def raise_not_provided_error(context: str) -> None:
+        def raise_not_provided_error(
+            context: str,
+            # Use the default value trick to avoid Python's late binding of
+            # closures issue.
+            # see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
+            bound_name: str = param_name,
+        ) -> None:
             raise ValueError(
-                f"{context} provided for parameter '{param_name}' of function '{tool_name}'."
+                f"{context} provided for parameter '{bound_name}' of function '{tool_name}'."
             )
         if param.type is None and not param.anyOf and not param.enum:

inspect_ai/util/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from inspect_ai._util.registry import RegistryType, registry_create
 from inspect_ai._util.trace import trace_action, trace_message
 from ._concurrency import concurrency
@@ -64,4 +65,6 @@ __all__ = [
     "throttle",
     "trace_action",
     "trace_message",
+    "RegistryType",
+    "registry_create",
 ]

inspect_ai/util/_concurrency.py CHANGED Viewed

@@ -56,10 +56,23 @@ async def concurrency(
         yield
-def concurrency_status() -> dict[str, tuple[int, int]]:
+def concurrency_status_display() -> dict[str, tuple[int, int]]:
     status: dict[str, tuple[int, int]] = {}
+    names = [c.name for c in _concurrency_semaphores.values()]
     for c in _concurrency_semaphores.values():
-        status[c.name] = (c.concurrency - c.semaphore.value, c.concurrency)
+        # compute name for status display. some resources (e.g. models) use
+        # a / prefix. if there are no duplicates of a given prefix then shorten
+        # it to be only the prefix (e.g. 'openai' rather than 'openai/gpt-4o')
+        prefix = c.name.split("/")[0]
+        prefix_count = sum([1 for name in names if name.startswith(prefix + "/")])
+        if prefix_count == 1:
+            name = prefix
+        else:
+            name = c.name
+        # status display entry
+        status[name] = (c.concurrency - c.semaphore.value, c.concurrency)
     return status

{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.87
+Version: 0.3.89
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License

inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl