PyPI - inspect-ai - Versions diffs - 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl - Mend

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

inspect_ai/_cli/eval.py +31 -0
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +13 -20
inspect_ai/_util/local_server.py +368 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +159 -146
inspect_ai/_view/www/dist/assets/index.js +1020 -1061
inspect_ai/_view/www/log-schema.json +4 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +3 -2
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +20 -12
inspect_ai/agent/_as_tool.py +15 -3
inspect_ai/agent/_handoff.py +8 -1
inspect_ai/agent/_run.py +11 -3
inspect_ai/log/__init__.py +4 -0
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +0 -8
inspect_ai/log/_transcript.py +7 -1
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +32 -12
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +21 -48
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_openai_responses.py +13 -1
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +241 -0
inspect_ai/model/_providers/vllm.py +207 -400
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +2 -0
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +12 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx CHANGED Viewed

@@ -1,14 +1,10 @@
 import { EvalPlan, EvalScore, EvalSpec, Params2 } from "../../@types/log";
-import { toTitleCase } from "../../utils/format";
-import { ghCommitUrl } from "../../utils/git";
-import { MetaDataView } from "../content/MetaDataView";
 import { DatasetDetailView } from "./DatasetDetailView";
 import { ScorerDetailView } from "./ScorerDetailView";
 import { SolversDetailView } from "./SolverDetailView";
 import clsx from "clsx";
 import { FC, ReactNode } from "react";
-import { kModelNone } from "../../constants";
 import styles from "./PlanDetailView.module.css";
 interface PlanDetailViewProps {
@@ -26,71 +22,7 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
     return null;
   }
-  // Add configuration
-  const config: Record<string, unknown> = {};
-  Object.entries(evaluation?.config || {}).forEach((entry) => {
-    const key = entry[0];
-    const value = entry[1];
-    config[key] = value;
-  });
   const steps = plan?.steps;
-  const metadata = evaluation?.metadata;
-  const revision = evaluation?.revision;
-  const packages = evaluation?.packages;
-  const model_args = evaluation?.model_args;
-  const task_args = evaluation?.task_args;
-  const generate_config = plan?.config;
-  const taskInformation: Record<string, unknown> = {
-    ["Task ID"]: evaluation?.task_id,
-    ["Run ID"]: evaluation?.run_id,
-  };
-  if (revision) {
-    taskInformation[
-      `${revision.type ? `${toTitleCase(revision.type)} ` : ""}Revision`
-    ] = {
-      _html: (
-        <a href={ghCommitUrl(revision.origin, revision.commit)}>
-          {revision.commit}
-        </a>
-      ),
-    };
-  }
-  if (packages) {
-    const names = Object.keys(packages).map((key) => {
-      return `${key} ${packages[key]}`;
-    });
-    if (names.length === 1) {
-      taskInformation["Inspect"] = names[0];
-    } else {
-      taskInformation["Inspect"] = names;
-    }
-  }
-  if (evaluation.tags) {
-    taskInformation["Tags"] = evaluation.tags.join(", ");
-  }
-  if (evaluation?.model && evaluation.model !== kModelNone) {
-    config["model"] = evaluation.model;
-  }
-  if (evaluation?.model_base_url) {
-    config["model_base_url"] = evaluation.model_base_url;
-  }
-  if (evaluation?.sandbox) {
-    if (Array.isArray(evaluation?.sandbox)) {
-      config["sandbox"] = evaluation.sandbox[0];
-      if (evaluation.sandbox[1]) {
-        config["sandbox_config"] = evaluation.sandbox[1];
-      }
-    } else {
-      config["sandbox"] = evaluation?.sandbox.type;
-      config["sandbox_config"] = evaluation?.sandbox.config;
-    }
-  }
   const taskColumns: {
     title: string;
@@ -148,117 +80,12 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
     }
   }
-  // Compute the column style for the remaining (either 1 or 2 columns wide)
-  const metadataColumns: {
-    title: string;
-    className: string;
-    contents: ReactNode;
-  }[] = [];
-  const cols = colCount(
-    metadataColumns,
-    task_args,
-    model_args,
-    config,
-    metadata,
-  );
-  metadataColumns.push({
-    title: "Task Information",
-    className: cols === 1 ? styles.oneCol : styles.twoCol,
-    contents: (
-      <MetaDataView
-        key={`plan-md-task`}
-        className={"text-size-small"}
-        entries={taskInformation}
-        tableOptions="sm"
-      />
-    ),
-  });
-  if (task_args && Object.keys(task_args).length > 0) {
-    metadataColumns.push({
-      title: "Task Args",
-      className: cols === 1 ? styles.oneCol : styles.twoCol,
-      contents: (
-        <MetaDataView
-          key={`plan-md-task-args`}
-          className={"text-size-small"}
-          entries={task_args as Record<string, unknown>}
-          tableOptions="sm"
-        />
-      ),
-    });
-  }
-  if (model_args && Object.keys(model_args).length > 0) {
-    metadataColumns.push({
-      title: "Model Args",
-      className: cols === 1 ? styles.oneCol : styles.twoCol,
-      contents: (
-        <MetaDataView
-          key={`plan-md-model-args`}
-          className={"text-size-small"}
-          entries={model_args as Record<string, unknown>}
-          tableOptions="sm"
-        />
-      ),
-    });
-  }
-  if (config && Object.keys(config).length > 0) {
-    metadataColumns.push({
-      title: "Configuration",
-      className: cols === 1 ? styles.oneCol : styles.twoCol,
-      contents: (
-        <MetaDataView
-          key={`plan-md-config`}
-          className={"text-size-small"}
-          entries={config}
-          tableOptions="sm"
-        />
-      ),
-    });
-  }
-  if (generate_config && Object.keys(generate_config).length > 0) {
-    const generate_record: Record<string, unknown> = Object.fromEntries(
-      Object.entries(generate_config),
-    );
-    metadataColumns.push({
-      title: "Generate Config",
-      className: cols === 1 ? styles.oneCol : styles.twoCol,
-      contents: (
-        <MetaDataView
-          key={`plan-md-generate-config`}
-          className={"text-size-small"}
-          entries={generate_record}
-          tableOptions="sm"
-        />
-      ),
-    });
-  }
-  if (metadata && Object.keys(metadata).length > 0) {
-    metadataColumns.push({
-      title: "Metadata",
-      className: cols === 1 ? styles.oneCol : styles.twoCol,
-      contents: (
-        <MetaDataView
-          key={`plan-md-metadata`}
-          className={"text-size-small"}
-          entries={metadata}
-          tableOptions="sm"
-        />
-      ),
-    });
-  }
   return (
     <div className={styles.container}>
       <div
         className={styles.grid}
         style={{
-          gridTemplateColumns: `repeat(${taskColumns.length}, auto)`,
+          gridTemplateColumns: `repeat(${taskColumns.length}, fit-content(50%))`,
         }}
       >
         {taskColumns.map((col) => {
@@ -273,34 +100,10 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
           );
         })}
       </div>
-      <div className={clsx(styles.row)}>
-        {metadataColumns.map((col) => {
-          return (
-            <PlanColumn
-              title={col.title}
-              className={col.className}
-              key={`plan-col-${col.title}`}
-            >
-              {col.contents}
-            </PlanColumn>
-          );
-        })}
-      </div>
     </div>
   );
 };
-const colCount = (...other: unknown[]) => {
-  let count = 0;
-  for (const o in other) {
-    if (o && Object.keys(o).length > 0) {
-      count++;
-    }
-  }
-  return count;
-};
 interface PlanColumnProps {
   title: string;
   className: string | string[];

inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx CHANGED Viewed

@@ -1,6 +1,7 @@
 import { Value2 } from "../../../../@types/log";
 import { kScoreTypeNumeric } from "../../../../constants";
 import { formatDecimalNoTrailingZeroes } from "../../../../utils/format";
+import { compareWithNan } from "../../../../utils/numeric";
 import { ScoreDescriptor } from "../types";
 export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
@@ -14,7 +15,7 @@ export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
     max: Math.max(...onlyNumeric),
     compare: (a, b) => {
       if (typeof a.value === "number" && typeof b.value === "number") {
-        return a.value - b.value;
+        return compareWithNan(a.value, b.value);
       } else {
         console.warn("Comparing non-numerics using a numeric score descriptor");
         return 0;

inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx CHANGED Viewed

@@ -6,6 +6,7 @@ import styles from "./ModelUsagePanel.module.css";
 interface ModelUsageProps {
   usage: ModelUsage1;
+  className?: string | string[];
 }
 interface ModelUsageRow {
@@ -19,7 +20,7 @@ interface ModelUsageRow {
 /**
  * Renders the ModelUsagePanel component.
  */
-export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
+export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage, className }) => {
   if (!usage) {
     return null;
   }
@@ -84,7 +85,7 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
   });
   return (
-    <div className={clsx("text-size-small", styles.wrapper)}>
+    <div className={clsx("text-size-small", styles.wrapper, className)}>
       {rows.map((row, idx) => {
         if (row.label === "---") {
           return (

inspect_ai/_view/www/src/app/usage/TokenTable.module.css CHANGED Viewed

@@ -1,6 +1,5 @@
 .table {
   width: 100%;
-  margin-top: 0.7rem;
 }
 .tableTokens {
@@ -15,3 +14,7 @@
 .model {
   padding-right: 1em;
 }
+.cellContents {
+  padding-bottom: 1em;
+}

inspect_ai/_view/www/src/app/usage/TokenTable.tsx CHANGED Viewed

@@ -79,10 +79,10 @@ export const TokenRow: FC<TokenRowProps> = ({ model, usage }) => {
   return (
     <tr>
       <td>
-        <div className={styles.model}>{model}</div>
+        <div className={clsx(styles.model, styles.cellContents)}>{model}</div>
       </td>
       <td>
-        <ModelUsagePanel usage={usage} />
+        <ModelUsagePanel usage={usage} className={clsx(styles.cellContents)} />
       </td>
     </tr>
   );

inspect_ai/_view/www/src/app/usage/UsageCard.module.css CHANGED Viewed

@@ -3,13 +3,18 @@
   padding-bottom: 1em;
   margin-left: 0.5em;
   display: flex;
+  flex-wrap: wrap;
+  gap: 1em;
 }
 .col1 {
-  flex: 1 1 40%;
-  margin-right: 1em;
+  flex: 0 1 auto;
+  min-width: 200px;
+  width: fit-content;
 }
 .col2 {
-  flex: 1 1 60%;
+  flex: 1 1 auto;
+  min-width: 300px;
+  width: fit-content;
 }

inspect_ai/_view/www/src/app/usage/UsageCard.tsx CHANGED Viewed

@@ -1,10 +1,5 @@
-import clsx from "clsx";
 import { EvalStats } from "../../@types/log";
-import { FontSize } from "../../app/appearance/fonts";
-import { ApplicationIcons } from "../../app/appearance/icons";
-import { MetaDataView } from "../../app/content/MetaDataView";
 import { Card, CardBody, CardHeader } from "../../components/Card";
-import { formatDuration } from "../../utils/format";
 import { ModelTokenTable } from "./ModelTokenTable";
 import { FC } from "react";
@@ -24,40 +19,11 @@ export const UsageCard: FC<UsageCardProps> = ({ stats }) => {
     return null;
   }
-  const totalDuration = formatDuration(
-    new Date(stats.started_at),
-    new Date(stats.completed_at),
-  );
-  const usageMetadataStyle = {
-    fontSize: FontSize.smaller,
-  };
   return (
     <Card>
-      <CardHeader icon={ApplicationIcons.usage} label="Usage" />
+      <CardHeader label="Usage" />
       <CardBody id={kUsageCardBodyId}>
         <div className={styles.wrapper}>
-          <div className={styles.col1}>
-            <div
-              className={clsx(
-                "text-size-smaller",
-                "text-style-label",
-                "text-style-secondary",
-              )}
-            >
-              Duration
-            </div>
-            <MetaDataView
-              entries={{
-                ["Start"]: new Date(stats.started_at).toLocaleString(),
-                ["End"]: new Date(stats.completed_at).toLocaleString(),
-                ["Duration"]: totalDuration,
-              }}
-              tableOptions="borderless,sm"
-              style={usageMetadataStyle}
-            />
-          </div>
           <div className={styles.col2}>
             <ModelTokenTable model_usage={stats.model_usage} />
           </div>

inspect_ai/_view/www/src/components/Card.css CHANGED Viewed

@@ -23,7 +23,6 @@
   background-color: var(--bs-light-bg-subtle);
   border: solid 1px var(--bs-light-border-subtle);
   border-radius: var(--bs-border-radius);
-  margin-bottom: 1.5em;
 }
 .card-collaping-header {

inspect_ai/_view/www/src/constants.ts CHANGED Viewed

@@ -5,6 +5,8 @@ export const kModelNone = "none/none";
 export const kLogViewSamplesTabId = "samples";
 export const kLogViewJsonTabId = "json";
 export const kLogViewInfoTabId = "info";
+export const kLogViewModelsTabId = "models";
+export const kLogViewTaskTabId = "task";
 // Sample tab constants
 export const kSampleMessagesTabId = `messages`;

inspect_ai/_view/www/src/utils/numeric.ts ADDED Viewed

@@ -0,0 +1,17 @@
+export function compareWithNan(a: number, b: number): number {
+  const aIsNaN = Number.isNaN(a);
+  const bIsNaN = Number.isNaN(b);
+  if (aIsNaN && bIsNaN) {
+    return 0;
+  }
+  if (aIsNaN) {
+    return 1;
+  }
+  if (bIsNaN) {
+    return -1;
+  }
+  return a - b;
+}

inspect_ai/agent/_agent.py CHANGED Viewed

@@ -27,13 +27,14 @@ from inspect_ai.model._chat_message import (
     ChatMessageAssistant,
 )
 from inspect_ai.model._model_output import ChatCompletionChoice, ModelOutput
+from inspect_ai.util._limited_conversation import ChatMessageList
 class AgentState:
     """Agent state."""
     def __init__(self, *, messages: list[ChatMessage]) -> None:
-        self._messages = messages
+        self._messages: list[ChatMessage] = ChatMessageList(messages)
         self._output: ModelOutput | None = None
     @property
@@ -43,8 +44,7 @@ class AgentState:
     @messages.setter
     def messages(self, messages: list[ChatMessage]) -> None:
-        """Set the conversation history."""
-        self._messages = messages
+        self._messages = ChatMessageList(messages)
     @property
     def output(self) -> ModelOutput:

inspect_ai/agent/_as_solver.py CHANGED Viewed

@@ -2,6 +2,8 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
+from inspect_ai.util._limit import Limit, apply_limits
 if TYPE_CHECKING:
     from inspect_ai.solver._solver import Solver
@@ -14,7 +16,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
 from ._agent import Agent, AgentState
-def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
+def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
     """Convert an agent to a solver.
     Note that agents used as solvers will only receive their first parameter
@@ -23,6 +25,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
     Args:
        agent: Agent to convert.
+       limits: List of limits to apply to the agent. Should a limit
+          be exceeded, the Sample ends and proceeds to scoring.
        **agent_kwargs: Arguments to curry to Agent function (required
           if the agent has parameters without default values).
@@ -52,17 +56,21 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
     @solver(name=agent_name)
     def agent_to_solver() -> Solver:
         async def solve(state: TaskState, generate: Generate) -> TaskState:
-            # run agent
-            agent_state = await agent(
-                AgentState(messages=state.messages), **agent_kwargs
-            )
-            # update messages
-            state.messages = agent_state.messages
-            # update output if its not empty
-            if agent_state.output:
-                state.output = agent_state.output
+            agent_state = AgentState(messages=state.messages)
+            try:
+                # run the agent with limits
+                with apply_limits(limits):
+                    agent_state = await agent(agent_state, **agent_kwargs)
+            # if an exception occurs, we still want to update the TaskState with the
+            # AgentState's messages + output so that it appears in the log and is scored
+            finally:
+                # update messages
+                state.messages = agent_state.messages
+                # update output if its not empty
+                if agent_state.output:
+                    state.output = agent_state.output
             return state

inspect_ai/agent/_as_tool.py CHANGED Viewed

@@ -10,12 +10,18 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
 from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
 from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
 from inspect_ai.tool._tool_params import ToolParam
+from inspect_ai.util._limit import Limit, apply_limits
 from ._agent import AGENT_DESCRIPTION, Agent, AgentState
 @tool
-def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -> Tool:
+def as_tool(
+    agent: Agent,
+    description: str | None = None,
+    limits: list[Limit] = [],
+    **agent_kwargs: Any,
+) -> Tool:
     """Convert an agent to a tool.
     By default the model will see all of the agent's arguments as
@@ -27,6 +33,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
     Args:
        agent: Agent to convert.
        description: Tool description (defaults to agent description)
+       limits: List of limits to apply to the agent. Should a limit
+          be exceeded, the tool call ends and returns an error
+          explaining that a limit was exceeded.
        **agent_kwargs: Arguments to curry to Agent function (arguments
           provided here will not be presented to the model as part
           of the tool interface).
@@ -41,9 +50,12 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
         )
     async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
-        # prepare state and call agent
+        # prepare state
         state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
-        state = await agent(state, *args, **(agent_kwargs | kwargs))
+        # run the agent with limits
+        with apply_limits(limits):
+            state = await agent(state, *args, **(agent_kwargs | kwargs))
         # find assistant message to read content from (prefer output)
         if not state.output.empty:

inspect_ai/agent/_handoff.py CHANGED Viewed

@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
 from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
 from inspect_ai.tool._tool_def import ToolDef
 from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
+from inspect_ai.util._limit import Limit
 from ._agent import Agent
 from ._as_tool import agent_tool_info
@@ -21,6 +22,7 @@ def handoff(
     input_filter: MessageFilter | None = None,
     output_filter: MessageFilter | None = None,
     tool_name: str | None = None,
+    limits: list[Limit] = [],
     **agent_kwargs: Any,
 ) -> Tool:
     """Create a tool that enables models to handoff to agents.
@@ -35,6 +37,9 @@ def handoff(
             Use the built-in `last_message` filter to return only the last message
             or alternatively specify a custom `MessageFilter` function.
         tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
+        limits: List of limits to apply to the agent. Should a limit be exceeded,
+            the agent stops and a user message is appended explaining that a limit was
+            exceeded.
         **agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
             will not be presented to the model as part of the tool interface).
@@ -52,7 +57,7 @@ def handoff(
     tool_info = agent_tool_info(agent, description, **agent_kwargs)
     # AgentTool calls will be intercepted by execute_tools
-    agent_tool = AgentTool(agent, input_filter, output_filter, **agent_kwargs)
+    agent_tool = AgentTool(agent, input_filter, output_filter, limits, **agent_kwargs)
     tool_name = tool_name or f"transfer_to_{tool_info.name}"
     set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
     set_tool_description(
@@ -72,11 +77,13 @@ class AgentTool(Tool):
         agent: Agent,
         input_filter: MessageFilter | None = None,
         output_filter: MessageFilter | None = None,
+        limits: list[Limit] = [],
         **kwargs: Any,
     ):
         self.agent = agent
         self.input_filter = input_filter
         self.output_filter = output_filter
+        self.limits = limits
         self.kwargs = kwargs
     @property

inspect_ai/agent/_run.py CHANGED Viewed

@@ -2,12 +2,16 @@ from copy import copy
 from typing import Any
 from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
+from inspect_ai.util._limit import Limit, apply_limits
 from ._agent import Agent, AgentState
 async def run(
-    agent: Agent, input: str | list[ChatMessage] | AgentState, **agent_kwargs: Any
+    agent: Agent,
+    input: str | list[ChatMessage] | AgentState,
+    limits: list[Limit] = [],
+    **agent_kwargs: Any,
 ) -> AgentState:
     """Run an agent.
@@ -17,6 +21,9 @@ async def run(
     Args:
         agent: Agent to run.
         input: Agent input (string, list of messages, or an `AgentState`).
+        limits: List of limits to apply to the agent. Should a limit be
+            exceeded, a LimitExceededError is raised which the caller may
+            handle as appropriate.
         **agent_kwargs: Additional arguments to pass to agent.
     Returns:
@@ -43,5 +50,6 @@ async def run(
     # create state
     state = AgentState(messages=input_messages)
-    # run the agent
-    return await agent(state, **agent_kwargs)
+    # run the agent with limits
+    with apply_limits(limits):
+        return await agent(state, **agent_kwargs)

inspect_ai/log/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from ._file import (
     read_eval_log,
     read_eval_log_async,
     read_eval_log_sample,
+    read_eval_log_sample_summaries,
     read_eval_log_samples,
     write_eval_log,
     write_eval_log_async,
@@ -28,6 +29,7 @@ from ._log import (
     EvalSampleLimit,
     EvalSampleReductions,
     EvalSampleScore,
+    EvalSampleSummary,
     EvalScore,
     EvalSpec,
     EvalStats,
@@ -70,6 +72,7 @@ __all__ = [
     "EvalSampleLimit",
     "EvalSampleScore",
     "EvalSampleReductions",
+    "EvalSampleSummary",
     "EvalScore",
     "EvalSpec",
     "EvalStats",
@@ -100,6 +103,7 @@ __all__ = [
     "read_eval_log_async",
     "read_eval_log_sample",
     "read_eval_log_samples",
+    "read_eval_log_sample_summaries",
     "condense_sample",
     "resolve_sample_attachments",
     "write_eval_log",

inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl