inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. inspect_ai/_cli/eval.py +31 -0
  2. inspect_ai/_eval/eval.py +19 -2
  3. inspect_ai/_eval/evalset.py +4 -1
  4. inspect_ai/_eval/run.py +41 -0
  5. inspect_ai/_eval/task/generate.py +38 -44
  6. inspect_ai/_eval/task/log.py +26 -28
  7. inspect_ai/_eval/task/run.py +13 -20
  8. inspect_ai/_util/local_server.py +368 -0
  9. inspect_ai/_util/working.py +10 -4
  10. inspect_ai/_view/www/dist/assets/index.css +159 -146
  11. inspect_ai/_view/www/dist/assets/index.js +1020 -1061
  12. inspect_ai/_view/www/log-schema.json +4 -3
  13. inspect_ai/_view/www/package.json +1 -1
  14. inspect_ai/_view/www/src/@types/log.d.ts +3 -2
  15. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  16. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  17. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  18. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  19. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  20. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  21. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  22. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  23. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  24. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  25. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  26. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  27. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  28. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  29. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  30. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  31. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  32. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  33. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  34. inspect_ai/_view/www/src/components/Card.css +0 -1
  35. inspect_ai/_view/www/src/constants.ts +2 -0
  36. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  37. inspect_ai/agent/_agent.py +3 -3
  38. inspect_ai/agent/_as_solver.py +20 -12
  39. inspect_ai/agent/_as_tool.py +15 -3
  40. inspect_ai/agent/_handoff.py +8 -1
  41. inspect_ai/agent/_run.py +11 -3
  42. inspect_ai/log/__init__.py +4 -0
  43. inspect_ai/log/_file.py +56 -0
  44. inspect_ai/log/_log.py +99 -0
  45. inspect_ai/log/_recorders/__init__.py +2 -0
  46. inspect_ai/log/_recorders/buffer/database.py +12 -11
  47. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  48. inspect_ai/log/_recorders/buffer/types.py +2 -2
  49. inspect_ai/log/_recorders/eval.py +20 -65
  50. inspect_ai/log/_recorders/file.py +28 -6
  51. inspect_ai/log/_recorders/recorder.py +7 -0
  52. inspect_ai/log/_recorders/types.py +1 -23
  53. inspect_ai/log/_samples.py +0 -8
  54. inspect_ai/log/_transcript.py +7 -1
  55. inspect_ai/log/_util.py +52 -0
  56. inspect_ai/model/__init__.py +5 -1
  57. inspect_ai/model/_call_tools.py +32 -12
  58. inspect_ai/model/_generate_config.py +14 -8
  59. inspect_ai/model/_model.py +21 -48
  60. inspect_ai/model/_model_output.py +25 -0
  61. inspect_ai/model/_openai.py +2 -0
  62. inspect_ai/model/_openai_responses.py +13 -1
  63. inspect_ai/model/_providers/anthropic.py +13 -23
  64. inspect_ai/model/_providers/openai_o1.py +8 -2
  65. inspect_ai/model/_providers/providers.py +18 -4
  66. inspect_ai/model/_providers/sglang.py +241 -0
  67. inspect_ai/model/_providers/vllm.py +207 -400
  68. inspect_ai/solver/__init__.py +7 -2
  69. inspect_ai/solver/_basic_agent.py +3 -10
  70. inspect_ai/solver/_task_state.py +26 -88
  71. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  72. inspect_ai/tool/_mcp/_mcp.py +2 -0
  73. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  74. inspect_ai/tool/_mcp/server.py +3 -1
  75. inspect_ai/tool/_tool_call.py +4 -1
  76. inspect_ai/tool/_tool_support_helpers.py +51 -12
  77. inspect_ai/tool/_tools/_bash_session.py +190 -68
  78. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  79. inspect_ai/tool/_tools/_text_editor.py +4 -3
  80. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  81. inspect_ai/util/__init__.py +12 -0
  82. inspect_ai/util/_limit.py +393 -0
  83. inspect_ai/util/_limited_conversation.py +57 -0
  84. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
  85. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
  86. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
  87. inspect_ai/solver/_limit.py +0 -39
  88. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  89. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  90. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  91. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  92. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  93. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  94. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  95. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  96. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  97. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  98. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  99. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  100. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  101. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  102. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  103. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  104. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  105. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  106. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  107. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  108. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  109. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  110. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  111. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  112. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  113. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  114. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  115. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  116. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
  117. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
  118. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,10 @@
1
1
  import { EvalPlan, EvalScore, EvalSpec, Params2 } from "../../@types/log";
2
- import { toTitleCase } from "../../utils/format";
3
- import { ghCommitUrl } from "../../utils/git";
4
- import { MetaDataView } from "../content/MetaDataView";
5
2
  import { DatasetDetailView } from "./DatasetDetailView";
6
3
  import { ScorerDetailView } from "./ScorerDetailView";
7
4
  import { SolversDetailView } from "./SolverDetailView";
8
5
 
9
6
  import clsx from "clsx";
10
7
  import { FC, ReactNode } from "react";
11
- import { kModelNone } from "../../constants";
12
8
  import styles from "./PlanDetailView.module.css";
13
9
 
14
10
  interface PlanDetailViewProps {
@@ -26,71 +22,7 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
26
22
  return null;
27
23
  }
28
24
 
29
- // Add configuration
30
- const config: Record<string, unknown> = {};
31
- Object.entries(evaluation?.config || {}).forEach((entry) => {
32
- const key = entry[0];
33
- const value = entry[1];
34
- config[key] = value;
35
- });
36
-
37
25
  const steps = plan?.steps;
38
- const metadata = evaluation?.metadata;
39
- const revision = evaluation?.revision;
40
- const packages = evaluation?.packages;
41
- const model_args = evaluation?.model_args;
42
- const task_args = evaluation?.task_args;
43
- const generate_config = plan?.config;
44
-
45
- const taskInformation: Record<string, unknown> = {
46
- ["Task ID"]: evaluation?.task_id,
47
- ["Run ID"]: evaluation?.run_id,
48
- };
49
- if (revision) {
50
- taskInformation[
51
- `${revision.type ? `${toTitleCase(revision.type)} ` : ""}Revision`
52
- ] = {
53
- _html: (
54
- <a href={ghCommitUrl(revision.origin, revision.commit)}>
55
- {revision.commit}
56
- </a>
57
- ),
58
- };
59
- }
60
- if (packages) {
61
- const names = Object.keys(packages).map((key) => {
62
- return `${key} ${packages[key]}`;
63
- });
64
-
65
- if (names.length === 1) {
66
- taskInformation["Inspect"] = names[0];
67
- } else {
68
- taskInformation["Inspect"] = names;
69
- }
70
- }
71
- if (evaluation.tags) {
72
- taskInformation["Tags"] = evaluation.tags.join(", ");
73
- }
74
-
75
- if (evaluation?.model && evaluation.model !== kModelNone) {
76
- config["model"] = evaluation.model;
77
- }
78
-
79
- if (evaluation?.model_base_url) {
80
- config["model_base_url"] = evaluation.model_base_url;
81
- }
82
-
83
- if (evaluation?.sandbox) {
84
- if (Array.isArray(evaluation?.sandbox)) {
85
- config["sandbox"] = evaluation.sandbox[0];
86
- if (evaluation.sandbox[1]) {
87
- config["sandbox_config"] = evaluation.sandbox[1];
88
- }
89
- } else {
90
- config["sandbox"] = evaluation?.sandbox.type;
91
- config["sandbox_config"] = evaluation?.sandbox.config;
92
- }
93
- }
94
26
 
95
27
  const taskColumns: {
96
28
  title: string;
@@ -148,117 +80,12 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
148
80
  }
149
81
  }
150
82
 
151
- // Compute the column style for the remaining (either 1 or 2 columns wide)
152
- const metadataColumns: {
153
- title: string;
154
- className: string;
155
- contents: ReactNode;
156
- }[] = [];
157
- const cols = colCount(
158
- metadataColumns,
159
- task_args,
160
- model_args,
161
- config,
162
- metadata,
163
- );
164
-
165
- metadataColumns.push({
166
- title: "Task Information",
167
- className: cols === 1 ? styles.oneCol : styles.twoCol,
168
- contents: (
169
- <MetaDataView
170
- key={`plan-md-task`}
171
- className={"text-size-small"}
172
- entries={taskInformation}
173
- tableOptions="sm"
174
- />
175
- ),
176
- });
177
-
178
- if (task_args && Object.keys(task_args).length > 0) {
179
- metadataColumns.push({
180
- title: "Task Args",
181
- className: cols === 1 ? styles.oneCol : styles.twoCol,
182
- contents: (
183
- <MetaDataView
184
- key={`plan-md-task-args`}
185
- className={"text-size-small"}
186
- entries={task_args as Record<string, unknown>}
187
- tableOptions="sm"
188
- />
189
- ),
190
- });
191
- }
192
- if (model_args && Object.keys(model_args).length > 0) {
193
- metadataColumns.push({
194
- title: "Model Args",
195
- className: cols === 1 ? styles.oneCol : styles.twoCol,
196
- contents: (
197
- <MetaDataView
198
- key={`plan-md-model-args`}
199
- className={"text-size-small"}
200
- entries={model_args as Record<string, unknown>}
201
- tableOptions="sm"
202
- />
203
- ),
204
- });
205
- }
206
-
207
- if (config && Object.keys(config).length > 0) {
208
- metadataColumns.push({
209
- title: "Configuration",
210
- className: cols === 1 ? styles.oneCol : styles.twoCol,
211
- contents: (
212
- <MetaDataView
213
- key={`plan-md-config`}
214
- className={"text-size-small"}
215
- entries={config}
216
- tableOptions="sm"
217
- />
218
- ),
219
- });
220
- }
221
-
222
- if (generate_config && Object.keys(generate_config).length > 0) {
223
- const generate_record: Record<string, unknown> = Object.fromEntries(
224
- Object.entries(generate_config),
225
- );
226
-
227
- metadataColumns.push({
228
- title: "Generate Config",
229
- className: cols === 1 ? styles.oneCol : styles.twoCol,
230
- contents: (
231
- <MetaDataView
232
- key={`plan-md-generate-config`}
233
- className={"text-size-small"}
234
- entries={generate_record}
235
- tableOptions="sm"
236
- />
237
- ),
238
- });
239
- }
240
-
241
- if (metadata && Object.keys(metadata).length > 0) {
242
- metadataColumns.push({
243
- title: "Metadata",
244
- className: cols === 1 ? styles.oneCol : styles.twoCol,
245
- contents: (
246
- <MetaDataView
247
- key={`plan-md-metadata`}
248
- className={"text-size-small"}
249
- entries={metadata}
250
- tableOptions="sm"
251
- />
252
- ),
253
- });
254
- }
255
-
256
83
  return (
257
84
  <div className={styles.container}>
258
85
  <div
259
86
  className={styles.grid}
260
87
  style={{
261
- gridTemplateColumns: `repeat(${taskColumns.length}, auto)`,
88
+ gridTemplateColumns: `repeat(${taskColumns.length}, fit-content(50%))`,
262
89
  }}
263
90
  >
264
91
  {taskColumns.map((col) => {
@@ -273,34 +100,10 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
273
100
  );
274
101
  })}
275
102
  </div>
276
-
277
- <div className={clsx(styles.row)}>
278
- {metadataColumns.map((col) => {
279
- return (
280
- <PlanColumn
281
- title={col.title}
282
- className={col.className}
283
- key={`plan-col-${col.title}`}
284
- >
285
- {col.contents}
286
- </PlanColumn>
287
- );
288
- })}
289
- </div>
290
103
  </div>
291
104
  );
292
105
  };
293
106
 
294
- const colCount = (...other: unknown[]) => {
295
- let count = 0;
296
- for (const o in other) {
297
- if (o && Object.keys(o).length > 0) {
298
- count++;
299
- }
300
- }
301
- return count;
302
- };
303
-
304
107
  interface PlanColumnProps {
305
108
  title: string;
306
109
  className: string | string[];
@@ -1,6 +1,7 @@
1
1
  import { Value2 } from "../../../../@types/log";
2
2
  import { kScoreTypeNumeric } from "../../../../constants";
3
3
  import { formatDecimalNoTrailingZeroes } from "../../../../utils/format";
4
+ import { compareWithNan } from "../../../../utils/numeric";
4
5
  import { ScoreDescriptor } from "../types";
5
6
 
6
7
  export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
@@ -14,7 +15,7 @@ export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
14
15
  max: Math.max(...onlyNumeric),
15
16
  compare: (a, b) => {
16
17
  if (typeof a.value === "number" && typeof b.value === "number") {
17
- return a.value - b.value;
18
+ return compareWithNan(a.value, b.value);
18
19
  } else {
19
20
  console.warn("Comparing non-numerics using a numeric score descriptor");
20
21
  return 0;
@@ -6,6 +6,7 @@ import styles from "./ModelUsagePanel.module.css";
6
6
 
7
7
  interface ModelUsageProps {
8
8
  usage: ModelUsage1;
9
+ className?: string | string[];
9
10
  }
10
11
 
11
12
  interface ModelUsageRow {
@@ -19,7 +20,7 @@ interface ModelUsageRow {
19
20
  /**
20
21
  * Renders the ModelUsagePanel component.
21
22
  */
22
- export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
23
+ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage, className }) => {
23
24
  if (!usage) {
24
25
  return null;
25
26
  }
@@ -84,7 +85,7 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
84
85
  });
85
86
 
86
87
  return (
87
- <div className={clsx("text-size-small", styles.wrapper)}>
88
+ <div className={clsx("text-size-small", styles.wrapper, className)}>
88
89
  {rows.map((row, idx) => {
89
90
  if (row.label === "---") {
90
91
  return (
@@ -1,6 +1,5 @@
1
1
  .table {
2
2
  width: 100%;
3
- margin-top: 0.7rem;
4
3
  }
5
4
 
6
5
  .tableTokens {
@@ -15,3 +14,7 @@
15
14
  .model {
16
15
  padding-right: 1em;
17
16
  }
17
+
18
+ .cellContents {
19
+ padding-bottom: 1em;
20
+ }
@@ -79,10 +79,10 @@ export const TokenRow: FC<TokenRowProps> = ({ model, usage }) => {
79
79
  return (
80
80
  <tr>
81
81
  <td>
82
- <div className={styles.model}>{model}</div>
82
+ <div className={clsx(styles.model, styles.cellContents)}>{model}</div>
83
83
  </td>
84
84
  <td>
85
- <ModelUsagePanel usage={usage} />
85
+ <ModelUsagePanel usage={usage} className={clsx(styles.cellContents)} />
86
86
  </td>
87
87
  </tr>
88
88
  );
@@ -3,13 +3,18 @@
3
3
  padding-bottom: 1em;
4
4
  margin-left: 0.5em;
5
5
  display: flex;
6
+ flex-wrap: wrap;
7
+ gap: 1em;
6
8
  }
7
9
 
8
10
  .col1 {
9
- flex: 1 1 40%;
10
- margin-right: 1em;
11
+ flex: 0 1 auto;
12
+ min-width: 200px;
13
+ width: fit-content;
11
14
  }
12
15
 
13
16
  .col2 {
14
- flex: 1 1 60%;
17
+ flex: 1 1 auto;
18
+ min-width: 300px;
19
+ width: fit-content;
15
20
  }
@@ -1,10 +1,5 @@
1
- import clsx from "clsx";
2
1
  import { EvalStats } from "../../@types/log";
3
- import { FontSize } from "../../app/appearance/fonts";
4
- import { ApplicationIcons } from "../../app/appearance/icons";
5
- import { MetaDataView } from "../../app/content/MetaDataView";
6
2
  import { Card, CardBody, CardHeader } from "../../components/Card";
7
- import { formatDuration } from "../../utils/format";
8
3
  import { ModelTokenTable } from "./ModelTokenTable";
9
4
 
10
5
  import { FC } from "react";
@@ -24,40 +19,11 @@ export const UsageCard: FC<UsageCardProps> = ({ stats }) => {
24
19
  return null;
25
20
  }
26
21
 
27
- const totalDuration = formatDuration(
28
- new Date(stats.started_at),
29
- new Date(stats.completed_at),
30
- );
31
- const usageMetadataStyle = {
32
- fontSize: FontSize.smaller,
33
- };
34
-
35
22
  return (
36
23
  <Card>
37
- <CardHeader icon={ApplicationIcons.usage} label="Usage" />
24
+ <CardHeader label="Usage" />
38
25
  <CardBody id={kUsageCardBodyId}>
39
26
  <div className={styles.wrapper}>
40
- <div className={styles.col1}>
41
- <div
42
- className={clsx(
43
- "text-size-smaller",
44
- "text-style-label",
45
- "text-style-secondary",
46
- )}
47
- >
48
- Duration
49
- </div>
50
- <MetaDataView
51
- entries={{
52
- ["Start"]: new Date(stats.started_at).toLocaleString(),
53
- ["End"]: new Date(stats.completed_at).toLocaleString(),
54
- ["Duration"]: totalDuration,
55
- }}
56
- tableOptions="borderless,sm"
57
- style={usageMetadataStyle}
58
- />
59
- </div>
60
-
61
27
  <div className={styles.col2}>
62
28
  <ModelTokenTable model_usage={stats.model_usage} />
63
29
  </div>
@@ -23,7 +23,6 @@
23
23
  background-color: var(--bs-light-bg-subtle);
24
24
  border: solid 1px var(--bs-light-border-subtle);
25
25
  border-radius: var(--bs-border-radius);
26
- margin-bottom: 1.5em;
27
26
  }
28
27
 
29
28
  .card-collaping-header {
@@ -5,6 +5,8 @@ export const kModelNone = "none/none";
5
5
  export const kLogViewSamplesTabId = "samples";
6
6
  export const kLogViewJsonTabId = "json";
7
7
  export const kLogViewInfoTabId = "info";
8
+ export const kLogViewModelsTabId = "models";
9
+ export const kLogViewTaskTabId = "task";
8
10
 
9
11
  // Sample tab constants
10
12
  export const kSampleMessagesTabId = `messages`;
@@ -0,0 +1,17 @@
1
+ export function compareWithNan(a: number, b: number): number {
2
+ const aIsNaN = Number.isNaN(a);
3
+ const bIsNaN = Number.isNaN(b);
4
+
5
+ if (aIsNaN && bIsNaN) {
6
+ return 0;
7
+ }
8
+
9
+ if (aIsNaN) {
10
+ return 1;
11
+ }
12
+ if (bIsNaN) {
13
+ return -1;
14
+ }
15
+
16
+ return a - b;
17
+ }
@@ -27,13 +27,14 @@ from inspect_ai.model._chat_message import (
27
27
  ChatMessageAssistant,
28
28
  )
29
29
  from inspect_ai.model._model_output import ChatCompletionChoice, ModelOutput
30
+ from inspect_ai.util._limited_conversation import ChatMessageList
30
31
 
31
32
 
32
33
  class AgentState:
33
34
  """Agent state."""
34
35
 
35
36
  def __init__(self, *, messages: list[ChatMessage]) -> None:
36
- self._messages = messages
37
+ self._messages: list[ChatMessage] = ChatMessageList(messages)
37
38
  self._output: ModelOutput | None = None
38
39
 
39
40
  @property
@@ -43,8 +44,7 @@ class AgentState:
43
44
 
44
45
  @messages.setter
45
46
  def messages(self, messages: list[ChatMessage]) -> None:
46
- """Set the conversation history."""
47
- self._messages = messages
47
+ self._messages = ChatMessageList(messages)
48
48
 
49
49
  @property
50
50
  def output(self) -> ModelOutput:
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Any
4
4
 
5
+ from inspect_ai.util._limit import Limit, apply_limits
6
+
5
7
  if TYPE_CHECKING:
6
8
  from inspect_ai.solver._solver import Solver
7
9
 
@@ -14,7 +16,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
14
16
  from ._agent import Agent, AgentState
15
17
 
16
18
 
17
- def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
19
+ def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
18
20
  """Convert an agent to a solver.
19
21
 
20
22
  Note that agents used as solvers will only receive their first parameter
@@ -23,6 +25,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
23
25
 
24
26
  Args:
25
27
  agent: Agent to convert.
28
+ limits: List of limits to apply to the agent. Should a limit
29
+ be exceeded, the Sample ends and proceeds to scoring.
26
30
  **agent_kwargs: Arguments to curry to Agent function (required
27
31
  if the agent has parameters without default values).
28
32
 
@@ -52,17 +56,21 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
52
56
  @solver(name=agent_name)
53
57
  def agent_to_solver() -> Solver:
54
58
  async def solve(state: TaskState, generate: Generate) -> TaskState:
55
- # run agent
56
- agent_state = await agent(
57
- AgentState(messages=state.messages), **agent_kwargs
58
- )
59
-
60
- # update messages
61
- state.messages = agent_state.messages
62
-
63
- # update output if its not empty
64
- if agent_state.output:
65
- state.output = agent_state.output
59
+ agent_state = AgentState(messages=state.messages)
60
+
61
+ try:
62
+ # run the agent with limits
63
+ with apply_limits(limits):
64
+ agent_state = await agent(agent_state, **agent_kwargs)
65
+ # if an exception occurs, we still want to update the TaskState with the
66
+ # AgentState's messages + output so that it appears in the log and is scored
67
+ finally:
68
+ # update messages
69
+ state.messages = agent_state.messages
70
+
71
+ # update output if its not empty
72
+ if agent_state.output:
73
+ state.output = agent_state.output
66
74
 
67
75
  return state
68
76
 
@@ -10,12 +10,18 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
10
10
  from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
11
11
  from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
12
12
  from inspect_ai.tool._tool_params import ToolParam
13
+ from inspect_ai.util._limit import Limit, apply_limits
13
14
 
14
15
  from ._agent import AGENT_DESCRIPTION, Agent, AgentState
15
16
 
16
17
 
17
18
  @tool
18
- def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -> Tool:
19
+ def as_tool(
20
+ agent: Agent,
21
+ description: str | None = None,
22
+ limits: list[Limit] = [],
23
+ **agent_kwargs: Any,
24
+ ) -> Tool:
19
25
  """Convert an agent to a tool.
20
26
 
21
27
  By default the model will see all of the agent's arguments as
@@ -27,6 +33,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
27
33
  Args:
28
34
  agent: Agent to convert.
29
35
  description: Tool description (defaults to agent description)
36
+ limits: List of limits to apply to the agent. Should a limit
37
+ be exceeded, the tool call ends and returns an error
38
+ explaining that a limit was exceeded.
30
39
  **agent_kwargs: Arguments to curry to Agent function (arguments
31
40
  provided here will not be presented to the model as part
32
41
  of the tool interface).
@@ -41,9 +50,12 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
41
50
  )
42
51
 
43
52
  async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
44
- # prepare state and call agent
53
+ # prepare state
45
54
  state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
46
- state = await agent(state, *args, **(agent_kwargs | kwargs))
55
+
56
+ # run the agent with limits
57
+ with apply_limits(limits):
58
+ state = await agent(state, *args, **(agent_kwargs | kwargs))
47
59
 
48
60
  # find assistant message to read content from (prefer output)
49
61
  if not state.output.empty:
@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
9
9
  from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
10
10
  from inspect_ai.tool._tool_def import ToolDef
11
11
  from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
12
+ from inspect_ai.util._limit import Limit
12
13
 
13
14
  from ._agent import Agent
14
15
  from ._as_tool import agent_tool_info
@@ -21,6 +22,7 @@ def handoff(
21
22
  input_filter: MessageFilter | None = None,
22
23
  output_filter: MessageFilter | None = None,
23
24
  tool_name: str | None = None,
25
+ limits: list[Limit] = [],
24
26
  **agent_kwargs: Any,
25
27
  ) -> Tool:
26
28
  """Create a tool that enables models to handoff to agents.
@@ -35,6 +37,9 @@ def handoff(
35
37
  Use the built-in `last_message` filter to return only the last message
36
38
  or alternatively specify a custom `MessageFilter` function.
37
39
  tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
40
+ limits: List of limits to apply to the agent. Should a limit be exceeded,
41
+ the agent stops and a user message is appended explaining that a limit was
42
+ exceeded.
38
43
  **agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
39
44
  will not be presented to the model as part of the tool interface).
40
45
 
@@ -52,7 +57,7 @@ def handoff(
52
57
  tool_info = agent_tool_info(agent, description, **agent_kwargs)
53
58
 
54
59
  # AgentTool calls will be intercepted by execute_tools
55
- agent_tool = AgentTool(agent, input_filter, output_filter, **agent_kwargs)
60
+ agent_tool = AgentTool(agent, input_filter, output_filter, limits, **agent_kwargs)
56
61
  tool_name = tool_name or f"transfer_to_{tool_info.name}"
57
62
  set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
58
63
  set_tool_description(
@@ -72,11 +77,13 @@ class AgentTool(Tool):
72
77
  agent: Agent,
73
78
  input_filter: MessageFilter | None = None,
74
79
  output_filter: MessageFilter | None = None,
80
+ limits: list[Limit] = [],
75
81
  **kwargs: Any,
76
82
  ):
77
83
  self.agent = agent
78
84
  self.input_filter = input_filter
79
85
  self.output_filter = output_filter
86
+ self.limits = limits
80
87
  self.kwargs = kwargs
81
88
 
82
89
  @property
inspect_ai/agent/_run.py CHANGED
@@ -2,12 +2,16 @@ from copy import copy
2
2
  from typing import Any
3
3
 
4
4
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
5
+ from inspect_ai.util._limit import Limit, apply_limits
5
6
 
6
7
  from ._agent import Agent, AgentState
7
8
 
8
9
 
9
10
  async def run(
10
- agent: Agent, input: str | list[ChatMessage] | AgentState, **agent_kwargs: Any
11
+ agent: Agent,
12
+ input: str | list[ChatMessage] | AgentState,
13
+ limits: list[Limit] = [],
14
+ **agent_kwargs: Any,
11
15
  ) -> AgentState:
12
16
  """Run an agent.
13
17
 
@@ -17,6 +21,9 @@ async def run(
17
21
  Args:
18
22
  agent: Agent to run.
19
23
  input: Agent input (string, list of messages, or an `AgentState`).
24
+ limits: List of limits to apply to the agent. Should a limit be
25
+ exceeded, a LimitExceededError is raised which the caller may
26
+ handle as appropriate.
20
27
  **agent_kwargs: Additional arguments to pass to agent.
21
28
 
22
29
  Returns:
@@ -43,5 +50,6 @@ async def run(
43
50
  # create state
44
51
  state = AgentState(messages=input_messages)
45
52
 
46
- # run the agent
47
- return await agent(state, **agent_kwargs)
53
+ # run the agent with limits
54
+ with apply_limits(limits):
55
+ return await agent(state, **agent_kwargs)
@@ -9,6 +9,7 @@ from ._file import (
9
9
  read_eval_log,
10
10
  read_eval_log_async,
11
11
  read_eval_log_sample,
12
+ read_eval_log_sample_summaries,
12
13
  read_eval_log_samples,
13
14
  write_eval_log,
14
15
  write_eval_log_async,
@@ -28,6 +29,7 @@ from ._log import (
28
29
  EvalSampleLimit,
29
30
  EvalSampleReductions,
30
31
  EvalSampleScore,
32
+ EvalSampleSummary,
31
33
  EvalScore,
32
34
  EvalSpec,
33
35
  EvalStats,
@@ -70,6 +72,7 @@ __all__ = [
70
72
  "EvalSampleLimit",
71
73
  "EvalSampleScore",
72
74
  "EvalSampleReductions",
75
+ "EvalSampleSummary",
73
76
  "EvalScore",
74
77
  "EvalSpec",
75
78
  "EvalStats",
@@ -100,6 +103,7 @@ __all__ = [
100
103
  "read_eval_log_async",
101
104
  "read_eval_log_sample",
102
105
  "read_eval_log_samples",
106
+ "read_eval_log_sample_summaries",
103
107
  "condense_sample",
104
108
  "resolve_sample_attachments",
105
109
  "write_eval_log",