inspect-ai 0.3.97__py3-none-any.whl → 0.3.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. inspect_ai/__init__.py +2 -0
  2. inspect_ai/_cli/log.py +1 -1
  3. inspect_ai/_display/textual/widgets/transcript.py +15 -3
  4. inspect_ai/_eval/run.py +18 -5
  5. inspect_ai/_eval/task/log.py +1 -1
  6. inspect_ai/_eval/task/task.py +1 -1
  7. inspect_ai/_util/_async.py +1 -1
  8. inspect_ai/_view/schema.py +1 -0
  9. inspect_ai/_view/view.py +14 -0
  10. inspect_ai/_view/www/dist/assets/index.css +10 -10
  11. inspect_ai/_view/www/dist/assets/index.js +10 -10
  12. inspect_ai/_view/www/log-schema.json +45 -5
  13. inspect_ai/_view/www/src/@types/log.d.ts +11 -2
  14. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +2 -1
  15. inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +2 -2
  16. inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css +2 -2
  17. inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -1
  18. inspect_ai/agent/_run.py +44 -8
  19. inspect_ai/dataset/_dataset.py +0 -1
  20. inspect_ai/log/_bundle.py +5 -3
  21. inspect_ai/log/_log.py +2 -2
  22. inspect_ai/model/_providers/anthropic.py +3 -6
  23. inspect_ai/model/_providers/google.py +6 -0
  24. inspect_ai/model/_providers/providers.py +1 -1
  25. inspect_ai/util/__init__.py +2 -0
  26. inspect_ai/util/_limit.py +160 -137
  27. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/METADATA +2 -2
  28. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/RECORD +32 -32
  29. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/WHEEL +1 -1
  30. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/entry_points.txt +0 -0
  31. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/licenses/LICENSE +0 -0
  32. {inspect_ai-0.3.97.dist-info → inspect_ai-0.3.99.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._eval.score import score, score_async
10
10
  from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
11
11
  from inspect_ai._eval.task.tasks import Tasks
12
12
  from inspect_ai._util.constants import PKG_NAME
13
+ from inspect_ai._view.view import view
13
14
  from inspect_ai.agent._human.agent import human_cli
14
15
  from inspect_ai.solver._human_agent import human_agent
15
16
 
@@ -32,4 +33,5 @@ __all__ = [
32
33
  "TaskInfo",
33
34
  "task",
34
35
  "task_with",
36
+ "view",
35
37
  ]
inspect_ai/_cli/log.py CHANGED
@@ -199,6 +199,6 @@ def view_resource(file: str) -> str:
199
199
 
200
200
 
201
201
  def view_type_resource(file: str) -> str:
202
- resource = PKG_PATH / "_view" / "www" / "src" / "types" / file
202
+ resource = PKG_PATH / "_view" / "www" / "src" / "@types" / file
203
203
  with open(resource, "r", encoding="utf-8") as f:
204
204
  return f.read()
@@ -84,6 +84,7 @@ class TranscriptView(ScrollableContainer):
84
84
  scroll_to_end = (
85
85
  new_sample or abs(self.scroll_y - self.max_scroll_y) <= 20
86
86
  )
87
+
87
88
  async with self.batch():
88
89
  await self.remove_children()
89
90
  await self.mount_all(
@@ -100,9 +101,13 @@ class TranscriptView(ScrollableContainer):
100
101
  else:
101
102
  self._pending_sample = sample
102
103
 
103
- def _widgets_for_events(self, events: Sequence[Event]) -> list[Widget]:
104
+ def _widgets_for_events(
105
+ self, events: Sequence[Event], limit: int = 10
106
+ ) -> list[Widget]:
104
107
  widgets: list[Widget] = []
105
- for event in events:
108
+ widget_count = 0
109
+ # reverse the events so that the N most recents events are displayed
110
+ for event in events[::-1]:
106
111
  display = render_event(event)
107
112
  if display:
108
113
  for d in display:
@@ -118,7 +123,14 @@ class TranscriptView(ScrollableContainer):
118
123
  set_transcript_markdown_options(d.content)
119
124
  widgets.append(Static(d.content, markup=False))
120
125
  widgets.append(Static(Text(" ")))
121
- return widgets
126
+ widget_count += 1
127
+
128
+ # only render the N most recent events
129
+ if widget_count >= limit:
130
+ break
131
+
132
+ # reverse the list since we added the events in reverse order
133
+ return widgets[::-1]
122
134
 
123
135
 
124
136
  class EventDisplay(NamedTuple):
inspect_ai/_eval/run.py CHANGED
@@ -298,10 +298,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
298
298
 
299
299
  # setup pending tasks, queue, and results
300
300
  pending_tasks = tasks.copy()
301
- results: list[EvalLog] = []
301
+ results: list[tuple[int, EvalLog]] = []
302
302
  tasks_completed = 0
303
303
  total_tasks = len(tasks)
304
304
 
305
+ # Create a mapping from task to its original index
306
+ task_to_original_index = {id(task): i for i, task in enumerate(tasks)}
307
+
305
308
  # produce/consume tasks
306
309
  send_channel, receive_channel = anyio.create_memory_object_stream[TaskRunOptions](
307
310
  parallel * 2
@@ -322,7 +325,7 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
322
325
  # among those models, pick one with the least usage
323
326
  model = min(models_with_pending, key=lambda m: model_counts[m])
324
327
 
325
- # now we know theres at least one pending task for this model so its safe to pick it
328
+ # now we know there's at least one pending task for this model so it's safe to pick it
326
329
  next_task = next(t for t in pending_tasks if str(t.model) == model)
327
330
  pending_tasks.remove(next_task)
328
331
  model_counts[str(next_task.model)] += 1
@@ -339,6 +342,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
339
342
  nonlocal tasks_completed
340
343
  async for task_options in receive_channel:
341
344
  result: EvalLog | None = None
345
+ # Get the original index of this task
346
+ original_index = task_to_original_index[id(task_options)]
342
347
 
343
348
  # run the task
344
349
  try:
@@ -354,11 +359,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
354
359
  # see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
355
360
  def create_task_runner(
356
361
  options: TaskRunOptions = task_options,
362
+ idx: int = original_index,
357
363
  ) -> Callable[[], Awaitable[None]]:
358
364
  async def run_task() -> None:
359
365
  nonlocal result
360
366
  result = await task_run(options)
361
- results.append(result)
367
+ # Store result with its original index
368
+ results.append((idx, result))
362
369
 
363
370
  return run_task
364
371
 
@@ -426,7 +433,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
426
433
 
427
434
  clear_task_screen()
428
435
 
429
- return results
436
+ # Sort results by original index and return just the values
437
+ return [r for _, r in sorted(results)]
430
438
 
431
439
 
432
440
  def resolve_task_sample_ids(
@@ -475,7 +483,12 @@ async def startup_sandbox_environments(
475
483
  sandboxenvs: Set[TaskSandboxEnvironment] = set()
476
484
  for task in tasks:
477
485
  # resolve each sample and add to sandboxenvs
478
- dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
486
+ resolved_task_sample_ids = resolve_task_sample_ids(
487
+ task.task.name, config.sample_id
488
+ )
489
+ dataset = slice_dataset(
490
+ task.task.dataset, config.limit, resolved_task_sample_ids
491
+ )
479
492
  for sample in dataset:
480
493
  sandbox = await resolve_sandbox_for_task_and_sample(
481
494
  eval_sandbox, task.task, sample
@@ -56,7 +56,7 @@ class TaskLogger:
56
56
  def __init__(
57
57
  self,
58
58
  task_name: str,
59
- task_version: int,
59
+ task_version: int | str,
60
60
  task_file: str | None,
61
61
  task_registry_name: str | None,
62
62
  task_id: str | None,
@@ -64,7 +64,7 @@ class Task:
64
64
  time_limit: int | None = None,
65
65
  working_limit: int | None = None,
66
66
  name: str | None = None,
67
- version: int = 0,
67
+ version: int | str = 0,
68
68
  metadata: dict[str, Any] | None = None,
69
69
  **kwargs: Unpack[TaskDeprecatedArgs],
70
70
  ) -> None:
@@ -136,7 +136,7 @@ def current_async_backend() -> Literal["asyncio", "trio"] | None:
136
136
 
137
137
 
138
138
  def configured_async_backend() -> Literal["asyncio", "trio"]:
139
- backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower()
139
+ backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower() or "asyncio"
140
140
  return _validate_backend(backend)
141
141
 
142
142
 
@@ -30,6 +30,7 @@ def sync_view_schema() -> None:
30
30
  for key in defs.keys():
31
31
  defs[key] = schema_to_strict(defs[key])
32
32
  f.write(json.dumps(schema, indent=2))
33
+ f.write("\n")
33
34
 
34
35
  # generate types w/ json-schema-to-typescript
35
36
  subprocess.run(
inspect_ai/_view/view.py CHANGED
@@ -30,6 +30,20 @@ def view(
30
30
  log_level: str | None = None,
31
31
  fs_options: dict[str, Any] = {},
32
32
  ) -> None:
33
+ """Run the Inspect View server.
34
+
35
+ Args:
36
+ log_dir: Directory to view logs from.
37
+ recursive: Recursively list files in `log_dir`.
38
+ host: Tcp/ip host (defaults to "127.0.0.1").
39
+ port: Tcp/ip port (defaults to 7575).
40
+ authorization: Validate requests by checking for this authorization header.
41
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
42
+ "info", "warning", "error", or "critical" (defaults to "warning").
43
+ fs_options: Additional arguments to pass through to the filesystem provider
44
+ (e.g. `S3FileSystem`). Use `{"anon": True }` if you are accessing a
45
+ public S3 bucket with no credentials.
46
+ """
33
47
  init_dotenv()
34
48
  init_logger(log_level)
35
49
 
@@ -17166,41 +17166,41 @@ thead th {
17166
17166
  flex-direction: column;
17167
17167
  padding-top: 0.1em;
17168
17168
  }
17169
- ._container_8i3m0_1 {
17169
+ ._container_181fj_1 {
17170
17170
  display: grid;
17171
17171
  grid-template-columns:
17172
- minmax(auto, 1fr) minmax(auto, 1fr) minmax(auto, 1fr)
17173
- 2fr;
17172
+ minmax(0, max-content) minmax(0, max-content) minmax(0, max-content)
17173
+ 5fr;
17174
17174
  column-gap: 0.75em;
17175
17175
  }
17176
17176
 
17177
- ._container_8i3m0_1 ._cell_8i3m0_9 {
17177
+ ._container_181fj_1 ._cell_181fj_9 {
17178
17178
  margin-bottom: 0.5em;
17179
17179
  }
17180
17180
 
17181
- ._fullWidth_8i3m0_13 {
17181
+ ._fullWidth_181fj_13 {
17182
17182
  grid-column: 1 / -1;
17183
17183
  }
17184
17184
 
17185
- ._heading_8i3m0_17 {
17185
+ ._heading_181fj_17 {
17186
17186
  font-weight: 600;
17187
17187
  }
17188
17188
 
17189
- ._padded_8i3m0_21 {
17189
+ ._padded_181fj_21 {
17190
17190
  padding-bottom: 3em;
17191
17191
  }
17192
17192
 
17193
- ._separator_8i3m0_25 {
17193
+ ._separator_181fj_25 {
17194
17194
  height: 1px;
17195
17195
  background-color: var(--bs-light-border-subtle);
17196
17196
  }
17197
17197
 
17198
- ._separatorPadded_8i3m0_30 {
17198
+ ._separatorPadded_181fj_30 {
17199
17199
  margin-top: 0.5em;
17200
17200
  margin-bottom: 0.5em;
17201
17201
  }
17202
17202
 
17203
- ._headerSep_8i3m0_35 {
17203
+ ._headerSep_181fj_35 {
17204
17204
  margin-top: 0.1em;
17205
17205
  margin-bottom: 0.2em;
17206
17206
  }
@@ -39446,7 +39446,7 @@ Please change the parent <Route path="${parentPath}"> to <Route path="${parentPa
39446
39446
  const rendered = entry2.value.trim();
39447
39447
  if (options2.renderString === "markdown") {
39448
39448
  return {
39449
- rendered
39449
+ rendered: /* @__PURE__ */ jsxRuntimeExports.jsx(MarkdownDiv, { markdown: rendered })
39450
39450
  };
39451
39451
  } else {
39452
39452
  return {
@@ -51898,12 +51898,12 @@ self.onmessage = function (e) {
51898
51898
  );
51899
51899
  return scorerDescriptor == null ? void 0 : scorerDescriptor.render(scoreData.value);
51900
51900
  };
51901
- const container$6 = "_container_8i3m0_1";
51902
- const cell$1 = "_cell_8i3m0_9";
51903
- const fullWidth = "_fullWidth_8i3m0_13";
51904
- const separator$2 = "_separator_8i3m0_25";
51905
- const separatorPadded = "_separatorPadded_8i3m0_30";
51906
- const headerSep = "_headerSep_8i3m0_35";
51901
+ const container$6 = "_container_181fj_1";
51902
+ const cell$1 = "_cell_181fj_9";
51903
+ const fullWidth = "_fullWidth_181fj_13";
51904
+ const separator$2 = "_separator_181fj_25";
51905
+ const separatorPadded = "_separatorPadded_181fj_30";
51906
+ const headerSep = "_headerSep_181fj_35";
51907
51907
  const styles$x = {
51908
51908
  container: container$6,
51909
51909
  cell: cell$1,
@@ -52473,7 +52473,7 @@ self.onmessage = function (e) {
52473
52473
  {
52474
52474
  output: event.error.traceback_ansi,
52475
52475
  style: {
52476
- fontSize: "clamp(0.5rem, calc(0.25em + 1vw), 0.8rem)",
52476
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
52477
52477
  margin: "0.5em 0"
52478
52478
  }
52479
52479
  }
@@ -61749,7 +61749,7 @@ ${events}
61749
61749
  output: sample2.error.traceback_ansi,
61750
61750
  className: clsx("text-size-small", styles$A.ansi),
61751
61751
  style: {
61752
- fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
61752
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
61753
61753
  margin: "0.5em 0"
61754
61754
  }
61755
61755
  }
@@ -61764,7 +61764,7 @@ ${events}
61764
61764
  output: retry.traceback_ansi,
61765
61765
  className: clsx("text-size-small", styles$A.ansi),
61766
61766
  style: {
61767
- fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
61767
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
61768
61768
  margin: "0.5em 0"
61769
61769
  }
61770
61770
  }
@@ -1136,6 +1136,18 @@
1136
1136
  "default": null,
1137
1137
  "title": "Log Samples"
1138
1138
  },
1139
+ "log_realtime": {
1140
+ "anyOf": [
1141
+ {
1142
+ "type": "boolean"
1143
+ },
1144
+ {
1145
+ "type": "null"
1146
+ }
1147
+ ],
1148
+ "default": null,
1149
+ "title": "Log Realtime"
1150
+ },
1139
1151
  "log_images": {
1140
1152
  "anyOf": [
1141
1153
  {
@@ -1205,6 +1217,7 @@
1205
1217
  "max_sandboxes",
1206
1218
  "sandbox_cleanup",
1207
1219
  "log_samples",
1220
+ "log_realtime",
1208
1221
  "log_images",
1209
1222
  "log_buffer",
1210
1223
  "log_shared",
@@ -1502,7 +1515,8 @@
1502
1515
  "reasoning_tokens": null,
1503
1516
  "reasoning_summary": null,
1504
1517
  "reasoning_history": null,
1505
- "response_schema": null
1518
+ "response_schema": null,
1519
+ "extra_body": null
1506
1520
  }
1507
1521
  }
1508
1522
  },
@@ -1944,7 +1958,7 @@
1944
1958
  "additionalProperties": false
1945
1959
  },
1946
1960
  "EvalSampleLimit": {
1947
- "description": "Limit encontered by sample.",
1961
+ "description": "Limit encountered by sample.",
1948
1962
  "properties": {
1949
1963
  "type": {
1950
1964
  "enum": [
@@ -2277,6 +2291,10 @@
2277
2291
  "EvalSpec": {
2278
2292
  "description": "Eval target and configuration.",
2279
2293
  "properties": {
2294
+ "eval_id": {
2295
+ "title": "Eval Id",
2296
+ "type": "string"
2297
+ },
2280
2298
  "run_id": {
2281
2299
  "title": "Run Id",
2282
2300
  "type": "string"
@@ -2294,9 +2312,16 @@
2294
2312
  "type": "string"
2295
2313
  },
2296
2314
  "task_version": {
2315
+ "anyOf": [
2316
+ {
2317
+ "type": "integer"
2318
+ },
2319
+ {
2320
+ "type": "string"
2321
+ }
2322
+ ],
2297
2323
  "default": 0,
2298
- "title": "Task Version",
2299
- "type": "integer"
2324
+ "title": "Task Version"
2300
2325
  },
2301
2326
  "task_file": {
2302
2327
  "anyOf": [
@@ -2500,6 +2525,7 @@
2500
2525
  }
2501
2526
  },
2502
2527
  "required": [
2528
+ "eval_id",
2503
2529
  "run_id",
2504
2530
  "created",
2505
2531
  "task",
@@ -2897,6 +2923,19 @@
2897
2923
  }
2898
2924
  ],
2899
2925
  "default": null
2926
+ },
2927
+ "extra_body": {
2928
+ "anyOf": [
2929
+ {
2930
+ "additionalProperties": true,
2931
+ "type": "object"
2932
+ },
2933
+ {
2934
+ "type": "null"
2935
+ }
2936
+ ],
2937
+ "default": null,
2938
+ "title": "Extra Body"
2900
2939
  }
2901
2940
  },
2902
2941
  "title": "GenerateConfig",
@@ -2927,7 +2966,8 @@
2927
2966
  "reasoning_tokens",
2928
2967
  "reasoning_summary",
2929
2968
  "reasoning_history",
2930
- "response_schema"
2969
+ "response_schema",
2970
+ "extra_body"
2931
2971
  ],
2932
2972
  "additionalProperties": false
2933
2973
  },
@@ -7,11 +7,12 @@
7
7
 
8
8
  export type Version = number;
9
9
  export type Status = "started" | "success" | "cancelled" | "error";
10
+ export type EvalId = string;
10
11
  export type RunId = string;
11
12
  export type Created = string;
12
13
  export type Task = string;
13
14
  export type TaskId = string;
14
- export type TaskVersion = number;
15
+ export type TaskVersion = number | string;
15
16
  export type TaskFile = string | null;
16
17
  export type TaskRegistryName = string | null;
17
18
  export type Solver = string | null;
@@ -68,6 +69,9 @@ export type Anyof = JSONSchema[] | null;
68
69
  export type Required = string[] | null;
69
70
  export type Description1 = string | null;
70
71
  export type Strict = boolean | null;
72
+ export type ExtraBody = {
73
+ [k: string]: unknown;
74
+ } | null;
71
75
  export type ModelBaseUrl = string | null;
72
76
  export type ModelRoles = {
73
77
  [k: string]: EvalModelConfig;
@@ -99,6 +103,7 @@ export type MaxSubprocesses = number | null;
99
103
  export type MaxSandboxes = number | null;
100
104
  export type SandboxCleanup = boolean | null;
101
105
  export type LogSamples = boolean | null;
106
+ export type LogRealtime = boolean | null;
102
107
  export type LogImages = boolean | null;
103
108
  export type LogBuffer = number | null;
104
109
  export type LogShared = number | null;
@@ -640,6 +645,7 @@ export interface EvalLog {
640
645
  * Eval target and configuration.
641
646
  */
642
647
  export interface EvalSpec {
648
+ eval_id: EvalId;
643
649
  run_id: RunId;
644
650
  created: Created;
645
651
  task: Task;
@@ -722,6 +728,7 @@ export interface GenerateConfig {
722
728
  reasoning_summary: ReasoningSummary;
723
729
  reasoning_history: ReasoningHistory;
724
730
  response_schema: ResponseSchema | null;
731
+ extra_body: ExtraBody;
725
732
  }
726
733
  /**
727
734
  * Schema for model response when using Structured Output.
@@ -786,6 +793,7 @@ export interface EvalConfig {
786
793
  max_sandboxes: MaxSandboxes;
787
794
  sandbox_cleanup: SandboxCleanup;
788
795
  log_samples: LogSamples;
796
+ log_realtime: LogRealtime;
789
797
  log_images: LogImages;
790
798
  log_buffer: LogBuffer;
791
799
  log_shared: LogShared;
@@ -888,6 +896,7 @@ export interface GenerateConfig1 {
888
896
  reasoning_summary: ReasoningSummary;
889
897
  reasoning_history: ReasoningHistory;
890
898
  response_schema: ResponseSchema | null;
899
+ extra_body: ExtraBody;
891
900
  }
892
901
  /**
893
902
  * Scoring results from evaluation.
@@ -1525,7 +1534,7 @@ export interface Attachments {
1525
1534
  [k: string]: string;
1526
1535
  }
1527
1536
  /**
1528
- * Limit encontered by sample.
1537
+ * Limit encountered by sample.
1529
1538
  */
1530
1539
  export interface EvalSampleLimit {
1531
1540
  type: Type16;
@@ -9,6 +9,7 @@ import { MetaDataView } from "./MetaDataView";
9
9
  import clsx from "clsx";
10
10
  import { FC, Fragment, isValidElement, JSX, ReactNode } from "react";
11
11
  import JSONPanel from "../../components/JsonPanel";
12
+ import { MarkdownDiv } from "../../components/MarkdownDiv";
12
13
  import { isJson } from "../../utils/json";
13
14
  import styles from "./RenderedContent.module.css";
14
15
  import { Buckets, ContentRenderer, RenderOptions } from "./types";
@@ -142,7 +143,7 @@ const contentRenderers: Record<string, ContentRenderer> = {
142
143
  const rendered = entry.value.trim();
143
144
  if (options.renderString === "markdown") {
144
145
  return {
145
- rendered: rendered,
146
+ rendered: <MarkdownDiv markdown={rendered} />,
146
147
  };
147
148
  } else {
148
149
  return {
@@ -275,7 +275,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
275
275
  output={sample.error.traceback_ansi}
276
276
  className={clsx("text-size-small", styles.ansi)}
277
277
  style={{
278
- fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
278
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
279
279
  margin: "0.5em 0",
280
280
  }}
281
281
  />
@@ -291,7 +291,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
291
291
  output={retry.traceback_ansi}
292
292
  className={clsx("text-size-small", styles.ansi)}
293
293
  style={{
294
- fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
294
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
295
295
  margin: "0.5em 0",
296
296
  }}
297
297
  />
@@ -1,8 +1,8 @@
1
1
  .container {
2
2
  display: grid;
3
3
  grid-template-columns:
4
- minmax(auto, 1fr) minmax(auto, 1fr) minmax(auto, 1fr)
5
- 2fr;
4
+ minmax(0, max-content) minmax(0, max-content) minmax(0, max-content)
5
+ 5fr;
6
6
  column-gap: 0.75em;
7
7
  }
8
8
 
@@ -32,7 +32,7 @@ export const ErrorEventView: FC<ErrorEventViewProps> = ({
32
32
  <ANSIDisplay
33
33
  output={event.error.traceback_ansi}
34
34
  style={{
35
- fontSize: "clamp(0.5rem, calc(0.25em + 1vw), 0.8rem)",
35
+ fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
36
36
  margin: "0.5em 0",
37
37
  }}
38
38
  />
inspect_ai/agent/_run.py CHANGED
@@ -1,20 +1,43 @@
1
1
  from copy import copy
2
- from typing import Any
2
+ from typing import Any, overload
3
3
 
4
4
  from inspect_ai._util.registry import registry_unqualified_name
5
5
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
6
- from inspect_ai.util._limit import Limit, apply_limits
6
+ from inspect_ai.util._limit import Limit, LimitExceededError, apply_limits
7
7
  from inspect_ai.util._span import span
8
8
 
9
9
  from ._agent import Agent, AgentState
10
10
 
11
11
 
12
+ @overload
12
13
  async def run(
13
14
  agent: Agent,
14
15
  input: str | list[ChatMessage] | AgentState,
15
16
  limits: list[Limit] = [],
17
+ *,
18
+ name: str | None = None,
16
19
  **agent_kwargs: Any,
17
- ) -> AgentState:
20
+ ) -> tuple[AgentState, LimitExceededError | None]: ...
21
+
22
+
23
+ @overload
24
+ async def run(
25
+ agent: Agent,
26
+ input: str | list[ChatMessage] | AgentState,
27
+ *,
28
+ name: str | None = None,
29
+ **agent_kwargs: Any,
30
+ ) -> AgentState: ...
31
+
32
+
33
+ async def run(
34
+ agent: Agent,
35
+ input: str | list[ChatMessage] | AgentState,
36
+ limits: list[Limit] = [],
37
+ *,
38
+ name: str | None = None,
39
+ **agent_kwargs: Any,
40
+ ) -> AgentState | tuple[AgentState, LimitExceededError | None]:
18
41
  """Run an agent.
19
42
 
20
43
  The input messages(s) will be copied prior to running so are
@@ -26,10 +49,16 @@ async def run(
26
49
  limits: List of limits to apply to the agent. Should a limit be
27
50
  exceeded, a LimitExceededError is raised which the caller may
28
51
  handle as appropriate.
52
+ name: Optional display name for the transcript entry. If not provided, the
53
+ agent's name as defined in the registry will be used.
29
54
  **agent_kwargs: Additional arguments to pass to agent.
30
55
 
31
56
  Returns:
32
- AgentState: Messages and generated output.
57
+ AgentState: Messages and generated output. This is all that is returned if no
58
+ limits are supplied.
59
+ LimitExceededError | None: If a non-empty limits list is supplied, a tuple is
60
+ returned. If a limit was exceeded, the second value in the tuple is the
61
+ exception instance. If no limit was exceeded, the second element is None.
33
62
  """
34
63
  # copy input so we don't mutate it in place
35
64
  input = copy(input)
@@ -52,9 +81,16 @@ async def run(
52
81
  # create state
53
82
  state = AgentState(messages=input_messages)
54
83
 
55
- # run the agent with limits
56
- with apply_limits(limits):
84
+ # run the agent with limits, catching errors which are a direct result of our limits
85
+ with apply_limits(limits, catch_errors=True) as limit_scope:
57
86
  # run the agent
58
- agent_name = registry_unqualified_name(agent)
87
+ agent_name = name or registry_unqualified_name(agent)
59
88
  async with span(name=agent_name, type="agent"):
60
- return await agent(state, **agent_kwargs)
89
+ state = await agent(state, **agent_kwargs)
90
+ if limits:
91
+ return state, None
92
+ else:
93
+ return state
94
+
95
+ # execution reaches this point iff one of "our" limits was exceeded
96
+ return state, limit_scope.limit_error
@@ -51,7 +51,6 @@ class Sample(BaseModel):
51
51
  or narrative text to be used by a model grader.
52
52
  id: Optional. Unique identifier for sample.
53
53
  metadata: Optional. Arbitrary metadata associated with the sample.
54
- sandbox (SandboxEnvironmentType | None): Sandbox environment type (or optionally a str or tuple with a shorthand spec)
55
54
  sandbox: Optional. Sandbox specification for this sample.
56
55
  files: Optional. Files that go along with the sample (copied to
57
56
  SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
inspect_ai/log/_bundle.py CHANGED
@@ -146,7 +146,7 @@ def copy_log_files(
146
146
  log_fs = filesystem(log_dir, fs_options)
147
147
  if log_fs.exists(log_dir):
148
148
  eval_logs = log_files_from_ls(
149
- log_fs.ls(log_dir, recursive=True), ["json", "eval"], True
149
+ log_fs.ls(log_dir, recursive=True), ["json", "eval"], False
150
150
  )
151
151
  if len(eval_logs) == 0:
152
152
  raise PrerequisiteError(
@@ -201,8 +201,10 @@ def move_output(
201
201
  output_fs.mkdir(dir_path)
202
202
  tick()
203
203
 
204
- # Copy the files
205
- for working_file in files:
204
+ # Copy the files, preserving relative mtime ordering
205
+ for _, working_file in sorted(
206
+ (os.stat(os.path.join(root, f)).st_mtime, f) for f in files
207
+ ):
206
208
  target_path = (
207
209
  os.path.join(relative_dir, working_file)
208
210
  if relative_dir != "."