inspect-ai 0.3.72__py3-none-any.whl → 0.3.73__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. inspect_ai/_cli/eval.py +14 -3
  2. inspect_ai/_cli/sandbox.py +3 -3
  3. inspect_ai/_cli/score.py +6 -4
  4. inspect_ai/_cli/trace.py +53 -6
  5. inspect_ai/_display/core/config.py +1 -1
  6. inspect_ai/_display/core/display.py +2 -1
  7. inspect_ai/_display/core/footer.py +6 -6
  8. inspect_ai/_display/plain/display.py +11 -6
  9. inspect_ai/_display/rich/display.py +23 -13
  10. inspect_ai/_display/textual/app.py +10 -9
  11. inspect_ai/_display/textual/display.py +2 -2
  12. inspect_ai/_display/textual/widgets/footer.py +4 -0
  13. inspect_ai/_display/textual/widgets/samples.py +14 -5
  14. inspect_ai/_eval/context.py +1 -2
  15. inspect_ai/_eval/eval.py +54 -41
  16. inspect_ai/_eval/loader.py +9 -2
  17. inspect_ai/_eval/run.py +148 -81
  18. inspect_ai/_eval/score.py +13 -8
  19. inspect_ai/_eval/task/images.py +31 -21
  20. inspect_ai/_eval/task/run.py +62 -59
  21. inspect_ai/_eval/task/rundir.py +16 -9
  22. inspect_ai/_eval/task/sandbox.py +7 -8
  23. inspect_ai/_eval/task/util.py +7 -0
  24. inspect_ai/_util/_async.py +118 -10
  25. inspect_ai/_util/constants.py +0 -2
  26. inspect_ai/_util/file.py +15 -29
  27. inspect_ai/_util/future.py +37 -0
  28. inspect_ai/_util/http.py +3 -99
  29. inspect_ai/_util/httpx.py +60 -0
  30. inspect_ai/_util/interrupt.py +2 -2
  31. inspect_ai/_util/json.py +5 -52
  32. inspect_ai/_util/logger.py +30 -86
  33. inspect_ai/_util/retry.py +10 -61
  34. inspect_ai/_util/trace.py +2 -2
  35. inspect_ai/_view/server.py +86 -3
  36. inspect_ai/_view/www/dist/assets/index.js +25837 -13269
  37. inspect_ai/_view/www/log-schema.json +253 -186
  38. inspect_ai/_view/www/package.json +2 -2
  39. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +8 -3
  40. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +2 -3
  41. inspect_ai/_view/www/src/types/log.d.ts +122 -94
  42. inspect_ai/approval/_human/manager.py +6 -10
  43. inspect_ai/approval/_human/panel.py +2 -2
  44. inspect_ai/dataset/_sources/util.py +7 -6
  45. inspect_ai/log/__init__.py +4 -0
  46. inspect_ai/log/_file.py +35 -61
  47. inspect_ai/log/_log.py +18 -1
  48. inspect_ai/log/_recorders/eval.py +14 -23
  49. inspect_ai/log/_recorders/json.py +3 -18
  50. inspect_ai/log/_samples.py +27 -2
  51. inspect_ai/log/_transcript.py +8 -8
  52. inspect_ai/model/__init__.py +2 -1
  53. inspect_ai/model/_call_tools.py +60 -40
  54. inspect_ai/model/_chat_message.py +3 -2
  55. inspect_ai/model/_generate_config.py +25 -0
  56. inspect_ai/model/_model.py +74 -36
  57. inspect_ai/model/_openai.py +9 -1
  58. inspect_ai/model/_providers/anthropic.py +24 -26
  59. inspect_ai/model/_providers/azureai.py +11 -9
  60. inspect_ai/model/_providers/bedrock.py +33 -24
  61. inspect_ai/model/_providers/cloudflare.py +8 -9
  62. inspect_ai/model/_providers/goodfire.py +7 -3
  63. inspect_ai/model/_providers/google.py +47 -13
  64. inspect_ai/model/_providers/groq.py +15 -15
  65. inspect_ai/model/_providers/hf.py +24 -17
  66. inspect_ai/model/_providers/mistral.py +36 -20
  67. inspect_ai/model/_providers/openai.py +30 -25
  68. inspect_ai/model/_providers/openai_o1.py +1 -1
  69. inspect_ai/model/_providers/providers.py +1 -1
  70. inspect_ai/model/_providers/together.py +3 -4
  71. inspect_ai/model/_providers/util/__init__.py +2 -2
  72. inspect_ai/model/_providers/util/chatapi.py +6 -19
  73. inspect_ai/model/_providers/util/hooks.py +165 -0
  74. inspect_ai/model/_providers/vertex.py +20 -3
  75. inspect_ai/model/_providers/vllm.py +16 -19
  76. inspect_ai/scorer/_multi.py +5 -2
  77. inspect_ai/solver/_bridge/patch.py +31 -1
  78. inspect_ai/solver/_fork.py +5 -3
  79. inspect_ai/solver/_human_agent/agent.py +3 -2
  80. inspect_ai/tool/__init__.py +8 -2
  81. inspect_ai/tool/_tool_info.py +4 -90
  82. inspect_ai/tool/_tool_params.py +4 -34
  83. inspect_ai/tool/_tools/_web_search.py +30 -24
  84. inspect_ai/util/__init__.py +4 -0
  85. inspect_ai/util/_concurrency.py +5 -6
  86. inspect_ai/util/_display.py +6 -0
  87. inspect_ai/util/_json.py +170 -0
  88. inspect_ai/util/_sandbox/docker/cleanup.py +13 -9
  89. inspect_ai/util/_sandbox/docker/docker.py +5 -0
  90. inspect_ai/util/_sandbox/environment.py +56 -9
  91. inspect_ai/util/_sandbox/service.py +12 -5
  92. inspect_ai/util/_subprocess.py +94 -113
  93. inspect_ai/util/_subtask.py +2 -4
  94. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/METADATA +6 -2
  95. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/RECORD +99 -99
  96. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/WHEEL +1 -1
  97. inspect_ai/_util/timeouts.py +0 -160
  98. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  99. inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
  100. inspect_ai/model/_providers/util/tracker.py +0 -92
  101. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/LICENSE +0 -0
  102. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/entry_points.txt +0 -0
  103. {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -11,12 +11,12 @@ from inspect_ai._util.constants import (
11
11
  DEFAULT_EPOCHS,
12
12
  DEFAULT_LOG_LEVEL_TRANSCRIPT,
13
13
  DEFAULT_MAX_CONNECTIONS,
14
- DEFAULT_MAX_RETRIES,
15
14
  )
16
15
  from inspect_ai._util.file import filesystem
17
16
  from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
18
17
  from inspect_ai.log._file import log_file_info
19
18
  from inspect_ai.model import GenerateConfigArgs
19
+ from inspect_ai.model._generate_config import ResponseSchema
20
20
  from inspect_ai.scorer._reducer import create_reducers
21
21
  from inspect_ai.solver._solver import SolverSpec
22
22
 
@@ -47,9 +47,9 @@ NO_SCORE_HELP = (
47
47
  NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
48
48
  MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
49
49
  MAX_RETRIES_HELP = (
50
- f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
50
+ "Maximum number of times to retry model API requests (defaults to unlimited)"
51
51
  )
52
- TIMEOUT_HELP = "Request timeout (in seconds)."
52
+ TIMEOUT_HELP = "Model API request timeout in seconds (defaults to no timeout)"
53
53
 
54
54
 
55
55
  def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
@@ -405,6 +405,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
405
405
  help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
406
406
  envvar="INSPECT_EVAL_REASONING_HISTORY",
407
407
  )
408
+ @click.option(
409
+ "--response-schema",
410
+ type=str,
411
+ help="JSON schema for desired response format (output should still be validated). OpenAI, Google, and Mistral only.",
412
+ envvar="INSPECT_EVAL_RESPONSE_SCHEMA",
413
+ )
408
414
  @click.option(
409
415
  "--log-format",
410
416
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -476,6 +482,7 @@ def eval_command(
476
482
  reasoning_effort: str | None,
477
483
  reasoning_tokens: int | None,
478
484
  reasoning_history: Literal["none", "all", "last", "auto"] | None,
485
+ response_schema: ResponseSchema | None,
479
486
  message_limit: int | None,
480
487
  token_limit: int | None,
481
488
  time_limit: int | None,
@@ -640,6 +647,7 @@ def eval_set_command(
640
647
  reasoning_effort: str | None,
641
648
  reasoning_tokens: int | None,
642
649
  reasoning_history: Literal["none", "all", "last", "auto"] | None,
650
+ response_schema: ResponseSchema | None,
643
651
  message_limit: int | None,
644
652
  token_limit: int | None,
645
653
  time_limit: int | None,
@@ -889,6 +897,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
889
897
  if key == "reasoning_history":
890
898
  if value is not False:
891
899
  value = None
900
+ if key == "response_schema":
901
+ if value is not None:
902
+ value = ResponseSchema.model_validate_json(value)
892
903
  config[key] = value # type: ignore
893
904
  return config
894
905
 
@@ -1,7 +1,7 @@
1
- import asyncio
2
-
1
+ import anyio
3
2
  import click
4
3
 
4
+ from inspect_ai._util._async import configured_async_backend
5
5
  from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
6
6
 
7
7
 
@@ -27,4 +27,4 @@ def sandbox_cleanup(type: str, environment_id: str | None) -> None:
27
27
  """
28
28
  sandboxenv_type = registry_find_sandboxenv(type)
29
29
  cli_cleanup = getattr(sandboxenv_type, "cli_cleanup")
30
- asyncio.run(cli_cleanup(environment_id))
30
+ anyio.run(cli_cleanup, environment_id, backend=configured_async_backend())
inspect_ai/_cli/score.py CHANGED
@@ -1,6 +1,6 @@
1
- import asyncio
2
1
  import os
3
2
 
3
+ import anyio
4
4
  import click
5
5
  import rich
6
6
  from rich.panel import Panel
@@ -13,6 +13,7 @@ from inspect_ai._display import display
13
13
  from inspect_ai._display.core.rich import rich_theme
14
14
  from inspect_ai._eval.context import init_eval_context, init_task_context
15
15
  from inspect_ai._eval.score import ScoreAction, task_score
16
+ from inspect_ai._util._async import configured_async_backend
16
17
  from inspect_ai._util.file import basename, dirname, exists
17
18
  from inspect_ai.log._log import EvalLog
18
19
  from inspect_ai.log._recorders import create_recorder_for_location
@@ -64,8 +65,8 @@ def score_command(
64
65
  process_common_options(common)
65
66
 
66
67
  # score
67
- asyncio.run(
68
- score(
68
+ async def run_score() -> None:
69
+ return await score(
69
70
  log_dir=common["log_dir"],
70
71
  log_file=log_file,
71
72
  scorer=scorer,
@@ -74,7 +75,8 @@ def score_command(
74
75
  action=action,
75
76
  log_level=common["log_level"],
76
77
  )
77
- )
78
+
79
+ anyio.run(run_score, backend=configured_async_backend())
78
80
 
79
81
 
80
82
  async def score(
inspect_ai/_cli/trace.py CHANGED
@@ -15,6 +15,7 @@ from rich.table import Column, Table
15
15
  from inspect_ai._util.error import PrerequisiteError
16
16
  from inspect_ai._util.trace import (
17
17
  ActionTraceRecord,
18
+ TraceRecord,
18
19
  inspect_trace_dir,
19
20
  list_trace_files,
20
21
  read_trace_file,
@@ -84,6 +85,41 @@ def dump_command(trace_file: str | None, filter: str | None) -> None:
84
85
  )
85
86
 
86
87
 
88
+ @trace_command.command("http")
89
+ @click.argument("trace-file", type=str, required=False)
90
+ @click.option(
91
+ "--filter",
92
+ type=str,
93
+ help="Filter (applied to trace message field).",
94
+ )
95
+ @click.option(
96
+ "--failed",
97
+ type=bool,
98
+ is_flag=True,
99
+ default=False,
100
+ help="Show only failed HTTP requests (non-200 status)",
101
+ )
102
+ def http_command(trace_file: str | None, filter: str | None, failed: bool) -> None:
103
+ """View all HTTP requests in the trace log."""
104
+ _, traces = _read_traces(trace_file, "HTTP", filter)
105
+
106
+ last_timestamp = ""
107
+ table = Table(Column(), Column(), box=None)
108
+ for trace in traces:
109
+ if failed and "200 OK" in trace.message:
110
+ continue
111
+ timestamp = trace.timestamp.split(".")[0]
112
+ if timestamp == last_timestamp:
113
+ timestamp = ""
114
+ else:
115
+ last_timestamp = timestamp
116
+ timestamp = f"[{timestamp}]"
117
+ table.add_row(timestamp, trace.message)
118
+
119
+ if table.row_count > 0:
120
+ r_print(table)
121
+
122
+
87
123
  @trace_command.command("anomalies")
88
124
  @click.argument("trace-file", type=str, required=False)
89
125
  @click.option(
@@ -99,12 +135,7 @@ def dump_command(trace_file: str | None, filter: str | None) -> None:
99
135
  )
100
136
  def anomolies_command(trace_file: str | None, filter: str | None, all: bool) -> None:
101
137
  """Look for anomalies in a trace file (never completed or cancelled actions)."""
102
- trace_file_path = _resolve_trace_file_path(trace_file)
103
- traces = read_trace_file(trace_file_path)
104
-
105
- if filter:
106
- filter = filter.lower()
107
- traces = [trace for trace in traces if filter in trace.message.lower()]
138
+ trace_file_path, traces = _read_traces(trace_file, None, filter)
108
139
 
109
140
  # Track started actions
110
141
  running_actions: dict[str, ActionTraceRecord] = {}
@@ -199,6 +230,22 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
199
230
  print(console.export_text(styles=True).strip())
200
231
 
201
232
 
233
+ def _read_traces(
234
+ trace_file: str | None, level: str | None = None, filter: str | None = None
235
+ ) -> tuple[Path, list[TraceRecord]]:
236
+ trace_file_path = _resolve_trace_file_path(trace_file)
237
+ traces = read_trace_file(trace_file_path)
238
+
239
+ if level:
240
+ traces = [trace for trace in traces if trace.level == level]
241
+
242
+ if filter:
243
+ filter = filter.lower()
244
+ traces = [trace for trace in traces if filter in trace.message.lower()]
245
+
246
+ return (trace_file_path, traces)
247
+
248
+
202
249
  def _print_bucket(
203
250
  print_fn: Callable[[RenderableType], None],
204
251
  label: str,
@@ -34,7 +34,7 @@ def task_config(
34
34
  value = value if isinstance(value, list) else [value]
35
35
  value = [str(v) for v in value]
36
36
  config_print.append(f"{name}: {','.join(value)}")
37
- elif name not in ["limit", "model"]:
37
+ elif name not in ["limit", "model", "response_schema"]:
38
38
  if isinstance(value, list):
39
39
  value = ",".join([str(v) for v in value])
40
40
  if isinstance(value, str):
@@ -4,6 +4,7 @@ from types import TracebackType
4
4
  from typing import (
5
5
  Any,
6
6
  AsyncIterator,
7
+ Callable,
7
8
  Coroutine,
8
9
  Iterator,
9
10
  Protocol,
@@ -130,7 +131,7 @@ class Display(Protocol):
130
131
  @contextlib.contextmanager
131
132
  def progress(self, total: int) -> Iterator[Progress]: ...
132
133
 
133
- def run_task_app(self, main: Coroutine[Any, Any, TR]) -> TR: ...
134
+ def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR: ...
134
135
 
135
136
  @contextlib.contextmanager
136
137
  def suspend_task_app(self) -> Iterator[None]: ...
@@ -1,7 +1,7 @@
1
1
  from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
- from inspect_ai._util.logger import http_rate_limit_count
4
+ from inspect_ai._util.retry import http_retries_count
5
5
  from inspect_ai.util._concurrency import concurrency_status
6
6
  from inspect_ai.util._throttle import throttle
7
7
 
@@ -26,12 +26,12 @@ def task_resources() -> str:
26
26
 
27
27
 
28
28
  def task_counters(counters: dict[str, str]) -> str:
29
- return task_dict(counters | task_http_rate_limits())
29
+ return task_dict(counters | task_http_retries())
30
30
 
31
31
 
32
- def task_http_rate_limits() -> dict[str, str]:
33
- return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
32
+ def task_http_retries() -> dict[str, str]:
33
+ return {"HTTP retries": f"{http_retries_count():,}"}
34
34
 
35
35
 
36
- def task_http_rate_limits_str() -> str:
37
- return f"HTTP rate limits: {http_rate_limit_count():,}"
36
+ def task_http_retries_str() -> str:
37
+ return f"HTTP retries: {http_retries_count():,}"
@@ -1,10 +1,12 @@
1
- import asyncio
2
1
  import contextlib
3
- from typing import Any, AsyncIterator, Coroutine, Iterator
2
+ from typing import AsyncIterator, Callable, Coroutine, Iterator
4
3
 
4
+ import anyio
5
5
  import rich
6
6
 
7
7
  from inspect_ai._display.core.rich import rich_initialise
8
+ from inspect_ai._util._async import configured_async_backend, run_coroutine
9
+ from inspect_ai._util.platform import running_in_notebook
8
10
  from inspect_ai._util.text import truncate
9
11
  from inspect_ai._util.throttle import throttle
10
12
 
@@ -22,7 +24,7 @@ from ..core.display import (
22
24
  TaskSpec,
23
25
  TaskWithResult,
24
26
  )
25
- from ..core.footer import task_http_rate_limits_str
27
+ from ..core.footer import task_http_retries_str
26
28
  from ..core.panel import task_panel, task_targets
27
29
  from ..core.results import task_metric, tasks_results
28
30
 
@@ -41,8 +43,11 @@ class PlainDisplay(Display):
41
43
  def progress(self, total: int) -> Iterator[Progress]:
42
44
  yield PlainProgress(total)
43
45
 
44
- def run_task_app(self, main: Coroutine[Any, Any, TR]) -> TR:
45
- return asyncio.run(main)
46
+ def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
47
+ if running_in_notebook():
48
+ return run_coroutine(main())
49
+ else:
50
+ return anyio.run(main, backend=configured_async_backend())
46
51
 
47
52
  @contextlib.contextmanager
48
53
  def suspend_task_app(self) -> Iterator[None]:
@@ -182,7 +187,7 @@ class PlainTaskDisplay(TaskDisplay):
182
187
  status_parts.append(resources)
183
188
 
184
189
  # Add rate limits
185
- rate_limits = task_http_rate_limits_str()
190
+ rate_limits = task_http_retries_str()
186
191
  if rate_limits:
187
192
  status_parts.append(rate_limits)
188
193
 
@@ -1,8 +1,8 @@
1
- import asyncio
2
1
  import contextlib
3
2
  from dataclasses import dataclass
4
3
  from typing import Any, AsyncIterator, Callable, Coroutine, Iterator
5
4
 
5
+ import anyio
6
6
  import rich
7
7
  from rich.console import Console, Group, RenderableType
8
8
  from rich.live import Live
@@ -11,7 +11,9 @@ from rich.progress import Progress as RProgress
11
11
  from rich.table import Table
12
12
  from typing_extensions import override
13
13
 
14
+ from inspect_ai._util._async import configured_async_backend, run_coroutine
14
15
  from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
16
+ from inspect_ai._util.platform import running_in_notebook
15
17
  from inspect_ai.log._transcript import InputEvent, transcript
16
18
  from inspect_ai.util._display import display_type
17
19
  from inspect_ai.util._throttle import throttle
@@ -59,7 +61,6 @@ class RichDisplay(Display):
59
61
  self.progress_ui: RProgress | None = None
60
62
  self.parallel = False
61
63
  self.live: Live | None = None
62
- self.timer_handle: asyncio.TimerHandle | None = None
63
64
  self.counters: dict[str, str] = {}
64
65
  rich_initialise()
65
66
 
@@ -74,8 +75,11 @@ class RichDisplay(Display):
74
75
  yield RichProgress(total, progress)
75
76
 
76
77
  @override
77
- def run_task_app(self, main: Coroutine[Any, Any, TR]) -> TR:
78
- return asyncio.run(main)
78
+ def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
79
+ if running_in_notebook():
80
+ return run_coroutine(main())
81
+ else:
82
+ return anyio.run(main, backend=configured_async_backend())
79
83
 
80
84
  @override
81
85
  @contextlib.contextmanager
@@ -104,13 +108,15 @@ class RichDisplay(Display):
104
108
  with RichTaskScreen(live) as task_screen:
105
109
  self.live = live
106
110
 
107
- # enque a display update
108
- self.timer_handle = asyncio.get_event_loop().call_later(
109
- 1, self._update_display
110
- )
111
+ async with anyio.create_task_group() as tg:
112
+ # update display every second while running
113
+ tg.start_soon(self._update_display_loop)
111
114
 
112
- # yield
113
- yield task_screen
115
+ # let the task screen run
116
+ try:
117
+ yield task_screen
118
+ finally:
119
+ tg.cancel_scope.cancel()
114
120
 
115
121
  # render task results (re-enable live if necessary)
116
122
  if not live.is_started:
@@ -124,8 +130,6 @@ class RichDisplay(Display):
124
130
  self.progress_ui = None
125
131
  self.parallel = False
126
132
  self.live = None
127
- if self.timer_handle:
128
- self.timer_handle.cancel()
129
133
 
130
134
  @override
131
135
  @contextlib.contextmanager
@@ -161,7 +165,13 @@ class RichDisplay(Display):
161
165
  r = task_live_status(self.tasks, self.progress_ui, self.counters)
162
166
  self.live.update(r, refresh=True)
163
167
 
164
- self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
168
+ async def _update_display_loop(self) -> None:
169
+ try:
170
+ while True:
171
+ await anyio.sleep(1)
172
+ self._update_display()
173
+ except Exception:
174
+ pass
165
175
 
166
176
  @override
167
177
  def display_counter(self, caption: str, value: str) -> None:
@@ -1,16 +1,18 @@
1
- import asyncio
2
1
  import contextlib
3
2
  from asyncio import CancelledError
4
3
  from typing import (
5
4
  Any,
6
5
  AsyncIterator,
6
+ Awaitable,
7
+ Callable,
7
8
  ClassVar,
8
- Coroutine,
9
9
  Generic,
10
10
  Iterator,
11
11
  cast,
12
12
  )
13
13
 
14
+ import anyio
15
+ import anyio.from_thread
14
16
  import rich
15
17
  from rich.console import Console
16
18
  from textual.app import App, ComposeResult
@@ -103,9 +105,8 @@ class TaskScreenApp(App[TR]):
103
105
  if focus and self.app._driver:
104
106
  textual_enable_mouse_support(self.app._driver)
105
107
 
106
- def run_app(self, main: Coroutine[Any, Any, TR]) -> TaskScreenResult[TR]:
107
- # create the worker
108
- self._worker = self.run_worker(main, start=False, exit_on_error=False)
108
+ def run_app(self, main: Callable[[], Awaitable[TR]]) -> TaskScreenResult[TR]:
109
+ self._worker = self.run_worker(main(), start=False, exit_on_error=False)
109
110
 
110
111
  # run the app
111
112
  self.run()
@@ -123,8 +124,8 @@ class TaskScreenApp(App[TR]):
123
124
 
124
125
  async def on_load(self) -> None:
125
126
  # events used to synchronise loading
126
- self._on_load_app = asyncio.Event()
127
- self._on_app_loaded = asyncio.Event()
127
+ self._on_load_app = anyio.Event()
128
+ self._on_app_loaded = anyio.Event()
128
129
 
129
130
  # run the workers
130
131
  self.workers.start_all()
@@ -136,7 +137,7 @@ class TaskScreenApp(App[TR]):
136
137
  while not self._on_load_app.is_set():
137
138
  if len(self.workers._workers) == 0:
138
139
  return
139
- await asyncio.sleep(0.1)
140
+ await anyio.sleep(0.1)
140
141
 
141
142
  @contextlib.contextmanager
142
143
  def suspend_app(self) -> Iterator[None]:
@@ -422,7 +423,7 @@ class TaskScreenApp(App[TR]):
422
423
  class TextualTaskScreen(TaskScreen, Generic[TR]):
423
424
  def __init__(self, app: TaskScreenApp[TR]) -> None:
424
425
  self.app = app
425
- self.lock = asyncio.Lock()
426
+ self.lock = anyio.Lock()
426
427
 
427
428
  def __exit__(self, *excinfo: Any) -> None:
428
429
  pass
@@ -1,5 +1,5 @@
1
1
  import contextlib
2
- from typing import Any, AsyncIterator, Coroutine, Iterator
2
+ from typing import AsyncIterator, Callable, Coroutine, Iterator
3
3
 
4
4
  import rich
5
5
  from typing_extensions import override
@@ -30,7 +30,7 @@ class TextualDisplay(Display):
30
30
  yield RichProgress(total, progress)
31
31
 
32
32
  @override
33
- def run_task_app(self, main: Coroutine[Any, Any, TR]) -> TR:
33
+ def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
34
34
  # create and run the app
35
35
  self.app = TaskScreenApp[TR]()
36
36
  result = self.app.run_app(main)
@@ -36,3 +36,7 @@ class AppFooter(Widget):
36
36
  def watch_right(self, new_right: RenderableType) -> None:
37
37
  footer_right = cast(Static, self.query_one("#footer-right"))
38
38
  footer_right.update(new_right)
39
+ if footer_right.tooltip is None:
40
+ footer_right.tooltip = (
41
+ "Execute 'inspect trace http' for a log of all HTTP requests."
42
+ )
@@ -506,6 +506,7 @@ class SampleToolbar(Horizontal):
506
506
  # track the sample
507
507
  self.sample = sample
508
508
 
509
+ status_group = self.query_one("#" + self.STATUS_GROUP)
509
510
  pending_status = self.query_one("#" + self.PENDING_STATUS)
510
511
  timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
511
512
  clock = self.query_one(Clock)
@@ -537,11 +538,19 @@ class SampleToolbar(Horizontal):
537
538
  pending_caption = cast(
538
539
  Static, self.query_one("#" + self.PENDING_CAPTION)
539
540
  )
540
- pending_caption_text = (
541
- "Generating..."
542
- if isinstance(last_event, ModelEvent)
543
- else "Executing..."
544
- )
541
+ if isinstance(last_event, ModelEvent):
542
+ # see if there are retries in play
543
+ if sample.retry_count > 0:
544
+ suffix = "retry" if sample.retry_count == 1 else "retries"
545
+ pending_caption_text = (
546
+ f"Generating ({sample.retry_count:,} {suffix})..."
547
+ )
548
+ else:
549
+ pending_caption_text = "Generating..."
550
+ else:
551
+ pending_caption_text = "Executing..."
552
+ status_group.styles.width = max(22, len(pending_caption_text))
553
+
545
554
  pending_caption.update(
546
555
  Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
547
556
  )
@@ -1,6 +1,6 @@
1
1
  from inspect_ai._util.dotenv import init_dotenv
2
2
  from inspect_ai._util.hooks import init_hooks
3
- from inspect_ai._util.logger import init_http_rate_limit_count, init_logger
3
+ from inspect_ai._util.logger import init_logger
4
4
  from inspect_ai.approval._apply import have_tool_approval, init_tool_approval
5
5
  from inspect_ai.approval._human.manager import init_human_approval_manager
6
6
  from inspect_ai.approval._policy import ApprovalPolicy
@@ -20,7 +20,6 @@ def init_eval_context(
20
20
  init_logger(log_level, log_level_transcript)
21
21
  init_concurrency()
22
22
  init_max_subprocesses(max_subprocesses)
23
- init_http_rate_limit_count()
24
23
  init_hooks()
25
24
  init_active_samples()
26
25
  init_human_approval_manager()
inspect_ai/_eval/eval.py CHANGED
@@ -1,8 +1,12 @@
1
1
  import logging
2
2
  import os
3
+ import sys
3
4
  from pathlib import Path
4
5
  from typing import Any, Literal
5
6
 
7
+ if sys.version_info < (3, 11):
8
+ from exceptiongroup import ExceptionGroup
9
+
6
10
  from shortuuid import uuid
7
11
  from typing_extensions import Unpack
8
12
 
@@ -166,43 +170,51 @@ def eval(
166
170
  display, trace, max_tasks, max_samples, model
167
171
  )
168
172
 
169
- return task_display().run_task_app(
170
- main=eval_async(
171
- tasks=tasks,
172
- model=model,
173
- model_base_url=model_base_url,
174
- model_args=model_args,
175
- task_args=task_args,
176
- sandbox=sandbox,
177
- sandbox_cleanup=sandbox_cleanup,
178
- solver=solver,
179
- tags=tags,
180
- approval=approval,
181
- log_level=log_level,
182
- log_level_transcript=log_level_transcript,
183
- log_dir=log_dir,
184
- log_format=log_format,
185
- limit=limit,
186
- sample_id=sample_id,
187
- epochs=epochs,
188
- fail_on_error=fail_on_error,
189
- debug_errors=debug_errors,
190
- message_limit=message_limit,
191
- token_limit=token_limit,
192
- time_limit=time_limit,
193
- working_limit=working_limit,
194
- max_samples=max_samples,
195
- max_tasks=max_tasks,
196
- max_subprocesses=max_subprocesses,
197
- max_sandboxes=max_sandboxes,
198
- log_samples=log_samples,
199
- log_images=log_images,
200
- log_buffer=log_buffer,
201
- score=score,
202
- score_display=score_display,
203
- **kwargs,
204
- )
205
- )
173
+ async def run_task_app() -> list[EvalLog]:
174
+ try:
175
+ return await eval_async(
176
+ tasks=tasks,
177
+ model=model,
178
+ model_base_url=model_base_url,
179
+ model_args=model_args,
180
+ task_args=task_args,
181
+ sandbox=sandbox,
182
+ sandbox_cleanup=sandbox_cleanup,
183
+ solver=solver,
184
+ tags=tags,
185
+ approval=approval,
186
+ log_level=log_level,
187
+ log_level_transcript=log_level_transcript,
188
+ log_dir=log_dir,
189
+ log_format=log_format,
190
+ limit=limit,
191
+ sample_id=sample_id,
192
+ epochs=epochs,
193
+ fail_on_error=fail_on_error,
194
+ debug_errors=debug_errors,
195
+ message_limit=message_limit,
196
+ token_limit=token_limit,
197
+ time_limit=time_limit,
198
+ working_limit=working_limit,
199
+ max_samples=max_samples,
200
+ max_tasks=max_tasks,
201
+ max_subprocesses=max_subprocesses,
202
+ max_sandboxes=max_sandboxes,
203
+ log_samples=log_samples,
204
+ log_images=log_images,
205
+ log_buffer=log_buffer,
206
+ score=score,
207
+ score_display=score_display,
208
+ **kwargs,
209
+ )
210
+ # exceptions can escape when debug_errors is True and that's okay
211
+ except ExceptionGroup as ex:
212
+ if debug_errors:
213
+ raise ex.exceptions[0] from None
214
+ else:
215
+ raise
216
+
217
+ return task_display().run_task_app(run_task_app)
206
218
 
207
219
 
208
220
  # single call to eval_async at a time
@@ -556,8 +568,8 @@ def eval_retry(
556
568
  # resolve eval trace
557
569
  max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
558
570
 
559
- return task_display().run_task_app(
560
- main=eval_retry_async(
571
+ async def run_task_app() -> list[EvalLog]:
572
+ return await eval_retry_async(
561
573
  tasks=tasks,
562
574
  log_level=log_level,
563
575
  log_level_transcript=log_level_transcript,
@@ -578,8 +590,9 @@ def eval_retry(
578
590
  max_retries=max_retries,
579
591
  timeout=timeout,
580
592
  max_connections=max_connections,
581
- ),
582
- )
593
+ )
594
+
595
+ return task_display().run_task_app(run_task_app)
583
596
 
584
597
 
585
598
  async def eval_retry_async(