inspect-ai 0.3.72__py3-none-any.whl → 0.3.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +14 -3
- inspect_ai/_cli/sandbox.py +3 -3
- inspect_ai/_cli/score.py +6 -4
- inspect_ai/_cli/trace.py +53 -6
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +2 -1
- inspect_ai/_display/core/footer.py +6 -6
- inspect_ai/_display/plain/display.py +11 -6
- inspect_ai/_display/rich/display.py +23 -13
- inspect_ai/_display/textual/app.py +10 -9
- inspect_ai/_display/textual/display.py +2 -2
- inspect_ai/_display/textual/widgets/footer.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +14 -5
- inspect_ai/_eval/context.py +1 -2
- inspect_ai/_eval/eval.py +54 -41
- inspect_ai/_eval/loader.py +9 -2
- inspect_ai/_eval/run.py +148 -81
- inspect_ai/_eval/score.py +13 -8
- inspect_ai/_eval/task/images.py +31 -21
- inspect_ai/_eval/task/run.py +62 -59
- inspect_ai/_eval/task/rundir.py +16 -9
- inspect_ai/_eval/task/sandbox.py +7 -8
- inspect_ai/_eval/task/util.py +7 -0
- inspect_ai/_util/_async.py +118 -10
- inspect_ai/_util/constants.py +0 -2
- inspect_ai/_util/file.py +15 -29
- inspect_ai/_util/future.py +37 -0
- inspect_ai/_util/http.py +3 -99
- inspect_ai/_util/httpx.py +60 -0
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/json.py +5 -52
- inspect_ai/_util/logger.py +30 -86
- inspect_ai/_util/retry.py +10 -61
- inspect_ai/_util/trace.py +2 -2
- inspect_ai/_view/server.py +86 -3
- inspect_ai/_view/www/dist/assets/index.js +25837 -13269
- inspect_ai/_view/www/log-schema.json +253 -186
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +8 -3
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +2 -3
- inspect_ai/_view/www/src/types/log.d.ts +122 -94
- inspect_ai/approval/_human/manager.py +6 -10
- inspect_ai/approval/_human/panel.py +2 -2
- inspect_ai/dataset/_sources/util.py +7 -6
- inspect_ai/log/__init__.py +4 -0
- inspect_ai/log/_file.py +35 -61
- inspect_ai/log/_log.py +18 -1
- inspect_ai/log/_recorders/eval.py +14 -23
- inspect_ai/log/_recorders/json.py +3 -18
- inspect_ai/log/_samples.py +27 -2
- inspect_ai/log/_transcript.py +8 -8
- inspect_ai/model/__init__.py +2 -1
- inspect_ai/model/_call_tools.py +60 -40
- inspect_ai/model/_chat_message.py +3 -2
- inspect_ai/model/_generate_config.py +25 -0
- inspect_ai/model/_model.py +74 -36
- inspect_ai/model/_openai.py +9 -1
- inspect_ai/model/_providers/anthropic.py +24 -26
- inspect_ai/model/_providers/azureai.py +11 -9
- inspect_ai/model/_providers/bedrock.py +33 -24
- inspect_ai/model/_providers/cloudflare.py +8 -9
- inspect_ai/model/_providers/goodfire.py +7 -3
- inspect_ai/model/_providers/google.py +47 -13
- inspect_ai/model/_providers/groq.py +15 -15
- inspect_ai/model/_providers/hf.py +24 -17
- inspect_ai/model/_providers/mistral.py +36 -20
- inspect_ai/model/_providers/openai.py +30 -25
- inspect_ai/model/_providers/openai_o1.py +1 -1
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/together.py +3 -4
- inspect_ai/model/_providers/util/__init__.py +2 -2
- inspect_ai/model/_providers/util/chatapi.py +6 -19
- inspect_ai/model/_providers/util/hooks.py +165 -0
- inspect_ai/model/_providers/vertex.py +20 -3
- inspect_ai/model/_providers/vllm.py +16 -19
- inspect_ai/scorer/_multi.py +5 -2
- inspect_ai/solver/_bridge/patch.py +31 -1
- inspect_ai/solver/_fork.py +5 -3
- inspect_ai/solver/_human_agent/agent.py +3 -2
- inspect_ai/tool/__init__.py +8 -2
- inspect_ai/tool/_tool_info.py +4 -90
- inspect_ai/tool/_tool_params.py +4 -34
- inspect_ai/tool/_tools/_web_search.py +30 -24
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_concurrency.py +5 -6
- inspect_ai/util/_display.py +6 -0
- inspect_ai/util/_json.py +170 -0
- inspect_ai/util/_sandbox/docker/cleanup.py +13 -9
- inspect_ai/util/_sandbox/docker/docker.py +5 -0
- inspect_ai/util/_sandbox/environment.py +56 -9
- inspect_ai/util/_sandbox/service.py +12 -5
- inspect_ai/util/_subprocess.py +94 -113
- inspect_ai/util/_subtask.py +2 -4
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/METADATA +6 -2
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/RECORD +99 -99
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/WHEEL +1 -1
- inspect_ai/_util/timeouts.py +0 -160
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
- inspect_ai/model/_providers/util/tracker.py +0 -92
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.72.dist-info → inspect_ai-0.3.73.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -11,12 +11,12 @@ from inspect_ai._util.constants import (
|
|
11
11
|
DEFAULT_EPOCHS,
|
12
12
|
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
13
13
|
DEFAULT_MAX_CONNECTIONS,
|
14
|
-
DEFAULT_MAX_RETRIES,
|
15
14
|
)
|
16
15
|
from inspect_ai._util.file import filesystem
|
17
16
|
from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
|
18
17
|
from inspect_ai.log._file import log_file_info
|
19
18
|
from inspect_ai.model import GenerateConfigArgs
|
19
|
+
from inspect_ai.model._generate_config import ResponseSchema
|
20
20
|
from inspect_ai.scorer._reducer import create_reducers
|
21
21
|
from inspect_ai.solver._solver import SolverSpec
|
22
22
|
|
@@ -47,9 +47,9 @@ NO_SCORE_HELP = (
|
|
47
47
|
NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
|
48
48
|
MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
|
49
49
|
MAX_RETRIES_HELP = (
|
50
|
-
|
50
|
+
"Maximum number of times to retry model API requests (defaults to unlimited)"
|
51
51
|
)
|
52
|
-
TIMEOUT_HELP = "
|
52
|
+
TIMEOUT_HELP = "Model API request timeout in seconds (defaults to no timeout)"
|
53
53
|
|
54
54
|
|
55
55
|
def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
@@ -405,6 +405,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
405
405
|
help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
|
406
406
|
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
407
407
|
)
|
408
|
+
@click.option(
|
409
|
+
"--response-schema",
|
410
|
+
type=str,
|
411
|
+
help="JSON schema for desired response format (output should still be validated). OpenAI, Google, and Mistral only.",
|
412
|
+
envvar="INSPECT_EVAL_RESPONSE_SCHEMA",
|
413
|
+
)
|
408
414
|
@click.option(
|
409
415
|
"--log-format",
|
410
416
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -476,6 +482,7 @@ def eval_command(
|
|
476
482
|
reasoning_effort: str | None,
|
477
483
|
reasoning_tokens: int | None,
|
478
484
|
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
485
|
+
response_schema: ResponseSchema | None,
|
479
486
|
message_limit: int | None,
|
480
487
|
token_limit: int | None,
|
481
488
|
time_limit: int | None,
|
@@ -640,6 +647,7 @@ def eval_set_command(
|
|
640
647
|
reasoning_effort: str | None,
|
641
648
|
reasoning_tokens: int | None,
|
642
649
|
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
650
|
+
response_schema: ResponseSchema | None,
|
643
651
|
message_limit: int | None,
|
644
652
|
token_limit: int | None,
|
645
653
|
time_limit: int | None,
|
@@ -889,6 +897,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
|
|
889
897
|
if key == "reasoning_history":
|
890
898
|
if value is not False:
|
891
899
|
value = None
|
900
|
+
if key == "response_schema":
|
901
|
+
if value is not None:
|
902
|
+
value = ResponseSchema.model_validate_json(value)
|
892
903
|
config[key] = value # type: ignore
|
893
904
|
return config
|
894
905
|
|
inspect_ai/_cli/sandbox.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
import
|
2
|
-
|
1
|
+
import anyio
|
3
2
|
import click
|
4
3
|
|
4
|
+
from inspect_ai._util._async import configured_async_backend
|
5
5
|
from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
|
6
6
|
|
7
7
|
|
@@ -27,4 +27,4 @@ def sandbox_cleanup(type: str, environment_id: str | None) -> None:
|
|
27
27
|
"""
|
28
28
|
sandboxenv_type = registry_find_sandboxenv(type)
|
29
29
|
cli_cleanup = getattr(sandboxenv_type, "cli_cleanup")
|
30
|
-
|
30
|
+
anyio.run(cli_cleanup, environment_id, backend=configured_async_backend())
|
inspect_ai/_cli/score.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
import asyncio
|
2
1
|
import os
|
3
2
|
|
3
|
+
import anyio
|
4
4
|
import click
|
5
5
|
import rich
|
6
6
|
from rich.panel import Panel
|
@@ -13,6 +13,7 @@ from inspect_ai._display import display
|
|
13
13
|
from inspect_ai._display.core.rich import rich_theme
|
14
14
|
from inspect_ai._eval.context import init_eval_context, init_task_context
|
15
15
|
from inspect_ai._eval.score import ScoreAction, task_score
|
16
|
+
from inspect_ai._util._async import configured_async_backend
|
16
17
|
from inspect_ai._util.file import basename, dirname, exists
|
17
18
|
from inspect_ai.log._log import EvalLog
|
18
19
|
from inspect_ai.log._recorders import create_recorder_for_location
|
@@ -64,8 +65,8 @@ def score_command(
|
|
64
65
|
process_common_options(common)
|
65
66
|
|
66
67
|
# score
|
67
|
-
|
68
|
-
score(
|
68
|
+
async def run_score() -> None:
|
69
|
+
return await score(
|
69
70
|
log_dir=common["log_dir"],
|
70
71
|
log_file=log_file,
|
71
72
|
scorer=scorer,
|
@@ -74,7 +75,8 @@ def score_command(
|
|
74
75
|
action=action,
|
75
76
|
log_level=common["log_level"],
|
76
77
|
)
|
77
|
-
|
78
|
+
|
79
|
+
anyio.run(run_score, backend=configured_async_backend())
|
78
80
|
|
79
81
|
|
80
82
|
async def score(
|
inspect_ai/_cli/trace.py
CHANGED
@@ -15,6 +15,7 @@ from rich.table import Column, Table
|
|
15
15
|
from inspect_ai._util.error import PrerequisiteError
|
16
16
|
from inspect_ai._util.trace import (
|
17
17
|
ActionTraceRecord,
|
18
|
+
TraceRecord,
|
18
19
|
inspect_trace_dir,
|
19
20
|
list_trace_files,
|
20
21
|
read_trace_file,
|
@@ -84,6 +85,41 @@ def dump_command(trace_file: str | None, filter: str | None) -> None:
|
|
84
85
|
)
|
85
86
|
|
86
87
|
|
88
|
+
@trace_command.command("http")
|
89
|
+
@click.argument("trace-file", type=str, required=False)
|
90
|
+
@click.option(
|
91
|
+
"--filter",
|
92
|
+
type=str,
|
93
|
+
help="Filter (applied to trace message field).",
|
94
|
+
)
|
95
|
+
@click.option(
|
96
|
+
"--failed",
|
97
|
+
type=bool,
|
98
|
+
is_flag=True,
|
99
|
+
default=False,
|
100
|
+
help="Show only failed HTTP requests (non-200 status)",
|
101
|
+
)
|
102
|
+
def http_command(trace_file: str | None, filter: str | None, failed: bool) -> None:
|
103
|
+
"""View all HTTP requests in the trace log."""
|
104
|
+
_, traces = _read_traces(trace_file, "HTTP", filter)
|
105
|
+
|
106
|
+
last_timestamp = ""
|
107
|
+
table = Table(Column(), Column(), box=None)
|
108
|
+
for trace in traces:
|
109
|
+
if failed and "200 OK" in trace.message:
|
110
|
+
continue
|
111
|
+
timestamp = trace.timestamp.split(".")[0]
|
112
|
+
if timestamp == last_timestamp:
|
113
|
+
timestamp = ""
|
114
|
+
else:
|
115
|
+
last_timestamp = timestamp
|
116
|
+
timestamp = f"[{timestamp}]"
|
117
|
+
table.add_row(timestamp, trace.message)
|
118
|
+
|
119
|
+
if table.row_count > 0:
|
120
|
+
r_print(table)
|
121
|
+
|
122
|
+
|
87
123
|
@trace_command.command("anomalies")
|
88
124
|
@click.argument("trace-file", type=str, required=False)
|
89
125
|
@click.option(
|
@@ -99,12 +135,7 @@ def dump_command(trace_file: str | None, filter: str | None) -> None:
|
|
99
135
|
)
|
100
136
|
def anomolies_command(trace_file: str | None, filter: str | None, all: bool) -> None:
|
101
137
|
"""Look for anomalies in a trace file (never completed or cancelled actions)."""
|
102
|
-
trace_file_path =
|
103
|
-
traces = read_trace_file(trace_file_path)
|
104
|
-
|
105
|
-
if filter:
|
106
|
-
filter = filter.lower()
|
107
|
-
traces = [trace for trace in traces if filter in trace.message.lower()]
|
138
|
+
trace_file_path, traces = _read_traces(trace_file, None, filter)
|
108
139
|
|
109
140
|
# Track started actions
|
110
141
|
running_actions: dict[str, ActionTraceRecord] = {}
|
@@ -199,6 +230,22 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
|
|
199
230
|
print(console.export_text(styles=True).strip())
|
200
231
|
|
201
232
|
|
233
|
+
def _read_traces(
|
234
|
+
trace_file: str | None, level: str | None = None, filter: str | None = None
|
235
|
+
) -> tuple[Path, list[TraceRecord]]:
|
236
|
+
trace_file_path = _resolve_trace_file_path(trace_file)
|
237
|
+
traces = read_trace_file(trace_file_path)
|
238
|
+
|
239
|
+
if level:
|
240
|
+
traces = [trace for trace in traces if trace.level == level]
|
241
|
+
|
242
|
+
if filter:
|
243
|
+
filter = filter.lower()
|
244
|
+
traces = [trace for trace in traces if filter in trace.message.lower()]
|
245
|
+
|
246
|
+
return (trace_file_path, traces)
|
247
|
+
|
248
|
+
|
202
249
|
def _print_bucket(
|
203
250
|
print_fn: Callable[[RenderableType], None],
|
204
251
|
label: str,
|
@@ -34,7 +34,7 @@ def task_config(
|
|
34
34
|
value = value if isinstance(value, list) else [value]
|
35
35
|
value = [str(v) for v in value]
|
36
36
|
config_print.append(f"{name}: {','.join(value)}")
|
37
|
-
elif name not in ["limit", "model"]:
|
37
|
+
elif name not in ["limit", "model", "response_schema"]:
|
38
38
|
if isinstance(value, list):
|
39
39
|
value = ",".join([str(v) for v in value])
|
40
40
|
if isinstance(value, str):
|
@@ -4,6 +4,7 @@ from types import TracebackType
|
|
4
4
|
from typing import (
|
5
5
|
Any,
|
6
6
|
AsyncIterator,
|
7
|
+
Callable,
|
7
8
|
Coroutine,
|
8
9
|
Iterator,
|
9
10
|
Protocol,
|
@@ -130,7 +131,7 @@ class Display(Protocol):
|
|
130
131
|
@contextlib.contextmanager
|
131
132
|
def progress(self, total: int) -> Iterator[Progress]: ...
|
132
133
|
|
133
|
-
def run_task_app(self, main: Coroutine[
|
134
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR: ...
|
134
135
|
|
135
136
|
@contextlib.contextmanager
|
136
137
|
def suspend_task_app(self) -> Iterator[None]: ...
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from rich.console import RenderableType
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
|
-
from inspect_ai._util.
|
4
|
+
from inspect_ai._util.retry import http_retries_count
|
5
5
|
from inspect_ai.util._concurrency import concurrency_status
|
6
6
|
from inspect_ai.util._throttle import throttle
|
7
7
|
|
@@ -26,12 +26,12 @@ def task_resources() -> str:
|
|
26
26
|
|
27
27
|
|
28
28
|
def task_counters(counters: dict[str, str]) -> str:
|
29
|
-
return task_dict(counters |
|
29
|
+
return task_dict(counters | task_http_retries())
|
30
30
|
|
31
31
|
|
32
|
-
def
|
33
|
-
return {"HTTP
|
32
|
+
def task_http_retries() -> dict[str, str]:
|
33
|
+
return {"HTTP retries": f"{http_retries_count():,}"}
|
34
34
|
|
35
35
|
|
36
|
-
def
|
37
|
-
return f"HTTP
|
36
|
+
def task_http_retries_str() -> str:
|
37
|
+
return f"HTTP retries: {http_retries_count():,}"
|
@@ -1,10 +1,12 @@
|
|
1
|
-
import asyncio
|
2
1
|
import contextlib
|
3
|
-
from typing import
|
2
|
+
from typing import AsyncIterator, Callable, Coroutine, Iterator
|
4
3
|
|
4
|
+
import anyio
|
5
5
|
import rich
|
6
6
|
|
7
7
|
from inspect_ai._display.core.rich import rich_initialise
|
8
|
+
from inspect_ai._util._async import configured_async_backend, run_coroutine
|
9
|
+
from inspect_ai._util.platform import running_in_notebook
|
8
10
|
from inspect_ai._util.text import truncate
|
9
11
|
from inspect_ai._util.throttle import throttle
|
10
12
|
|
@@ -22,7 +24,7 @@ from ..core.display import (
|
|
22
24
|
TaskSpec,
|
23
25
|
TaskWithResult,
|
24
26
|
)
|
25
|
-
from ..core.footer import
|
27
|
+
from ..core.footer import task_http_retries_str
|
26
28
|
from ..core.panel import task_panel, task_targets
|
27
29
|
from ..core.results import task_metric, tasks_results
|
28
30
|
|
@@ -41,8 +43,11 @@ class PlainDisplay(Display):
|
|
41
43
|
def progress(self, total: int) -> Iterator[Progress]:
|
42
44
|
yield PlainProgress(total)
|
43
45
|
|
44
|
-
def run_task_app(self, main: Coroutine[
|
45
|
-
|
46
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
|
47
|
+
if running_in_notebook():
|
48
|
+
return run_coroutine(main())
|
49
|
+
else:
|
50
|
+
return anyio.run(main, backend=configured_async_backend())
|
46
51
|
|
47
52
|
@contextlib.contextmanager
|
48
53
|
def suspend_task_app(self) -> Iterator[None]:
|
@@ -182,7 +187,7 @@ class PlainTaskDisplay(TaskDisplay):
|
|
182
187
|
status_parts.append(resources)
|
183
188
|
|
184
189
|
# Add rate limits
|
185
|
-
rate_limits =
|
190
|
+
rate_limits = task_http_retries_str()
|
186
191
|
if rate_limits:
|
187
192
|
status_parts.append(rate_limits)
|
188
193
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
import asyncio
|
2
1
|
import contextlib
|
3
2
|
from dataclasses import dataclass
|
4
3
|
from typing import Any, AsyncIterator, Callable, Coroutine, Iterator
|
5
4
|
|
5
|
+
import anyio
|
6
6
|
import rich
|
7
7
|
from rich.console import Console, Group, RenderableType
|
8
8
|
from rich.live import Live
|
@@ -11,7 +11,9 @@ from rich.progress import Progress as RProgress
|
|
11
11
|
from rich.table import Table
|
12
12
|
from typing_extensions import override
|
13
13
|
|
14
|
+
from inspect_ai._util._async import configured_async_backend, run_coroutine
|
14
15
|
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
16
|
+
from inspect_ai._util.platform import running_in_notebook
|
15
17
|
from inspect_ai.log._transcript import InputEvent, transcript
|
16
18
|
from inspect_ai.util._display import display_type
|
17
19
|
from inspect_ai.util._throttle import throttle
|
@@ -59,7 +61,6 @@ class RichDisplay(Display):
|
|
59
61
|
self.progress_ui: RProgress | None = None
|
60
62
|
self.parallel = False
|
61
63
|
self.live: Live | None = None
|
62
|
-
self.timer_handle: asyncio.TimerHandle | None = None
|
63
64
|
self.counters: dict[str, str] = {}
|
64
65
|
rich_initialise()
|
65
66
|
|
@@ -74,8 +75,11 @@ class RichDisplay(Display):
|
|
74
75
|
yield RichProgress(total, progress)
|
75
76
|
|
76
77
|
@override
|
77
|
-
def run_task_app(self, main: Coroutine[
|
78
|
-
|
78
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
|
79
|
+
if running_in_notebook():
|
80
|
+
return run_coroutine(main())
|
81
|
+
else:
|
82
|
+
return anyio.run(main, backend=configured_async_backend())
|
79
83
|
|
80
84
|
@override
|
81
85
|
@contextlib.contextmanager
|
@@ -104,13 +108,15 @@ class RichDisplay(Display):
|
|
104
108
|
with RichTaskScreen(live) as task_screen:
|
105
109
|
self.live = live
|
106
110
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
)
|
111
|
+
async with anyio.create_task_group() as tg:
|
112
|
+
# update display every second while running
|
113
|
+
tg.start_soon(self._update_display_loop)
|
111
114
|
|
112
|
-
|
113
|
-
|
115
|
+
# let the task screen run
|
116
|
+
try:
|
117
|
+
yield task_screen
|
118
|
+
finally:
|
119
|
+
tg.cancel_scope.cancel()
|
114
120
|
|
115
121
|
# render task results (re-enable live if necessary)
|
116
122
|
if not live.is_started:
|
@@ -124,8 +130,6 @@ class RichDisplay(Display):
|
|
124
130
|
self.progress_ui = None
|
125
131
|
self.parallel = False
|
126
132
|
self.live = None
|
127
|
-
if self.timer_handle:
|
128
|
-
self.timer_handle.cancel()
|
129
133
|
|
130
134
|
@override
|
131
135
|
@contextlib.contextmanager
|
@@ -161,7 +165,13 @@ class RichDisplay(Display):
|
|
161
165
|
r = task_live_status(self.tasks, self.progress_ui, self.counters)
|
162
166
|
self.live.update(r, refresh=True)
|
163
167
|
|
164
|
-
|
168
|
+
async def _update_display_loop(self) -> None:
|
169
|
+
try:
|
170
|
+
while True:
|
171
|
+
await anyio.sleep(1)
|
172
|
+
self._update_display()
|
173
|
+
except Exception:
|
174
|
+
pass
|
165
175
|
|
166
176
|
@override
|
167
177
|
def display_counter(self, caption: str, value: str) -> None:
|
@@ -1,16 +1,18 @@
|
|
1
|
-
import asyncio
|
2
1
|
import contextlib
|
3
2
|
from asyncio import CancelledError
|
4
3
|
from typing import (
|
5
4
|
Any,
|
6
5
|
AsyncIterator,
|
6
|
+
Awaitable,
|
7
|
+
Callable,
|
7
8
|
ClassVar,
|
8
|
-
Coroutine,
|
9
9
|
Generic,
|
10
10
|
Iterator,
|
11
11
|
cast,
|
12
12
|
)
|
13
13
|
|
14
|
+
import anyio
|
15
|
+
import anyio.from_thread
|
14
16
|
import rich
|
15
17
|
from rich.console import Console
|
16
18
|
from textual.app import App, ComposeResult
|
@@ -103,9 +105,8 @@ class TaskScreenApp(App[TR]):
|
|
103
105
|
if focus and self.app._driver:
|
104
106
|
textual_enable_mouse_support(self.app._driver)
|
105
107
|
|
106
|
-
def run_app(self, main:
|
107
|
-
|
108
|
-
self._worker = self.run_worker(main, start=False, exit_on_error=False)
|
108
|
+
def run_app(self, main: Callable[[], Awaitable[TR]]) -> TaskScreenResult[TR]:
|
109
|
+
self._worker = self.run_worker(main(), start=False, exit_on_error=False)
|
109
110
|
|
110
111
|
# run the app
|
111
112
|
self.run()
|
@@ -123,8 +124,8 @@ class TaskScreenApp(App[TR]):
|
|
123
124
|
|
124
125
|
async def on_load(self) -> None:
|
125
126
|
# events used to synchronise loading
|
126
|
-
self._on_load_app =
|
127
|
-
self._on_app_loaded =
|
127
|
+
self._on_load_app = anyio.Event()
|
128
|
+
self._on_app_loaded = anyio.Event()
|
128
129
|
|
129
130
|
# run the workers
|
130
131
|
self.workers.start_all()
|
@@ -136,7 +137,7 @@ class TaskScreenApp(App[TR]):
|
|
136
137
|
while not self._on_load_app.is_set():
|
137
138
|
if len(self.workers._workers) == 0:
|
138
139
|
return
|
139
|
-
await
|
140
|
+
await anyio.sleep(0.1)
|
140
141
|
|
141
142
|
@contextlib.contextmanager
|
142
143
|
def suspend_app(self) -> Iterator[None]:
|
@@ -422,7 +423,7 @@ class TaskScreenApp(App[TR]):
|
|
422
423
|
class TextualTaskScreen(TaskScreen, Generic[TR]):
|
423
424
|
def __init__(self, app: TaskScreenApp[TR]) -> None:
|
424
425
|
self.app = app
|
425
|
-
self.lock =
|
426
|
+
self.lock = anyio.Lock()
|
426
427
|
|
427
428
|
def __exit__(self, *excinfo: Any) -> None:
|
428
429
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import contextlib
|
2
|
-
from typing import
|
2
|
+
from typing import AsyncIterator, Callable, Coroutine, Iterator
|
3
3
|
|
4
4
|
import rich
|
5
5
|
from typing_extensions import override
|
@@ -30,7 +30,7 @@ class TextualDisplay(Display):
|
|
30
30
|
yield RichProgress(total, progress)
|
31
31
|
|
32
32
|
@override
|
33
|
-
def run_task_app(self, main: Coroutine[
|
33
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
|
34
34
|
# create and run the app
|
35
35
|
self.app = TaskScreenApp[TR]()
|
36
36
|
result = self.app.run_app(main)
|
@@ -36,3 +36,7 @@ class AppFooter(Widget):
|
|
36
36
|
def watch_right(self, new_right: RenderableType) -> None:
|
37
37
|
footer_right = cast(Static, self.query_one("#footer-right"))
|
38
38
|
footer_right.update(new_right)
|
39
|
+
if footer_right.tooltip is None:
|
40
|
+
footer_right.tooltip = (
|
41
|
+
"Execute 'inspect trace http' for a log of all HTTP requests."
|
42
|
+
)
|
@@ -506,6 +506,7 @@ class SampleToolbar(Horizontal):
|
|
506
506
|
# track the sample
|
507
507
|
self.sample = sample
|
508
508
|
|
509
|
+
status_group = self.query_one("#" + self.STATUS_GROUP)
|
509
510
|
pending_status = self.query_one("#" + self.PENDING_STATUS)
|
510
511
|
timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
|
511
512
|
clock = self.query_one(Clock)
|
@@ -537,11 +538,19 @@ class SampleToolbar(Horizontal):
|
|
537
538
|
pending_caption = cast(
|
538
539
|
Static, self.query_one("#" + self.PENDING_CAPTION)
|
539
540
|
)
|
540
|
-
|
541
|
-
|
542
|
-
if
|
543
|
-
|
544
|
-
|
541
|
+
if isinstance(last_event, ModelEvent):
|
542
|
+
# see if there are retries in play
|
543
|
+
if sample.retry_count > 0:
|
544
|
+
suffix = "retry" if sample.retry_count == 1 else "retries"
|
545
|
+
pending_caption_text = (
|
546
|
+
f"Generating ({sample.retry_count:,} {suffix})..."
|
547
|
+
)
|
548
|
+
else:
|
549
|
+
pending_caption_text = "Generating..."
|
550
|
+
else:
|
551
|
+
pending_caption_text = "Executing..."
|
552
|
+
status_group.styles.width = max(22, len(pending_caption_text))
|
553
|
+
|
545
554
|
pending_caption.update(
|
546
555
|
Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
|
547
556
|
)
|
inspect_ai/_eval/context.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from inspect_ai._util.dotenv import init_dotenv
|
2
2
|
from inspect_ai._util.hooks import init_hooks
|
3
|
-
from inspect_ai._util.logger import
|
3
|
+
from inspect_ai._util.logger import init_logger
|
4
4
|
from inspect_ai.approval._apply import have_tool_approval, init_tool_approval
|
5
5
|
from inspect_ai.approval._human.manager import init_human_approval_manager
|
6
6
|
from inspect_ai.approval._policy import ApprovalPolicy
|
@@ -20,7 +20,6 @@ def init_eval_context(
|
|
20
20
|
init_logger(log_level, log_level_transcript)
|
21
21
|
init_concurrency()
|
22
22
|
init_max_subprocesses(max_subprocesses)
|
23
|
-
init_http_rate_limit_count()
|
24
23
|
init_hooks()
|
25
24
|
init_active_samples()
|
26
25
|
init_human_approval_manager()
|
inspect_ai/_eval/eval.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
+
import sys
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, Literal
|
5
6
|
|
7
|
+
if sys.version_info < (3, 11):
|
8
|
+
from exceptiongroup import ExceptionGroup
|
9
|
+
|
6
10
|
from shortuuid import uuid
|
7
11
|
from typing_extensions import Unpack
|
8
12
|
|
@@ -166,43 +170,51 @@ def eval(
|
|
166
170
|
display, trace, max_tasks, max_samples, model
|
167
171
|
)
|
168
172
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
173
|
+
async def run_task_app() -> list[EvalLog]:
|
174
|
+
try:
|
175
|
+
return await eval_async(
|
176
|
+
tasks=tasks,
|
177
|
+
model=model,
|
178
|
+
model_base_url=model_base_url,
|
179
|
+
model_args=model_args,
|
180
|
+
task_args=task_args,
|
181
|
+
sandbox=sandbox,
|
182
|
+
sandbox_cleanup=sandbox_cleanup,
|
183
|
+
solver=solver,
|
184
|
+
tags=tags,
|
185
|
+
approval=approval,
|
186
|
+
log_level=log_level,
|
187
|
+
log_level_transcript=log_level_transcript,
|
188
|
+
log_dir=log_dir,
|
189
|
+
log_format=log_format,
|
190
|
+
limit=limit,
|
191
|
+
sample_id=sample_id,
|
192
|
+
epochs=epochs,
|
193
|
+
fail_on_error=fail_on_error,
|
194
|
+
debug_errors=debug_errors,
|
195
|
+
message_limit=message_limit,
|
196
|
+
token_limit=token_limit,
|
197
|
+
time_limit=time_limit,
|
198
|
+
working_limit=working_limit,
|
199
|
+
max_samples=max_samples,
|
200
|
+
max_tasks=max_tasks,
|
201
|
+
max_subprocesses=max_subprocesses,
|
202
|
+
max_sandboxes=max_sandboxes,
|
203
|
+
log_samples=log_samples,
|
204
|
+
log_images=log_images,
|
205
|
+
log_buffer=log_buffer,
|
206
|
+
score=score,
|
207
|
+
score_display=score_display,
|
208
|
+
**kwargs,
|
209
|
+
)
|
210
|
+
# exceptions can escape when debug_errors is True and that's okay
|
211
|
+
except ExceptionGroup as ex:
|
212
|
+
if debug_errors:
|
213
|
+
raise ex.exceptions[0] from None
|
214
|
+
else:
|
215
|
+
raise
|
216
|
+
|
217
|
+
return task_display().run_task_app(run_task_app)
|
206
218
|
|
207
219
|
|
208
220
|
# single call to eval_async at a time
|
@@ -556,8 +568,8 @@ def eval_retry(
|
|
556
568
|
# resolve eval trace
|
557
569
|
max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
|
558
570
|
|
559
|
-
|
560
|
-
|
571
|
+
async def run_task_app() -> list[EvalLog]:
|
572
|
+
return await eval_retry_async(
|
561
573
|
tasks=tasks,
|
562
574
|
log_level=log_level,
|
563
575
|
log_level_transcript=log_level_transcript,
|
@@ -578,8 +590,9 @@ def eval_retry(
|
|
578
590
|
max_retries=max_retries,
|
579
591
|
timeout=timeout,
|
580
592
|
max_connections=max_connections,
|
581
|
-
)
|
582
|
-
|
593
|
+
)
|
594
|
+
|
595
|
+
return task_display().run_task_app(run_task_app)
|
583
596
|
|
584
597
|
|
585
598
|
async def eval_retry_async(
|