inspect-ai 0.3.102__py3-none-any.whl → 0.3.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +2 -1
- inspect_ai/_cli/eval.py +2 -1
- inspect_ai/_display/core/active.py +3 -0
- inspect_ai/_display/core/config.py +1 -0
- inspect_ai/_display/core/panel.py +21 -13
- inspect_ai/_display/core/results.py +3 -7
- inspect_ai/_display/core/rich.py +3 -5
- inspect_ai/_display/log/__init__.py +0 -0
- inspect_ai/_display/log/display.py +173 -0
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_display/rich/display.py +2 -4
- inspect_ai/_display/textual/app.py +1 -6
- inspect_ai/_display/textual/widgets/task_detail.py +3 -14
- inspect_ai/_display/textual/widgets/tasks.py +1 -1
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +3 -2
- inspect_ai/_eval/registry.py +6 -1
- inspect_ai/_eval/run.py +7 -1
- inspect_ai/_eval/task/constants.py +1 -0
- inspect_ai/_eval/task/log.py +5 -1
- inspect_ai/_eval/task/run.py +1 -1
- inspect_ai/_util/citation.py +88 -0
- inspect_ai/_util/content.py +24 -2
- inspect_ai/_util/json.py +17 -2
- inspect_ai/_util/registry.py +19 -4
- inspect_ai/_view/schema.py +0 -6
- inspect_ai/_view/www/dist/assets/index.css +82 -24
- inspect_ai/_view/www/dist/assets/index.js +10124 -9808
- inspect_ai/_view/www/log-schema.json +418 -1
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/@types/log.d.ts +140 -39
- inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
- inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
- inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
- inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
- inspect_ai/_view/www/src/tests/README.md +2 -2
- inspect_ai/_view/www/src/utils/git.ts +3 -1
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/agent/_handoff.py +3 -3
- inspect_ai/log/_condense.py +5 -0
- inspect_ai/log/_file.py +4 -1
- inspect_ai/log/_log.py +9 -4
- inspect_ai/log/_recorders/eval.py +4 -3
- inspect_ai/log/_recorders/json.py +5 -2
- inspect_ai/log/_recorders/recorder.py +1 -0
- inspect_ai/log/_util.py +2 -0
- inspect_ai/model/__init__.py +14 -0
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_openai_responses.py +80 -34
- inspect_ai/model/_providers/_anthropic_citations.py +158 -0
- inspect_ai/model/_providers/_google_citations.py +100 -0
- inspect_ai/model/_providers/anthropic.py +196 -34
- inspect_ai/model/_providers/google.py +94 -22
- inspect_ai/model/_providers/mistral.py +20 -7
- inspect_ai/model/_providers/openai.py +11 -10
- inspect_ai/model/_providers/openai_compatible.py +3 -2
- inspect_ai/model/_providers/openai_responses.py +2 -5
- inspect_ai/model/_providers/perplexity.py +123 -0
- inspect_ai/model/_providers/providers.py +13 -2
- inspect_ai/model/_providers/vertex.py +3 -0
- inspect_ai/model/_trim.py +5 -0
- inspect_ai/tool/__init__.py +14 -0
- inspect_ai/tool/_mcp/_mcp.py +5 -2
- inspect_ai/tool/_mcp/sampling.py +19 -3
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tool.py +10 -1
- inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
- inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
- inspect_ai/tool/_tools/_web_search/_google.py +22 -25
- inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
- inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
- inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
- inspect_ai/util/_display.py +11 -2
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_span.py +12 -1
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/RECORD +112 -88
- /inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
- /inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/common.py
CHANGED
@@ -60,7 +60,8 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
60
60
|
@click.option(
|
61
61
|
"--display",
|
62
62
|
type=click.Choice(
|
63
|
-
["full", "conversation", "rich", "plain", "none"],
|
63
|
+
["full", "conversation", "rich", "plain", "log", "none"],
|
64
|
+
case_sensitive=False,
|
64
65
|
),
|
65
66
|
default=DEFAULT_DISPLAY,
|
66
67
|
envvar="INSPECT_DISPLAY",
|
inspect_ai/_cli/eval.py
CHANGED
@@ -641,7 +641,7 @@ def eval_command(
|
|
641
641
|
@click.option(
|
642
642
|
"--retry-connections",
|
643
643
|
type=float,
|
644
|
-
help="Reduce max_connections at this rate with each retry (defaults to 0.
|
644
|
+
help="Reduce max_connections at this rate with each retry (defaults to 1.0, which results in no reduction).",
|
645
645
|
envvar="INSPECT_EVAL_RETRY_CONNECTIONS",
|
646
646
|
)
|
647
647
|
@click.option(
|
@@ -966,6 +966,7 @@ def eval_exec(
|
|
966
966
|
success, _ = eval_set(**params)
|
967
967
|
return success
|
968
968
|
else:
|
969
|
+
params["log_header_only"] = True # cli invocation doesn't need full log
|
969
970
|
eval(**params)
|
970
971
|
return True
|
971
972
|
|
@@ -5,6 +5,7 @@ import rich
|
|
5
5
|
|
6
6
|
from inspect_ai.util._display import display_type
|
7
7
|
|
8
|
+
from ..log.display import LogDisplay
|
8
9
|
from ..plain.display import PlainDisplay
|
9
10
|
from ..rich.display import RichDisplay
|
10
11
|
from ..textual.display import TextualDisplay
|
@@ -24,6 +25,8 @@ def display() -> Display:
|
|
24
25
|
and not rich.get_console().is_jupyter
|
25
26
|
):
|
26
27
|
_active_display = TextualDisplay()
|
28
|
+
elif display_type() == "log":
|
29
|
+
_active_display = LogDisplay()
|
27
30
|
else:
|
28
31
|
_active_display = RichDisplay()
|
29
32
|
|
@@ -30,6 +30,7 @@ def task_config(
|
|
30
30
|
config = dict(profile.generate_config.model_dump(exclude_none=True)) | config
|
31
31
|
if profile.tags:
|
32
32
|
config["tags"] = ",".join(profile.tags)
|
33
|
+
config["dataset"] = profile.dataset
|
33
34
|
config_print: list[str] = []
|
34
35
|
for name, value in config.items():
|
35
36
|
if name == "approval" and isinstance(value, dict):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import Tuple
|
2
2
|
|
3
3
|
import rich
|
4
|
-
from rich.console import RenderableType
|
4
|
+
from rich.console import Group, RenderableType
|
5
5
|
from rich.panel import Panel
|
6
6
|
from rich.table import Table
|
7
7
|
from rich.text import Text
|
@@ -9,7 +9,7 @@ from rich.text import Text
|
|
9
9
|
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
10
10
|
from inspect_ai._util.path import cwd_relative_path
|
11
11
|
from inspect_ai._util.registry import registry_unqualified_name
|
12
|
-
from inspect_ai.util._display import
|
12
|
+
from inspect_ai.util._display import display_type_plain
|
13
13
|
|
14
14
|
from .display import TaskProfile
|
15
15
|
from .rich import is_vscode_notebook, rich_theme
|
@@ -27,7 +27,7 @@ def task_panel(
|
|
27
27
|
log_location: str | None,
|
28
28
|
) -> RenderableType:
|
29
29
|
# dispatch to plain handler if we are in plain mode
|
30
|
-
if
|
30
|
+
if display_type_plain():
|
31
31
|
return task_panel_plain(
|
32
32
|
profile, show_model, body, subtitle, footer, log_location
|
33
33
|
)
|
@@ -89,23 +89,31 @@ def task_panel(
|
|
89
89
|
log_location_relative = log_location
|
90
90
|
|
91
91
|
root = Table.grid(expand=True)
|
92
|
-
root.add_column()
|
92
|
+
root.add_column(overflow="fold")
|
93
93
|
root.add_row(table)
|
94
94
|
root.add_row()
|
95
95
|
root.add_row(
|
96
96
|
f"[bold][{theme.light}]Log:[/{theme.light}][/bold] "
|
97
97
|
+ f"[{theme.link}]{log_location_relative}[/{theme.link}]"
|
98
98
|
)
|
99
|
+
root.add_row()
|
99
100
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
101
|
+
panel = Panel(
|
102
|
+
task_panel_title(profile, show_model),
|
103
|
+
padding=(0, 0),
|
104
|
+
width=width,
|
105
|
+
height=3,
|
106
|
+
expand=True,
|
107
|
+
)
|
108
|
+
return Group(panel, root)
|
109
|
+
else:
|
110
|
+
return Panel(
|
111
|
+
root,
|
112
|
+
title=task_panel_title(profile, show_model),
|
113
|
+
title_align="left",
|
114
|
+
width=width,
|
115
|
+
expand=True,
|
116
|
+
)
|
109
117
|
|
110
118
|
|
111
119
|
def task_panel_plain(
|
@@ -18,7 +18,7 @@ from .display import (
|
|
18
18
|
TaskSuccess,
|
19
19
|
TaskWithResult,
|
20
20
|
)
|
21
|
-
from .panel import task_panel
|
21
|
+
from .panel import task_panel
|
22
22
|
from .rich import rich_theme
|
23
23
|
|
24
24
|
|
@@ -41,8 +41,6 @@ def task_result_cancelled(
|
|
41
41
|
) -> RenderableType:
|
42
42
|
# The contents of the panel
|
43
43
|
config = task_config(profile)
|
44
|
-
targets = task_targets(profile)
|
45
|
-
subtitle = config, targets
|
46
44
|
body = task_stats(cancelled.stats)
|
47
45
|
|
48
46
|
# The panel
|
@@ -50,7 +48,7 @@ def task_result_cancelled(
|
|
50
48
|
profile=profile,
|
51
49
|
show_model=True,
|
52
50
|
body=body,
|
53
|
-
subtitle=
|
51
|
+
subtitle=config,
|
54
52
|
footer=task_interrupted(profile, cancelled.samples_completed),
|
55
53
|
log_location=profile.log_location,
|
56
54
|
)
|
@@ -76,8 +74,6 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
|
|
76
74
|
def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
|
77
75
|
# The contents of the panel
|
78
76
|
config = task_config(profile)
|
79
|
-
targets = task_targets(profile)
|
80
|
-
subtitle = config, targets
|
81
77
|
body = task_stats(success.stats)
|
82
78
|
|
83
79
|
# the panel
|
@@ -85,7 +81,7 @@ def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> Renderabl
|
|
85
81
|
profile=profile,
|
86
82
|
show_model=True,
|
87
83
|
body=body,
|
88
|
-
subtitle=
|
84
|
+
subtitle=config,
|
89
85
|
footer=task_results(profile, success),
|
90
86
|
log_location=profile.log_location,
|
91
87
|
)
|
inspect_ai/_display/core/rich.py
CHANGED
@@ -11,7 +11,7 @@ from typing_extensions import override
|
|
11
11
|
|
12
12
|
from inspect_ai._util.platform import is_running_in_jupyterlab, is_running_in_vscode
|
13
13
|
from inspect_ai._util.transcript import transcript_code_theme
|
14
|
-
from inspect_ai.util._display import display_type
|
14
|
+
from inspect_ai.util._display import display_type, display_type_plain
|
15
15
|
|
16
16
|
|
17
17
|
def is_vscode_notebook(console: Console) -> bool:
|
@@ -20,15 +20,13 @@ def is_vscode_notebook(console: Console) -> bool:
|
|
20
20
|
|
21
21
|
def rich_no_color() -> bool:
|
22
22
|
return (
|
23
|
-
|
24
|
-
or not is_running_in_vscode()
|
25
|
-
or is_running_in_jupyterlab()
|
23
|
+
display_type_plain() or not is_running_in_vscode() or is_running_in_jupyterlab()
|
26
24
|
)
|
27
25
|
|
28
26
|
|
29
27
|
def rich_initialise() -> None:
|
30
28
|
# reflect ansi prefs
|
31
|
-
if
|
29
|
+
if display_type_plain():
|
32
30
|
rich.reconfigure(no_color=True, force_terminal=False, force_interactive=False)
|
33
31
|
elif rich_no_color():
|
34
32
|
rich.reconfigure(no_color=True)
|
File without changes
|
@@ -0,0 +1,173 @@
|
|
1
|
+
import contextlib
|
2
|
+
import logging
|
3
|
+
from typing import AsyncIterator, Callable, Coroutine, Iterator
|
4
|
+
|
5
|
+
import anyio
|
6
|
+
from rich.console import Console
|
7
|
+
|
8
|
+
from inspect_ai._util._async import configured_async_backend, run_coroutine
|
9
|
+
from inspect_ai._util.platform import running_in_notebook
|
10
|
+
|
11
|
+
from ...util import throttle
|
12
|
+
from ...util._concurrency import concurrency_status_display
|
13
|
+
from ..core.display import (
|
14
|
+
TR,
|
15
|
+
Display,
|
16
|
+
Progress,
|
17
|
+
TaskDisplay,
|
18
|
+
TaskDisplayMetric,
|
19
|
+
TaskProfile,
|
20
|
+
TaskResult,
|
21
|
+
TaskScreen,
|
22
|
+
TaskSpec,
|
23
|
+
TaskWithResult,
|
24
|
+
)
|
25
|
+
from ..core.footer import task_http_retries_str
|
26
|
+
from ..core.results import task_metric, tasks_results
|
27
|
+
|
28
|
+
|
29
|
+
class LogDisplay(Display):
|
30
|
+
def __init__(self) -> None:
|
31
|
+
self.total_tasks: int = 0
|
32
|
+
self.tasks: list[TaskWithResult] = []
|
33
|
+
self.parallel = False
|
34
|
+
|
35
|
+
def print(self, message: str) -> None:
|
36
|
+
logging.info(message, stacklevel=2)
|
37
|
+
|
38
|
+
@contextlib.contextmanager
|
39
|
+
def progress(self, total: int) -> Iterator[Progress]:
|
40
|
+
yield LogProgress(total)
|
41
|
+
|
42
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
|
43
|
+
if running_in_notebook():
|
44
|
+
return run_coroutine(main())
|
45
|
+
else:
|
46
|
+
return anyio.run(main, backend=configured_async_backend())
|
47
|
+
|
48
|
+
@contextlib.contextmanager
|
49
|
+
def suspend_task_app(self) -> Iterator[None]:
|
50
|
+
yield
|
51
|
+
|
52
|
+
@contextlib.asynccontextmanager
|
53
|
+
async def task_screen(
|
54
|
+
self, tasks: list[TaskSpec], parallel: bool
|
55
|
+
) -> AsyncIterator[TaskScreen]:
|
56
|
+
self.total_tasks = len(tasks)
|
57
|
+
self.tasks = []
|
58
|
+
self.parallel = parallel
|
59
|
+
try:
|
60
|
+
logging.info(f"Running {self.total_tasks} tasks...", stacklevel=3)
|
61
|
+
yield TaskScreen()
|
62
|
+
finally:
|
63
|
+
# Log final results
|
64
|
+
if self.tasks:
|
65
|
+
self._log_results()
|
66
|
+
|
67
|
+
@contextlib.contextmanager
|
68
|
+
def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
|
69
|
+
# Create and yield task display
|
70
|
+
task = TaskWithResult(profile, None)
|
71
|
+
self.tasks.append(task)
|
72
|
+
yield LogTaskDisplay(task)
|
73
|
+
self._log_status()
|
74
|
+
|
75
|
+
def display_counter(self, caption: str, value: str) -> None:
|
76
|
+
logging.info(f"{caption}: {value}", stacklevel=2)
|
77
|
+
|
78
|
+
def _log_status(self) -> None:
|
79
|
+
"""Log status updates for all tasks"""
|
80
|
+
completed_tasks = sum(1 for task in self.tasks if task.result is not None)
|
81
|
+
total_tasks = len(self.tasks)
|
82
|
+
logging.info(f"{completed_tasks}/{total_tasks} tasks complete", stacklevel=4)
|
83
|
+
|
84
|
+
def _log_results(self) -> None:
|
85
|
+
"""Log final results"""
|
86
|
+
results = tasks_results(self.tasks)
|
87
|
+
console = Console(width=120)
|
88
|
+
console.log(results, _stack_offset=4)
|
89
|
+
|
90
|
+
|
91
|
+
class LogProgress(Progress):
|
92
|
+
def __init__(self, total: int):
|
93
|
+
self.total = total
|
94
|
+
self.current = 0
|
95
|
+
|
96
|
+
def update(self, n: int = 1) -> None:
|
97
|
+
self.current += n
|
98
|
+
|
99
|
+
def complete(self) -> None:
|
100
|
+
self.current = self.total
|
101
|
+
|
102
|
+
|
103
|
+
class LogTaskDisplay(TaskDisplay):
|
104
|
+
def __init__(self, task: TaskWithResult):
|
105
|
+
self.task = task
|
106
|
+
self.progress_display: LogProgress | None = None
|
107
|
+
self.samples_complete = 0
|
108
|
+
self.samples_total = 0
|
109
|
+
self.current_metrics: list[TaskDisplayMetric] | None = None
|
110
|
+
|
111
|
+
@contextlib.contextmanager
|
112
|
+
def progress(self) -> Iterator[Progress]:
|
113
|
+
self.progress_display = LogProgress(self.task.profile.steps)
|
114
|
+
yield self.progress_display
|
115
|
+
|
116
|
+
@throttle(5)
|
117
|
+
def _log_status_throttled(self, stacklevel: int) -> None:
|
118
|
+
self._log_status(stacklevel=stacklevel + 2)
|
119
|
+
|
120
|
+
def _log_status(self, stacklevel: int) -> None:
|
121
|
+
"""Log status updates"""
|
122
|
+
status_parts: list[str] = []
|
123
|
+
|
124
|
+
# Add task name and model
|
125
|
+
status_parts.append(f"Task: {self.task.profile.name}")
|
126
|
+
status_parts.append(f"Model: {self.task.profile.model}")
|
127
|
+
|
128
|
+
# Add step progress
|
129
|
+
if self.progress_display:
|
130
|
+
progress_percent = int(
|
131
|
+
self.progress_display.current / self.progress_display.total * 100
|
132
|
+
)
|
133
|
+
status_parts.append(
|
134
|
+
f"Steps: {self.progress_display.current}/{self.progress_display.total} {progress_percent}%"
|
135
|
+
)
|
136
|
+
|
137
|
+
# Add sample progress
|
138
|
+
status_parts.append(f"Samples: {self.samples_complete}/{self.samples_total}")
|
139
|
+
|
140
|
+
# Add metrics
|
141
|
+
if self.current_metrics:
|
142
|
+
metric_str = task_metric(self.current_metrics)
|
143
|
+
status_parts.append(metric_str)
|
144
|
+
|
145
|
+
# Add resource usage
|
146
|
+
resources_dict: dict[str, str] = {}
|
147
|
+
for model, resource in concurrency_status_display().items():
|
148
|
+
resources_dict[model] = f"{resource[0]}/{resource[1]}"
|
149
|
+
resources = ", ".join(
|
150
|
+
[f"{key}: {value}" for key, value in resources_dict.items()]
|
151
|
+
)
|
152
|
+
status_parts.append(resources)
|
153
|
+
|
154
|
+
# Add rate limits
|
155
|
+
rate_limits = task_http_retries_str()
|
156
|
+
if rate_limits:
|
157
|
+
status_parts.append(rate_limits)
|
158
|
+
|
159
|
+
# Print on new line
|
160
|
+
logging.info(", ".join(status_parts), stacklevel=stacklevel)
|
161
|
+
|
162
|
+
def sample_complete(self, complete: int, total: int) -> None:
|
163
|
+
self.samples_complete = complete
|
164
|
+
self.samples_total = total
|
165
|
+
self._log_status_throttled(stacklevel=3)
|
166
|
+
|
167
|
+
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
168
|
+
self.current_metrics = metrics
|
169
|
+
self._log_status_throttled(stacklevel=3)
|
170
|
+
|
171
|
+
def complete(self, result: TaskResult) -> None:
|
172
|
+
self.task.result = result
|
173
|
+
self._log_status(stacklevel=3)
|
@@ -25,7 +25,7 @@ from ..core.display import (
|
|
25
25
|
TaskWithResult,
|
26
26
|
)
|
27
27
|
from ..core.footer import task_http_retries_str
|
28
|
-
from ..core.panel import task_panel
|
28
|
+
from ..core.panel import task_panel
|
29
29
|
from ..core.results import task_metric, tasks_results
|
30
30
|
|
31
31
|
|
@@ -79,7 +79,7 @@ class PlainDisplay(Display):
|
|
79
79
|
profile=profile,
|
80
80
|
show_model=True,
|
81
81
|
body="", # Empty body since we haven't started yet
|
82
|
-
subtitle=
|
82
|
+
subtitle=task_config(profile),
|
83
83
|
footer=None,
|
84
84
|
log_location=None,
|
85
85
|
)
|
@@ -32,7 +32,7 @@ from ..core.display import (
|
|
32
32
|
TaskWithResult,
|
33
33
|
)
|
34
34
|
from ..core.footer import task_footer
|
35
|
-
from ..core.panel import task_panel,
|
35
|
+
from ..core.panel import task_panel, task_title, tasks_title
|
36
36
|
from ..core.progress import (
|
37
37
|
RichProgress,
|
38
38
|
progress_description,
|
@@ -311,15 +311,13 @@ def task_live_status(
|
|
311
311
|
|
312
312
|
# the panel contents
|
313
313
|
config = task_config(tasks[0].profile, style=theme.light)
|
314
|
-
targets = task_targets(tasks[0].profile)
|
315
|
-
subtitle = config, targets
|
316
314
|
|
317
315
|
# the panel
|
318
316
|
return task_panel(
|
319
317
|
profile=tasks[0].profile,
|
320
318
|
show_model=len(tasks) == 1,
|
321
319
|
body=Group("", progress),
|
322
|
-
subtitle=
|
320
|
+
subtitle=config,
|
323
321
|
footer=task_footer(counters, theme.light),
|
324
322
|
log_location=None,
|
325
323
|
)
|
@@ -42,7 +42,7 @@ from ..core.display import (
|
|
42
42
|
TaskWithResult,
|
43
43
|
)
|
44
44
|
from ..core.footer import task_footer
|
45
|
-
from ..core.panel import
|
45
|
+
from ..core.panel import task_title, tasks_title
|
46
46
|
from ..core.rich import record_console_input, rich_initialise, rich_theme
|
47
47
|
from .theme import inspect_dark, inspect_light
|
48
48
|
from .widgets.console import ConsoleView
|
@@ -296,13 +296,8 @@ class TaskScreenApp(App[TR]):
|
|
296
296
|
tasks.config = task_config(
|
297
297
|
self._tasks[0].profile, generate_config=not self._parallel
|
298
298
|
)
|
299
|
-
if not self._parallel:
|
300
|
-
tasks.targets = task_targets(self._tasks[0].profile)
|
301
|
-
else:
|
302
|
-
tasks.targets = " \n "
|
303
299
|
else:
|
304
300
|
tasks.config = ""
|
305
|
-
tasks.targets = ""
|
306
301
|
|
307
302
|
def update_samples(self) -> None:
|
308
303
|
samples_view = self.query_one(SamplesView)
|
@@ -30,6 +30,8 @@ class TaskDetail(Widget):
|
|
30
30
|
width: 100%;
|
31
31
|
height: auto;
|
32
32
|
grid-gutter: 1 3;
|
33
|
+
grid-size-columns: 3;
|
34
|
+
grid-columns: 1fr 1fr 1fr;
|
33
35
|
}
|
34
36
|
"""
|
35
37
|
|
@@ -92,20 +94,6 @@ class TaskDetail(Widget):
|
|
92
94
|
if len(self.by_reducer) == 0:
|
93
95
|
return
|
94
96
|
|
95
|
-
# Compute the row and column count
|
96
|
-
row_count = len(self.by_reducer)
|
97
|
-
col_count = len(next(iter(self.by_reducer.values())))
|
98
|
-
|
99
|
-
# If this can fit in a single row, make it fit
|
100
|
-
# otherwise place each reducer on their own row
|
101
|
-
self.grid.styles.grid_columns = "auto"
|
102
|
-
if row_count * col_count < 4:
|
103
|
-
self.grid.styles.grid_size_columns = row_count * col_count
|
104
|
-
self.grid.styles.grid_size_rows = 1
|
105
|
-
else:
|
106
|
-
self.grid.styles.grid_size_columns = col_count
|
107
|
-
self.grid.styles.grid_size_rows = row_count
|
108
|
-
|
109
97
|
# In order to reduce flashing the below tracks use of widgets
|
110
98
|
# and updates them when possible (removing and adding them as needed)
|
111
99
|
# Makes keys for tracking Task Metric widgets
|
@@ -142,6 +130,7 @@ class TaskMetrics(Widget):
|
|
142
130
|
TaskMetrics {
|
143
131
|
width: auto;
|
144
132
|
height: auto;
|
133
|
+
border: solid $foreground 20%;
|
145
134
|
}
|
146
135
|
TaskMetrics Grid {
|
147
136
|
width: auto;
|
inspect_ai/_eval/eval.py
CHANGED
@@ -105,6 +105,7 @@ def eval(
|
|
105
105
|
log_images: bool | None = None,
|
106
106
|
log_buffer: int | None = None,
|
107
107
|
log_shared: bool | int | None = None,
|
108
|
+
log_header_only: bool | None = None,
|
108
109
|
score: bool = True,
|
109
110
|
score_display: bool | None = None,
|
110
111
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -181,6 +182,8 @@ def eval(
|
|
181
182
|
log_shared: Sync sample events to log directory so that users on other systems
|
182
183
|
can see log updates in realtime (defaults to no syncing). Specify `True`
|
183
184
|
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
185
|
+
log_header_only: If `True`, the function should return only log headers rather
|
186
|
+
than full logs with samples (defaults to `False`).
|
184
187
|
score: Score output (defaults to True)
|
185
188
|
score_display: Show scoring metrics in realtime (defaults to True)
|
186
189
|
**kwargs: Model generation options.
|
@@ -234,6 +237,7 @@ def eval(
|
|
234
237
|
log_images=log_images,
|
235
238
|
log_buffer=log_buffer,
|
236
239
|
log_shared=log_shared,
|
240
|
+
log_header_only=log_header_only,
|
237
241
|
score=score,
|
238
242
|
score_display=score_display,
|
239
243
|
**kwargs,
|
@@ -288,6 +292,7 @@ async def eval_async(
|
|
288
292
|
log_images: bool | None = None,
|
289
293
|
log_buffer: int | None = None,
|
290
294
|
log_shared: bool | int | None = None,
|
295
|
+
log_header_only: bool | None = None,
|
291
296
|
score: bool = True,
|
292
297
|
score_display: bool | None = None,
|
293
298
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -344,7 +349,9 @@ async def eval_async(
|
|
344
349
|
log_buffer: Number of samples to buffer before writing log file.
|
345
350
|
If not specified, an appropriate default for the format and filesystem is
|
346
351
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
347
|
-
log_shared: Indicate that the log directory is shared, which results in additional
|
352
|
+
log_shared: Indicate that the log directory is shared, which results in additional
|
353
|
+
syncing of realtime log data for Inspect View.
|
354
|
+
log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
|
348
355
|
score: Score output (defaults to True)
|
349
356
|
score_display: Show scoring metrics in realtime (defaults to True)
|
350
357
|
**kwargs: Model generation options.
|
@@ -432,6 +439,9 @@ async def eval_async(
|
|
432
439
|
# resolve log_shared
|
433
440
|
log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
|
434
441
|
|
442
|
+
# resolve header only
|
443
|
+
log_header_only = log_header_only is True
|
444
|
+
|
435
445
|
# validate that --log-shared can't use used with 'json' format
|
436
446
|
if log_shared and log_format == JSON_LOG_FORMAT:
|
437
447
|
raise PrerequisiteError(
|
@@ -507,6 +517,7 @@ async def eval_async(
|
|
507
517
|
eval_config=eval_config,
|
508
518
|
eval_sandbox=sandbox,
|
509
519
|
recorder=recorder,
|
520
|
+
header_only=log_header_only,
|
510
521
|
epochs_reducer=epochs_reducer,
|
511
522
|
solver=solver,
|
512
523
|
tags=tags,
|
@@ -532,6 +543,7 @@ async def eval_async(
|
|
532
543
|
eval_config=eval_config,
|
533
544
|
eval_sandbox=sandbox,
|
534
545
|
recorder=recorder,
|
546
|
+
header_only=log_header_only,
|
535
547
|
epochs_reducer=epochs_reducer,
|
536
548
|
solver=solver,
|
537
549
|
tags=tags,
|
@@ -800,7 +812,7 @@ async def eval_retry_async(
|
|
800
812
|
model_roles = model_roles_config_to_model_roles(eval_log.eval.model_roles)
|
801
813
|
|
802
814
|
# collect the rest of the params we need for the eval
|
803
|
-
task_args = eval_log.eval.
|
815
|
+
task_args = eval_log.eval.task_args_passed
|
804
816
|
tags = eval_log.eval.tags
|
805
817
|
limit = eval_log.eval.config.limit
|
806
818
|
sample_id = eval_log.eval.config.sample_id
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -114,7 +114,7 @@ def eval_set(
|
|
114
114
|
(defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
|
115
115
|
per-retry will in no case by longer than 1 hour.
|
116
116
|
retry_connections: Reduce max_connections at this rate with each retry
|
117
|
-
(defaults to 0.
|
117
|
+
(defaults to 1.0, which results in no reduction).
|
118
118
|
retry_cleanup: Cleanup failed log files after retries
|
119
119
|
(defaults to True)
|
120
120
|
model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
@@ -235,6 +235,7 @@ def eval_set(
|
|
235
235
|
log_images=log_images,
|
236
236
|
log_buffer=log_buffer,
|
237
237
|
log_shared=log_shared,
|
238
|
+
log_header_only=True,
|
238
239
|
score=score,
|
239
240
|
**kwargs,
|
240
241
|
)
|
@@ -274,7 +275,7 @@ def eval_set(
|
|
274
275
|
fs.mkdir(log_dir, exist_ok=True)
|
275
276
|
|
276
277
|
# resolve some parameters
|
277
|
-
retry_connections = retry_connections or 0
|
278
|
+
retry_connections = retry_connections or 1.0
|
278
279
|
retry_cleanup = retry_cleanup is not False
|
279
280
|
max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
|
280
281
|
max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
|
inspect_ai/_eval/registry.py
CHANGED
@@ -8,6 +8,7 @@ from inspect_ai._util.error import PrerequisiteError
|
|
8
8
|
from inspect_ai._util.package import get_installed_package_name
|
9
9
|
from inspect_ai._util.registry import (
|
10
10
|
RegistryInfo,
|
11
|
+
extract_named_params,
|
11
12
|
registry_add,
|
12
13
|
registry_create,
|
13
14
|
registry_info,
|
@@ -17,7 +18,7 @@ from inspect_ai._util.registry import (
|
|
17
18
|
)
|
18
19
|
|
19
20
|
from .task import Task
|
20
|
-
from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
|
21
|
+
from .task.constants import TASK_ALL_PARAMS_ATTR, TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
|
21
22
|
|
22
23
|
MODEL_PARAM = "model"
|
23
24
|
|
@@ -133,6 +134,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
133
134
|
**w_kwargs,
|
134
135
|
)
|
135
136
|
|
137
|
+
# extract all task parameters including defaults
|
138
|
+
named_params = extract_named_params(task_type, True, *w_args, **w_kwargs)
|
139
|
+
setattr(task_instance, TASK_ALL_PARAMS_ATTR, named_params)
|
140
|
+
|
136
141
|
# if its not from an installed package then it is a "local"
|
137
142
|
# module import, so set its task file and run dir
|
138
143
|
if get_installed_package_name(task_type) is None:
|
inspect_ai/_eval/run.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3
3
|
import sys
|
4
4
|
from typing import Any, Awaitable, Callable, Set, cast
|
5
5
|
|
6
|
+
from inspect_ai._eval.task.constants import TASK_ALL_PARAMS_ATTR
|
6
7
|
from inspect_ai._eval.task.task import Task
|
7
8
|
from inspect_ai._util.environ import environ_vars
|
8
9
|
from inspect_ai._util.trace import trace_action
|
@@ -63,6 +64,7 @@ async def eval_run(
|
|
63
64
|
eval_config: EvalConfig,
|
64
65
|
eval_sandbox: SandboxEnvironmentType | None,
|
65
66
|
recorder: Recorder,
|
67
|
+
header_only: bool,
|
66
68
|
epochs_reducer: list[ScoreReducer] | None = None,
|
67
69
|
solver: Solver | SolverSpec | None = None,
|
68
70
|
tags: list[str] | None = None,
|
@@ -207,11 +209,15 @@ async def eval_run(
|
|
207
209
|
metrics=eval_metrics,
|
208
210
|
sandbox=resolved_task.sandbox,
|
209
211
|
task_attribs=task.attribs,
|
210
|
-
task_args=
|
212
|
+
task_args=getattr(
|
213
|
+
task, TASK_ALL_PARAMS_ATTR, resolved_task.task_args
|
214
|
+
),
|
215
|
+
task_args_passed=resolved_task.task_args,
|
211
216
|
model_args=resolved_task.model.model_args,
|
212
217
|
eval_config=task_eval_config,
|
213
218
|
metadata=((metadata or {}) | (task.metadata or {})) or None,
|
214
219
|
recorder=recorder,
|
220
|
+
header_only=header_only,
|
215
221
|
)
|
216
222
|
await logger.init()
|
217
223
|
|