inspect-ai 0.3.103__py3-none-any.whl → 0.3.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +2 -1
- inspect_ai/_cli/eval.py +2 -2
- inspect_ai/_display/core/active.py +3 -0
- inspect_ai/_display/core/config.py +1 -0
- inspect_ai/_display/core/panel.py +21 -13
- inspect_ai/_display/core/results.py +3 -7
- inspect_ai/_display/core/rich.py +3 -5
- inspect_ai/_display/log/__init__.py +0 -0
- inspect_ai/_display/log/display.py +173 -0
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_display/rich/display.py +2 -4
- inspect_ai/_display/textual/app.py +1 -6
- inspect_ai/_display/textual/widgets/task_detail.py +3 -14
- inspect_ai/_display/textual/widgets/tasks.py +1 -1
- inspect_ai/_eval/eval.py +1 -1
- inspect_ai/_eval/evalset.py +3 -3
- inspect_ai/_eval/registry.py +6 -1
- inspect_ai/_eval/run.py +5 -1
- inspect_ai/_eval/task/constants.py +1 -0
- inspect_ai/_eval/task/log.py +2 -0
- inspect_ai/_eval/task/run.py +65 -39
- inspect_ai/_util/citation.py +88 -0
- inspect_ai/_util/content.py +24 -2
- inspect_ai/_util/json.py +17 -2
- inspect_ai/_util/registry.py +19 -4
- inspect_ai/_view/schema.py +0 -6
- inspect_ai/_view/server.py +17 -0
- inspect_ai/_view/www/dist/assets/index.css +93 -31
- inspect_ai/_view/www/dist/assets/index.js +10639 -10011
- inspect_ai/_view/www/log-schema.json +418 -1
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
- inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/@types/log.d.ts +140 -39
- inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
- inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
- inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
- inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
- inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
- inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
- inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
- inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
- inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
- inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
- inspect_ai/_view/www/src/client/api/types.ts +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
- inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
- inspect_ai/_view/www/src/tests/README.md +2 -2
- inspect_ai/_view/www/src/utils/git.ts +3 -1
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/agent/_handoff.py +8 -5
- inspect_ai/agent/_react.py +5 -5
- inspect_ai/dataset/_dataset.py +1 -1
- inspect_ai/log/_condense.py +5 -0
- inspect_ai/log/_file.py +4 -1
- inspect_ai/log/_log.py +9 -4
- inspect_ai/log/_recorders/json.py +4 -2
- inspect_ai/log/_samples.py +5 -0
- inspect_ai/log/_util.py +2 -0
- inspect_ai/model/__init__.py +14 -0
- inspect_ai/model/_call_tools.py +17 -8
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_openai_responses.py +80 -34
- inspect_ai/model/_providers/_anthropic_citations.py +158 -0
- inspect_ai/model/_providers/_google_citations.py +100 -0
- inspect_ai/model/_providers/anthropic.py +219 -36
- inspect_ai/model/_providers/google.py +98 -22
- inspect_ai/model/_providers/mistral.py +20 -7
- inspect_ai/model/_providers/openai.py +11 -10
- inspect_ai/model/_providers/openai_compatible.py +3 -2
- inspect_ai/model/_providers/openai_responses.py +2 -5
- inspect_ai/model/_providers/perplexity.py +123 -0
- inspect_ai/model/_providers/providers.py +13 -2
- inspect_ai/model/_providers/vertex.py +3 -0
- inspect_ai/model/_trim.py +5 -0
- inspect_ai/tool/__init__.py +14 -0
- inspect_ai/tool/_mcp/_mcp.py +5 -2
- inspect_ai/tool/_mcp/sampling.py +19 -3
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tool.py +10 -1
- inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
- inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
- inspect_ai/tool/_tools/_web_search/_google.py +22 -25
- inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
- inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
- inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
- inspect_ai/util/__init__.py +8 -0
- inspect_ai/util/_background.py +64 -0
- inspect_ai/util/_display.py +11 -2
- inspect_ai/util/_limit.py +72 -5
- inspect_ai/util/_sandbox/__init__.py +2 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/service.py +28 -7
- inspect_ai/util/_span.py +12 -1
- inspect_ai/util/_subprocess.py +51 -38
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +134 -109
- /inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
- /inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/common.py
CHANGED
@@ -60,7 +60,8 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
60
60
|
@click.option(
|
61
61
|
"--display",
|
62
62
|
type=click.Choice(
|
63
|
-
["full", "conversation", "rich", "plain", "none"],
|
63
|
+
["full", "conversation", "rich", "plain", "log", "none"],
|
64
|
+
case_sensitive=False,
|
64
65
|
),
|
65
66
|
default=DEFAULT_DISPLAY,
|
66
67
|
envvar="INSPECT_DISPLAY",
|
inspect_ai/_cli/eval.py
CHANGED
@@ -641,7 +641,7 @@ def eval_command(
|
|
641
641
|
@click.option(
|
642
642
|
"--retry-connections",
|
643
643
|
type=float,
|
644
|
-
help="Reduce max_connections at this rate with each retry (defaults to 0.
|
644
|
+
help="Reduce max_connections at this rate with each retry (defaults to 1.0, which results in no reduction).",
|
645
645
|
envvar="INSPECT_EVAL_RETRY_CONNECTIONS",
|
646
646
|
)
|
647
647
|
@click.option(
|
@@ -966,7 +966,7 @@ def eval_exec(
|
|
966
966
|
success, _ = eval_set(**params)
|
967
967
|
return success
|
968
968
|
else:
|
969
|
-
params["log_header_only"] =
|
969
|
+
params["log_header_only"] = True # cli invocation doesn't need full log
|
970
970
|
eval(**params)
|
971
971
|
return True
|
972
972
|
|
@@ -5,6 +5,7 @@ import rich
|
|
5
5
|
|
6
6
|
from inspect_ai.util._display import display_type
|
7
7
|
|
8
|
+
from ..log.display import LogDisplay
|
8
9
|
from ..plain.display import PlainDisplay
|
9
10
|
from ..rich.display import RichDisplay
|
10
11
|
from ..textual.display import TextualDisplay
|
@@ -24,6 +25,8 @@ def display() -> Display:
|
|
24
25
|
and not rich.get_console().is_jupyter
|
25
26
|
):
|
26
27
|
_active_display = TextualDisplay()
|
28
|
+
elif display_type() == "log":
|
29
|
+
_active_display = LogDisplay()
|
27
30
|
else:
|
28
31
|
_active_display = RichDisplay()
|
29
32
|
|
@@ -30,6 +30,7 @@ def task_config(
|
|
30
30
|
config = dict(profile.generate_config.model_dump(exclude_none=True)) | config
|
31
31
|
if profile.tags:
|
32
32
|
config["tags"] = ",".join(profile.tags)
|
33
|
+
config["dataset"] = profile.dataset
|
33
34
|
config_print: list[str] = []
|
34
35
|
for name, value in config.items():
|
35
36
|
if name == "approval" and isinstance(value, dict):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import Tuple
|
2
2
|
|
3
3
|
import rich
|
4
|
-
from rich.console import RenderableType
|
4
|
+
from rich.console import Group, RenderableType
|
5
5
|
from rich.panel import Panel
|
6
6
|
from rich.table import Table
|
7
7
|
from rich.text import Text
|
@@ -9,7 +9,7 @@ from rich.text import Text
|
|
9
9
|
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
10
10
|
from inspect_ai._util.path import cwd_relative_path
|
11
11
|
from inspect_ai._util.registry import registry_unqualified_name
|
12
|
-
from inspect_ai.util._display import
|
12
|
+
from inspect_ai.util._display import display_type_plain
|
13
13
|
|
14
14
|
from .display import TaskProfile
|
15
15
|
from .rich import is_vscode_notebook, rich_theme
|
@@ -27,7 +27,7 @@ def task_panel(
|
|
27
27
|
log_location: str | None,
|
28
28
|
) -> RenderableType:
|
29
29
|
# dispatch to plain handler if we are in plain mode
|
30
|
-
if
|
30
|
+
if display_type_plain():
|
31
31
|
return task_panel_plain(
|
32
32
|
profile, show_model, body, subtitle, footer, log_location
|
33
33
|
)
|
@@ -89,23 +89,31 @@ def task_panel(
|
|
89
89
|
log_location_relative = log_location
|
90
90
|
|
91
91
|
root = Table.grid(expand=True)
|
92
|
-
root.add_column()
|
92
|
+
root.add_column(overflow="fold")
|
93
93
|
root.add_row(table)
|
94
94
|
root.add_row()
|
95
95
|
root.add_row(
|
96
96
|
f"[bold][{theme.light}]Log:[/{theme.light}][/bold] "
|
97
97
|
+ f"[{theme.link}]{log_location_relative}[/{theme.link}]"
|
98
98
|
)
|
99
|
+
root.add_row()
|
99
100
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
101
|
+
panel = Panel(
|
102
|
+
task_panel_title(profile, show_model),
|
103
|
+
padding=(0, 0),
|
104
|
+
width=width,
|
105
|
+
height=3,
|
106
|
+
expand=True,
|
107
|
+
)
|
108
|
+
return Group(panel, root)
|
109
|
+
else:
|
110
|
+
return Panel(
|
111
|
+
root,
|
112
|
+
title=task_panel_title(profile, show_model),
|
113
|
+
title_align="left",
|
114
|
+
width=width,
|
115
|
+
expand=True,
|
116
|
+
)
|
109
117
|
|
110
118
|
|
111
119
|
def task_panel_plain(
|
@@ -18,7 +18,7 @@ from .display import (
|
|
18
18
|
TaskSuccess,
|
19
19
|
TaskWithResult,
|
20
20
|
)
|
21
|
-
from .panel import task_panel
|
21
|
+
from .panel import task_panel
|
22
22
|
from .rich import rich_theme
|
23
23
|
|
24
24
|
|
@@ -41,8 +41,6 @@ def task_result_cancelled(
|
|
41
41
|
) -> RenderableType:
|
42
42
|
# The contents of the panel
|
43
43
|
config = task_config(profile)
|
44
|
-
targets = task_targets(profile)
|
45
|
-
subtitle = config, targets
|
46
44
|
body = task_stats(cancelled.stats)
|
47
45
|
|
48
46
|
# The panel
|
@@ -50,7 +48,7 @@ def task_result_cancelled(
|
|
50
48
|
profile=profile,
|
51
49
|
show_model=True,
|
52
50
|
body=body,
|
53
|
-
subtitle=
|
51
|
+
subtitle=config,
|
54
52
|
footer=task_interrupted(profile, cancelled.samples_completed),
|
55
53
|
log_location=profile.log_location,
|
56
54
|
)
|
@@ -76,8 +74,6 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
|
|
76
74
|
def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
|
77
75
|
# The contents of the panel
|
78
76
|
config = task_config(profile)
|
79
|
-
targets = task_targets(profile)
|
80
|
-
subtitle = config, targets
|
81
77
|
body = task_stats(success.stats)
|
82
78
|
|
83
79
|
# the panel
|
@@ -85,7 +81,7 @@ def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> Renderabl
|
|
85
81
|
profile=profile,
|
86
82
|
show_model=True,
|
87
83
|
body=body,
|
88
|
-
subtitle=
|
84
|
+
subtitle=config,
|
89
85
|
footer=task_results(profile, success),
|
90
86
|
log_location=profile.log_location,
|
91
87
|
)
|
inspect_ai/_display/core/rich.py
CHANGED
@@ -11,7 +11,7 @@ from typing_extensions import override
|
|
11
11
|
|
12
12
|
from inspect_ai._util.platform import is_running_in_jupyterlab, is_running_in_vscode
|
13
13
|
from inspect_ai._util.transcript import transcript_code_theme
|
14
|
-
from inspect_ai.util._display import display_type
|
14
|
+
from inspect_ai.util._display import display_type, display_type_plain
|
15
15
|
|
16
16
|
|
17
17
|
def is_vscode_notebook(console: Console) -> bool:
|
@@ -20,15 +20,13 @@ def is_vscode_notebook(console: Console) -> bool:
|
|
20
20
|
|
21
21
|
def rich_no_color() -> bool:
|
22
22
|
return (
|
23
|
-
|
24
|
-
or not is_running_in_vscode()
|
25
|
-
or is_running_in_jupyterlab()
|
23
|
+
display_type_plain() or not is_running_in_vscode() or is_running_in_jupyterlab()
|
26
24
|
)
|
27
25
|
|
28
26
|
|
29
27
|
def rich_initialise() -> None:
|
30
28
|
# reflect ansi prefs
|
31
|
-
if
|
29
|
+
if display_type_plain():
|
32
30
|
rich.reconfigure(no_color=True, force_terminal=False, force_interactive=False)
|
33
31
|
elif rich_no_color():
|
34
32
|
rich.reconfigure(no_color=True)
|
File without changes
|
@@ -0,0 +1,173 @@
|
|
1
|
+
import contextlib
|
2
|
+
import logging
|
3
|
+
from typing import AsyncIterator, Callable, Coroutine, Iterator
|
4
|
+
|
5
|
+
import anyio
|
6
|
+
from rich.console import Console
|
7
|
+
|
8
|
+
from inspect_ai._util._async import configured_async_backend, run_coroutine
|
9
|
+
from inspect_ai._util.platform import running_in_notebook
|
10
|
+
|
11
|
+
from ...util import throttle
|
12
|
+
from ...util._concurrency import concurrency_status_display
|
13
|
+
from ..core.display import (
|
14
|
+
TR,
|
15
|
+
Display,
|
16
|
+
Progress,
|
17
|
+
TaskDisplay,
|
18
|
+
TaskDisplayMetric,
|
19
|
+
TaskProfile,
|
20
|
+
TaskResult,
|
21
|
+
TaskScreen,
|
22
|
+
TaskSpec,
|
23
|
+
TaskWithResult,
|
24
|
+
)
|
25
|
+
from ..core.footer import task_http_retries_str
|
26
|
+
from ..core.results import task_metric, tasks_results
|
27
|
+
|
28
|
+
|
29
|
+
class LogDisplay(Display):
|
30
|
+
def __init__(self) -> None:
|
31
|
+
self.total_tasks: int = 0
|
32
|
+
self.tasks: list[TaskWithResult] = []
|
33
|
+
self.parallel = False
|
34
|
+
|
35
|
+
def print(self, message: str) -> None:
|
36
|
+
logging.info(message, stacklevel=2)
|
37
|
+
|
38
|
+
@contextlib.contextmanager
|
39
|
+
def progress(self, total: int) -> Iterator[Progress]:
|
40
|
+
yield LogProgress(total)
|
41
|
+
|
42
|
+
def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
|
43
|
+
if running_in_notebook():
|
44
|
+
return run_coroutine(main())
|
45
|
+
else:
|
46
|
+
return anyio.run(main, backend=configured_async_backend())
|
47
|
+
|
48
|
+
@contextlib.contextmanager
|
49
|
+
def suspend_task_app(self) -> Iterator[None]:
|
50
|
+
yield
|
51
|
+
|
52
|
+
@contextlib.asynccontextmanager
|
53
|
+
async def task_screen(
|
54
|
+
self, tasks: list[TaskSpec], parallel: bool
|
55
|
+
) -> AsyncIterator[TaskScreen]:
|
56
|
+
self.total_tasks = len(tasks)
|
57
|
+
self.tasks = []
|
58
|
+
self.parallel = parallel
|
59
|
+
try:
|
60
|
+
logging.info(f"Running {self.total_tasks} tasks...", stacklevel=3)
|
61
|
+
yield TaskScreen()
|
62
|
+
finally:
|
63
|
+
# Log final results
|
64
|
+
if self.tasks:
|
65
|
+
self._log_results()
|
66
|
+
|
67
|
+
@contextlib.contextmanager
|
68
|
+
def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
|
69
|
+
# Create and yield task display
|
70
|
+
task = TaskWithResult(profile, None)
|
71
|
+
self.tasks.append(task)
|
72
|
+
yield LogTaskDisplay(task)
|
73
|
+
self._log_status()
|
74
|
+
|
75
|
+
def display_counter(self, caption: str, value: str) -> None:
|
76
|
+
logging.info(f"{caption}: {value}", stacklevel=2)
|
77
|
+
|
78
|
+
def _log_status(self) -> None:
|
79
|
+
"""Log status updates for all tasks"""
|
80
|
+
completed_tasks = sum(1 for task in self.tasks if task.result is not None)
|
81
|
+
total_tasks = len(self.tasks)
|
82
|
+
logging.info(f"{completed_tasks}/{total_tasks} tasks complete", stacklevel=4)
|
83
|
+
|
84
|
+
def _log_results(self) -> None:
|
85
|
+
"""Log final results"""
|
86
|
+
results = tasks_results(self.tasks)
|
87
|
+
console = Console(width=120)
|
88
|
+
console.log(results, _stack_offset=4)
|
89
|
+
|
90
|
+
|
91
|
+
class LogProgress(Progress):
|
92
|
+
def __init__(self, total: int):
|
93
|
+
self.total = total
|
94
|
+
self.current = 0
|
95
|
+
|
96
|
+
def update(self, n: int = 1) -> None:
|
97
|
+
self.current += n
|
98
|
+
|
99
|
+
def complete(self) -> None:
|
100
|
+
self.current = self.total
|
101
|
+
|
102
|
+
|
103
|
+
class LogTaskDisplay(TaskDisplay):
|
104
|
+
def __init__(self, task: TaskWithResult):
|
105
|
+
self.task = task
|
106
|
+
self.progress_display: LogProgress | None = None
|
107
|
+
self.samples_complete = 0
|
108
|
+
self.samples_total = 0
|
109
|
+
self.current_metrics: list[TaskDisplayMetric] | None = None
|
110
|
+
|
111
|
+
@contextlib.contextmanager
|
112
|
+
def progress(self) -> Iterator[Progress]:
|
113
|
+
self.progress_display = LogProgress(self.task.profile.steps)
|
114
|
+
yield self.progress_display
|
115
|
+
|
116
|
+
@throttle(5)
|
117
|
+
def _log_status_throttled(self, stacklevel: int) -> None:
|
118
|
+
self._log_status(stacklevel=stacklevel + 2)
|
119
|
+
|
120
|
+
def _log_status(self, stacklevel: int) -> None:
|
121
|
+
"""Log status updates"""
|
122
|
+
status_parts: list[str] = []
|
123
|
+
|
124
|
+
# Add task name and model
|
125
|
+
status_parts.append(f"Task: {self.task.profile.name}")
|
126
|
+
status_parts.append(f"Model: {self.task.profile.model}")
|
127
|
+
|
128
|
+
# Add step progress
|
129
|
+
if self.progress_display:
|
130
|
+
progress_percent = int(
|
131
|
+
self.progress_display.current / self.progress_display.total * 100
|
132
|
+
)
|
133
|
+
status_parts.append(
|
134
|
+
f"Steps: {self.progress_display.current}/{self.progress_display.total} {progress_percent}%"
|
135
|
+
)
|
136
|
+
|
137
|
+
# Add sample progress
|
138
|
+
status_parts.append(f"Samples: {self.samples_complete}/{self.samples_total}")
|
139
|
+
|
140
|
+
# Add metrics
|
141
|
+
if self.current_metrics:
|
142
|
+
metric_str = task_metric(self.current_metrics)
|
143
|
+
status_parts.append(metric_str)
|
144
|
+
|
145
|
+
# Add resource usage
|
146
|
+
resources_dict: dict[str, str] = {}
|
147
|
+
for model, resource in concurrency_status_display().items():
|
148
|
+
resources_dict[model] = f"{resource[0]}/{resource[1]}"
|
149
|
+
resources = ", ".join(
|
150
|
+
[f"{key}: {value}" for key, value in resources_dict.items()]
|
151
|
+
)
|
152
|
+
status_parts.append(resources)
|
153
|
+
|
154
|
+
# Add rate limits
|
155
|
+
rate_limits = task_http_retries_str()
|
156
|
+
if rate_limits:
|
157
|
+
status_parts.append(rate_limits)
|
158
|
+
|
159
|
+
# Print on new line
|
160
|
+
logging.info(", ".join(status_parts), stacklevel=stacklevel)
|
161
|
+
|
162
|
+
def sample_complete(self, complete: int, total: int) -> None:
|
163
|
+
self.samples_complete = complete
|
164
|
+
self.samples_total = total
|
165
|
+
self._log_status_throttled(stacklevel=3)
|
166
|
+
|
167
|
+
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
168
|
+
self.current_metrics = metrics
|
169
|
+
self._log_status_throttled(stacklevel=3)
|
170
|
+
|
171
|
+
def complete(self, result: TaskResult) -> None:
|
172
|
+
self.task.result = result
|
173
|
+
self._log_status(stacklevel=3)
|
@@ -25,7 +25,7 @@ from ..core.display import (
|
|
25
25
|
TaskWithResult,
|
26
26
|
)
|
27
27
|
from ..core.footer import task_http_retries_str
|
28
|
-
from ..core.panel import task_panel
|
28
|
+
from ..core.panel import task_panel
|
29
29
|
from ..core.results import task_metric, tasks_results
|
30
30
|
|
31
31
|
|
@@ -79,7 +79,7 @@ class PlainDisplay(Display):
|
|
79
79
|
profile=profile,
|
80
80
|
show_model=True,
|
81
81
|
body="", # Empty body since we haven't started yet
|
82
|
-
subtitle=
|
82
|
+
subtitle=task_config(profile),
|
83
83
|
footer=None,
|
84
84
|
log_location=None,
|
85
85
|
)
|
@@ -32,7 +32,7 @@ from ..core.display import (
|
|
32
32
|
TaskWithResult,
|
33
33
|
)
|
34
34
|
from ..core.footer import task_footer
|
35
|
-
from ..core.panel import task_panel,
|
35
|
+
from ..core.panel import task_panel, task_title, tasks_title
|
36
36
|
from ..core.progress import (
|
37
37
|
RichProgress,
|
38
38
|
progress_description,
|
@@ -311,15 +311,13 @@ def task_live_status(
|
|
311
311
|
|
312
312
|
# the panel contents
|
313
313
|
config = task_config(tasks[0].profile, style=theme.light)
|
314
|
-
targets = task_targets(tasks[0].profile)
|
315
|
-
subtitle = config, targets
|
316
314
|
|
317
315
|
# the panel
|
318
316
|
return task_panel(
|
319
317
|
profile=tasks[0].profile,
|
320
318
|
show_model=len(tasks) == 1,
|
321
319
|
body=Group("", progress),
|
322
|
-
subtitle=
|
320
|
+
subtitle=config,
|
323
321
|
footer=task_footer(counters, theme.light),
|
324
322
|
log_location=None,
|
325
323
|
)
|
@@ -42,7 +42,7 @@ from ..core.display import (
|
|
42
42
|
TaskWithResult,
|
43
43
|
)
|
44
44
|
from ..core.footer import task_footer
|
45
|
-
from ..core.panel import
|
45
|
+
from ..core.panel import task_title, tasks_title
|
46
46
|
from ..core.rich import record_console_input, rich_initialise, rich_theme
|
47
47
|
from .theme import inspect_dark, inspect_light
|
48
48
|
from .widgets.console import ConsoleView
|
@@ -296,13 +296,8 @@ class TaskScreenApp(App[TR]):
|
|
296
296
|
tasks.config = task_config(
|
297
297
|
self._tasks[0].profile, generate_config=not self._parallel
|
298
298
|
)
|
299
|
-
if not self._parallel:
|
300
|
-
tasks.targets = task_targets(self._tasks[0].profile)
|
301
|
-
else:
|
302
|
-
tasks.targets = " \n "
|
303
299
|
else:
|
304
300
|
tasks.config = ""
|
305
|
-
tasks.targets = ""
|
306
301
|
|
307
302
|
def update_samples(self) -> None:
|
308
303
|
samples_view = self.query_one(SamplesView)
|
@@ -30,6 +30,8 @@ class TaskDetail(Widget):
|
|
30
30
|
width: 100%;
|
31
31
|
height: auto;
|
32
32
|
grid-gutter: 1 3;
|
33
|
+
grid-size-columns: 3;
|
34
|
+
grid-columns: 1fr 1fr 1fr;
|
33
35
|
}
|
34
36
|
"""
|
35
37
|
|
@@ -92,20 +94,6 @@ class TaskDetail(Widget):
|
|
92
94
|
if len(self.by_reducer) == 0:
|
93
95
|
return
|
94
96
|
|
95
|
-
# Compute the row and column count
|
96
|
-
row_count = len(self.by_reducer)
|
97
|
-
col_count = len(next(iter(self.by_reducer.values())))
|
98
|
-
|
99
|
-
# If this can fit in a single row, make it fit
|
100
|
-
# otherwise place each reducer on their own row
|
101
|
-
self.grid.styles.grid_columns = "auto"
|
102
|
-
if row_count * col_count < 4:
|
103
|
-
self.grid.styles.grid_size_columns = row_count * col_count
|
104
|
-
self.grid.styles.grid_size_rows = 1
|
105
|
-
else:
|
106
|
-
self.grid.styles.grid_size_columns = col_count
|
107
|
-
self.grid.styles.grid_size_rows = row_count
|
108
|
-
|
109
97
|
# In order to reduce flashing the below tracks use of widgets
|
110
98
|
# and updates them when possible (removing and adding them as needed)
|
111
99
|
# Makes keys for tracking Task Metric widgets
|
@@ -142,6 +130,7 @@ class TaskMetrics(Widget):
|
|
142
130
|
TaskMetrics {
|
143
131
|
width: auto;
|
144
132
|
height: auto;
|
133
|
+
border: solid $foreground 20%;
|
145
134
|
}
|
146
135
|
TaskMetrics Grid {
|
147
136
|
width: auto;
|
inspect_ai/_eval/eval.py
CHANGED
@@ -812,7 +812,7 @@ async def eval_retry_async(
|
|
812
812
|
model_roles = model_roles_config_to_model_roles(eval_log.eval.model_roles)
|
813
813
|
|
814
814
|
# collect the rest of the params we need for the eval
|
815
|
-
task_args = eval_log.eval.
|
815
|
+
task_args = eval_log.eval.task_args_passed
|
816
816
|
tags = eval_log.eval.tags
|
817
817
|
limit = eval_log.eval.config.limit
|
818
818
|
sample_id = eval_log.eval.config.sample_id
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -114,7 +114,7 @@ def eval_set(
|
|
114
114
|
(defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
|
115
115
|
per-retry will in no case by longer than 1 hour.
|
116
116
|
retry_connections: Reduce max_connections at this rate with each retry
|
117
|
-
(defaults to 0.
|
117
|
+
(defaults to 1.0, which results in no reduction).
|
118
118
|
retry_cleanup: Cleanup failed log files after retries
|
119
119
|
(defaults to True)
|
120
120
|
model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
@@ -275,7 +275,7 @@ def eval_set(
|
|
275
275
|
fs.mkdir(log_dir, exist_ok=True)
|
276
276
|
|
277
277
|
# resolve some parameters
|
278
|
-
retry_connections = retry_connections or 0
|
278
|
+
retry_connections = retry_connections or 1.0
|
279
279
|
retry_cleanup = retry_cleanup is not False
|
280
280
|
max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
|
281
281
|
max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
|
@@ -578,7 +578,7 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
|
|
578
578
|
else:
|
579
579
|
task_file = task.eval.task_file or ""
|
580
580
|
task_name = task.eval.task
|
581
|
-
task_args = task.eval.
|
581
|
+
task_args = task.eval.task_args_passed
|
582
582
|
model = str(task.eval.model)
|
583
583
|
model_roles = task.eval.model_roles or {}
|
584
584
|
|
inspect_ai/_eval/registry.py
CHANGED
@@ -8,6 +8,7 @@ from inspect_ai._util.error import PrerequisiteError
|
|
8
8
|
from inspect_ai._util.package import get_installed_package_name
|
9
9
|
from inspect_ai._util.registry import (
|
10
10
|
RegistryInfo,
|
11
|
+
extract_named_params,
|
11
12
|
registry_add,
|
12
13
|
registry_create,
|
13
14
|
registry_info,
|
@@ -17,7 +18,7 @@ from inspect_ai._util.registry import (
|
|
17
18
|
)
|
18
19
|
|
19
20
|
from .task import Task
|
20
|
-
from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
|
21
|
+
from .task.constants import TASK_ALL_PARAMS_ATTR, TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
|
21
22
|
|
22
23
|
MODEL_PARAM = "model"
|
23
24
|
|
@@ -133,6 +134,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
133
134
|
**w_kwargs,
|
134
135
|
)
|
135
136
|
|
137
|
+
# extract all task parameters including defaults
|
138
|
+
named_params = extract_named_params(task_type, True, *w_args, **w_kwargs)
|
139
|
+
setattr(task_instance, TASK_ALL_PARAMS_ATTR, named_params)
|
140
|
+
|
136
141
|
# if its not from an installed package then it is a "local"
|
137
142
|
# module import, so set its task file and run dir
|
138
143
|
if get_installed_package_name(task_type) is None:
|
inspect_ai/_eval/run.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3
3
|
import sys
|
4
4
|
from typing import Any, Awaitable, Callable, Set, cast
|
5
5
|
|
6
|
+
from inspect_ai._eval.task.constants import TASK_ALL_PARAMS_ATTR
|
6
7
|
from inspect_ai._eval.task.task import Task
|
7
8
|
from inspect_ai._util.environ import environ_vars
|
8
9
|
from inspect_ai._util.trace import trace_action
|
@@ -208,7 +209,10 @@ async def eval_run(
|
|
208
209
|
metrics=eval_metrics,
|
209
210
|
sandbox=resolved_task.sandbox,
|
210
211
|
task_attribs=task.attribs,
|
211
|
-
task_args=
|
212
|
+
task_args=getattr(
|
213
|
+
task, TASK_ALL_PARAMS_ATTR, resolved_task.task_args
|
214
|
+
),
|
215
|
+
task_args_passed=resolved_task.task_args,
|
212
216
|
model_args=resolved_task.model.model_args,
|
213
217
|
eval_config=task_eval_config,
|
214
218
|
metadata=((metadata or {}) | (task.metadata or {})) or None,
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -71,6 +71,7 @@ class TaskLogger:
|
|
71
71
|
sandbox: SandboxEnvironmentSpec | None,
|
72
72
|
task_attribs: dict[str, Any],
|
73
73
|
task_args: dict[str, Any],
|
74
|
+
task_args_passed: dict[str, Any],
|
74
75
|
model_args: dict[str, Any],
|
75
76
|
eval_config: EvalConfig,
|
76
77
|
metadata: dict[str, Any] | None,
|
@@ -128,6 +129,7 @@ class TaskLogger:
|
|
128
129
|
task_registry_name=task_registry_name,
|
129
130
|
task_attribs=task_attribs,
|
130
131
|
task_args=task_args,
|
132
|
+
task_args_passed=task_args_passed,
|
131
133
|
solver=solver.solver if solver else None,
|
132
134
|
tags=tags,
|
133
135
|
solver_args=solver.args if solver else None,
|