inspect-ai 0.3.99__py3-none-any.whl → 0.3.101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +2 -1
- inspect_ai/_display/core/config.py +11 -5
- inspect_ai/_display/core/panel.py +66 -2
- inspect_ai/_display/core/textual.py +5 -2
- inspect_ai/_display/plain/display.py +1 -0
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/widgets/transcript.py +37 -9
- inspect_ai/_eval/eval.py +13 -1
- inspect_ai/_eval/evalset.py +3 -2
- inspect_ai/_eval/run.py +2 -0
- inspect_ai/_eval/score.py +2 -4
- inspect_ai/_eval/task/log.py +3 -1
- inspect_ai/_eval/task/run.py +59 -81
- inspect_ai/_util/content.py +11 -6
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/text.py +7 -0
- inspect_ai/_util/working.py +8 -37
- inspect_ai/_view/__init__.py +0 -0
- inspect_ai/_view/schema.py +2 -1
- inspect_ai/_view/www/CLAUDE.md +15 -0
- inspect_ai/_view/www/dist/assets/index.css +307 -171
- inspect_ai/_view/www/dist/assets/index.js +24733 -21641
- inspect_ai/_view/www/log-schema.json +77 -3
- inspect_ai/_view/www/package.json +9 -5
- inspect_ai/_view/www/src/@types/log.d.ts +9 -0
- inspect_ai/_view/www/src/app/App.tsx +1 -15
- inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +220 -205
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
- inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
- inspect_ai/_view/www/src/app/routing/url.ts +84 -4
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
- inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +24 -17
- inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
- inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
- inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
- inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
- inspect_ai/_view/www/src/app/types.ts +5 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
- inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
- inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
- inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
- inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
- inspect_ai/_view/www/src/state/hooks.ts +52 -2
- inspect_ai/_view/www/src/state/logSlice.ts +4 -3
- inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
- inspect_ai/_view/www/src/state/scrolling.ts +152 -0
- inspect_ai/_view/www/src/utils/attachments.ts +7 -0
- inspect_ai/_view/www/src/utils/python.ts +18 -0
- inspect_ai/_view/www/yarn.lock +290 -33
- inspect_ai/agent/_react.py +12 -7
- inspect_ai/agent/_run.py +2 -3
- inspect_ai/analysis/beta/__init__.py +2 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
- inspect_ai/dataset/_sources/csv.py +2 -6
- inspect_ai/dataset/_sources/hf.py +2 -6
- inspect_ai/dataset/_sources/json.py +2 -6
- inspect_ai/dataset/_util.py +23 -0
- inspect_ai/log/_log.py +1 -1
- inspect_ai/log/_recorders/eval.py +4 -3
- inspect_ai/log/_recorders/file.py +2 -9
- inspect_ai/log/_recorders/json.py +1 -0
- inspect_ai/log/_recorders/recorder.py +1 -0
- inspect_ai/log/_transcript.py +1 -1
- inspect_ai/model/_call_tools.py +6 -2
- inspect_ai/model/_openai.py +1 -1
- inspect_ai/model/_openai_responses.py +85 -41
- inspect_ai/model/_openai_web_search.py +38 -0
- inspect_ai/model/_providers/azureai.py +72 -3
- inspect_ai/model/_providers/openai.py +4 -1
- inspect_ai/model/_providers/openai_responses.py +5 -1
- inspect_ai/scorer/_metric.py +1 -2
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/solver/_task_state.py +2 -2
- inspect_ai/tool/_tool.py +6 -2
- inspect_ai/tool/_tool_def.py +27 -4
- inspect_ai/tool/_tool_info.py +2 -0
- inspect_ai/tool/_tools/_web_search/_google.py +43 -15
- inspect_ai/tool/_tools/_web_search/_tavily.py +46 -13
- inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_json.py +3 -0
- inspect_ai/util/_limit.py +230 -20
- inspect_ai/util/_sandbox/docker/compose.py +20 -11
- inspect_ai/util/_span.py +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +138 -124
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -35,7 +35,7 @@ from .util import (
|
|
35
35
|
)
|
36
36
|
|
37
37
|
MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
|
38
|
-
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
38
|
+
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1 for eval and 4 for eval-set)"
|
39
39
|
MAX_SUBPROCESSES_HELP = (
|
40
40
|
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
|
41
41
|
)
|
@@ -949,6 +949,7 @@ def eval_exec(
|
|
949
949
|
log_images=log_images,
|
950
950
|
log_buffer=log_buffer,
|
951
951
|
log_shared=log_shared,
|
952
|
+
log_header_only=True, # cli invocation doesn't need full log
|
952
953
|
score=score,
|
953
954
|
score_display=score_display,
|
954
955
|
)
|
@@ -1,4 +1,8 @@
|
|
1
|
+
from rich.console import RenderableType
|
2
|
+
from rich.text import Text
|
3
|
+
|
1
4
|
from inspect_ai._util.registry import is_model_dict, is_registry_dict
|
5
|
+
from inspect_ai._util.text import truncate_text
|
2
6
|
from inspect_ai.log._log import eval_config_defaults
|
3
7
|
|
4
8
|
from .display import TaskProfile
|
@@ -6,7 +10,7 @@ from .display import TaskProfile
|
|
6
10
|
|
7
11
|
def task_config(
|
8
12
|
profile: TaskProfile, generate_config: bool = True, style: str = ""
|
9
|
-
) ->
|
13
|
+
) -> RenderableType:
|
10
14
|
# merge config
|
11
15
|
# wind params back for display
|
12
16
|
task_args = dict(profile.task_args)
|
@@ -39,15 +43,17 @@ def task_config(
|
|
39
43
|
elif name not in ["limit", "model", "response_schema", "log_shared"]:
|
40
44
|
if isinstance(value, list):
|
41
45
|
value = ",".join([str(v) for v in value])
|
46
|
+
elif isinstance(value, dict):
|
47
|
+
value = "{...}"
|
42
48
|
if isinstance(value, str):
|
49
|
+
value = truncate_text(value, 50)
|
43
50
|
value = value.replace("[", "\\[")
|
44
51
|
config_print.append(f"{name}: {value}")
|
45
52
|
values = ", ".join(config_print)
|
46
53
|
if values:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
return values
|
54
|
+
values_text = Text(values, style=style)
|
55
|
+
values_text.truncate(500, overflow="ellipsis")
|
56
|
+
return values_text
|
51
57
|
else:
|
52
58
|
return ""
|
53
59
|
|
@@ -9,6 +9,7 @@ from rich.text import Text
|
|
9
9
|
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
10
10
|
from inspect_ai._util.path import cwd_relative_path
|
11
11
|
from inspect_ai._util.registry import registry_unqualified_name
|
12
|
+
from inspect_ai.util._display import display_type
|
12
13
|
|
13
14
|
from .display import TaskProfile
|
14
15
|
from .rich import is_vscode_notebook, rich_theme
|
@@ -24,7 +25,13 @@ def task_panel(
|
|
24
25
|
| None,
|
25
26
|
footer: RenderableType | tuple[RenderableType, RenderableType] | None,
|
26
27
|
log_location: str | None,
|
27
|
-
) ->
|
28
|
+
) -> RenderableType:
|
29
|
+
# dispatch to plain handler if we are in plain mode
|
30
|
+
if display_type() == "plain":
|
31
|
+
return task_panel_plain(
|
32
|
+
profile, show_model, body, subtitle, footer, log_location
|
33
|
+
)
|
34
|
+
|
28
35
|
# rendering context
|
29
36
|
theme = rich_theme()
|
30
37
|
console = rich.get_console()
|
@@ -93,7 +100,7 @@ def task_panel(
|
|
93
100
|
# create panel w/ title
|
94
101
|
panel = Panel(
|
95
102
|
root,
|
96
|
-
title=
|
103
|
+
title=task_panel_title(profile, show_model),
|
97
104
|
title_align="left",
|
98
105
|
width=width,
|
99
106
|
expand=True,
|
@@ -101,6 +108,63 @@ def task_panel(
|
|
101
108
|
return panel
|
102
109
|
|
103
110
|
|
111
|
+
def task_panel_plain(
|
112
|
+
profile: TaskProfile,
|
113
|
+
show_model: bool,
|
114
|
+
body: RenderableType,
|
115
|
+
subtitle: RenderableType
|
116
|
+
| str
|
117
|
+
| Tuple[RenderableType | str, RenderableType | str]
|
118
|
+
| None,
|
119
|
+
footer: RenderableType | tuple[RenderableType, RenderableType] | None,
|
120
|
+
log_location: str | None,
|
121
|
+
) -> RenderableType:
|
122
|
+
# delimiter text
|
123
|
+
delimeter = "---------------------------------------------------------"
|
124
|
+
|
125
|
+
# root table for output
|
126
|
+
table = Table.grid(expand=False)
|
127
|
+
table.add_column()
|
128
|
+
table.add_row(delimeter)
|
129
|
+
|
130
|
+
# title and subtitle
|
131
|
+
table.add_row(task_panel_title(profile, show_model))
|
132
|
+
if isinstance(subtitle, tuple):
|
133
|
+
subtitle = subtitle[0]
|
134
|
+
table.add_row(subtitle)
|
135
|
+
|
136
|
+
# task info
|
137
|
+
if body:
|
138
|
+
table.add_row(body)
|
139
|
+
|
140
|
+
# footer
|
141
|
+
if isinstance(footer, tuple):
|
142
|
+
footer = footer[0]
|
143
|
+
if footer:
|
144
|
+
table.add_row(footer)
|
145
|
+
|
146
|
+
# log location
|
147
|
+
if log_location:
|
148
|
+
# Print a cwd relative path
|
149
|
+
try:
|
150
|
+
log_location_relative = cwd_relative_path(log_location, walk_up=True)
|
151
|
+
except ValueError:
|
152
|
+
log_location_relative = log_location
|
153
|
+
table.add_row(f"Log: {log_location_relative}")
|
154
|
+
|
155
|
+
table.add_row(delimeter)
|
156
|
+
table.add_row("")
|
157
|
+
|
158
|
+
return table
|
159
|
+
|
160
|
+
|
161
|
+
def task_panel_title(profile: TaskProfile, show_model: bool) -> str:
|
162
|
+
theme = rich_theme()
|
163
|
+
return (
|
164
|
+
f"[bold][{theme.meta}]{task_title(profile, show_model)}[/{theme.meta}][/bold]"
|
165
|
+
)
|
166
|
+
|
167
|
+
|
104
168
|
def to_renderable(item: RenderableType | str, style: str = "") -> RenderableType:
|
105
169
|
if isinstance(item, str):
|
106
170
|
return Text.from_markup(item, style=style)
|
@@ -8,8 +8,6 @@ logger = getLogger(__name__)
|
|
8
8
|
# force mouse support for textual -- this works around an issue where
|
9
9
|
# mouse events are disabled after a reload of the vs code ide, see:
|
10
10
|
# https://github.com/Textualize/textual/issues/5380
|
11
|
-
# ansi codes for enabling mouse support are idempotent so it is fine
|
12
|
-
# to do this even in cases where mouse support is already enabled.
|
13
11
|
# we try/catch since we aren't 100% sure there aren't cases where doing
|
14
12
|
# this won't raise and we'd rather not fail hard in in these case
|
15
13
|
def textual_enable_mouse_support(driver: Driver) -> None:
|
@@ -17,5 +15,10 @@ def textual_enable_mouse_support(driver: Driver) -> None:
|
|
17
15
|
if enable_mouse_support:
|
18
16
|
try:
|
19
17
|
enable_mouse_support()
|
18
|
+
# Re-enable SGR-Pixels format if it was previously enabled.
|
19
|
+
# See #1943.
|
20
|
+
enable_mouse_pixels = getattr(driver, "_enable_mouse_pixels", None)
|
21
|
+
if enable_mouse_pixels and getattr(driver, "_mouse_pixels", False):
|
22
|
+
enable_mouse_pixels()
|
20
23
|
except Exception as ex:
|
21
24
|
logger.warning(f"Error enabling mouse support: {ex}")
|
@@ -341,8 +341,6 @@ def tasks_live_status(
|
|
341
341
|
|
342
342
|
# get config
|
343
343
|
config = task_config(tasks[0].profile, generate_config=False, style=theme.light)
|
344
|
-
if config:
|
345
|
-
config += "\n"
|
346
344
|
|
347
345
|
# build footer table
|
348
346
|
footer_table = Table.grid(expand=True)
|
@@ -356,6 +354,8 @@ def tasks_live_status(
|
|
356
354
|
layout_table = Table.grid(expand=True)
|
357
355
|
layout_table.add_column()
|
358
356
|
layout_table.add_row(config)
|
357
|
+
if config:
|
358
|
+
layout_table.add_row("")
|
359
359
|
layout_table.add_row(progress)
|
360
360
|
layout_table.add_row(footer_table)
|
361
361
|
|
@@ -105,9 +105,28 @@ class TranscriptView(ScrollableContainer):
|
|
105
105
|
self, events: Sequence[Event], limit: int = 10
|
106
106
|
) -> list[Widget]:
|
107
107
|
widgets: list[Widget] = []
|
108
|
-
|
109
|
-
#
|
110
|
-
|
108
|
+
|
109
|
+
# filter the events to the <limit> most recent
|
110
|
+
filtered_events = events
|
111
|
+
if len(events) > limit:
|
112
|
+
filtered_events = filtered_events[-limit:]
|
113
|
+
|
114
|
+
# find the sample init event
|
115
|
+
sample_init: SampleInitEvent | None = None
|
116
|
+
for event in events:
|
117
|
+
if isinstance(event, SampleInitEvent):
|
118
|
+
sample_init = event
|
119
|
+
break
|
120
|
+
|
121
|
+
# add the sample init event if it isn't already in the event list
|
122
|
+
if sample_init and sample_init not in filtered_events:
|
123
|
+
filtered_events = [sample_init] + list(filtered_events)
|
124
|
+
|
125
|
+
# compute how many events we filtered out
|
126
|
+
filtered_count = len(events) - len(filtered_events)
|
127
|
+
showed_filtered_count = False
|
128
|
+
|
129
|
+
for event in filtered_events:
|
111
130
|
display = render_event(event)
|
112
131
|
if display:
|
113
132
|
for d in display:
|
@@ -123,14 +142,23 @@ class TranscriptView(ScrollableContainer):
|
|
123
142
|
set_transcript_markdown_options(d.content)
|
124
143
|
widgets.append(Static(d.content, markup=False))
|
125
144
|
widgets.append(Static(Text(" ")))
|
126
|
-
widget_count += 1
|
127
145
|
|
128
|
-
|
129
|
-
|
130
|
-
|
146
|
+
if not showed_filtered_count and filtered_count > 0:
|
147
|
+
showed_filtered_count = True
|
148
|
+
|
149
|
+
widgets.append(
|
150
|
+
Static(
|
151
|
+
transcript_separator(
|
152
|
+
f"{filtered_count} events..."
|
153
|
+
if filtered_count > 1
|
154
|
+
else "1 event...",
|
155
|
+
self.app.current_theme.primary,
|
156
|
+
)
|
157
|
+
)
|
158
|
+
)
|
159
|
+
widgets.append(Static(Text(" ")))
|
131
160
|
|
132
|
-
|
133
|
-
return widgets[::-1]
|
161
|
+
return widgets
|
134
162
|
|
135
163
|
|
136
164
|
class EventDisplay(NamedTuple):
|
inspect_ai/_eval/eval.py
CHANGED
@@ -105,6 +105,7 @@ def eval(
|
|
105
105
|
log_images: bool | None = None,
|
106
106
|
log_buffer: int | None = None,
|
107
107
|
log_shared: bool | int | None = None,
|
108
|
+
log_header_only: bool | None = None,
|
108
109
|
score: bool = True,
|
109
110
|
score_display: bool | None = None,
|
110
111
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -181,6 +182,8 @@ def eval(
|
|
181
182
|
log_shared: Sync sample events to log directory so that users on other systems
|
182
183
|
can see log updates in realtime (defaults to no syncing). Specify `True`
|
183
184
|
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
185
|
+
log_header_only: If `True`, the function should return only log headers rather
|
186
|
+
than full logs with samples (defaults to `False`).
|
184
187
|
score: Score output (defaults to True)
|
185
188
|
score_display: Show scoring metrics in realtime (defaults to True)
|
186
189
|
**kwargs: Model generation options.
|
@@ -234,6 +237,7 @@ def eval(
|
|
234
237
|
log_images=log_images,
|
235
238
|
log_buffer=log_buffer,
|
236
239
|
log_shared=log_shared,
|
240
|
+
log_header_only=log_header_only,
|
237
241
|
score=score,
|
238
242
|
score_display=score_display,
|
239
243
|
**kwargs,
|
@@ -288,6 +292,7 @@ async def eval_async(
|
|
288
292
|
log_images: bool | None = None,
|
289
293
|
log_buffer: int | None = None,
|
290
294
|
log_shared: bool | int | None = None,
|
295
|
+
log_header_only: bool | None = None,
|
291
296
|
score: bool = True,
|
292
297
|
score_display: bool | None = None,
|
293
298
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -344,7 +349,9 @@ async def eval_async(
|
|
344
349
|
log_buffer: Number of samples to buffer before writing log file.
|
345
350
|
If not specified, an appropriate default for the format and filesystem is
|
346
351
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
347
|
-
log_shared: Indicate that the log directory is shared, which results in additional
|
352
|
+
log_shared: Indicate that the log directory is shared, which results in additional
|
353
|
+
syncing of realtime log data for Inspect View.
|
354
|
+
log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
|
348
355
|
score: Score output (defaults to True)
|
349
356
|
score_display: Show scoring metrics in realtime (defaults to True)
|
350
357
|
**kwargs: Model generation options.
|
@@ -432,6 +439,9 @@ async def eval_async(
|
|
432
439
|
# resolve log_shared
|
433
440
|
log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
|
434
441
|
|
442
|
+
# resolve header only
|
443
|
+
log_header_only = log_header_only is True
|
444
|
+
|
435
445
|
# validate that --log-shared can't use used with 'json' format
|
436
446
|
if log_shared and log_format == JSON_LOG_FORMAT:
|
437
447
|
raise PrerequisiteError(
|
@@ -507,6 +517,7 @@ async def eval_async(
|
|
507
517
|
eval_config=eval_config,
|
508
518
|
eval_sandbox=sandbox,
|
509
519
|
recorder=recorder,
|
520
|
+
header_only=log_header_only,
|
510
521
|
epochs_reducer=epochs_reducer,
|
511
522
|
solver=solver,
|
512
523
|
tags=tags,
|
@@ -532,6 +543,7 @@ async def eval_async(
|
|
532
543
|
eval_config=eval_config,
|
533
544
|
eval_sandbox=sandbox,
|
534
545
|
recorder=recorder,
|
546
|
+
header_only=log_header_only,
|
535
547
|
epochs_reducer=epochs_reducer,
|
536
548
|
solver=solver,
|
537
549
|
tags=tags,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -168,7 +168,7 @@ def eval_set(
|
|
168
168
|
max_samples: Maximum number of samples to run in parallel
|
169
169
|
(default is max_connections)
|
170
170
|
max_tasks: Maximum number of tasks to run in parallel
|
171
|
-
(defaults to number of models being evaluated)
|
171
|
+
(defaults to the greater of 4 and the number of models being evaluated)
|
172
172
|
max_subprocesses: Maximum number of subprocesses to
|
173
173
|
run in parallel (default is os.cpu_count())
|
174
174
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
@@ -235,6 +235,7 @@ def eval_set(
|
|
235
235
|
log_images=log_images,
|
236
236
|
log_buffer=log_buffer,
|
237
237
|
log_shared=log_shared,
|
238
|
+
log_header_only=True,
|
238
239
|
score=score,
|
239
240
|
**kwargs,
|
240
241
|
)
|
@@ -277,7 +278,7 @@ def eval_set(
|
|
277
278
|
retry_connections = retry_connections or 0.5
|
278
279
|
retry_cleanup = retry_cleanup is not False
|
279
280
|
max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
|
280
|
-
max_tasks = max_tasks if max_tasks is not None else len(models)
|
281
|
+
max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
|
281
282
|
|
282
283
|
# prepare console/status
|
283
284
|
console = rich.get_console()
|
inspect_ai/_eval/run.py
CHANGED
@@ -63,6 +63,7 @@ async def eval_run(
|
|
63
63
|
eval_config: EvalConfig,
|
64
64
|
eval_sandbox: SandboxEnvironmentType | None,
|
65
65
|
recorder: Recorder,
|
66
|
+
header_only: bool,
|
66
67
|
epochs_reducer: list[ScoreReducer] | None = None,
|
67
68
|
solver: Solver | SolverSpec | None = None,
|
68
69
|
tags: list[str] | None = None,
|
@@ -212,6 +213,7 @@ async def eval_run(
|
|
212
213
|
eval_config=task_eval_config,
|
213
214
|
metadata=((metadata or {}) | (task.metadata or {})) or None,
|
214
215
|
recorder=recorder,
|
216
|
+
header_only=header_only,
|
215
217
|
)
|
216
218
|
await logger.init()
|
217
219
|
|
inspect_ai/_eval/score.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import functools
|
2
2
|
from copy import deepcopy
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Callable, Literal
|
4
|
+
from typing import Any, Callable, Literal
|
5
5
|
|
6
6
|
import anyio
|
7
7
|
|
@@ -270,9 +270,7 @@ def metrics_from_log(log: EvalLog) -> list[Metric] | dict[str, list[Metric]] | N
|
|
270
270
|
|
271
271
|
|
272
272
|
def metric_from_log(metric: EvalMetricDefinition) -> Metric:
|
273
|
-
return
|
274
|
-
Metric, registry_create("metric", metric.name, **(metric.options or {}))
|
275
|
-
)
|
273
|
+
return registry_create("metric", metric.name, **(metric.options or {}))
|
276
274
|
|
277
275
|
|
278
276
|
def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -75,6 +75,7 @@ class TaskLogger:
|
|
75
75
|
eval_config: EvalConfig,
|
76
76
|
metadata: dict[str, Any] | None,
|
77
77
|
recorder: Recorder,
|
78
|
+
header_only: bool,
|
78
79
|
) -> None:
|
79
80
|
# determine versions
|
80
81
|
git = git_context()
|
@@ -153,6 +154,7 @@ class TaskLogger:
|
|
153
154
|
|
154
155
|
# stack recorder and location
|
155
156
|
self.recorder = recorder
|
157
|
+
self.header_only = header_only
|
156
158
|
|
157
159
|
# number of samples logged
|
158
160
|
self._samples_completed = 0
|
@@ -238,7 +240,7 @@ class TaskLogger:
|
|
238
240
|
) -> EvalLog:
|
239
241
|
# finish and get log
|
240
242
|
log = await self.recorder.log_finish(
|
241
|
-
self.eval, status, stats, results, reductions, error
|
243
|
+
self.eval, status, stats, results, reductions, error, self.header_only
|
242
244
|
)
|
243
245
|
|
244
246
|
# cleanup the events db
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -35,11 +35,7 @@ from inspect_ai._util.registry import (
|
|
35
35
|
registry_log_name,
|
36
36
|
registry_unqualified_name,
|
37
37
|
)
|
38
|
-
from inspect_ai._util.working import
|
39
|
-
end_sample_working_limit,
|
40
|
-
init_sample_working_limit,
|
41
|
-
sample_waiting_time,
|
42
|
-
)
|
38
|
+
from inspect_ai._util.working import init_sample_working_time, sample_waiting_time
|
43
39
|
from inspect_ai._view.notify import view_notify_eval
|
44
40
|
from inspect_ai.dataset import Dataset, Sample
|
45
41
|
from inspect_ai.log import (
|
@@ -90,6 +86,8 @@ from inspect_ai.solver._fork import set_task_generate
|
|
90
86
|
from inspect_ai.solver._solver import Solver
|
91
87
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
92
88
|
from inspect_ai.util._limit import LimitExceededError
|
89
|
+
from inspect_ai.util._limit import time_limit as create_time_limit
|
90
|
+
from inspect_ai.util._limit import working_limit as create_working_limit
|
93
91
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
94
92
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
95
93
|
from inspect_ai.util._span import span
|
@@ -635,10 +633,6 @@ async def task_run_sample(
|
|
635
633
|
)
|
636
634
|
|
637
635
|
async with sandboxenv_cm:
|
638
|
-
timeout_cm: (
|
639
|
-
contextlib._GeneratorContextManager[anyio.CancelScope]
|
640
|
-
| contextlib.nullcontext[None]
|
641
|
-
) = contextlib.nullcontext()
|
642
636
|
try:
|
643
637
|
# update active sample wth sandboxes now that we are initialised
|
644
638
|
# (ensure that we still exit init context in presence of sandbox error)
|
@@ -647,19 +641,17 @@ async def task_run_sample(
|
|
647
641
|
finally:
|
648
642
|
await init_span.__aexit__(None, None, None)
|
649
643
|
|
650
|
-
# initialise timeout context manager
|
651
|
-
timeout_cm = (
|
652
|
-
anyio.fail_after(time_limit)
|
653
|
-
if time_limit is not None
|
654
|
-
else contextlib.nullcontext()
|
655
|
-
)
|
656
|
-
|
657
644
|
# record start time
|
658
645
|
start_time = time.monotonic()
|
659
|
-
|
660
|
-
|
661
|
-
# run sample w/ optional
|
662
|
-
with
|
646
|
+
init_sample_working_time(start_time)
|
647
|
+
|
648
|
+
# run sample w/ optional limits
|
649
|
+
with (
|
650
|
+
state._token_limit,
|
651
|
+
state._message_limit,
|
652
|
+
create_time_limit(time_limit),
|
653
|
+
create_working_limit(working_limit),
|
654
|
+
):
|
663
655
|
# mark started
|
664
656
|
active.started = datetime.now().timestamp()
|
665
657
|
|
@@ -675,24 +667,15 @@ async def task_run_sample(
|
|
675
667
|
)
|
676
668
|
|
677
669
|
# set progress for plan then run it
|
678
|
-
|
679
|
-
|
680
|
-
# disable sample working limit after execution
|
681
|
-
end_sample_working_limit()
|
670
|
+
async with span("solvers"):
|
671
|
+
state = await plan(state, generate)
|
682
672
|
|
683
673
|
except TimeoutError:
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
limit=time_limit,
|
690
|
-
)
|
691
|
-
)
|
692
|
-
else:
|
693
|
-
py_logger.warning(
|
694
|
-
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
695
|
-
)
|
674
|
+
# Scoped time limits manifest themselves as LimitExceededError, not
|
675
|
+
# TimeoutError.
|
676
|
+
py_logger.warning(
|
677
|
+
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
678
|
+
)
|
696
679
|
|
697
680
|
# capture most recent state for scoring
|
698
681
|
state = sample_state() or state
|
@@ -737,54 +720,59 @@ async def task_run_sample(
|
|
737
720
|
# the cause of the timeout is a hung container and scoring requires
|
738
721
|
# interacting with the container). as a middle ground we use half
|
739
722
|
# of the original timeout value for scoring.
|
740
|
-
if time_limit
|
741
|
-
timeout_cm = anyio.fail_after(time_limit / 2)
|
723
|
+
scoring_time_limit = time_limit / 2 if time_limit else None
|
742
724
|
|
743
725
|
set_sample_state(state)
|
744
726
|
|
745
727
|
# scoring
|
746
728
|
try:
|
747
729
|
# timeout during scoring will result in an ordinary sample error
|
748
|
-
with
|
730
|
+
with create_time_limit(scoring_time_limit):
|
749
731
|
if error is None:
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
async with span(name=scorer_name, type="scorer"):
|
755
|
-
score_result = (
|
756
|
-
await scorer(state, Target(sample.target))
|
757
|
-
if scorer
|
758
|
-
else None
|
732
|
+
async with span(name="scorers"):
|
733
|
+
for scorer in scorers or []:
|
734
|
+
scorer_name = unique_scorer_name(
|
735
|
+
scorer, list(results.keys())
|
759
736
|
)
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
737
|
+
async with span(name=scorer_name, type="scorer"):
|
738
|
+
score_result = (
|
739
|
+
await scorer(state, Target(sample.target))
|
740
|
+
if scorer
|
741
|
+
else None
|
742
|
+
)
|
743
|
+
if score_result is not None:
|
744
|
+
sample_score = SampleScore(
|
745
|
+
score=score_result,
|
746
|
+
sample_id=sample.id,
|
747
|
+
sample_metadata=sample.metadata,
|
748
|
+
scorer=registry_unqualified_name(
|
749
|
+
scorer
|
750
|
+
),
|
751
|
+
)
|
752
|
+
transcript()._event(
|
753
|
+
ScoreEvent(
|
754
|
+
score=score_result,
|
755
|
+
target=sample.target,
|
756
|
+
)
|
757
|
+
)
|
758
|
+
results[scorer_name] = sample_score
|
759
|
+
|
760
|
+
# add scores returned by solvers
|
761
|
+
if state.scores is not None:
|
762
|
+
for name, score in state.scores.items():
|
763
|
+
results[name] = SampleScore(
|
764
|
+
score=score,
|
765
|
+
sample_id=state.sample_id,
|
766
|
+
sample_metadata=state.metadata,
|
766
767
|
)
|
767
768
|
transcript()._event(
|
768
769
|
ScoreEvent(
|
769
|
-
score=
|
770
|
+
score=score, target=sample.target
|
770
771
|
)
|
771
772
|
)
|
772
|
-
results[scorer_name] = sample_score
|
773
|
-
|
774
|
-
# add scores returned by solvers
|
775
|
-
if state.scores is not None:
|
776
|
-
for name, score in state.scores.items():
|
777
|
-
results[name] = SampleScore(
|
778
|
-
score=score,
|
779
|
-
sample_id=state.sample_id,
|
780
|
-
sample_metadata=state.metadata,
|
781
|
-
)
|
782
|
-
transcript()._event(
|
783
|
-
ScoreEvent(score=score, target=sample.target)
|
784
|
-
)
|
785
773
|
|
786
|
-
|
787
|
-
|
774
|
+
# propagate results into scores
|
775
|
+
state.scores = {k: v.score for k, v in results.items()}
|
788
776
|
|
789
777
|
except anyio.get_cancelled_exc_class():
|
790
778
|
if active.interrupt_action:
|
@@ -798,17 +786,7 @@ async def task_run_sample(
|
|
798
786
|
raise
|
799
787
|
|
800
788
|
except BaseException as ex:
|
801
|
-
#
|
802
|
-
if isinstance(ex, TimeoutError):
|
803
|
-
transcript()._event(
|
804
|
-
SampleLimitEvent(
|
805
|
-
type="time",
|
806
|
-
message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
|
807
|
-
limit=time_limit,
|
808
|
-
)
|
809
|
-
)
|
810
|
-
|
811
|
-
# handle error (this will throw if we've exceeded the limit)
|
789
|
+
# handle error
|
812
790
|
error, raise_error = handle_error(ex)
|
813
791
|
|
814
792
|
except Exception as ex:
|