inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_display/plain/display.py +9 -11
- inspect_ai/_display/textual/app.py +5 -5
- inspect_ai/_display/textual/widgets/samples.py +47 -18
- inspect_ai/_display/textual/widgets/transcript.py +25 -12
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +44 -15
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/interrupt.py +15 -0
- inspect_ai/_util/logger.py +23 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +301 -1
- inspect_ai/_util/transcript.py +10 -2
- inspect_ai/_util/working.py +46 -0
- inspect_ai/_view/www/dist/assets/index.css +56 -12
- inspect_ai/_view/www/dist/assets/index.js +905 -751
- inspect_ai/_view/www/log-schema.json +337 -2
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +188 -108
- inspect_ai/_view/www/src/utils/format.ts +7 -4
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +1 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_samples.py +5 -5
- inspect_ai/log/_transcript.py +31 -1
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_conversation.py +1 -1
- inspect_ai/model/_model.py +35 -16
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_providers/anthropic.py +13 -2
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +358 -302
- inspect_ai/model/_providers/groq.py +57 -23
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +81 -52
- inspect_ai/model/_providers/openai.py +9 -0
- inspect_ai/model/_providers/providers.py +6 -6
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +3 -3
- inspect_ai/solver/_solver.py +3 -0
- inspect_ai/solver/_task_state.py +10 -1
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
- inspect_ai/util/_sandbox/docker/compose.py +5 -9
- inspect_ai/util/_sandbox/docker/docker.py +20 -6
- inspect_ai/util/_sandbox/docker/util.py +10 -1
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +149 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +2 -1
- inspect_ai/util/_subprocess.py +4 -1
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
218
218
|
@click.option(
|
219
219
|
"--time-limit",
|
220
220
|
type=int,
|
221
|
-
help="Limit on total
|
221
|
+
help="Limit on total running time for each sample.",
|
222
222
|
envvar="INSPECT_EVAL_TIME_LIMIT",
|
223
223
|
)
|
224
|
+
@click.option(
|
225
|
+
"--working-limit",
|
226
|
+
type=int,
|
227
|
+
help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
|
228
|
+
envvar="INSPECT_EVAL_WORKING_LIMIT",
|
229
|
+
)
|
224
230
|
@click.option(
|
225
231
|
"--fail-on-error",
|
226
232
|
type=float,
|
@@ -468,6 +474,7 @@ def eval_command(
|
|
468
474
|
message_limit: int | None,
|
469
475
|
token_limit: int | None,
|
470
476
|
time_limit: int | None,
|
477
|
+
working_limit: int | None,
|
471
478
|
max_samples: int | None,
|
472
479
|
max_tasks: int | None,
|
473
480
|
max_subprocesses: int | None,
|
@@ -518,6 +525,7 @@ def eval_command(
|
|
518
525
|
message_limit=message_limit,
|
519
526
|
token_limit=token_limit,
|
520
527
|
time_limit=time_limit,
|
528
|
+
working_limit=working_limit,
|
521
529
|
max_samples=max_samples,
|
522
530
|
max_tasks=max_tasks,
|
523
531
|
max_subprocesses=max_subprocesses,
|
@@ -629,6 +637,7 @@ def eval_set_command(
|
|
629
637
|
message_limit: int | None,
|
630
638
|
token_limit: int | None,
|
631
639
|
time_limit: int | None,
|
640
|
+
working_limit: int | None,
|
632
641
|
max_samples: int | None,
|
633
642
|
max_tasks: int | None,
|
634
643
|
max_subprocesses: int | None,
|
@@ -684,6 +693,7 @@ def eval_set_command(
|
|
684
693
|
message_limit=message_limit,
|
685
694
|
token_limit=token_limit,
|
686
695
|
time_limit=time_limit,
|
696
|
+
working_limit=working_limit,
|
687
697
|
max_samples=max_samples,
|
688
698
|
max_tasks=max_tasks,
|
689
699
|
max_subprocesses=max_subprocesses,
|
@@ -737,6 +747,7 @@ def eval_exec(
|
|
737
747
|
message_limit: int | None,
|
738
748
|
token_limit: int | None,
|
739
749
|
time_limit: int | None,
|
750
|
+
working_limit: int | None,
|
740
751
|
max_samples: int | None,
|
741
752
|
max_tasks: int | None,
|
742
753
|
max_subprocesses: int | None,
|
@@ -817,6 +828,7 @@ def eval_exec(
|
|
817
828
|
message_limit=message_limit,
|
818
829
|
token_limit=token_limit,
|
819
830
|
time_limit=time_limit,
|
831
|
+
working_limit=working_limit,
|
820
832
|
max_samples=max_samples,
|
821
833
|
max_tasks=max_tasks,
|
822
834
|
max_subprocesses=max_subprocesses,
|
@@ -119,14 +119,14 @@ class PlainTaskDisplay(TaskDisplay):
|
|
119
119
|
self.samples_complete = 0
|
120
120
|
self.samples_total = 0
|
121
121
|
self.current_metrics: list[TaskDisplayMetric] | None = None
|
122
|
-
self.last_progress = 0
|
122
|
+
self.last_progress = 0
|
123
123
|
|
124
124
|
@contextlib.contextmanager
|
125
125
|
def progress(self) -> Iterator[Progress]:
|
126
126
|
self.progress_display = PlainProgress(self.task.profile.steps)
|
127
127
|
yield self.progress_display
|
128
128
|
|
129
|
-
@throttle(
|
129
|
+
@throttle(5)
|
130
130
|
def _print_status_throttled(self) -> None:
|
131
131
|
self._print_status()
|
132
132
|
|
@@ -135,13 +135,8 @@ class PlainTaskDisplay(TaskDisplay):
|
|
135
135
|
if not self.progress_display:
|
136
136
|
return
|
137
137
|
|
138
|
-
#
|
139
|
-
|
140
|
-
self.progress_display.current / self.progress_display.total * 100
|
141
|
-
)
|
142
|
-
|
143
|
-
# Only print on percentage changes to avoid too much output
|
144
|
-
if current_progress != self.last_progress:
|
138
|
+
# Only print when step count changes to avoid too much output
|
139
|
+
if self.progress_display.current != self.last_progress:
|
145
140
|
status_parts: list[str] = []
|
146
141
|
|
147
142
|
# if this is parallel print task and model to distinguish (limit both to 12 chars)
|
@@ -154,8 +149,11 @@ class PlainTaskDisplay(TaskDisplay):
|
|
154
149
|
)
|
155
150
|
|
156
151
|
# Add step progress
|
152
|
+
progress_percent = int(
|
153
|
+
self.progress_display.current / self.progress_display.total * 100
|
154
|
+
)
|
157
155
|
status_parts.append(
|
158
|
-
f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {
|
156
|
+
f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {progress_percent:3d}%"
|
159
157
|
)
|
160
158
|
|
161
159
|
# Add sample progress
|
@@ -187,7 +185,7 @@ class PlainTaskDisplay(TaskDisplay):
|
|
187
185
|
# Print on new line
|
188
186
|
print(" | ".join(status_parts))
|
189
187
|
|
190
|
-
self.last_progress =
|
188
|
+
self.last_progress = self.progress_display.current
|
191
189
|
|
192
190
|
def sample_complete(self, complete: int, total: int) -> None:
|
193
191
|
self.samples_complete = complete
|
@@ -13,7 +13,6 @@ from typing import (
|
|
13
13
|
|
14
14
|
import rich
|
15
15
|
from rich.console import Console
|
16
|
-
from rich.text import Text
|
17
16
|
from textual.app import App, ComposeResult
|
18
17
|
from textual.binding import Binding, BindingType
|
19
18
|
from textual.css.query import NoMatches
|
@@ -186,7 +185,8 @@ class TaskScreenApp(App[TR]):
|
|
186
185
|
# force repaint
|
187
186
|
self.refresh(repaint=True)
|
188
187
|
|
189
|
-
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
188
|
+
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
189
|
+
# however is fixed in textual 2.1)
|
190
190
|
assert self.app._driver
|
191
191
|
textual_enable_mouse_support(self.app._driver)
|
192
192
|
|
@@ -316,9 +316,9 @@ class TaskScreenApp(App[TR]):
|
|
316
316
|
|
317
317
|
def set_unread(unread: int | None) -> None:
|
318
318
|
if unread is not None:
|
319
|
-
console_tab.label =
|
319
|
+
console_tab.label = f"Console ({unread})" # type: ignore[assignment]
|
320
320
|
else:
|
321
|
-
console_tab.label =
|
321
|
+
console_tab.label = "Console" # type: ignore[assignment]
|
322
322
|
|
323
323
|
self.watch(console_view, "unread", set_unread)
|
324
324
|
|
@@ -385,7 +385,7 @@ class TaskScreenApp(App[TR]):
|
|
385
385
|
def set_title(self, title: str) -> None:
|
386
386
|
tabs = self.app.query_one(TabbedContent)
|
387
387
|
tab = tabs.get_tab(self.tab_id)
|
388
|
-
tab.label =
|
388
|
+
tab.label = title # type: ignore[assignment]
|
389
389
|
|
390
390
|
def activate(self) -> None:
|
391
391
|
# show the tab
|
@@ -6,6 +6,7 @@ from rich.table import Table
|
|
6
6
|
from rich.text import Text
|
7
7
|
from textual.app import ComposeResult
|
8
8
|
from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
|
9
|
+
from textual.css.query import NoMatches
|
9
10
|
from textual.reactive import reactive
|
10
11
|
from textual.widget import Widget
|
11
12
|
from textual.widgets import (
|
@@ -38,7 +39,7 @@ class SamplesView(Widget):
|
|
38
39
|
padding: 0 1 0 1;
|
39
40
|
layout: grid;
|
40
41
|
grid-size: 2 3;
|
41
|
-
grid-rows: auto 1fr
|
42
|
+
grid-rows: auto 1fr 3;
|
42
43
|
grid-columns: 32 1fr;
|
43
44
|
grid-gutter: 1;
|
44
45
|
}
|
@@ -61,7 +62,10 @@ class SamplesView(Widget):
|
|
61
62
|
)
|
62
63
|
|
63
64
|
async def notify_active(self, active: bool) -> None:
|
64
|
-
|
65
|
+
try:
|
66
|
+
await self.query_one(TranscriptView).notify_active(active)
|
67
|
+
except NoMatches:
|
68
|
+
pass
|
65
69
|
|
66
70
|
def set_samples(self, samples: list[ActiveSample]) -> None:
|
67
71
|
# throttle to no more than 1 second per 100 samples
|
@@ -137,8 +141,8 @@ class SamplesList(OptionList):
|
|
137
141
|
if highlighted_sample and (highlighted_sample not in self.samples):
|
138
142
|
self.samples.append(highlighted_sample)
|
139
143
|
|
140
|
-
# sort the samples by
|
141
|
-
self.samples.sort(key=lambda sample: sample.
|
144
|
+
# sort the samples by running time
|
145
|
+
self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
|
142
146
|
|
143
147
|
# rebuild the list
|
144
148
|
self.clear_options()
|
@@ -150,9 +154,7 @@ class SamplesList(OptionList):
|
|
150
154
|
table.add_column(width=1)
|
151
155
|
task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
|
152
156
|
task_name.truncate(18, overflow="ellipsis", pad=True)
|
153
|
-
task_time = Text.from_markup(
|
154
|
-
f"{format_progress_time(sample.execution_time)}"
|
155
|
-
)
|
157
|
+
task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
|
156
158
|
table.add_row(task_name, task_time, " ")
|
157
159
|
sample_id = Text.from_markup(f"id: {sample.sample.id}")
|
158
160
|
sample_id.truncate(18, overflow="ellipsis", pad=True)
|
@@ -408,11 +410,17 @@ class SampleToolbar(Horizontal):
|
|
408
410
|
PENDING_STATUS = "pending_status"
|
409
411
|
PENDING_CAPTION = "pending_caption"
|
410
412
|
|
413
|
+
TIMEOUT_TOOL_CALL_ENABLED = (
|
414
|
+
"Cancel the tool call and report a timeout to the model."
|
415
|
+
)
|
416
|
+
TIMEOUT_TOOL_CALL_DISABLED = "Cancelling tool call..."
|
417
|
+
CANCEL_SCORE_OUTPUT_ENABLED = (
|
418
|
+
"Cancel the sample and score whatever output has been generated so far."
|
419
|
+
)
|
420
|
+
CANCEL_RAISE_ERROR_ENABLED = "Cancel the sample and raise an error"
|
421
|
+
CANCEL_DISABLED = "Cancelling sample..."
|
422
|
+
|
411
423
|
DEFAULT_CSS = f"""
|
412
|
-
SampleToolbar {{
|
413
|
-
grid-size: 5 1;
|
414
|
-
grid-columns: auto auto 1fr auto auto;
|
415
|
-
}}
|
416
424
|
SampleToolbar #{STATUS_GROUP} {{
|
417
425
|
width: 22;
|
418
426
|
}}
|
@@ -445,18 +453,18 @@ class SampleToolbar(Horizontal):
|
|
445
453
|
yield Button(
|
446
454
|
Text("Timeout Tool"),
|
447
455
|
id=self.TIMEOUT_TOOL_CALL,
|
448
|
-
tooltip=
|
456
|
+
tooltip=self.TIMEOUT_TOOL_CALL_ENABLED,
|
449
457
|
)
|
450
458
|
yield Horizontal()
|
451
459
|
yield Button(
|
452
460
|
Text("Cancel (Score)"),
|
453
461
|
id=self.CANCEL_SCORE_OUTPUT,
|
454
|
-
tooltip=
|
462
|
+
tooltip=self.CANCEL_SCORE_OUTPUT_ENABLED,
|
455
463
|
)
|
456
464
|
yield Button(
|
457
465
|
Text("Cancel (Error)"),
|
458
466
|
id=self.CANCEL_RAISE_ERROR,
|
459
|
-
tooltip=
|
467
|
+
tooltip=self.CANCEL_RAISE_ERROR_ENABLED,
|
460
468
|
)
|
461
469
|
|
462
470
|
def on_mount(self) -> None:
|
@@ -475,14 +483,26 @@ class SampleToolbar(Horizontal):
|
|
475
483
|
)
|
476
484
|
if isinstance(last_event, ToolEvent):
|
477
485
|
last_event._cancel()
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
self.
|
486
|
+
event.button.disabled = True
|
487
|
+
event.button.tooltip = self.TIMEOUT_TOOL_CALL_DISABLED
|
488
|
+
else:
|
489
|
+
if event.button.id == self.CANCEL_SCORE_OUTPUT:
|
490
|
+
self.sample.interrupt("score")
|
491
|
+
elif event.button.id == self.CANCEL_RAISE_ERROR:
|
492
|
+
self.sample.interrupt("error")
|
493
|
+
cancel_score_output = self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
|
494
|
+
cancel_score_output.disabled = True
|
495
|
+
cancel_score_output.tooltip = self.CANCEL_DISABLED
|
496
|
+
cancel_with_error = self.query_one("#" + self.CANCEL_RAISE_ERROR)
|
497
|
+
cancel_with_error.disabled = True
|
498
|
+
cancel_with_error.tooltip = self.CANCEL_DISABLED
|
482
499
|
|
483
500
|
async def sync_sample(self, sample: ActiveSample | None) -> None:
|
484
501
|
from inspect_ai.log._transcript import ModelEvent
|
485
502
|
|
503
|
+
# is it a new sample?
|
504
|
+
new_sample = sample != self.sample
|
505
|
+
|
486
506
|
# track the sample
|
487
507
|
self.sample = sample
|
488
508
|
|
@@ -499,6 +519,13 @@ class SampleToolbar(Horizontal):
|
|
499
519
|
cancel_score_output.display = True
|
500
520
|
cancel_with_error.display = not sample.fails_on_error
|
501
521
|
|
522
|
+
# if its a new sample then reset enabled states
|
523
|
+
if new_sample:
|
524
|
+
cancel_score_output.disabled = False
|
525
|
+
cancel_score_output.tooltip = self.CANCEL_SCORE_OUTPUT_ENABLED
|
526
|
+
cancel_with_error.disabled = False
|
527
|
+
cancel_with_error.tooltip = self.CANCEL_RAISE_ERROR_ENABLED
|
528
|
+
|
502
529
|
# if we have a pending event then start the clock and show pending status
|
503
530
|
last_event = (
|
504
531
|
sample.transcript.events[-1]
|
@@ -520,6 +547,8 @@ class SampleToolbar(Horizontal):
|
|
520
547
|
)
|
521
548
|
|
522
549
|
timeout_tool.display = isinstance(last_event, ToolEvent)
|
550
|
+
timeout_tool.disabled = False
|
551
|
+
timeout_tool.tooltip = self.TIMEOUT_TOOL_CALL_ENABLED
|
523
552
|
|
524
553
|
clock.start(last_event.timestamp.timestamp())
|
525
554
|
else:
|
@@ -193,16 +193,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
193
193
|
return EventDisplay(f"model: {event.model}", Group(*content))
|
194
194
|
|
195
195
|
|
196
|
-
def
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
for
|
201
|
-
|
196
|
+
def render_sub_events(events: list[Event]) -> list[RenderableType]:
|
197
|
+
content: list[RenderableType] = []
|
198
|
+
for e in events:
|
199
|
+
event_displays = render_event(e) or []
|
200
|
+
for d in event_displays:
|
201
|
+
if d.content:
|
202
|
+
content.append(Text(" "))
|
203
|
+
content.append(transcript_separator(d.title, "black", "··"))
|
204
|
+
if isinstance(d.content, Markdown):
|
205
|
+
set_transcript_markdown_options(d.content)
|
206
|
+
content.append(d.content)
|
207
|
+
|
208
|
+
return content
|
209
|
+
|
202
210
|
|
211
|
+
def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
203
212
|
# render the call
|
204
213
|
content = transcript_tool_call(event)
|
205
214
|
|
215
|
+
# render sub-events
|
216
|
+
if event.events:
|
217
|
+
content.extend(render_sub_events(event.events))
|
218
|
+
|
206
219
|
# render the output
|
207
220
|
if isinstance(event.result, list):
|
208
221
|
result: ToolResult = "\n".join(
|
@@ -220,7 +233,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
220
233
|
result = str(result).strip()
|
221
234
|
content.extend(lines_display(result, 50))
|
222
235
|
|
223
|
-
return
|
236
|
+
return [EventDisplay("tool call", Group(*content))]
|
224
237
|
|
225
238
|
|
226
239
|
def render_step_event(event: StepEvent) -> EventDisplay:
|
@@ -257,13 +270,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
|
|
257
270
|
|
258
271
|
|
259
272
|
def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
273
|
+
# render header
|
274
|
+
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
275
|
+
|
260
276
|
# render sub-events
|
261
|
-
display: list[EventDisplay] = []
|
262
277
|
if event.events:
|
263
|
-
|
264
|
-
display.extend(render_event(e) or [])
|
278
|
+
content.extend(render_sub_events(event.events))
|
265
279
|
|
266
|
-
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
267
280
|
if event.result:
|
268
281
|
content.append(Text())
|
269
282
|
if isinstance(event.result, str | int | float | bool | None):
|
@@ -271,7 +284,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
|
271
284
|
else:
|
272
285
|
content.append(render_as_json(event.result))
|
273
286
|
|
274
|
-
return
|
287
|
+
return [EventDisplay(f"subtask: {event.name}", Group(*content))]
|
275
288
|
|
276
289
|
|
277
290
|
def render_input_event(event: InputEvent) -> EventDisplay:
|
inspect_ai/_eval/eval.py
CHANGED
@@ -75,6 +75,7 @@ def eval(
|
|
75
75
|
message_limit: int | None = None,
|
76
76
|
token_limit: int | None = None,
|
77
77
|
time_limit: int | None = None,
|
78
|
+
working_limit: int | None = None,
|
78
79
|
max_samples: int | None = None,
|
79
80
|
max_tasks: int | None = None,
|
80
81
|
max_subprocesses: int | None = None,
|
@@ -132,7 +133,10 @@ def eval(
|
|
132
133
|
so they can be debugged (defaults to False).
|
133
134
|
message_limit: Limit on total messages used for each sample.
|
134
135
|
token_limit: Limit on total tokens used for each sample.
|
135
|
-
time_limit: Limit on time (in seconds) for
|
136
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
137
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
138
|
+
time includes model generation, tool calls, etc. but does not include
|
139
|
+
time spent waiting on retries or shared resources.
|
136
140
|
max_samples: Maximum number of samples to run in parallel
|
137
141
|
(default is max_connections)
|
138
142
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -186,6 +190,7 @@ def eval(
|
|
186
190
|
message_limit=message_limit,
|
187
191
|
token_limit=token_limit,
|
188
192
|
time_limit=time_limit,
|
193
|
+
working_limit=working_limit,
|
189
194
|
max_samples=max_samples,
|
190
195
|
max_tasks=max_tasks,
|
191
196
|
max_subprocesses=max_subprocesses,
|
@@ -227,6 +232,7 @@ async def eval_async(
|
|
227
232
|
message_limit: int | None = None,
|
228
233
|
token_limit: int | None = None,
|
229
234
|
time_limit: int | None = None,
|
235
|
+
working_limit: int | None = None,
|
230
236
|
max_samples: int | None = None,
|
231
237
|
max_tasks: int | None = None,
|
232
238
|
max_subprocesses: int | None = None,
|
@@ -281,7 +287,10 @@ async def eval_async(
|
|
281
287
|
so they can be debugged (defaults to False).
|
282
288
|
message_limit (int | None): Limit on total messages used for each sample.
|
283
289
|
token_limit (int | None): Limit on total tokens used for each sample.
|
284
|
-
time_limit
|
290
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
291
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
292
|
+
time includes model generation, tool calls, etc. but does not include
|
293
|
+
time spent waiting on retries or shared resources.
|
285
294
|
max_samples (int | None): Maximum number of samples to run in parallel
|
286
295
|
(default is max_connections)
|
287
296
|
max_tasks (int | None): Maximum number of tasks to run in parallel
|
@@ -395,6 +404,7 @@ async def eval_async(
|
|
395
404
|
message_limit=message_limit,
|
396
405
|
token_limit=token_limit,
|
397
406
|
time_limit=time_limit,
|
407
|
+
working_limit=working_limit,
|
398
408
|
max_samples=max_samples,
|
399
409
|
max_tasks=max_tasks,
|
400
410
|
max_subprocesses=max_subprocesses,
|
@@ -702,6 +712,7 @@ async def eval_retry_async(
|
|
702
712
|
message_limit = eval_log.eval.config.message_limit
|
703
713
|
token_limit = eval_log.eval.config.token_limit
|
704
714
|
time_limit = eval_log.eval.config.time_limit
|
715
|
+
working_limit = eval_log.eval.config.working_limit
|
705
716
|
max_samples = max_samples or eval_log.eval.config.max_samples
|
706
717
|
max_tasks = max_tasks or eval_log.eval.config.max_tasks
|
707
718
|
max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
|
@@ -763,6 +774,7 @@ async def eval_retry_async(
|
|
763
774
|
message_limit=message_limit,
|
764
775
|
token_limit=token_limit,
|
765
776
|
time_limit=time_limit,
|
777
|
+
working_limit=working_limit,
|
766
778
|
max_samples=max_samples,
|
767
779
|
max_tasks=max_tasks,
|
768
780
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -79,6 +79,7 @@ def eval_set(
|
|
79
79
|
message_limit: int | None = None,
|
80
80
|
token_limit: int | None = None,
|
81
81
|
time_limit: int | None = None,
|
82
|
+
working_limit: int | None = None,
|
82
83
|
max_samples: int | None = None,
|
83
84
|
max_tasks: int | None = None,
|
84
85
|
max_subprocesses: int | None = None,
|
@@ -146,7 +147,10 @@ def eval_set(
|
|
146
147
|
so they can be debugged (defaults to False).
|
147
148
|
message_limit: Limit on total messages used for each sample.
|
148
149
|
token_limit: Limit on total tokens used for each sample.
|
149
|
-
time_limit: Limit on time (in seconds) for
|
150
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
151
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
152
|
+
time includes model generation, tool calls, etc. but does not include
|
153
|
+
time spent waiting on retries or shared resources.
|
150
154
|
max_samples: Maximum number of samples to run in parallel
|
151
155
|
(default is max_connections)
|
152
156
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -202,6 +206,7 @@ def eval_set(
|
|
202
206
|
message_limit=message_limit,
|
203
207
|
token_limit=token_limit,
|
204
208
|
time_limit=time_limit,
|
209
|
+
working_limit=working_limit,
|
205
210
|
max_samples=max_samples,
|
206
211
|
max_tasks=max_tasks,
|
207
212
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/run.py
CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
|
|
163
163
|
else:
|
164
164
|
task.time_limit = task_eval_config.time_limit
|
165
165
|
|
166
|
+
# sample execution limit
|
167
|
+
if task_eval_config.working_limit is None:
|
168
|
+
task_eval_config.working_limit = task.working_limit
|
169
|
+
else:
|
170
|
+
task.working_limit = task_eval_config.working_limit
|
171
|
+
|
166
172
|
# fail_on_error
|
167
173
|
if task_eval_config.fail_on_error is None:
|
168
174
|
task_eval_config.fail_on_error = task.fail_on_error
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
|
|
33
33
|
registry_unqualified_name,
|
34
34
|
)
|
35
35
|
from inspect_ai._util.timeouts import Timeout, timeout
|
36
|
+
from inspect_ai._util.working import (
|
37
|
+
init_sample_working_limit,
|
38
|
+
sample_waiting_time,
|
39
|
+
)
|
36
40
|
from inspect_ai._view.notify import view_notify_eval
|
37
41
|
from inspect_ai.dataset import Dataset, Sample
|
38
42
|
from inspect_ai.log import (
|
@@ -56,6 +60,7 @@ from inspect_ai.log._transcript import (
|
|
56
60
|
SampleInitEvent,
|
57
61
|
SampleLimitEvent,
|
58
62
|
ScoreEvent,
|
63
|
+
StepEvent,
|
59
64
|
transcript,
|
60
65
|
)
|
61
66
|
from inspect_ai.model import (
|
@@ -182,9 +187,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
182
187
|
if isinstance(solver, Plan):
|
183
188
|
plan = solver
|
184
189
|
elif isinstance(solver, Chain):
|
185
|
-
plan = Plan(list(solver), internal=True)
|
190
|
+
plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
|
186
191
|
else:
|
187
|
-
plan = Plan(unroll(solver), internal=True)
|
192
|
+
plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
|
188
193
|
|
189
194
|
# add setup solver(s) if specified
|
190
195
|
if task.setup:
|
@@ -308,6 +313,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
308
313
|
or config.fail_on_error is True
|
309
314
|
),
|
310
315
|
time_limit=config.time_limit,
|
316
|
+
working_limit=config.working_limit,
|
311
317
|
semaphore=sample_semaphore,
|
312
318
|
)
|
313
319
|
for (sample, state) in zip(samples, states)
|
@@ -500,6 +506,7 @@ async def task_run_sample(
|
|
500
506
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
501
507
|
fails_on_error: bool,
|
502
508
|
time_limit: int | None,
|
509
|
+
working_limit: int | None,
|
503
510
|
semaphore: asyncio.Semaphore | None,
|
504
511
|
) -> dict[str, SampleScore] | None:
|
505
512
|
# if there is an existing sample then tick off its progress, log it, and return it
|
@@ -570,19 +577,37 @@ async def task_run_sample(
|
|
570
577
|
message_limit=state.message_limit,
|
571
578
|
token_limit=state.token_limit,
|
572
579
|
time_limit=time_limit,
|
580
|
+
working_limit=working_limit,
|
573
581
|
fails_on_error=fails_on_error,
|
574
582
|
transcript=sample_transcript,
|
575
583
|
) as active,
|
576
584
|
):
|
585
|
+
start_time: float | None = None
|
577
586
|
error: EvalError | None = None
|
578
587
|
raise_error: BaseException | None = None
|
579
588
|
results: dict[str, SampleScore] = {}
|
580
589
|
try:
|
590
|
+
# begin init
|
591
|
+
transcript()._event(StepEvent(action="begin", name="init"))
|
592
|
+
|
593
|
+
# sample init event (remove file bodies as they have content or absolute paths)
|
594
|
+
event_sample = sample.model_copy(
|
595
|
+
update=dict(files={k: "" for k in sample.files.keys()})
|
596
|
+
if sample.files
|
597
|
+
else None
|
598
|
+
)
|
599
|
+
transcript()._event(
|
600
|
+
SampleInitEvent(sample=event_sample, state=state_jsonable(state))
|
601
|
+
)
|
602
|
+
|
581
603
|
async with sandboxenv_cm:
|
582
604
|
try:
|
583
605
|
# update active sample wth sandboxes now that we are initialised
|
584
606
|
active.sandboxes = await sandbox_connections()
|
585
607
|
|
608
|
+
# end init
|
609
|
+
transcript()._event(StepEvent(action="end", name="init"))
|
610
|
+
|
586
611
|
# initialise timeout context manager
|
587
612
|
timeout_cm = (
|
588
613
|
timeout(time_limit)
|
@@ -590,23 +615,15 @@ async def task_run_sample(
|
|
590
615
|
else contextlib.nullcontext()
|
591
616
|
)
|
592
617
|
|
618
|
+
# record start time
|
619
|
+
start_time = time.monotonic()
|
620
|
+
init_sample_working_limit(start_time, working_limit)
|
621
|
+
|
593
622
|
# run sample w/ optional timeout
|
594
623
|
async with timeout_cm:
|
595
624
|
# mark started
|
596
625
|
active.started = datetime.now().timestamp()
|
597
626
|
|
598
|
-
# sample init event (remove file bodies as they have content or absolute paths)
|
599
|
-
event_sample = sample.model_copy(
|
600
|
-
update=dict(files={k: "" for k in sample.files.keys()})
|
601
|
-
if sample.files
|
602
|
-
else None
|
603
|
-
)
|
604
|
-
transcript()._event(
|
605
|
-
SampleInitEvent(
|
606
|
-
sample=event_sample, state=state_jsonable(state)
|
607
|
-
)
|
608
|
-
)
|
609
|
-
|
610
627
|
# set progress for plan then run it
|
611
628
|
state = await plan(state, generate)
|
612
629
|
|
@@ -661,11 +678,13 @@ async def task_run_sample(
|
|
661
678
|
|
662
679
|
# capture most recent state for scoring
|
663
680
|
state = ex.state or sample_state() or state
|
664
|
-
state.completed = True
|
665
681
|
|
666
682
|
except BaseException as ex:
|
667
683
|
error, raise_error = handle_error(ex)
|
668
684
|
|
685
|
+
# mark completed
|
686
|
+
state.completed = True
|
687
|
+
|
669
688
|
# set timeout for scoring. if the original timeout was hit we still
|
670
689
|
# want to provide opportunity for scoring, but we don't necessarily
|
671
690
|
# want to wait the full timeout again (especially in the case where
|
@@ -768,6 +787,7 @@ async def task_run_sample(
|
|
768
787
|
|
769
788
|
# log the sample
|
770
789
|
await log_sample(
|
790
|
+
start_time=start_time,
|
771
791
|
logger=logger,
|
772
792
|
sample=sample,
|
773
793
|
state=state,
|
@@ -788,6 +808,7 @@ async def task_run_sample(
|
|
788
808
|
|
789
809
|
|
790
810
|
async def log_sample(
|
811
|
+
start_time: float | None,
|
791
812
|
logger: TaskLogger,
|
792
813
|
sample: Sample,
|
793
814
|
state: TaskState,
|
@@ -804,6 +825,9 @@ async def log_sample(
|
|
804
825
|
|
805
826
|
# construct sample for logging
|
806
827
|
|
828
|
+
# compute total time if we can
|
829
|
+
total_time = time.monotonic() - start_time if start_time is not None else None
|
830
|
+
|
807
831
|
# if a limit was hit, note that in the Eval Sample
|
808
832
|
limit = None
|
809
833
|
for e in transcript().events:
|
@@ -827,8 +851,13 @@ async def log_sample(
|
|
827
851
|
output=state.output,
|
828
852
|
scores={k: v.score for k, v in scores.items()},
|
829
853
|
store=dict(state.store.items()),
|
854
|
+
uuid=state.uuid,
|
830
855
|
events=list(transcript().events),
|
831
856
|
model_usage=sample_model_usage(),
|
857
|
+
total_time=round(total_time, 3) if total_time is not None else None,
|
858
|
+
working_time=round(total_time - sample_waiting_time(), 3)
|
859
|
+
if total_time is not None
|
860
|
+
else None,
|
832
861
|
error=error,
|
833
862
|
limit=limit,
|
834
863
|
)
|