inspect-ai 0.3.69__py3-none-any.whl → 0.3.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_display/textual/app.py +3 -2
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +25 -12
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +44 -15
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +10 -2
- inspect_ai/_util/working.py +46 -0
- inspect_ai/_view/www/dist/assets/index.css +56 -12
- inspect_ai/_view/www/dist/assets/index.js +904 -750
- inspect_ai/_view/www/log-schema.json +337 -2
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +188 -108
- inspect_ai/_view/www/src/utils/format.ts +7 -4
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +1 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +31 -1
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_conversation.py +1 -1
- inspect_ai/model/_model.py +32 -16
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_providers/anthropic.py +13 -2
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +2 -0
- inspect_ai/model/_providers/groq.py +57 -23
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +9 -0
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +149 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +74 -67
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
218
218
|
@click.option(
|
219
219
|
"--time-limit",
|
220
220
|
type=int,
|
221
|
-
help="Limit on total
|
221
|
+
help="Limit on total running time for each sample.",
|
222
222
|
envvar="INSPECT_EVAL_TIME_LIMIT",
|
223
223
|
)
|
224
|
+
@click.option(
|
225
|
+
"--working-limit",
|
226
|
+
type=int,
|
227
|
+
help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
|
228
|
+
envvar="INSPECT_EVAL_WORKING_LIMIT",
|
229
|
+
)
|
224
230
|
@click.option(
|
225
231
|
"--fail-on-error",
|
226
232
|
type=float,
|
@@ -468,6 +474,7 @@ def eval_command(
|
|
468
474
|
message_limit: int | None,
|
469
475
|
token_limit: int | None,
|
470
476
|
time_limit: int | None,
|
477
|
+
working_limit: int | None,
|
471
478
|
max_samples: int | None,
|
472
479
|
max_tasks: int | None,
|
473
480
|
max_subprocesses: int | None,
|
@@ -518,6 +525,7 @@ def eval_command(
|
|
518
525
|
message_limit=message_limit,
|
519
526
|
token_limit=token_limit,
|
520
527
|
time_limit=time_limit,
|
528
|
+
working_limit=working_limit,
|
521
529
|
max_samples=max_samples,
|
522
530
|
max_tasks=max_tasks,
|
523
531
|
max_subprocesses=max_subprocesses,
|
@@ -629,6 +637,7 @@ def eval_set_command(
|
|
629
637
|
message_limit: int | None,
|
630
638
|
token_limit: int | None,
|
631
639
|
time_limit: int | None,
|
640
|
+
working_limit: int | None,
|
632
641
|
max_samples: int | None,
|
633
642
|
max_tasks: int | None,
|
634
643
|
max_subprocesses: int | None,
|
@@ -684,6 +693,7 @@ def eval_set_command(
|
|
684
693
|
message_limit=message_limit,
|
685
694
|
token_limit=token_limit,
|
686
695
|
time_limit=time_limit,
|
696
|
+
working_limit=working_limit,
|
687
697
|
max_samples=max_samples,
|
688
698
|
max_tasks=max_tasks,
|
689
699
|
max_subprocesses=max_subprocesses,
|
@@ -737,6 +747,7 @@ def eval_exec(
|
|
737
747
|
message_limit: int | None,
|
738
748
|
token_limit: int | None,
|
739
749
|
time_limit: int | None,
|
750
|
+
working_limit: int | None,
|
740
751
|
max_samples: int | None,
|
741
752
|
max_tasks: int | None,
|
742
753
|
max_subprocesses: int | None,
|
@@ -817,6 +828,7 @@ def eval_exec(
|
|
817
828
|
message_limit=message_limit,
|
818
829
|
token_limit=token_limit,
|
819
830
|
time_limit=time_limit,
|
831
|
+
working_limit=working_limit,
|
820
832
|
max_samples=max_samples,
|
821
833
|
max_tasks=max_tasks,
|
822
834
|
max_subprocesses=max_subprocesses,
|
@@ -185,7 +185,8 @@ class TaskScreenApp(App[TR]):
|
|
185
185
|
# force repaint
|
186
186
|
self.refresh(repaint=True)
|
187
187
|
|
188
|
-
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
188
|
+
# enable mouse support (this broke in textual 2.0 when running in VS Code
|
189
|
+
# however is fixed in textual 2.1)
|
189
190
|
assert self.app._driver
|
190
191
|
textual_enable_mouse_support(self.app._driver)
|
191
192
|
|
@@ -315,7 +316,7 @@ class TaskScreenApp(App[TR]):
|
|
315
316
|
|
316
317
|
def set_unread(unread: int | None) -> None:
|
317
318
|
if unread is not None:
|
318
|
-
console_tab.label = f"Console ({unread}" # type: ignore[assignment]
|
319
|
+
console_tab.label = f"Console ({unread})" # type: ignore[assignment]
|
319
320
|
else:
|
320
321
|
console_tab.label = "Console" # type: ignore[assignment]
|
321
322
|
|
@@ -39,7 +39,7 @@ class SamplesView(Widget):
|
|
39
39
|
padding: 0 1 0 1;
|
40
40
|
layout: grid;
|
41
41
|
grid-size: 2 3;
|
42
|
-
grid-rows: auto 1fr
|
42
|
+
grid-rows: auto 1fr 3;
|
43
43
|
grid-columns: 32 1fr;
|
44
44
|
grid-gutter: 1;
|
45
45
|
}
|
@@ -141,8 +141,8 @@ class SamplesList(OptionList):
|
|
141
141
|
if highlighted_sample and (highlighted_sample not in self.samples):
|
142
142
|
self.samples.append(highlighted_sample)
|
143
143
|
|
144
|
-
# sort the samples by
|
145
|
-
self.samples.sort(key=lambda sample: sample.
|
144
|
+
# sort the samples by running time
|
145
|
+
self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
|
146
146
|
|
147
147
|
# rebuild the list
|
148
148
|
self.clear_options()
|
@@ -154,9 +154,7 @@ class SamplesList(OptionList):
|
|
154
154
|
table.add_column(width=1)
|
155
155
|
task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
|
156
156
|
task_name.truncate(18, overflow="ellipsis", pad=True)
|
157
|
-
task_time = Text.from_markup(
|
158
|
-
f"{format_progress_time(sample.execution_time)}"
|
159
|
-
)
|
157
|
+
task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
|
160
158
|
table.add_row(task_name, task_time, " ")
|
161
159
|
sample_id = Text.from_markup(f"id: {sample.sample.id}")
|
162
160
|
sample_id.truncate(18, overflow="ellipsis", pad=True)
|
@@ -423,10 +421,6 @@ class SampleToolbar(Horizontal):
|
|
423
421
|
CANCEL_DISABLED = "Cancelling sample..."
|
424
422
|
|
425
423
|
DEFAULT_CSS = f"""
|
426
|
-
SampleToolbar {{
|
427
|
-
grid-size: 5 1;
|
428
|
-
grid-columns: auto auto 1fr auto auto;
|
429
|
-
}}
|
430
424
|
SampleToolbar #{STATUS_GROUP} {{
|
431
425
|
width: 22;
|
432
426
|
}}
|
@@ -193,16 +193,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
193
193
|
return EventDisplay(f"model: {event.model}", Group(*content))
|
194
194
|
|
195
195
|
|
196
|
-
def
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
for
|
201
|
-
|
196
|
+
def render_sub_events(events: list[Event]) -> list[RenderableType]:
|
197
|
+
content: list[RenderableType] = []
|
198
|
+
for e in events:
|
199
|
+
event_displays = render_event(e) or []
|
200
|
+
for d in event_displays:
|
201
|
+
if d.content:
|
202
|
+
content.append(Text(" "))
|
203
|
+
content.append(transcript_separator(d.title, "black", "··"))
|
204
|
+
if isinstance(d.content, Markdown):
|
205
|
+
set_transcript_markdown_options(d.content)
|
206
|
+
content.append(d.content)
|
207
|
+
|
208
|
+
return content
|
209
|
+
|
202
210
|
|
211
|
+
def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
203
212
|
# render the call
|
204
213
|
content = transcript_tool_call(event)
|
205
214
|
|
215
|
+
# render sub-events
|
216
|
+
if event.events:
|
217
|
+
content.extend(render_sub_events(event.events))
|
218
|
+
|
206
219
|
# render the output
|
207
220
|
if isinstance(event.result, list):
|
208
221
|
result: ToolResult = "\n".join(
|
@@ -220,7 +233,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
220
233
|
result = str(result).strip()
|
221
234
|
content.extend(lines_display(result, 50))
|
222
235
|
|
223
|
-
return
|
236
|
+
return [EventDisplay("tool call", Group(*content))]
|
224
237
|
|
225
238
|
|
226
239
|
def render_step_event(event: StepEvent) -> EventDisplay:
|
@@ -257,13 +270,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
|
|
257
270
|
|
258
271
|
|
259
272
|
def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
273
|
+
# render header
|
274
|
+
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
275
|
+
|
260
276
|
# render sub-events
|
261
|
-
display: list[EventDisplay] = []
|
262
277
|
if event.events:
|
263
|
-
|
264
|
-
display.extend(render_event(e) or [])
|
278
|
+
content.extend(render_sub_events(event.events))
|
265
279
|
|
266
|
-
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
267
280
|
if event.result:
|
268
281
|
content.append(Text())
|
269
282
|
if isinstance(event.result, str | int | float | bool | None):
|
@@ -271,7 +284,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
|
271
284
|
else:
|
272
285
|
content.append(render_as_json(event.result))
|
273
286
|
|
274
|
-
return
|
287
|
+
return [EventDisplay(f"subtask: {event.name}", Group(*content))]
|
275
288
|
|
276
289
|
|
277
290
|
def render_input_event(event: InputEvent) -> EventDisplay:
|
inspect_ai/_eval/eval.py
CHANGED
@@ -75,6 +75,7 @@ def eval(
|
|
75
75
|
message_limit: int | None = None,
|
76
76
|
token_limit: int | None = None,
|
77
77
|
time_limit: int | None = None,
|
78
|
+
working_limit: int | None = None,
|
78
79
|
max_samples: int | None = None,
|
79
80
|
max_tasks: int | None = None,
|
80
81
|
max_subprocesses: int | None = None,
|
@@ -132,7 +133,10 @@ def eval(
|
|
132
133
|
so they can be debugged (defaults to False).
|
133
134
|
message_limit: Limit on total messages used for each sample.
|
134
135
|
token_limit: Limit on total tokens used for each sample.
|
135
|
-
time_limit: Limit on time (in seconds) for
|
136
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
137
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
138
|
+
time includes model generation, tool calls, etc. but does not include
|
139
|
+
time spent waiting on retries or shared resources.
|
136
140
|
max_samples: Maximum number of samples to run in parallel
|
137
141
|
(default is max_connections)
|
138
142
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -186,6 +190,7 @@ def eval(
|
|
186
190
|
message_limit=message_limit,
|
187
191
|
token_limit=token_limit,
|
188
192
|
time_limit=time_limit,
|
193
|
+
working_limit=working_limit,
|
189
194
|
max_samples=max_samples,
|
190
195
|
max_tasks=max_tasks,
|
191
196
|
max_subprocesses=max_subprocesses,
|
@@ -227,6 +232,7 @@ async def eval_async(
|
|
227
232
|
message_limit: int | None = None,
|
228
233
|
token_limit: int | None = None,
|
229
234
|
time_limit: int | None = None,
|
235
|
+
working_limit: int | None = None,
|
230
236
|
max_samples: int | None = None,
|
231
237
|
max_tasks: int | None = None,
|
232
238
|
max_subprocesses: int | None = None,
|
@@ -281,7 +287,10 @@ async def eval_async(
|
|
281
287
|
so they can be debugged (defaults to False).
|
282
288
|
message_limit (int | None): Limit on total messages used for each sample.
|
283
289
|
token_limit (int | None): Limit on total tokens used for each sample.
|
284
|
-
time_limit
|
290
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
291
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
292
|
+
time includes model generation, tool calls, etc. but does not include
|
293
|
+
time spent waiting on retries or shared resources.
|
285
294
|
max_samples (int | None): Maximum number of samples to run in parallel
|
286
295
|
(default is max_connections)
|
287
296
|
max_tasks (int | None): Maximum number of tasks to run in parallel
|
@@ -395,6 +404,7 @@ async def eval_async(
|
|
395
404
|
message_limit=message_limit,
|
396
405
|
token_limit=token_limit,
|
397
406
|
time_limit=time_limit,
|
407
|
+
working_limit=working_limit,
|
398
408
|
max_samples=max_samples,
|
399
409
|
max_tasks=max_tasks,
|
400
410
|
max_subprocesses=max_subprocesses,
|
@@ -702,6 +712,7 @@ async def eval_retry_async(
|
|
702
712
|
message_limit = eval_log.eval.config.message_limit
|
703
713
|
token_limit = eval_log.eval.config.token_limit
|
704
714
|
time_limit = eval_log.eval.config.time_limit
|
715
|
+
working_limit = eval_log.eval.config.working_limit
|
705
716
|
max_samples = max_samples or eval_log.eval.config.max_samples
|
706
717
|
max_tasks = max_tasks or eval_log.eval.config.max_tasks
|
707
718
|
max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
|
@@ -763,6 +774,7 @@ async def eval_retry_async(
|
|
763
774
|
message_limit=message_limit,
|
764
775
|
token_limit=token_limit,
|
765
776
|
time_limit=time_limit,
|
777
|
+
working_limit=working_limit,
|
766
778
|
max_samples=max_samples,
|
767
779
|
max_tasks=max_tasks,
|
768
780
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -79,6 +79,7 @@ def eval_set(
|
|
79
79
|
message_limit: int | None = None,
|
80
80
|
token_limit: int | None = None,
|
81
81
|
time_limit: int | None = None,
|
82
|
+
working_limit: int | None = None,
|
82
83
|
max_samples: int | None = None,
|
83
84
|
max_tasks: int | None = None,
|
84
85
|
max_subprocesses: int | None = None,
|
@@ -146,7 +147,10 @@ def eval_set(
|
|
146
147
|
so they can be debugged (defaults to False).
|
147
148
|
message_limit: Limit on total messages used for each sample.
|
148
149
|
token_limit: Limit on total tokens used for each sample.
|
149
|
-
time_limit: Limit on time (in seconds) for
|
150
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
151
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
152
|
+
time includes model generation, tool calls, etc. but does not include
|
153
|
+
time spent waiting on retries or shared resources.
|
150
154
|
max_samples: Maximum number of samples to run in parallel
|
151
155
|
(default is max_connections)
|
152
156
|
max_tasks: Maximum number of tasks to run in parallel
|
@@ -202,6 +206,7 @@ def eval_set(
|
|
202
206
|
message_limit=message_limit,
|
203
207
|
token_limit=token_limit,
|
204
208
|
time_limit=time_limit,
|
209
|
+
working_limit=working_limit,
|
205
210
|
max_samples=max_samples,
|
206
211
|
max_tasks=max_tasks,
|
207
212
|
max_subprocesses=max_subprocesses,
|
inspect_ai/_eval/run.py
CHANGED
@@ -163,6 +163,12 @@ async def eval_run(
|
|
163
163
|
else:
|
164
164
|
task.time_limit = task_eval_config.time_limit
|
165
165
|
|
166
|
+
# sample execution limit
|
167
|
+
if task_eval_config.working_limit is None:
|
168
|
+
task_eval_config.working_limit = task.working_limit
|
169
|
+
else:
|
170
|
+
task.working_limit = task_eval_config.working_limit
|
171
|
+
|
166
172
|
# fail_on_error
|
167
173
|
if task_eval_config.fail_on_error is None:
|
168
174
|
task_eval_config.fail_on_error = task.fail_on_error
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
|
|
33
33
|
registry_unqualified_name,
|
34
34
|
)
|
35
35
|
from inspect_ai._util.timeouts import Timeout, timeout
|
36
|
+
from inspect_ai._util.working import (
|
37
|
+
init_sample_working_limit,
|
38
|
+
sample_waiting_time,
|
39
|
+
)
|
36
40
|
from inspect_ai._view.notify import view_notify_eval
|
37
41
|
from inspect_ai.dataset import Dataset, Sample
|
38
42
|
from inspect_ai.log import (
|
@@ -56,6 +60,7 @@ from inspect_ai.log._transcript import (
|
|
56
60
|
SampleInitEvent,
|
57
61
|
SampleLimitEvent,
|
58
62
|
ScoreEvent,
|
63
|
+
StepEvent,
|
59
64
|
transcript,
|
60
65
|
)
|
61
66
|
from inspect_ai.model import (
|
@@ -182,9 +187,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
182
187
|
if isinstance(solver, Plan):
|
183
188
|
plan = solver
|
184
189
|
elif isinstance(solver, Chain):
|
185
|
-
plan = Plan(list(solver), internal=True)
|
190
|
+
plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
|
186
191
|
else:
|
187
|
-
plan = Plan(unroll(solver), internal=True)
|
192
|
+
plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
|
188
193
|
|
189
194
|
# add setup solver(s) if specified
|
190
195
|
if task.setup:
|
@@ -308,6 +313,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
308
313
|
or config.fail_on_error is True
|
309
314
|
),
|
310
315
|
time_limit=config.time_limit,
|
316
|
+
working_limit=config.working_limit,
|
311
317
|
semaphore=sample_semaphore,
|
312
318
|
)
|
313
319
|
for (sample, state) in zip(samples, states)
|
@@ -500,6 +506,7 @@ async def task_run_sample(
|
|
500
506
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
501
507
|
fails_on_error: bool,
|
502
508
|
time_limit: int | None,
|
509
|
+
working_limit: int | None,
|
503
510
|
semaphore: asyncio.Semaphore | None,
|
504
511
|
) -> dict[str, SampleScore] | None:
|
505
512
|
# if there is an existing sample then tick off its progress, log it, and return it
|
@@ -570,19 +577,37 @@ async def task_run_sample(
|
|
570
577
|
message_limit=state.message_limit,
|
571
578
|
token_limit=state.token_limit,
|
572
579
|
time_limit=time_limit,
|
580
|
+
working_limit=working_limit,
|
573
581
|
fails_on_error=fails_on_error,
|
574
582
|
transcript=sample_transcript,
|
575
583
|
) as active,
|
576
584
|
):
|
585
|
+
start_time: float | None = None
|
577
586
|
error: EvalError | None = None
|
578
587
|
raise_error: BaseException | None = None
|
579
588
|
results: dict[str, SampleScore] = {}
|
580
589
|
try:
|
590
|
+
# begin init
|
591
|
+
transcript()._event(StepEvent(action="begin", name="init"))
|
592
|
+
|
593
|
+
# sample init event (remove file bodies as they have content or absolute paths)
|
594
|
+
event_sample = sample.model_copy(
|
595
|
+
update=dict(files={k: "" for k in sample.files.keys()})
|
596
|
+
if sample.files
|
597
|
+
else None
|
598
|
+
)
|
599
|
+
transcript()._event(
|
600
|
+
SampleInitEvent(sample=event_sample, state=state_jsonable(state))
|
601
|
+
)
|
602
|
+
|
581
603
|
async with sandboxenv_cm:
|
582
604
|
try:
|
583
605
|
# update active sample wth sandboxes now that we are initialised
|
584
606
|
active.sandboxes = await sandbox_connections()
|
585
607
|
|
608
|
+
# end init
|
609
|
+
transcript()._event(StepEvent(action="end", name="init"))
|
610
|
+
|
586
611
|
# initialise timeout context manager
|
587
612
|
timeout_cm = (
|
588
613
|
timeout(time_limit)
|
@@ -590,23 +615,15 @@ async def task_run_sample(
|
|
590
615
|
else contextlib.nullcontext()
|
591
616
|
)
|
592
617
|
|
618
|
+
# record start time
|
619
|
+
start_time = time.monotonic()
|
620
|
+
init_sample_working_limit(start_time, working_limit)
|
621
|
+
|
593
622
|
# run sample w/ optional timeout
|
594
623
|
async with timeout_cm:
|
595
624
|
# mark started
|
596
625
|
active.started = datetime.now().timestamp()
|
597
626
|
|
598
|
-
# sample init event (remove file bodies as they have content or absolute paths)
|
599
|
-
event_sample = sample.model_copy(
|
600
|
-
update=dict(files={k: "" for k in sample.files.keys()})
|
601
|
-
if sample.files
|
602
|
-
else None
|
603
|
-
)
|
604
|
-
transcript()._event(
|
605
|
-
SampleInitEvent(
|
606
|
-
sample=event_sample, state=state_jsonable(state)
|
607
|
-
)
|
608
|
-
)
|
609
|
-
|
610
627
|
# set progress for plan then run it
|
611
628
|
state = await plan(state, generate)
|
612
629
|
|
@@ -661,11 +678,13 @@ async def task_run_sample(
|
|
661
678
|
|
662
679
|
# capture most recent state for scoring
|
663
680
|
state = ex.state or sample_state() or state
|
664
|
-
state.completed = True
|
665
681
|
|
666
682
|
except BaseException as ex:
|
667
683
|
error, raise_error = handle_error(ex)
|
668
684
|
|
685
|
+
# mark completed
|
686
|
+
state.completed = True
|
687
|
+
|
669
688
|
# set timeout for scoring. if the original timeout was hit we still
|
670
689
|
# want to provide opportunity for scoring, but we don't necessarily
|
671
690
|
# want to wait the full timeout again (especially in the case where
|
@@ -768,6 +787,7 @@ async def task_run_sample(
|
|
768
787
|
|
769
788
|
# log the sample
|
770
789
|
await log_sample(
|
790
|
+
start_time=start_time,
|
771
791
|
logger=logger,
|
772
792
|
sample=sample,
|
773
793
|
state=state,
|
@@ -788,6 +808,7 @@ async def task_run_sample(
|
|
788
808
|
|
789
809
|
|
790
810
|
async def log_sample(
|
811
|
+
start_time: float | None,
|
791
812
|
logger: TaskLogger,
|
792
813
|
sample: Sample,
|
793
814
|
state: TaskState,
|
@@ -804,6 +825,9 @@ async def log_sample(
|
|
804
825
|
|
805
826
|
# construct sample for logging
|
806
827
|
|
828
|
+
# compute total time if we can
|
829
|
+
total_time = time.monotonic() - start_time if start_time is not None else None
|
830
|
+
|
807
831
|
# if a limit was hit, note that in the Eval Sample
|
808
832
|
limit = None
|
809
833
|
for e in transcript().events:
|
@@ -827,8 +851,13 @@ async def log_sample(
|
|
827
851
|
output=state.output,
|
828
852
|
scores={k: v.score for k, v in scores.items()},
|
829
853
|
store=dict(state.store.items()),
|
854
|
+
uuid=state.uuid,
|
830
855
|
events=list(transcript().events),
|
831
856
|
model_usage=sample_model_usage(),
|
857
|
+
total_time=round(total_time, 3) if total_time is not None else None,
|
858
|
+
working_time=round(total_time - sample_waiting_time(), 3)
|
859
|
+
if total_time is not None
|
860
|
+
else None,
|
832
861
|
error=error,
|
833
862
|
limit=limit,
|
834
863
|
)
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from copy import deepcopy
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from logging import getLogger
|
4
|
-
from typing import Any, Callable, Sequence, cast
|
4
|
+
from typing import Any, Awaitable, Callable, Sequence, cast
|
5
5
|
|
6
6
|
from pydantic import BaseModel
|
7
7
|
from typing_extensions import TypedDict, Unpack
|
@@ -17,6 +17,7 @@ from inspect_ai.scorer import Metric, Scorer
|
|
17
17
|
from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
|
18
18
|
from inspect_ai.solver import Plan, Solver, generate
|
19
19
|
from inspect_ai.solver._chain import chain
|
20
|
+
from inspect_ai.solver._task_state import TaskState
|
20
21
|
from inspect_ai.util._sandbox.environment import (
|
21
22
|
SandboxEnvironmentSpec,
|
22
23
|
SandboxEnvironmentType,
|
@@ -46,6 +47,7 @@ class Task:
|
|
46
47
|
dataset: Dataset | Sequence[Sample] | None = None,
|
47
48
|
setup: Solver | list[Solver] | None = None,
|
48
49
|
solver: Solver | list[Solver] = generate(),
|
50
|
+
cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
|
49
51
|
scorer: Scorer | list[Scorer] | None = None,
|
50
52
|
metrics: list[Metric] | dict[str, list[Metric]] | None = None,
|
51
53
|
config: GenerateConfig = GenerateConfig(),
|
@@ -56,6 +58,7 @@ class Task:
|
|
56
58
|
message_limit: int | None = None,
|
57
59
|
token_limit: int | None = None,
|
58
60
|
time_limit: int | None = None,
|
61
|
+
working_limit: int | None = None,
|
59
62
|
name: str | None = None,
|
60
63
|
version: int = 0,
|
61
64
|
metadata: dict[str, Any] | None = None,
|
@@ -69,6 +72,9 @@ class Task:
|
|
69
72
|
even when the main `solver` is replaced).
|
70
73
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
71
74
|
Defaults to generate(), a normal call to the model.
|
75
|
+
cleanup: Optional cleanup function for task. Called after
|
76
|
+
all solvers have run for each sample (including if an
|
77
|
+
exception occurs during the run)
|
72
78
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
73
79
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
74
80
|
Alternative metrics (overrides the metrics provided by the specified scorer).
|
@@ -86,7 +92,10 @@ class Task:
|
|
86
92
|
eval if a count of samples fails.
|
87
93
|
message_limit (int | None): Limit on total messages used for each sample.
|
88
94
|
token_limit (int | None): Limit on total tokens used for each sample.
|
89
|
-
time_limit
|
95
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
96
|
+
working_limit: Limit on working time (in seconds) for sample. Working
|
97
|
+
time includes model generation, tool calls, etc. but does not include
|
98
|
+
time spent waiting on retries or shared resources.
|
90
99
|
name: (str | None): Task name. If not specified is automatically
|
91
100
|
determined based on the name of the task directory (or "task")
|
92
101
|
if its anonymous task (e.g. created in a notebook and passed to
|
@@ -123,6 +132,7 @@ class Task:
|
|
123
132
|
self.dataset = resolve_dataset(dataset)
|
124
133
|
self.setup = setup
|
125
134
|
self.solver = resolve_solver(solver)
|
135
|
+
self.cleanup = cleanup
|
126
136
|
self.scorer = resolve_scorer(scorer)
|
127
137
|
self.metrics = metrics
|
128
138
|
self.config = config
|
@@ -135,6 +145,7 @@ class Task:
|
|
135
145
|
self.message_limit = message_limit
|
136
146
|
self.token_limit = token_limit
|
137
147
|
self.time_limit = time_limit
|
148
|
+
self.working_limit = working_limit
|
138
149
|
self.version = version
|
139
150
|
self._name = name
|
140
151
|
self.metadata = metadata
|
@@ -162,6 +173,7 @@ def task_with(
|
|
162
173
|
dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
|
163
174
|
setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
|
164
175
|
solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
|
176
|
+
cleanup: Callable[[TaskState], Awaitable[None]] | None | NotGiven = NOT_GIVEN,
|
165
177
|
scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
|
166
178
|
metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
|
167
179
|
config: GenerateConfig | NotGiven = NOT_GIVEN,
|
@@ -172,6 +184,7 @@ def task_with(
|
|
172
184
|
message_limit: int | None | NotGiven = NOT_GIVEN,
|
173
185
|
token_limit: int | None | NotGiven = NOT_GIVEN,
|
174
186
|
time_limit: int | None | NotGiven = NOT_GIVEN,
|
187
|
+
working_limit: int | None | NotGiven = NOT_GIVEN,
|
175
188
|
name: str | None | NotGiven = NOT_GIVEN,
|
176
189
|
version: int | NotGiven = NOT_GIVEN,
|
177
190
|
metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
|
@@ -185,6 +198,9 @@ def task_with(
|
|
185
198
|
even when the main `solver` is replaced).
|
186
199
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
187
200
|
Defaults to generate(), a normal call to the model.
|
201
|
+
cleanup: Optional cleanup function for task. Called after
|
202
|
+
all solvers have run for each sample (including if an
|
203
|
+
exception occurs during the run)
|
188
204
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
189
205
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
190
206
|
Alternative metrics (overrides the metrics provided by the specified scorer).
|
@@ -202,7 +218,10 @@ def task_with(
|
|
202
218
|
eval if a count of samples fails.
|
203
219
|
message_limit (int | None): Limit on total messages used for each sample.
|
204
220
|
token_limit (int | None): Limit on total tokens used for each sample.
|
205
|
-
time_limit
|
221
|
+
time_limit: Limit on clock time (in seconds) for samples.
|
222
|
+
working_limit: Limit on execution time (in seconds) for sample. Execution
|
223
|
+
time includes model generation, tool calls, etc. but does not include
|
224
|
+
time spent waiting on retries or shared resources.
|
206
225
|
name: (str | None): Task name. If not specified is automatically
|
207
226
|
determined based on the name of the task directory (or "task")
|
208
227
|
if its anonymous task (e.g. created in a notebook and passed to
|
@@ -223,6 +242,8 @@ def task_with(
|
|
223
242
|
task.setup = setup
|
224
243
|
if not isinstance(solver, NotGiven):
|
225
244
|
task.solver = resolve_solver(solver)
|
245
|
+
if not isinstance(cleanup, NotGiven):
|
246
|
+
task.cleanup = cleanup
|
226
247
|
if not isinstance(scorer, NotGiven):
|
227
248
|
task.scorer = resolve_scorer(scorer)
|
228
249
|
if not isinstance(metrics, NotGiven):
|
@@ -245,6 +266,8 @@ def task_with(
|
|
245
266
|
task.token_limit = token_limit
|
246
267
|
if not isinstance(time_limit, NotGiven):
|
247
268
|
task.time_limit = time_limit
|
269
|
+
if not isinstance(working_limit, NotGiven):
|
270
|
+
task.working_limit = working_limit
|
248
271
|
if not isinstance(version, NotGiven):
|
249
272
|
task.version = version
|
250
273
|
if not isinstance(name, NotGiven):
|
inspect_ai/_util/interrupt.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1
1
|
import asyncio
|
2
2
|
|
3
|
+
from .working import check_sample_working_limit
|
4
|
+
|
3
5
|
|
4
6
|
def check_sample_interrupt() -> None:
|
5
7
|
from inspect_ai.log._samples import sample_active
|
6
8
|
|
9
|
+
# check for user interrupt
|
7
10
|
sample = sample_active()
|
8
11
|
if sample and sample.interrupt_action:
|
9
12
|
raise asyncio.CancelledError()
|
13
|
+
|
14
|
+
# check for working_limit
|
15
|
+
check_sample_working_limit()
|
inspect_ai/_util/logger.py
CHANGED
@@ -160,7 +160,9 @@ def init_logger(
|
|
160
160
|
|
161
161
|
# init logging handler on demand
|
162
162
|
global _logHandler
|
163
|
+
removed_root_handlers = False
|
163
164
|
if not _logHandler:
|
165
|
+
removed_root_handlers = remove_non_pytest_root_logger_handlers()
|
164
166
|
_logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
|
165
167
|
getLogger().addHandler(_logHandler)
|
166
168
|
|
@@ -173,6 +175,11 @@ def init_logger(
|
|
173
175
|
getLogger("httpx").setLevel(capture_level)
|
174
176
|
getLogger("botocore").setLevel(DEBUG)
|
175
177
|
|
178
|
+
if removed_root_handlers:
|
179
|
+
getLogger(PKG_NAME).warning(
|
180
|
+
"Inspect removed pre-existing root logger handlers and replaced them with its own handler."
|
181
|
+
)
|
182
|
+
|
176
183
|
# set the levelno on the global handler
|
177
184
|
_logHandler.display_level = levelno
|
178
185
|
|
@@ -180,6 +187,18 @@ def init_logger(
|
|
180
187
|
_logHandler: LogHandler | None = None
|
181
188
|
|
182
189
|
|
190
|
+
def remove_non_pytest_root_logger_handlers() -> bool:
|
191
|
+
root_logger = getLogger()
|
192
|
+
non_pytest_handlers = [
|
193
|
+
handler
|
194
|
+
for handler in root_logger.handlers
|
195
|
+
if handler.__module__ != "_pytest.logging"
|
196
|
+
]
|
197
|
+
for handler in non_pytest_handlers:
|
198
|
+
root_logger.removeHandler(handler)
|
199
|
+
return len(non_pytest_handlers) > 0
|
200
|
+
|
201
|
+
|
183
202
|
def notify_logger_record(record: LogRecord, write: bool) -> None:
|
184
203
|
from inspect_ai.log._message import LoggingMessage
|
185
204
|
from inspect_ai.log._transcript import LoggerEvent, transcript
|