inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +31 -0
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +13 -20
- inspect_ai/_util/local_server.py +368 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +159 -146
- inspect_ai/_view/www/dist/assets/index.js +1020 -1061
- inspect_ai/_view/www/log-schema.json +4 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +3 -2
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +20 -12
- inspect_ai/agent/_as_tool.py +15 -3
- inspect_ai/agent/_handoff.py +8 -1
- inspect_ai/agent/_run.py +11 -3
- inspect_ai/log/__init__.py +4 -0
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +0 -8
- inspect_ai/log/_transcript.py +7 -1
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +32 -12
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +21 -48
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_openai_responses.py +13 -1
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +241 -0
- inspect_ai/model/_providers/vllm.py +207 -400
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +2 -0
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +12 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -43,6 +43,9 @@ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in paral
|
|
43
43
|
NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
|
44
44
|
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
|
45
45
|
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
|
46
|
+
NO_LOG_REALTIME_HELP = (
|
47
|
+
"Do not log events in realtime (affects live viewing of samples in inspect view)"
|
48
|
+
)
|
46
49
|
NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
|
47
50
|
RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
|
48
51
|
LOG_IMAGES_HELP = (
|
@@ -281,6 +284,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
281
284
|
help=NO_LOG_SAMPLES_HELP,
|
282
285
|
envvar="INSPECT_EVAL_NO_LOG_SAMPLES",
|
283
286
|
)
|
287
|
+
@click.option(
|
288
|
+
"--no-log-realtime",
|
289
|
+
type=bool,
|
290
|
+
is_flag=True,
|
291
|
+
help=NO_LOG_REALTIME_HELP,
|
292
|
+
envvar="INSPECT_EVAL_NO_LOG_REALTIME",
|
293
|
+
)
|
284
294
|
@click.option(
|
285
295
|
"--log-images/--no-log-images",
|
286
296
|
type=bool,
|
@@ -544,6 +554,7 @@ def eval_command(
|
|
544
554
|
no_fail_on_error: bool | None,
|
545
555
|
retry_on_error: int | None,
|
546
556
|
no_log_samples: bool | None,
|
557
|
+
no_log_realtime: bool | None,
|
547
558
|
log_images: bool | None,
|
548
559
|
log_buffer: int | None,
|
549
560
|
log_shared: int | None,
|
@@ -600,6 +611,7 @@ def eval_command(
|
|
600
611
|
retry_on_error=retry_on_error,
|
601
612
|
debug_errors=common["debug_errors"],
|
602
613
|
no_log_samples=no_log_samples,
|
614
|
+
no_log_realtime=no_log_realtime,
|
603
615
|
log_images=log_images,
|
604
616
|
log_buffer=log_buffer,
|
605
617
|
log_shared=log_shared,
|
@@ -718,6 +730,7 @@ def eval_set_command(
|
|
718
730
|
no_fail_on_error: bool | None,
|
719
731
|
retry_on_error: int | None,
|
720
732
|
no_log_samples: bool | None,
|
733
|
+
no_log_realtime: bool | None,
|
721
734
|
log_images: bool | None,
|
722
735
|
log_buffer: int | None,
|
723
736
|
log_shared: int | None,
|
@@ -779,6 +792,7 @@ def eval_set_command(
|
|
779
792
|
retry_on_error=retry_on_error,
|
780
793
|
debug_errors=common["debug_errors"],
|
781
794
|
no_log_samples=no_log_samples,
|
795
|
+
no_log_realtime=no_log_realtime,
|
782
796
|
log_images=log_images,
|
783
797
|
log_buffer=log_buffer,
|
784
798
|
log_shared=log_shared,
|
@@ -837,6 +851,7 @@ def eval_exec(
|
|
837
851
|
retry_on_error: int | None,
|
838
852
|
debug_errors: bool | None,
|
839
853
|
no_log_samples: bool | None,
|
854
|
+
no_log_realtime: bool | None,
|
840
855
|
log_images: bool | None,
|
841
856
|
log_buffer: int | None,
|
842
857
|
log_shared: int | None,
|
@@ -889,6 +904,7 @@ def eval_exec(
|
|
889
904
|
# resolve negating options
|
890
905
|
sandbox_cleanup = False if no_sandbox_cleanup else None
|
891
906
|
log_samples = False if no_log_samples else None
|
907
|
+
log_realtime = False if no_log_realtime else None
|
892
908
|
log_images = False if log_images is False else None
|
893
909
|
trace = True if trace else None
|
894
910
|
score = False if no_score else True
|
@@ -929,6 +945,7 @@ def eval_exec(
|
|
929
945
|
max_subprocesses=max_subprocesses,
|
930
946
|
max_sandboxes=max_sandboxes,
|
931
947
|
log_samples=log_samples,
|
948
|
+
log_realtime=log_realtime,
|
932
949
|
log_images=log_images,
|
933
950
|
log_buffer=log_buffer,
|
934
951
|
log_shared=log_shared,
|
@@ -1069,6 +1086,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
1069
1086
|
help=NO_LOG_SAMPLES_HELP,
|
1070
1087
|
envvar="INSPECT_EVAL_LOG_SAMPLES",
|
1071
1088
|
)
|
1089
|
+
@click.option(
|
1090
|
+
"--no-log-realtime",
|
1091
|
+
type=bool,
|
1092
|
+
is_flag=True,
|
1093
|
+
help=NO_LOG_REALTIME_HELP,
|
1094
|
+
envvar="INSPECT_EVAL_LOG_REALTIME",
|
1095
|
+
)
|
1072
1096
|
@click.option(
|
1073
1097
|
"--log-images/--no-log-images",
|
1074
1098
|
type=bool,
|
@@ -1136,6 +1160,7 @@ def eval_retry_command(
|
|
1136
1160
|
no_fail_on_error: bool | None,
|
1137
1161
|
retry_on_error: int | None,
|
1138
1162
|
no_log_samples: bool | None,
|
1163
|
+
no_log_realtime: bool | None,
|
1139
1164
|
log_images: bool | None,
|
1140
1165
|
log_buffer: int | None,
|
1141
1166
|
log_shared: int | None,
|
@@ -1154,6 +1179,7 @@ def eval_retry_command(
|
|
1154
1179
|
# resolve negating options
|
1155
1180
|
sandbox_cleanup = False if no_sandbox_cleanup else None
|
1156
1181
|
log_samples = False if no_log_samples else None
|
1182
|
+
log_realtime = False if no_log_realtime else None
|
1157
1183
|
log_images = False if log_images is False else None
|
1158
1184
|
score = False if no_score else True
|
1159
1185
|
score_display = False if no_score_display else None
|
@@ -1164,6 +1190,10 @@ def eval_retry_command(
|
|
1164
1190
|
elif fail_on_error == 0.0:
|
1165
1191
|
fail_on_error = True
|
1166
1192
|
|
1193
|
+
# resolve retry on error
|
1194
|
+
if retry_on_error == 0:
|
1195
|
+
retry_on_error = None
|
1196
|
+
|
1167
1197
|
# resolve log file
|
1168
1198
|
retry_log_files = [
|
1169
1199
|
log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
|
@@ -1185,6 +1215,7 @@ def eval_retry_command(
|
|
1185
1215
|
retry_on_error=retry_on_error,
|
1186
1216
|
debug_errors=common["debug_errors"],
|
1187
1217
|
log_samples=log_samples,
|
1218
|
+
log_realtime=log_realtime,
|
1188
1219
|
log_images=log_images,
|
1189
1220
|
log_buffer=log_buffer,
|
1190
1221
|
log_shared=log_shared,
|
inspect_ai/_eval/eval.py
CHANGED
@@ -101,6 +101,7 @@ def eval(
|
|
101
101
|
max_subprocesses: int | None = None,
|
102
102
|
max_sandboxes: int | None = None,
|
103
103
|
log_samples: bool | None = None,
|
104
|
+
log_realtime: bool | None = None,
|
104
105
|
log_images: bool | None = None,
|
105
106
|
log_buffer: int | None = None,
|
106
107
|
log_shared: bool | int | None = None,
|
@@ -145,7 +146,7 @@ def eval(
|
|
145
146
|
to "eval", the native high-performance format).
|
146
147
|
limit: Limit evaluated samples
|
147
148
|
(defaults to all samples).
|
148
|
-
sample_id: Evaluate specific sample(s) from the dataset.
|
149
|
+
sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
|
149
150
|
epochs: Epochs to repeat samples for and optional score
|
150
151
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
151
152
|
fail_on_error: `True` to fail on first sample error
|
@@ -171,6 +172,7 @@ def eval(
|
|
171
172
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
172
173
|
to run in parallel.
|
173
174
|
log_samples: Log detailed samples and scores (defaults to True)
|
175
|
+
log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
|
174
176
|
log_images: Log base64 encoded version of images,
|
175
177
|
even if specified as a filename or URL (defaults to False)
|
176
178
|
log_buffer: Number of samples to buffer before writing log file.
|
@@ -228,6 +230,7 @@ def eval(
|
|
228
230
|
max_subprocesses=max_subprocesses,
|
229
231
|
max_sandboxes=max_sandboxes,
|
230
232
|
log_samples=log_samples,
|
233
|
+
log_realtime=log_realtime,
|
231
234
|
log_images=log_images,
|
232
235
|
log_buffer=log_buffer,
|
233
236
|
log_shared=log_shared,
|
@@ -281,6 +284,7 @@ async def eval_async(
|
|
281
284
|
max_subprocesses: int | None = None,
|
282
285
|
max_sandboxes: int | None = None,
|
283
286
|
log_samples: bool | None = None,
|
287
|
+
log_realtime: bool | None = None,
|
284
288
|
log_images: bool | None = None,
|
285
289
|
log_buffer: int | None = None,
|
286
290
|
log_shared: bool | int | None = None,
|
@@ -314,7 +318,7 @@ async def eval_async(
|
|
314
318
|
log_dir: Output path for logging results (defaults to file log in ./logs directory).
|
315
319
|
log_format: Format for writing log files (defaults to "eval", the native high-performance format).
|
316
320
|
limit: Limit evaluated samples (defaults to all samples).
|
317
|
-
sample_id: Evaluate specific sample(s) from the dataset.
|
321
|
+
sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
|
318
322
|
epochs: Epochs to repeat samples for and optional score
|
319
323
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
320
324
|
fail_on_error: `True` to fail on first sample error
|
@@ -335,6 +339,7 @@ async def eval_async(
|
|
335
339
|
max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
|
336
340
|
max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
|
337
341
|
log_samples: Log detailed samples and scores (defaults to True)
|
342
|
+
log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
|
338
343
|
log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
|
339
344
|
log_buffer: Number of samples to buffer before writing log file.
|
340
345
|
If not specified, an appropriate default for the format and filesystem is
|
@@ -473,6 +478,7 @@ async def eval_async(
|
|
473
478
|
max_sandboxes=max_sandboxes,
|
474
479
|
sandbox_cleanup=sandbox_cleanup,
|
475
480
|
log_samples=log_samples,
|
481
|
+
log_realtime=log_realtime,
|
476
482
|
log_images=log_images,
|
477
483
|
log_buffer=log_buffer,
|
478
484
|
log_shared=log_shared,
|
@@ -562,6 +568,7 @@ def eval_retry(
|
|
562
568
|
retry_on_error: int | None = None,
|
563
569
|
debug_errors: bool | None = None,
|
564
570
|
log_samples: bool | None = None,
|
571
|
+
log_realtime: bool | None = None,
|
565
572
|
log_images: bool | None = None,
|
566
573
|
log_buffer: int | None = None,
|
567
574
|
log_shared: bool | int | None = None,
|
@@ -603,6 +610,7 @@ def eval_retry(
|
|
603
610
|
debug_errors: Raise task errors (rather than logging them)
|
604
611
|
so they can be debugged (defaults to False).
|
605
612
|
log_samples: Log detailed samples and scores (defaults to True)
|
613
|
+
log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
|
606
614
|
log_images: Log base64 encoded version of images,
|
607
615
|
even if specified as a filename or URL (defaults to False)
|
608
616
|
log_buffer: Number of samples to buffer before writing log file.
|
@@ -645,6 +653,7 @@ def eval_retry(
|
|
645
653
|
retry_on_error=retry_on_error,
|
646
654
|
debug_errors=debug_errors,
|
647
655
|
log_samples=log_samples,
|
656
|
+
log_realtime=log_realtime,
|
648
657
|
log_images=log_images,
|
649
658
|
log_buffer=log_buffer,
|
650
659
|
log_shared=log_shared,
|
@@ -673,6 +682,7 @@ async def eval_retry_async(
|
|
673
682
|
retry_on_error: int | None = None,
|
674
683
|
debug_errors: bool | None = None,
|
675
684
|
log_samples: bool | None = None,
|
685
|
+
log_realtime: bool | None = None,
|
676
686
|
log_images: bool | None = None,
|
677
687
|
log_buffer: int | None = None,
|
678
688
|
log_shared: bool | int | None = None,
|
@@ -707,6 +717,7 @@ async def eval_retry_async(
|
|
707
717
|
debug_errors: Raise task errors (rather than logging them)
|
708
718
|
so they can be debugged (defaults to False).
|
709
719
|
log_samples: Log detailed samples and scores (defaults to True)
|
720
|
+
log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
|
710
721
|
log_images: Log base64 encoded version of images,
|
711
722
|
even if specified as a filename or URL (defaults to False)
|
712
723
|
log_buffer: Number of samples to buffer before writing log file.
|
@@ -817,6 +828,11 @@ async def eval_retry_async(
|
|
817
828
|
log_samples = (
|
818
829
|
log_samples if log_samples is not None else eval_log.eval.config.log_samples
|
819
830
|
)
|
831
|
+
log_realtime = (
|
832
|
+
log_realtime
|
833
|
+
if log_realtime is not None
|
834
|
+
else eval_log.eval.config.log_realtime
|
835
|
+
)
|
820
836
|
log_images = (
|
821
837
|
log_images if log_images is not None else eval_log.eval.config.log_images
|
822
838
|
)
|
@@ -875,6 +891,7 @@ async def eval_retry_async(
|
|
875
891
|
max_subprocesses=max_subprocesses,
|
876
892
|
max_sandboxes=max_sandboxes,
|
877
893
|
log_samples=log_samples,
|
894
|
+
log_realtime=log_realtime,
|
878
895
|
log_images=log_images,
|
879
896
|
log_buffer=log_buffer,
|
880
897
|
log_shared=log_shared,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -93,6 +93,7 @@ def eval_set(
|
|
93
93
|
max_subprocesses: int | None = None,
|
94
94
|
max_sandboxes: int | None = None,
|
95
95
|
log_samples: bool | None = None,
|
96
|
+
log_realtime: bool | None = None,
|
96
97
|
log_images: bool | None = None,
|
97
98
|
log_buffer: int | None = None,
|
98
99
|
log_shared: bool | int | None = None,
|
@@ -147,7 +148,7 @@ def eval_set(
|
|
147
148
|
log files (defaults to "eval", the native high-performance format).
|
148
149
|
limit: Limit evaluated samples
|
149
150
|
(defaults to all samples).
|
150
|
-
sample_id: Evaluate specific sample(s) from the dataset.
|
151
|
+
sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
|
151
152
|
epochs: Epochs to repeat samples for and optional score
|
152
153
|
reducer function(s) used to combine sample scores (defaults to "mean")
|
153
154
|
fail_on_error: `True` to fail on first sample error
|
@@ -173,6 +174,7 @@ def eval_set(
|
|
173
174
|
max_sandboxes: Maximum number of sandboxes (per-provider)
|
174
175
|
to run in parallel.
|
175
176
|
log_samples: Log detailed samples and scores (defaults to True)
|
177
|
+
log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
|
176
178
|
log_images: Log base64 encoded version of images,
|
177
179
|
even if specified as a filename or URL (defaults to False)
|
178
180
|
log_buffer: Number of samples to buffer before writing log file.
|
@@ -229,6 +231,7 @@ def eval_set(
|
|
229
231
|
max_subprocesses=max_subprocesses,
|
230
232
|
max_sandboxes=max_sandboxes,
|
231
233
|
log_samples=log_samples,
|
234
|
+
log_realtime=log_realtime,
|
232
235
|
log_images=log_images,
|
233
236
|
log_buffer=log_buffer,
|
234
237
|
log_shared=log_shared,
|
inspect_ai/_eval/run.py
CHANGED
@@ -122,6 +122,11 @@ async def eval_run(
|
|
122
122
|
task = resolved_task.task
|
123
123
|
task_eval_config = eval_config.model_copy()
|
124
124
|
|
125
|
+
# sample_ids can be specified per task
|
126
|
+
task_eval_config.sample_id = resolve_task_sample_ids(
|
127
|
+
resolved_task.task.name, task_eval_config.sample_id
|
128
|
+
)
|
129
|
+
|
125
130
|
# resolve the task scorers
|
126
131
|
eval_scorer_specs = (
|
127
132
|
[as_scorer_spec(scorer) for scorer in task.scorer]
|
@@ -424,6 +429,42 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
424
429
|
return results
|
425
430
|
|
426
431
|
|
432
|
+
def resolve_task_sample_ids(
|
433
|
+
task: str, sample_id: str | int | list[str] | list[int] | list[str | int] | None
|
434
|
+
) -> str | int | list[str] | list[int] | list[str | int] | None:
|
435
|
+
def collect_for_task(sample: str | int) -> str | int | None:
|
436
|
+
if isinstance(sample, str):
|
437
|
+
scoped = sample.split(":", maxsplit=1)
|
438
|
+
if len(scoped) > 1:
|
439
|
+
if scoped[0].lower() == task.lower():
|
440
|
+
return scoped[1]
|
441
|
+
else:
|
442
|
+
return None
|
443
|
+
else:
|
444
|
+
return sample
|
445
|
+
else:
|
446
|
+
return sample
|
447
|
+
|
448
|
+
if sample_id is not None:
|
449
|
+
if isinstance(sample_id, list):
|
450
|
+
ids: list[int | str] = []
|
451
|
+
for id in sample_id:
|
452
|
+
collect = collect_for_task(id)
|
453
|
+
if collect is not None:
|
454
|
+
ids.append(collect)
|
455
|
+
return ids
|
456
|
+
|
457
|
+
else:
|
458
|
+
collect = collect_for_task(sample_id)
|
459
|
+
if collect is not None:
|
460
|
+
return collect
|
461
|
+
else:
|
462
|
+
return []
|
463
|
+
|
464
|
+
else:
|
465
|
+
return sample_id
|
466
|
+
|
467
|
+
|
427
468
|
async def startup_sandbox_environments(
|
428
469
|
eval_sandbox: SandboxEnvironmentSpec | None,
|
429
470
|
tasks: list[ResolvedTask],
|
@@ -4,7 +4,6 @@ from inspect_ai.model import CachePolicy, GenerateConfig, Model
|
|
4
4
|
from inspect_ai.model._cache import epoch
|
5
5
|
from inspect_ai.model._call_tools import execute_tools
|
6
6
|
from inspect_ai.solver import TaskState
|
7
|
-
from inspect_ai.solver._limit import SampleLimitExceededError
|
8
7
|
from inspect_ai.tool import ToolFunction
|
9
8
|
|
10
9
|
|
@@ -18,53 +17,48 @@ async def task_generate(
|
|
18
17
|
# track tool_choice (revert to "auto" after first forced call of a tool)
|
19
18
|
tool_choice = state.tool_choice
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
epoch.set(state.epoch)
|
20
|
+
while True:
|
21
|
+
# If we don't update the epoch here as we go, it's entirely possible
|
22
|
+
# we'd cache the same response for every single epoch, which would
|
23
|
+
# completely defeat the point!
|
24
|
+
epoch.set(state.epoch)
|
27
25
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# append the assistant message
|
38
|
-
message = state.output.message
|
39
|
-
state.messages.append(message)
|
40
|
-
|
41
|
-
# check for completed
|
42
|
-
if state.completed:
|
43
|
-
return state
|
26
|
+
# call the model
|
27
|
+
state.output = await model.generate(
|
28
|
+
input=state.messages,
|
29
|
+
tools=state.tools,
|
30
|
+
tool_choice=tool_choice,
|
31
|
+
config=config,
|
32
|
+
cache=cache,
|
33
|
+
)
|
44
34
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
messages, output = await execute_tools(
|
49
|
-
state.messages, state.tools, config.max_tool_output
|
50
|
-
)
|
51
|
-
state.messages.extend(messages)
|
52
|
-
if output is not None:
|
53
|
-
state.output = output
|
35
|
+
# append the assistant message
|
36
|
+
message = state.output.message
|
37
|
+
state.messages.append(message)
|
54
38
|
|
55
|
-
|
56
|
-
|
57
|
-
|
39
|
+
# check for completed
|
40
|
+
if state.completed:
|
41
|
+
return state
|
58
42
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
43
|
+
# resolve tool calls if necessary
|
44
|
+
if tool_calls != "none" and message.tool_calls:
|
45
|
+
# call tools and update messages and output
|
46
|
+
messages, output = await execute_tools(
|
47
|
+
state.messages, state.tools, config.max_tool_output
|
48
|
+
)
|
49
|
+
state.messages.extend(messages)
|
50
|
+
if output is not None:
|
51
|
+
state.output = output
|
63
52
|
|
64
|
-
#
|
65
|
-
|
53
|
+
# check for completed or only executing a single tool call
|
54
|
+
if state.completed or tool_calls == "single":
|
66
55
|
return state
|
67
56
|
|
68
|
-
|
69
|
-
|
70
|
-
|
57
|
+
# if a tool_call was forced set tool_choice to 'auto'
|
58
|
+
# (otherwise it will get forced over and over again)
|
59
|
+
if isinstance(tool_choice, ToolFunction):
|
60
|
+
tool_choice = "auto"
|
61
|
+
|
62
|
+
# no tool calls or not resolving tool calls, we are done!
|
63
|
+
else:
|
64
|
+
return state
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -30,13 +30,14 @@ from inspect_ai.log._log import (
|
|
30
30
|
EvalLog,
|
31
31
|
EvalMetricDefinition,
|
32
32
|
EvalSampleReductions,
|
33
|
+
EvalSampleSummary,
|
33
34
|
EvalScorer,
|
34
35
|
eval_config_defaults,
|
35
36
|
)
|
36
37
|
from inspect_ai.log._model import model_args_for_log, model_roles_to_model_roles_config
|
37
38
|
from inspect_ai.log._recorders import Recorder
|
38
39
|
from inspect_ai.log._recorders.buffer import SampleBufferDatabase
|
39
|
-
from inspect_ai.log._recorders.types import SampleEvent
|
40
|
+
from inspect_ai.log._recorders.types import SampleEvent
|
40
41
|
from inspect_ai.log._transcript import Event
|
41
42
|
from inspect_ai.model import (
|
42
43
|
GenerateConfig,
|
@@ -160,13 +161,17 @@ class TaskLogger:
|
|
160
161
|
self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
|
161
162
|
self.flush_pending: list[tuple[str | int, int]] = []
|
162
163
|
|
164
|
+
# sample buffer db
|
165
|
+
self._buffer_db: SampleBufferDatabase | None = None
|
166
|
+
|
163
167
|
async def init(self) -> None:
|
164
168
|
self._location = await self.recorder.log_init(self.eval)
|
165
|
-
self.
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
169
|
+
if self.eval.config.log_realtime is not False:
|
170
|
+
self._buffer_db = SampleBufferDatabase(
|
171
|
+
location=self._location,
|
172
|
+
log_images=self.eval.config.log_images is not False,
|
173
|
+
log_shared=self.eval.config.log_shared,
|
174
|
+
)
|
170
175
|
|
171
176
|
@property
|
172
177
|
def location(self) -> str:
|
@@ -180,36 +185,26 @@ class TaskLogger:
|
|
180
185
|
await self.recorder.log_start(self.eval, plan)
|
181
186
|
await self.recorder.flush(self.eval)
|
182
187
|
|
183
|
-
async def start_sample(self, sample:
|
184
|
-
self._buffer_db
|
188
|
+
async def start_sample(self, sample: EvalSampleSummary) -> None:
|
189
|
+
if self._buffer_db is not None:
|
190
|
+
self._buffer_db.start_sample(sample)
|
185
191
|
|
186
192
|
def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
|
187
193
|
# log the sample event
|
188
|
-
self._buffer_db
|
194
|
+
if self._buffer_db is not None:
|
195
|
+
self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
|
189
196
|
|
190
197
|
def remove_sample(self, id: str | int, epoch: int) -> None:
|
191
|
-
self._buffer_db
|
198
|
+
if self._buffer_db is not None:
|
199
|
+
self._buffer_db.remove_samples([(id, epoch)])
|
192
200
|
|
193
201
|
async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
|
194
202
|
# log the sample
|
195
203
|
await self.recorder.log_sample(self.eval, sample)
|
196
204
|
|
197
205
|
# mark complete
|
198
|
-
self._buffer_db
|
199
|
-
|
200
|
-
id=sample.id,
|
201
|
-
epoch=sample.epoch,
|
202
|
-
input=sample.input,
|
203
|
-
target=sample.target,
|
204
|
-
completed=True,
|
205
|
-
scores=sample.scores,
|
206
|
-
error=sample.error.message if sample.error is not None else None,
|
207
|
-
limit=f"{sample.limit.type}" if sample.limit is not None else None,
|
208
|
-
retries=len(sample.error_retries)
|
209
|
-
if sample.error_retries is not None
|
210
|
-
else None,
|
211
|
-
)
|
212
|
-
)
|
206
|
+
if self._buffer_db is not None:
|
207
|
+
self._buffer_db.complete_sample(sample.summary())
|
213
208
|
|
214
209
|
# flush if requested
|
215
210
|
if flush:
|
@@ -219,7 +214,8 @@ class TaskLogger:
|
|
219
214
|
await self.recorder.flush(self.eval)
|
220
215
|
|
221
216
|
# notify the event db it can remove these
|
222
|
-
self._buffer_db
|
217
|
+
if self._buffer_db is not None:
|
218
|
+
self._buffer_db.remove_samples(self.flush_pending)
|
223
219
|
|
224
220
|
# Clear
|
225
221
|
self.flush_pending.clear()
|
@@ -229,7 +225,8 @@ class TaskLogger:
|
|
229
225
|
self._samples_completed += 1
|
230
226
|
|
231
227
|
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
232
|
-
self._buffer_db
|
228
|
+
if self._buffer_db is not None:
|
229
|
+
self._buffer_db.update_metrics(metrics)
|
233
230
|
|
234
231
|
async def log_finish(
|
235
232
|
self,
|
@@ -245,7 +242,8 @@ class TaskLogger:
|
|
245
242
|
)
|
246
243
|
|
247
244
|
# cleanup the events db
|
248
|
-
self._buffer_db
|
245
|
+
if self._buffer_db is not None:
|
246
|
+
self._buffer_db.cleanup()
|
249
247
|
|
250
248
|
# return log
|
251
249
|
return log
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -51,8 +51,12 @@ from inspect_ai.log import (
|
|
51
51
|
)
|
52
52
|
from inspect_ai.log._condense import condense_sample
|
53
53
|
from inspect_ai.log._file import eval_log_json_str
|
54
|
-
from inspect_ai.log._log import
|
55
|
-
|
54
|
+
from inspect_ai.log._log import (
|
55
|
+
EvalSampleLimit,
|
56
|
+
EvalSampleReductions,
|
57
|
+
EvalSampleSummary,
|
58
|
+
eval_error,
|
59
|
+
)
|
56
60
|
from inspect_ai.log._samples import (
|
57
61
|
active_sample,
|
58
62
|
)
|
@@ -82,9 +86,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
|
|
82
86
|
from inspect_ai.solver import Generate, Plan, TaskState
|
83
87
|
from inspect_ai.solver._chain import Chain, unroll
|
84
88
|
from inspect_ai.solver._fork import set_task_generate
|
85
|
-
from inspect_ai.solver._limit import SampleLimitExceededError
|
86
89
|
from inspect_ai.solver._solver import Solver
|
87
90
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
91
|
+
from inspect_ai.util._limit import LimitExceededError
|
88
92
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
89
93
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
90
94
|
from inspect_ai.util._subtask import init_subtask
|
@@ -649,17 +653,18 @@ async def task_run_sample(
|
|
649
653
|
init_sample_working_limit(start_time, working_limit)
|
650
654
|
|
651
655
|
# run sample w/ optional timeout
|
652
|
-
with timeout_cm:
|
656
|
+
with timeout_cm, state._token_limit, state._message_limit:
|
653
657
|
# mark started
|
654
658
|
active.started = datetime.now().timestamp()
|
655
659
|
|
656
660
|
if logger is not None:
|
657
661
|
await logger.start_sample(
|
658
|
-
|
662
|
+
EvalSampleSummary(
|
659
663
|
id=sample_id,
|
660
664
|
epoch=state.epoch,
|
661
665
|
input=sample.input,
|
662
666
|
target=sample.target,
|
667
|
+
metadata=sample.metadata or {},
|
663
668
|
)
|
664
669
|
)
|
665
670
|
|
@@ -707,18 +712,9 @@ async def task_run_sample(
|
|
707
712
|
# handle the cancel exception
|
708
713
|
raise
|
709
714
|
|
710
|
-
except
|
711
|
-
# sample limit event
|
712
|
-
transcript()._event(
|
713
|
-
SampleLimitEvent(
|
714
|
-
type=ex.type,
|
715
|
-
limit=ex.limit,
|
716
|
-
message=f"Sample completed: {ex.message}",
|
717
|
-
)
|
718
|
-
)
|
719
|
-
|
715
|
+
except LimitExceededError:
|
720
716
|
# capture most recent state for scoring
|
721
|
-
state =
|
717
|
+
state = sample_state() or state
|
722
718
|
|
723
719
|
except BaseException as ex:
|
724
720
|
error, raise_error = handle_error(ex)
|
@@ -735,9 +731,6 @@ async def task_run_sample(
|
|
735
731
|
if time_limit is not None:
|
736
732
|
timeout_cm = anyio.fail_after(time_limit / 2)
|
737
733
|
|
738
|
-
# turn off message and token limits
|
739
|
-
state.message_limit = None
|
740
|
-
state.token_limit = None
|
741
734
|
set_sample_state(state)
|
742
735
|
|
743
736
|
# scoring
|
@@ -929,7 +922,7 @@ async def log_sample(
|
|
929
922
|
input=sample.input,
|
930
923
|
choices=sample.choices,
|
931
924
|
target=sample.target,
|
932
|
-
metadata=
|
925
|
+
metadata=sample.metadata or {},
|
933
926
|
sandbox=sample.sandbox,
|
934
927
|
files=list(sample.files.keys()) if sample.files else None,
|
935
928
|
setup=sample.setup,
|