inspect-ai 0.3.81__py3-none-any.whl → 0.3.82__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +35 -2
- inspect_ai/_cli/util.py +44 -1
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +13 -4
- inspect_ai/_display/core/results.py +1 -1
- inspect_ai/_display/textual/widgets/task_detail.py +5 -4
- inspect_ai/_eval/eval.py +38 -1
- inspect_ai/_eval/evalset.py +5 -0
- inspect_ai/_eval/run.py +5 -2
- inspect_ai/_eval/task/log.py +53 -6
- inspect_ai/_eval/task/run.py +51 -10
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/file.py +17 -1
- inspect_ai/_util/json.py +36 -1
- inspect_ai/_view/server.py +113 -1
- inspect_ai/_view/www/App.css +1 -1
- inspect_ai/_view/www/dist/assets/index.css +518 -296
- inspect_ai/_view/www/dist/assets/index.js +38803 -36307
- inspect_ai/_view/www/eslint.config.mjs +1 -1
- inspect_ai/_view/www/log-schema.json +13 -0
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/package.json +8 -2
- inspect_ai/_view/www/src/App.tsx +151 -855
- inspect_ai/_view/www/src/api/api-browser.ts +176 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
- inspect_ai/_view/www/src/api/client-api.ts +66 -10
- inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
- inspect_ai/_view/www/src/api/types.ts +107 -2
- inspect_ai/_view/www/src/appearance/icons.ts +1 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
- inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
- inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
- inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -3
- inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
- inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
- inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
- inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
- inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
- inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
- inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
- inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
- inspect_ai/_view/www/src/index.tsx +26 -94
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +67 -28
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +51 -22
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +144 -90
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +82 -35
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +23 -30
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +4 -1
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +3 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +34 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +10 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +25 -17
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +21 -3
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +20 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +105 -85
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +27 -14
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
- inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
- inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +7 -9
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +7 -11
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +8 -13
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +52 -58
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +30 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
- inspect_ai/_view/www/src/scoring/utils.ts +87 -0
- inspect_ai/_view/www/src/state/appSlice.ts +244 -0
- inspect_ai/_view/www/src/state/hooks.ts +397 -0
- inspect_ai/_view/www/src/state/logPolling.ts +196 -0
- inspect_ai/_view/www/src/state/logSlice.ts +214 -0
- inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
- inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +311 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +127 -0
- inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
- inspect_ai/_view/www/src/state/scrolling.ts +206 -0
- inspect_ai/_view/www/src/state/store.ts +168 -0
- inspect_ai/_view/www/src/state/store_filter.ts +84 -0
- inspect_ai/_view/www/src/state/utils.ts +23 -0
- inspect_ai/_view/www/src/storage/index.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +2 -0
- inspect_ai/_view/www/src/types.ts +94 -32
- inspect_ai/_view/www/src/utils/attachments.ts +58 -23
- inspect_ai/_view/www/src/utils/logger.ts +52 -0
- inspect_ai/_view/www/src/utils/polling.ts +100 -0
- inspect_ai/_view/www/src/utils/react.ts +30 -0
- inspect_ai/_view/www/src/utils/vscode.ts +1 -1
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +181 -216
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +0 -1
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +98 -39
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +11 -13
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +110 -115
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
- inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
- inspect_ai/_view/www/src/workspace/types.ts +4 -3
- inspect_ai/_view/www/src/workspace/utils.ts +4 -4
- inspect_ai/_view/www/vite.config.js +6 -0
- inspect_ai/_view/www/yarn.lock +370 -354
- inspect_ai/log/_condense.py +26 -0
- inspect_ai/log/_log.py +6 -3
- inspect_ai/log/_recorders/buffer/__init__.py +14 -0
- inspect_ai/log/_recorders/buffer/buffer.py +30 -0
- inspect_ai/log/_recorders/buffer/database.py +685 -0
- inspect_ai/log/_recorders/buffer/filestore.py +259 -0
- inspect_ai/log/_recorders/buffer/types.py +84 -0
- inspect_ai/log/_recorders/eval.py +2 -11
- inspect_ai/log/_recorders/types.py +30 -0
- inspect_ai/log/_transcript.py +27 -1
- inspect_ai/model/_call_tools.py +1 -0
- inspect_ai/model/_generate_config.py +2 -2
- inspect_ai/model/_model.py +1 -0
- inspect_ai/tool/_tool_support_helpers.py +4 -4
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
- inspect_ai/util/_subtask.py +1 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/RECORD +178 -138
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._util.constants import (
|
|
10
10
|
ALL_LOG_LEVELS,
|
11
11
|
DEFAULT_EPOCHS,
|
12
12
|
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
13
|
+
DEFAULT_LOG_SHARED,
|
13
14
|
DEFAULT_MAX_CONNECTIONS,
|
14
15
|
)
|
15
16
|
from inspect_ai._util.file import filesystem
|
@@ -25,7 +26,12 @@ from .common import (
|
|
25
26
|
common_options,
|
26
27
|
process_common_options,
|
27
28
|
)
|
28
|
-
from .util import
|
29
|
+
from .util import (
|
30
|
+
int_or_bool_flag_callback,
|
31
|
+
parse_cli_args,
|
32
|
+
parse_cli_config,
|
33
|
+
parse_sandbox,
|
34
|
+
)
|
29
35
|
|
30
36
|
MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
|
31
37
|
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
@@ -41,6 +47,7 @@ LOG_IMAGES_HELP = (
|
|
41
47
|
"Include base64 encoded versions of filename or URL based images in the log file."
|
42
48
|
)
|
43
49
|
LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems)."
|
50
|
+
LOG_SHARED_HELP = "Sync sample events to log directory so that users on other systems can see log updates in realtime (defaults to no syncing). If enabled will sync every 10 seconds (or pass a value to sync every `n` seconds)."
|
44
51
|
NO_SCORE_HELP = (
|
45
52
|
"Do not score model output (use the inspect score command to score output later)"
|
46
53
|
)
|
@@ -266,6 +273,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
266
273
|
@click.option(
|
267
274
|
"--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
|
268
275
|
)
|
276
|
+
@click.option(
|
277
|
+
"--log-shared",
|
278
|
+
is_flag=False,
|
279
|
+
flag_value="true",
|
280
|
+
default=None,
|
281
|
+
callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
|
282
|
+
help=LOG_SHARED_HELP,
|
283
|
+
envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
|
284
|
+
)
|
269
285
|
@click.option(
|
270
286
|
"--no-score",
|
271
287
|
type=bool,
|
@@ -396,7 +412,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
396
412
|
@click.option(
|
397
413
|
"--reasoning-effort",
|
398
414
|
type=click.Choice(["low", "medium", "high"]),
|
399
|
-
help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
|
415
|
+
help="Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o-series models only.",
|
400
416
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
401
417
|
)
|
402
418
|
@click.option(
|
@@ -503,6 +519,7 @@ def eval_command(
|
|
503
519
|
no_log_samples: bool | None,
|
504
520
|
log_images: bool | None,
|
505
521
|
log_buffer: int | None,
|
522
|
+
log_shared: int | None,
|
506
523
|
no_score: bool | None,
|
507
524
|
no_score_display: bool | None,
|
508
525
|
log_format: Literal["eval", "json"] | None,
|
@@ -556,6 +573,7 @@ def eval_command(
|
|
556
573
|
no_log_samples=no_log_samples,
|
557
574
|
log_images=log_images,
|
558
575
|
log_buffer=log_buffer,
|
576
|
+
log_shared=log_shared,
|
559
577
|
no_score=no_score,
|
560
578
|
no_score_display=no_score_display,
|
561
579
|
is_eval_set=False,
|
@@ -670,6 +688,7 @@ def eval_set_command(
|
|
670
688
|
no_log_samples: bool | None,
|
671
689
|
log_images: bool | None,
|
672
690
|
log_buffer: int | None,
|
691
|
+
log_shared: int | None,
|
673
692
|
no_score: bool | None,
|
674
693
|
no_score_display: bool | None,
|
675
694
|
bundle_dir: str | None,
|
@@ -728,6 +747,7 @@ def eval_set_command(
|
|
728
747
|
no_log_samples=no_log_samples,
|
729
748
|
log_images=log_images,
|
730
749
|
log_buffer=log_buffer,
|
750
|
+
log_shared=log_shared,
|
731
751
|
no_score=no_score,
|
732
752
|
no_score_display=no_score_display,
|
733
753
|
is_eval_set=True,
|
@@ -783,6 +803,7 @@ def eval_exec(
|
|
783
803
|
no_log_samples: bool | None,
|
784
804
|
log_images: bool | None,
|
785
805
|
log_buffer: int | None,
|
806
|
+
log_shared: int | None,
|
786
807
|
no_score: bool | None,
|
787
808
|
no_score_display: bool | None,
|
788
809
|
is_eval_set: bool = False,
|
@@ -865,6 +886,7 @@ def eval_exec(
|
|
865
886
|
log_samples=log_samples,
|
866
887
|
log_images=log_images,
|
867
888
|
log_buffer=log_buffer,
|
889
|
+
log_shared=log_shared,
|
868
890
|
score=score,
|
869
891
|
score_display=score_display,
|
870
892
|
)
|
@@ -1004,6 +1026,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
1004
1026
|
@click.option(
|
1005
1027
|
"--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
|
1006
1028
|
)
|
1029
|
+
@click.option(
|
1030
|
+
"--log-shared",
|
1031
|
+
is_flag=False,
|
1032
|
+
flag_value="true",
|
1033
|
+
default=None,
|
1034
|
+
callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
|
1035
|
+
help=LOG_SHARED_HELP,
|
1036
|
+
envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
|
1037
|
+
)
|
1007
1038
|
@click.option(
|
1008
1039
|
"--no-score",
|
1009
1040
|
type=bool,
|
@@ -1052,6 +1083,7 @@ def eval_retry_command(
|
|
1052
1083
|
no_log_samples: bool | None,
|
1053
1084
|
log_images: bool | None,
|
1054
1085
|
log_buffer: int | None,
|
1086
|
+
log_shared: int | None,
|
1055
1087
|
no_score: bool | None,
|
1056
1088
|
no_score_display: bool | None,
|
1057
1089
|
max_connections: int | None,
|
@@ -1099,6 +1131,7 @@ def eval_retry_command(
|
|
1099
1131
|
log_samples=log_samples,
|
1100
1132
|
log_images=log_images,
|
1101
1133
|
log_buffer=log_buffer,
|
1134
|
+
log_shared=log_shared,
|
1102
1135
|
score=score,
|
1103
1136
|
score_display=score_display,
|
1104
1137
|
max_retries=max_retries,
|
inspect_ai/_cli/util.py
CHANGED
@@ -1,11 +1,54 @@
|
|
1
|
-
from typing import Any
|
1
|
+
from typing import Any, Callable
|
2
2
|
|
3
|
+
import click
|
3
4
|
import yaml
|
4
5
|
|
5
6
|
from inspect_ai._util.config import resolve_args
|
6
7
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
7
8
|
|
8
9
|
|
10
|
+
def int_or_bool_flag_callback(
|
11
|
+
true_value: int, false_value: int = 0
|
12
|
+
) -> Callable[[click.Context, click.Parameter, Any], int]:
|
13
|
+
def callback(ctx: click.Context, param: click.Parameter, value: Any) -> int:
|
14
|
+
"""Callback to parse the an option that can either be a boolean flag or integer.
|
15
|
+
|
16
|
+
Desired behavior:
|
17
|
+
- Not specified at all -> false_value
|
18
|
+
- Specified with no value -> true_value
|
19
|
+
- Specified with "true"/"false" -> true_value or false_value respectively
|
20
|
+
- Specified with an integer -> that integer
|
21
|
+
"""
|
22
|
+
# 1. If this parameter was never given on the command line,
|
23
|
+
# then we return 0.
|
24
|
+
source = ctx.get_parameter_source(param.name) if param.name else ""
|
25
|
+
if source == click.core.ParameterSource.DEFAULT:
|
26
|
+
# Means the user did NOT specify the flag at all
|
27
|
+
return false_value
|
28
|
+
|
29
|
+
# 2. The user did specify the flag. If value is None,
|
30
|
+
# that means they used the flag with no argument, e.g. --my-flag
|
31
|
+
if value is None:
|
32
|
+
return true_value
|
33
|
+
|
34
|
+
# 3. If there is a value, try to parse booleans or an integer.
|
35
|
+
lower_val = value.lower()
|
36
|
+
if lower_val in ("true", "yes", "1"):
|
37
|
+
return true_value
|
38
|
+
elif lower_val in ("false", "no", "0"):
|
39
|
+
return false_value
|
40
|
+
else:
|
41
|
+
# 4. Otherwise, assume it is an integer
|
42
|
+
try:
|
43
|
+
return int(value)
|
44
|
+
except ValueError:
|
45
|
+
raise click.BadParameter(
|
46
|
+
f"Expected 'true', 'false', or an integer for --{param.name}. Got: {value}"
|
47
|
+
)
|
48
|
+
|
49
|
+
return callback
|
50
|
+
|
51
|
+
|
9
52
|
def parse_cli_config(
|
10
53
|
args: tuple[str] | list[str] | None, config: str | None
|
11
54
|
) -> dict[str, Any]:
|
@@ -36,7 +36,7 @@ def task_config(
|
|
36
36
|
value = value if isinstance(value, list) else [value]
|
37
37
|
value = [str(v) for v in value]
|
38
38
|
config_print.append(f"{name}: {','.join(value)}")
|
39
|
-
elif name not in ["limit", "model", "response_schema"]:
|
39
|
+
elif name not in ["limit", "model", "response_schema", "log_shared"]:
|
40
40
|
if isinstance(value, list):
|
41
41
|
value = ",".join([str(v) for v in value])
|
42
42
|
if isinstance(value, str):
|
@@ -15,6 +15,7 @@ from typing import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
import rich
|
18
|
+
from pydantic import BaseModel, Field, field_validator
|
18
19
|
from rich.console import Console
|
19
20
|
|
20
21
|
from inspect_ai.log import EvalConfig, EvalResults, EvalStats
|
@@ -104,12 +105,20 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
|
|
104
105
|
raise NotImplementedError("input_panel not implemented by current display")
|
105
106
|
|
106
107
|
|
107
|
-
|
108
|
-
class TaskDisplayMetric:
|
108
|
+
class TaskDisplayMetric(BaseModel):
|
109
109
|
scorer: str
|
110
110
|
name: str
|
111
|
-
value: float | int
|
112
|
-
reducer: str | None
|
111
|
+
value: float | int | None = Field(default=None)
|
112
|
+
reducer: str | None = Field(default=None)
|
113
|
+
|
114
|
+
@field_validator("value", mode="before")
|
115
|
+
@classmethod
|
116
|
+
def handle_null_value(cls, v: Any) -> Union[float, int, None]:
|
117
|
+
if v is None:
|
118
|
+
return None
|
119
|
+
if isinstance(v, float | int):
|
120
|
+
return v
|
121
|
+
raise ValueError(f"Expected float, int, or None, got {type(v)}")
|
113
122
|
|
114
123
|
|
115
124
|
@runtime_checkable
|
@@ -180,7 +180,7 @@ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> s
|
|
180
180
|
)
|
181
181
|
|
182
182
|
metric = metrics[0]
|
183
|
-
if np.isnan(metric.value):
|
183
|
+
if metric.value is None or np.isnan(metric.value):
|
184
184
|
value = " n/a"
|
185
185
|
else:
|
186
186
|
value = f"{metric.value:.2f}"
|
@@ -14,7 +14,7 @@ from inspect_ai._display.core.display import TaskDisplayMetric
|
|
14
14
|
@dataclass
|
15
15
|
class TaskMetric:
|
16
16
|
name: str
|
17
|
-
value: float
|
17
|
+
value: float | int | None
|
18
18
|
|
19
19
|
|
20
20
|
class TaskDetail(Widget):
|
@@ -233,9 +233,10 @@ class TaskMetrics(Widget):
|
|
233
233
|
for metric in self.metrics:
|
234
234
|
# Add the value static but keep it around
|
235
235
|
# for future updates
|
236
|
-
|
237
|
-
self.
|
238
|
-
|
236
|
+
if metric.value is not None:
|
237
|
+
self.value_widgets[metric.name] = Static(
|
238
|
+
self._metric_value(metric.value), markup=False
|
239
|
+
)
|
239
240
|
|
240
241
|
grid.mount(Static(metric.name, markup=False))
|
241
242
|
grid.mount(self.value_widgets[metric.name])
|
inspect_ai/_eval/eval.py
CHANGED
@@ -15,7 +15,11 @@ from typing_extensions import Unpack
|
|
15
15
|
from inspect_ai._cli.util import parse_cli_args
|
16
16
|
from inspect_ai._display.core.active import display as task_display
|
17
17
|
from inspect_ai._util.config import resolve_args
|
18
|
-
from inspect_ai._util.constants import
|
18
|
+
from inspect_ai._util.constants import (
|
19
|
+
DEFAULT_LOG_FORMAT,
|
20
|
+
DEFAULT_LOG_SHARED,
|
21
|
+
JSON_LOG_FORMAT,
|
22
|
+
)
|
19
23
|
from inspect_ai._util.error import PrerequisiteError
|
20
24
|
from inspect_ai._util.file import absolute_file_path
|
21
25
|
from inspect_ai._util.logger import warn_once
|
@@ -31,6 +35,7 @@ from inspect_ai.approval._policy import (
|
|
31
35
|
from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
|
32
36
|
from inspect_ai.log._file import read_eval_log_async
|
33
37
|
from inspect_ai.log._recorders import create_recorder_for_format
|
38
|
+
from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
|
34
39
|
from inspect_ai.model import (
|
35
40
|
GenerateConfig,
|
36
41
|
GenerateConfigArgs,
|
@@ -92,6 +97,7 @@ def eval(
|
|
92
97
|
log_samples: bool | None = None,
|
93
98
|
log_images: bool | None = None,
|
94
99
|
log_buffer: int | None = None,
|
100
|
+
log_shared: bool | int | None = None,
|
95
101
|
score: bool = True,
|
96
102
|
score_display: bool | None = None,
|
97
103
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -161,6 +167,9 @@ def eval(
|
|
161
167
|
log_buffer: Number of samples to buffer before writing log file.
|
162
168
|
If not specified, an appropriate default for the format and filesystem is
|
163
169
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
170
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
171
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
172
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
164
173
|
score: Score output (defaults to True)
|
165
174
|
score_display: Show scoring metrics in realtime (defaults to True)
|
166
175
|
**kwargs: Model generation options.
|
@@ -210,6 +219,7 @@ def eval(
|
|
210
219
|
log_samples=log_samples,
|
211
220
|
log_images=log_images,
|
212
221
|
log_buffer=log_buffer,
|
222
|
+
log_shared=log_shared,
|
213
223
|
score=score,
|
214
224
|
score_display=score_display,
|
215
225
|
**kwargs,
|
@@ -260,6 +270,7 @@ async def eval_async(
|
|
260
270
|
log_samples: bool | None = None,
|
261
271
|
log_images: bool | None = None,
|
262
272
|
log_buffer: int | None = None,
|
273
|
+
log_shared: bool | int | None = None,
|
263
274
|
score: bool = True,
|
264
275
|
score_display: bool | None = None,
|
265
276
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -312,6 +323,7 @@ async def eval_async(
|
|
312
323
|
log_buffer: Number of samples to buffer before writing log file.
|
313
324
|
If not specified, an appropriate default for the format and filesystem is
|
314
325
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
326
|
+
log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
|
315
327
|
score: Score output (defaults to True)
|
316
328
|
score_display: Show scoring metrics in realtime (defaults to True)
|
317
329
|
**kwargs: Model generation options.
|
@@ -390,6 +402,15 @@ async def eval_async(
|
|
390
402
|
f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
|
391
403
|
)
|
392
404
|
|
405
|
+
# resolve log_shared
|
406
|
+
log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
|
407
|
+
|
408
|
+
# validate that --log-shared can't use used with 'json' format
|
409
|
+
if log_shared and log_format == JSON_LOG_FORMAT:
|
410
|
+
raise PrerequisiteError(
|
411
|
+
"ERROR: --log-shared is not compatible with the json log format."
|
412
|
+
)
|
413
|
+
|
393
414
|
# resolve solver
|
394
415
|
solver = chain(solver) if isinstance(solver, list) else solver
|
395
416
|
|
@@ -426,6 +447,7 @@ async def eval_async(
|
|
426
447
|
log_samples=log_samples,
|
427
448
|
log_images=log_images,
|
428
449
|
log_buffer=log_buffer,
|
450
|
+
log_shared=log_shared,
|
429
451
|
score_display=score_display,
|
430
452
|
)
|
431
453
|
|
@@ -485,6 +507,9 @@ async def eval_async(
|
|
485
507
|
)
|
486
508
|
logs = EvalLogs(results)
|
487
509
|
|
510
|
+
# cleanup sample buffers if required
|
511
|
+
cleanup_sample_buffers(log_dir)
|
512
|
+
|
488
513
|
finally:
|
489
514
|
_eval_async_running = False
|
490
515
|
|
@@ -510,6 +535,7 @@ def eval_retry(
|
|
510
535
|
log_samples: bool | None = None,
|
511
536
|
log_images: bool | None = None,
|
512
537
|
log_buffer: int | None = None,
|
538
|
+
log_shared: bool | int | None = None,
|
513
539
|
score: bool = True,
|
514
540
|
score_display: bool | None = None,
|
515
541
|
max_retries: int | None = None,
|
@@ -551,6 +577,9 @@ def eval_retry(
|
|
551
577
|
log_buffer: Number of samples to buffer before writing log file.
|
552
578
|
If not specified, an appropriate default for the format and filesystem is
|
553
579
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
580
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
581
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
582
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
554
583
|
score: Score output (defaults to True)
|
555
584
|
score_display: Show scoring metrics in realtime (defaults to True)
|
556
585
|
max_retries:
|
@@ -586,6 +615,7 @@ def eval_retry(
|
|
586
615
|
log_samples=log_samples,
|
587
616
|
log_images=log_images,
|
588
617
|
log_buffer=log_buffer,
|
618
|
+
log_shared=log_shared,
|
589
619
|
score=score,
|
590
620
|
score_display=score_display,
|
591
621
|
max_retries=max_retries,
|
@@ -612,6 +642,7 @@ async def eval_retry_async(
|
|
612
642
|
log_samples: bool | None = None,
|
613
643
|
log_images: bool | None = None,
|
614
644
|
log_buffer: int | None = None,
|
645
|
+
log_shared: bool | int | None = None,
|
615
646
|
score: bool = True,
|
616
647
|
score_display: bool | None = None,
|
617
648
|
max_retries: int | None = None,
|
@@ -651,6 +682,8 @@ async def eval_retry_async(
|
|
651
682
|
log_buffer: (int | None): Number of samples to buffer before writing log file.
|
652
683
|
If not specified, an appropriate default for the format and filesystem is
|
653
684
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
685
|
+
log_shared: Indicate that the log directory is shared, which results in
|
686
|
+
additional syncing of realtime log data for Inspect View.
|
654
687
|
score (bool): Score output (defaults to True)
|
655
688
|
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
|
656
689
|
max_retries (int | None):
|
@@ -750,6 +783,9 @@ async def eval_retry_async(
|
|
750
783
|
log_buffer = (
|
751
784
|
log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
|
752
785
|
)
|
786
|
+
log_shared = (
|
787
|
+
log_shared if log_shared is not None else eval_log.eval.config.log_shared
|
788
|
+
)
|
753
789
|
score_display = (
|
754
790
|
score_display
|
755
791
|
if score_display is not None
|
@@ -796,6 +832,7 @@ async def eval_retry_async(
|
|
796
832
|
log_samples=log_samples,
|
797
833
|
log_images=log_images,
|
798
834
|
log_buffer=log_buffer,
|
835
|
+
log_shared=log_shared,
|
799
836
|
score=score,
|
800
837
|
score_display=score_display,
|
801
838
|
**dict(config),
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -92,6 +92,7 @@ def eval_set(
|
|
92
92
|
log_samples: bool | None = None,
|
93
93
|
log_images: bool | None = None,
|
94
94
|
log_buffer: int | None = None,
|
95
|
+
log_shared: bool | int | None = None,
|
95
96
|
bundle_dir: str | None = None,
|
96
97
|
bundle_overwrite: bool = False,
|
97
98
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -171,6 +172,9 @@ def eval_set(
|
|
171
172
|
log_buffer: Number of samples to buffer before writing log file.
|
172
173
|
If not specified, an appropriate default for the format and filesystem is
|
173
174
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
175
|
+
log_shared: Sync sample events to log directory so that users on other systems
|
176
|
+
can see log updates in realtime (defaults to no syncing). Specify `True`
|
177
|
+
to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
|
174
178
|
bundle_dir: If specified, the log viewer and logs generated
|
175
179
|
by this eval set will be bundled into this directory.
|
176
180
|
bundle_overwrite: Whether to overwrite files in the bundle_dir.
|
@@ -219,6 +223,7 @@ def eval_set(
|
|
219
223
|
log_samples=log_samples,
|
220
224
|
log_images=log_images,
|
221
225
|
log_buffer=log_buffer,
|
226
|
+
log_shared=log_shared,
|
222
227
|
score=score,
|
223
228
|
**kwargs,
|
224
229
|
)
|
inspect_ai/_eval/run.py
CHANGED
@@ -407,12 +407,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
407
407
|
# Use anyio task group instead of manual task management
|
408
408
|
try:
|
409
409
|
async with anyio.create_task_group() as tg:
|
410
|
+
# computer number of workers (never more than total_tasks)
|
411
|
+
num_workers = min(parallel, total_tasks)
|
412
|
+
|
410
413
|
# start worker tasks
|
411
|
-
for _ in range(
|
414
|
+
for _ in range(num_workers):
|
412
415
|
tg.start_soon(worker)
|
413
416
|
|
414
417
|
# enqueue initial set of tasks
|
415
|
-
for _ in range(
|
418
|
+
for _ in range(num_workers):
|
416
419
|
await enque_next_task()
|
417
420
|
except anyio.get_cancelled_exc_class():
|
418
421
|
pass
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Any, Iterator, Literal, cast
|
|
4
4
|
|
5
5
|
from shortuuid import uuid
|
6
6
|
|
7
|
+
from inspect_ai._display.core.display import TaskDisplayMetric
|
7
8
|
from inspect_ai._eval.task.util import slice_dataset
|
8
9
|
from inspect_ai._util.constants import PKG_NAME
|
9
10
|
from inspect_ai._util.datetime import iso_now
|
@@ -34,6 +35,9 @@ from inspect_ai.log._log import (
|
|
34
35
|
eval_config_defaults,
|
35
36
|
)
|
36
37
|
from inspect_ai.log._recorders import Recorder
|
38
|
+
from inspect_ai.log._recorders.buffer import SampleBufferDatabase
|
39
|
+
from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
|
40
|
+
from inspect_ai.log._transcript import Event
|
37
41
|
from inspect_ai.model import (
|
38
42
|
GenerateConfig,
|
39
43
|
Model,
|
@@ -159,10 +163,15 @@ class TaskLogger:
|
|
159
163
|
|
160
164
|
# size of flush buffer (how many samples we buffer before hitting storage)
|
161
165
|
self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
|
162
|
-
self.flush_pending =
|
166
|
+
self.flush_pending: list[tuple[str | int, int]] = []
|
163
167
|
|
164
168
|
async def init(self) -> None:
|
165
169
|
self._location = await self.recorder.log_init(self.eval)
|
170
|
+
self._buffer_db = SampleBufferDatabase(
|
171
|
+
location=self._location,
|
172
|
+
log_images=self.eval.config.log_images is not False,
|
173
|
+
log_shared=self.eval.config.log_shared,
|
174
|
+
)
|
166
175
|
|
167
176
|
@property
|
168
177
|
def location(self) -> str:
|
@@ -174,22 +183,53 @@ class TaskLogger:
|
|
174
183
|
|
175
184
|
async def log_start(self, plan: EvalPlan) -> None:
|
176
185
|
await self.recorder.log_start(self.eval, plan)
|
186
|
+
await self.recorder.flush(self.eval)
|
187
|
+
|
188
|
+
async def start_sample(self, sample: SampleSummary) -> None:
|
189
|
+
self._buffer_db.start_sample(sample)
|
190
|
+
|
191
|
+
def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
|
192
|
+
# log the sample event
|
193
|
+
self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
|
177
194
|
|
178
|
-
async def
|
195
|
+
async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
|
179
196
|
# log the sample
|
180
197
|
await self.recorder.log_sample(self.eval, sample)
|
181
198
|
|
199
|
+
# mark complete
|
200
|
+
self._buffer_db.complete_sample(
|
201
|
+
SampleSummary(
|
202
|
+
id=sample.id,
|
203
|
+
epoch=sample.epoch,
|
204
|
+
input=sample.input,
|
205
|
+
target=sample.target,
|
206
|
+
completed=True,
|
207
|
+
scores=sample.scores,
|
208
|
+
error=sample.error.message if sample.error is not None else None,
|
209
|
+
limit=f"{sample.limit.type}" if sample.limit is not None else None,
|
210
|
+
)
|
211
|
+
)
|
212
|
+
|
182
213
|
# flush if requested
|
183
214
|
if flush:
|
184
|
-
self.flush_pending
|
185
|
-
if self.flush_pending >= self.flush_buffer:
|
215
|
+
self.flush_pending.append((sample.id, sample.epoch))
|
216
|
+
if len(self.flush_pending) >= self.flush_buffer:
|
217
|
+
# flush to disk
|
186
218
|
await self.recorder.flush(self.eval)
|
187
|
-
|
219
|
+
|
220
|
+
# notify the event db it can remove these
|
221
|
+
self._buffer_db.remove_samples(self.flush_pending)
|
222
|
+
|
223
|
+
# Clear
|
224
|
+
self.flush_pending.clear()
|
188
225
|
|
189
226
|
# track sucessful samples logged
|
190
227
|
if sample.error is None:
|
191
228
|
self._samples_completed += 1
|
192
229
|
|
230
|
+
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
231
|
+
self._buffer_db.update_metrics(metrics)
|
232
|
+
|
193
233
|
async def log_finish(
|
194
234
|
self,
|
195
235
|
status: Literal["success", "cancelled", "error"],
|
@@ -198,10 +238,17 @@ class TaskLogger:
|
|
198
238
|
reductions: list[EvalSampleReductions] | None = None,
|
199
239
|
error: EvalError | None = None,
|
200
240
|
) -> EvalLog:
|
201
|
-
|
241
|
+
# finish and get log
|
242
|
+
log = await self.recorder.log_finish(
|
202
243
|
self.eval, status, stats, results, reductions, error
|
203
244
|
)
|
204
245
|
|
246
|
+
# cleanup the events db
|
247
|
+
self._buffer_db.cleanup()
|
248
|
+
|
249
|
+
# return log
|
250
|
+
return log
|
251
|
+
|
205
252
|
|
206
253
|
async def log_start(
|
207
254
|
logger: TaskLogger,
|