PyPI - inspect-ai - Versions diffs - 0.3.81__py3-none-any.whl → 0.3.82__py3-none-any.whl - Mend

inspect-ai 0.3.81py3-none-any.whl → 0.3.82py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

inspect_ai/_cli/eval.py +35 -2
inspect_ai/_cli/util.py +44 -1
inspect_ai/_display/core/config.py +1 -1
inspect_ai/_display/core/display.py +13 -4
inspect_ai/_display/core/results.py +1 -1
inspect_ai/_display/textual/widgets/task_detail.py +5 -4
inspect_ai/_eval/eval.py +38 -1
inspect_ai/_eval/evalset.py +5 -0
inspect_ai/_eval/run.py +5 -2
inspect_ai/_eval/task/log.py +53 -6
inspect_ai/_eval/task/run.py +51 -10
inspect_ai/_util/constants.py +2 -0
inspect_ai/_util/file.py +17 -1
inspect_ai/_util/json.py +36 -1
inspect_ai/_view/server.py +113 -1
inspect_ai/_view/www/App.css +1 -1
inspect_ai/_view/www/dist/assets/index.css +518 -296
inspect_ai/_view/www/dist/assets/index.js +38803 -36307
inspect_ai/_view/www/eslint.config.mjs +1 -1
inspect_ai/_view/www/log-schema.json +13 -0
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/package.json +8 -2
inspect_ai/_view/www/src/App.tsx +151 -855
inspect_ai/_view/www/src/api/api-browser.ts +176 -5
inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
inspect_ai/_view/www/src/api/client-api.ts +66 -10
inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
inspect_ai/_view/www/src/api/types.ts +107 -2
inspect_ai/_view/www/src/appearance/icons.ts +1 -0
inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -3
inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
inspect_ai/_view/www/src/index.tsx +26 -94
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +67 -28
inspect_ai/_view/www/src/samples/SampleDialog.tsx +51 -22
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +144 -90
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +82 -35
inspect_ai/_view/www/src/samples/SamplesTools.tsx +23 -30
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +4 -1
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +3 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +34 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +10 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +25 -17
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +21 -3
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +20 -1
inspect_ai/_view/www/src/samples/list/SampleList.tsx +105 -85
inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +27 -14
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +7 -9
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +7 -11
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +8 -13
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +52 -58
inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +30 -1
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
inspect_ai/_view/www/src/scoring/utils.ts +87 -0
inspect_ai/_view/www/src/state/appSlice.ts +244 -0
inspect_ai/_view/www/src/state/hooks.ts +397 -0
inspect_ai/_view/www/src/state/logPolling.ts +196 -0
inspect_ai/_view/www/src/state/logSlice.ts +214 -0
inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
inspect_ai/_view/www/src/state/samplePolling.ts +311 -0
inspect_ai/_view/www/src/state/sampleSlice.ts +127 -0
inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
inspect_ai/_view/www/src/state/scrolling.ts +206 -0
inspect_ai/_view/www/src/state/store.ts +168 -0
inspect_ai/_view/www/src/state/store_filter.ts +84 -0
inspect_ai/_view/www/src/state/utils.ts +23 -0
inspect_ai/_view/www/src/storage/index.ts +26 -0
inspect_ai/_view/www/src/types/log.d.ts +2 -0
inspect_ai/_view/www/src/types.ts +94 -32
inspect_ai/_view/www/src/utils/attachments.ts +58 -23
inspect_ai/_view/www/src/utils/logger.ts +52 -0
inspect_ai/_view/www/src/utils/polling.ts +100 -0
inspect_ai/_view/www/src/utils/react.ts +30 -0
inspect_ai/_view/www/src/utils/vscode.ts +1 -1
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +181 -216
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +0 -1
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +98 -39
inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +11 -13
inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +110 -115
inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
inspect_ai/_view/www/src/workspace/types.ts +4 -3
inspect_ai/_view/www/src/workspace/utils.ts +4 -4
inspect_ai/_view/www/vite.config.js +6 -0
inspect_ai/_view/www/yarn.lock +370 -354
inspect_ai/log/_condense.py +26 -0
inspect_ai/log/_log.py +6 -3
inspect_ai/log/_recorders/buffer/__init__.py +14 -0
inspect_ai/log/_recorders/buffer/buffer.py +30 -0
inspect_ai/log/_recorders/buffer/database.py +685 -0
inspect_ai/log/_recorders/buffer/filestore.py +259 -0
inspect_ai/log/_recorders/buffer/types.py +84 -0
inspect_ai/log/_recorders/eval.py +2 -11
inspect_ai/log/_recorders/types.py +30 -0
inspect_ai/log/_transcript.py +27 -1
inspect_ai/model/_call_tools.py +1 -0
inspect_ai/model/_generate_config.py +2 -2
inspect_ai/model/_model.py +1 -0
inspect_ai/tool/_tool_support_helpers.py +4 -4
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
inspect_ai/util/_subtask.py +1 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/METADATA +1 -1
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/RECORD +178 -138
inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.81.dist-info → inspect_ai-0.3.82.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -10,6 +10,7 @@ from inspect_ai._util.constants import (
     ALL_LOG_LEVELS,
     DEFAULT_EPOCHS,
     DEFAULT_LOG_LEVEL_TRANSCRIPT,
+    DEFAULT_LOG_SHARED,
     DEFAULT_MAX_CONNECTIONS,
 )
 from inspect_ai._util.file import filesystem
@@ -25,7 +26,12 @@ from .common import (
     common_options,
     process_common_options,
 )
-from .util import parse_cli_args, parse_cli_config, parse_sandbox
+from .util import (
+    int_or_bool_flag_callback,
+    parse_cli_args,
+    parse_cli_config,
+    parse_sandbox,
+)
 MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
 MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
@@ -41,6 +47,7 @@ LOG_IMAGES_HELP = (
     "Include base64 encoded versions of filename or URL based images in the log file."
 )
 LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems)."
+LOG_SHARED_HELP = "Sync sample events to log directory so that users on other systems can see log updates in realtime (defaults to no syncing). If enabled will sync every 10 seconds (or pass a value to sync every `n` seconds)."
 NO_SCORE_HELP = (
     "Do not score model output (use the inspect score command to score output later)"
 )
@@ -266,6 +273,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
     )
+    @click.option(
+        "--log-shared",
+        is_flag=False,
+        flag_value="true",
+        default=None,
+        callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
+        help=LOG_SHARED_HELP,
+        envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
+    )
     @click.option(
         "--no-score",
         type=bool,
@@ -396,7 +412,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--reasoning-effort",
         type=click.Choice(["low", "medium", "high"]),
-        help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
+        help="Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o-series models only.",
         envvar="INSPECT_EVAL_REASONING_EFFORT",
     )
     @click.option(
@@ -503,6 +519,7 @@ def eval_command(
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
+    log_shared: int | None,
     no_score: bool | None,
     no_score_display: bool | None,
     log_format: Literal["eval", "json"] | None,
@@ -556,6 +573,7 @@ def eval_command(
         no_log_samples=no_log_samples,
         log_images=log_images,
         log_buffer=log_buffer,
+        log_shared=log_shared,
         no_score=no_score,
         no_score_display=no_score_display,
         is_eval_set=False,
@@ -670,6 +688,7 @@ def eval_set_command(
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
+    log_shared: int | None,
     no_score: bool | None,
     no_score_display: bool | None,
     bundle_dir: str | None,
@@ -728,6 +747,7 @@ def eval_set_command(
         no_log_samples=no_log_samples,
         log_images=log_images,
         log_buffer=log_buffer,
+        log_shared=log_shared,
         no_score=no_score,
         no_score_display=no_score_display,
         is_eval_set=True,
@@ -783,6 +803,7 @@ def eval_exec(
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
+    log_shared: int | None,
     no_score: bool | None,
     no_score_display: bool | None,
     is_eval_set: bool = False,
@@ -865,6 +886,7 @@ def eval_exec(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score=score,
             score_display=score_display,
         )
@@ -1004,6 +1026,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
 @click.option(
     "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
 )
+@click.option(
+    "--log-shared",
+    is_flag=False,
+    flag_value="true",
+    default=None,
+    callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
+    help=LOG_SHARED_HELP,
+    envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
+)
 @click.option(
     "--no-score",
     type=bool,
@@ -1052,6 +1083,7 @@ def eval_retry_command(
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
+    log_shared: int | None,
     no_score: bool | None,
     no_score_display: bool | None,
     max_connections: int | None,
@@ -1099,6 +1131,7 @@ def eval_retry_command(
         log_samples=log_samples,
         log_images=log_images,
         log_buffer=log_buffer,
+        log_shared=log_shared,
         score=score,
         score_display=score_display,
         max_retries=max_retries,

inspect_ai/_cli/util.py CHANGED Viewed

@@ -1,11 +1,54 @@
-from typing import Any
+from typing import Any, Callable
+import click
 import yaml
 from inspect_ai._util.config import resolve_args
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
+def int_or_bool_flag_callback(
+    true_value: int, false_value: int = 0
+) -> Callable[[click.Context, click.Parameter, Any], int]:
+    def callback(ctx: click.Context, param: click.Parameter, value: Any) -> int:
+        """Callback to parse the an option that can either be a boolean flag or integer.
+        Desired behavior:
+        - Not specified at all -> false_value
+        - Specified with no value -> true_value
+        - Specified with "true"/"false" -> true_value or false_value respectively
+        - Specified with an integer -> that integer
+        """
+        # 1. If this parameter was never given on the command line,
+        #    then we return 0.
+        source = ctx.get_parameter_source(param.name) if param.name else ""
+        if source == click.core.ParameterSource.DEFAULT:
+            # Means the user did NOT specify the flag at all
+            return false_value
+        # 2. The user did specify the flag. If value is None,
+        #    that means they used the flag with no argument, e.g. --my-flag
+        if value is None:
+            return true_value
+        # 3. If there is a value, try to parse booleans or an integer.
+        lower_val = value.lower()
+        if lower_val in ("true", "yes", "1"):
+            return true_value
+        elif lower_val in ("false", "no", "0"):
+            return false_value
+        else:
+            # 4. Otherwise, assume it is an integer
+            try:
+                return int(value)
+            except ValueError:
+                raise click.BadParameter(
+                    f"Expected 'true', 'false', or an integer for --{param.name}. Got: {value}"
+                )
+    return callback
 def parse_cli_config(
     args: tuple[str] | list[str] | None, config: str | None
 ) -> dict[str, Any]:

inspect_ai/_display/core/config.py CHANGED Viewed

@@ -36,7 +36,7 @@ def task_config(
             value = value if isinstance(value, list) else [value]
             value = [str(v) for v in value]
             config_print.append(f"{name}: {','.join(value)}")
-        elif name not in ["limit", "model", "response_schema"]:
+        elif name not in ["limit", "model", "response_schema", "log_shared"]:
             if isinstance(value, list):
                 value = ",".join([str(v) for v in value])
             if isinstance(value, str):

inspect_ai/_display/core/display.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import (
 )
 import rich
+from pydantic import BaseModel, Field, field_validator
 from rich.console import Console
 from inspect_ai.log import EvalConfig, EvalResults, EvalStats
@@ -104,12 +105,20 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
         raise NotImplementedError("input_panel not implemented by current display")
-@dataclass
-class TaskDisplayMetric:
+class TaskDisplayMetric(BaseModel):
     scorer: str
     name: str
-    value: float | int
-    reducer: str | None
+    value: float | int | None = Field(default=None)
+    reducer: str | None = Field(default=None)
+    @field_validator("value", mode="before")
+    @classmethod
+    def handle_null_value(cls, v: Any) -> Union[float, int, None]:
+        if v is None:
+            return None
+        if isinstance(v, float | int):
+            return v
+        raise ValueError(f"Expected float, int, or None, got {type(v)}")
 @runtime_checkable

inspect_ai/_display/core/results.py CHANGED Viewed

@@ -180,7 +180,7 @@ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> s
     )
     metric = metrics[0]
-    if np.isnan(metric.value):
+    if metric.value is None or np.isnan(metric.value):
         value = " n/a"
     else:
         value = f"{metric.value:.2f}"

inspect_ai/_display/textual/widgets/task_detail.py CHANGED Viewed

@@ -14,7 +14,7 @@ from inspect_ai._display.core.display import TaskDisplayMetric
 @dataclass
 class TaskMetric:
     name: str
-    value: float
+    value: float | int | None
 class TaskDetail(Widget):
@@ -233,9 +233,10 @@ class TaskMetrics(Widget):
         for metric in self.metrics:
             # Add the value static but keep it around
             # for future updates
-            self.value_widgets[metric.name] = Static(
-                self._metric_value(metric.value), markup=False
-            )
+            if metric.value is not None:
+                self.value_widgets[metric.name] = Static(
+                    self._metric_value(metric.value), markup=False
+                )
             grid.mount(Static(metric.name, markup=False))
             grid.mount(self.value_widgets[metric.name])

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -15,7 +15,11 @@ from typing_extensions import Unpack
 from inspect_ai._cli.util import parse_cli_args
 from inspect_ai._display.core.active import display as task_display
 from inspect_ai._util.config import resolve_args
-from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
+from inspect_ai._util.constants import (
+    DEFAULT_LOG_FORMAT,
+    DEFAULT_LOG_SHARED,
+    JSON_LOG_FORMAT,
+)
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.file import absolute_file_path
 from inspect_ai._util.logger import warn_once
@@ -31,6 +35,7 @@ from inspect_ai.approval._policy import (
 from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
 from inspect_ai.log._file import read_eval_log_async
 from inspect_ai.log._recorders import create_recorder_for_format
+from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
 from inspect_ai.model import (
     GenerateConfig,
     GenerateConfigArgs,
@@ -92,6 +97,7 @@ def eval(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -161,6 +167,9 @@ def eval(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -210,6 +219,7 @@ def eval(
                 log_samples=log_samples,
                 log_images=log_images,
                 log_buffer=log_buffer,
+                log_shared=log_shared,
                 score=score,
                 score_display=score_display,
                 **kwargs,
@@ -260,6 +270,7 @@ async def eval_async(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -312,6 +323,7 @@ async def eval_async(
         log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -390,6 +402,15 @@ async def eval_async(
                 f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
             )
+        # resolve log_shared
+        log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
+        # validate that --log-shared can't use used with 'json' format
+        if log_shared and log_format == JSON_LOG_FORMAT:
+            raise PrerequisiteError(
+                "ERROR: --log-shared is not compatible with the json log format."
+            )
         # resolve solver
         solver = chain(solver) if isinstance(solver, list) else solver
@@ -426,6 +447,7 @@ async def eval_async(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score_display=score_display,
         )
@@ -485,6 +507,9 @@ async def eval_async(
             )
             logs = EvalLogs(results)
+        # cleanup sample buffers if required
+        cleanup_sample_buffers(log_dir)
     finally:
         _eval_async_running = False
@@ -510,6 +535,7 @@ def eval_retry(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     max_retries: int | None = None,
@@ -551,6 +577,9 @@ def eval_retry(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         max_retries:
@@ -586,6 +615,7 @@ def eval_retry(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score=score,
             score_display=score_display,
             max_retries=max_retries,
@@ -612,6 +642,7 @@ async def eval_retry_async(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     score: bool = True,
     score_display: bool | None = None,
     max_retries: int | None = None,
@@ -651,6 +682,8 @@ async def eval_retry_async(
         log_buffer: (int | None): Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Indicate that the log directory is shared, which results in
+            additional syncing of realtime log data for Inspect View.
         score (bool): Score output (defaults to True)
         score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         max_retries (int | None):
@@ -750,6 +783,9 @@ async def eval_retry_async(
         log_buffer = (
             log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
         )
+        log_shared = (
+            log_shared if log_shared is not None else eval_log.eval.config.log_shared
+        )
         score_display = (
             score_display
             if score_display is not None
@@ -796,6 +832,7 @@ async def eval_retry_async(
                 log_samples=log_samples,
                 log_images=log_images,
                 log_buffer=log_buffer,
+                log_shared=log_shared,
                 score=score,
                 score_display=score_display,
                 **dict(config),

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -92,6 +92,7 @@ def eval_set(
     log_samples: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
+    log_shared: bool | int | None = None,
     bundle_dir: str | None = None,
     bundle_overwrite: bool = False,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -171,6 +172,9 @@ def eval_set(
         log_buffer: Number of samples to buffer before writing log file.
             If not specified, an appropriate default for the format and filesystem is
             chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
+        log_shared: Sync sample events to log directory so that users on other systems
+            can see log updates in realtime (defaults to no syncing). Specify `True`
+            to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
         bundle_dir: If specified, the log viewer and logs generated
             by this eval set will be bundled into this directory.
         bundle_overwrite: Whether to overwrite files in the bundle_dir.
@@ -219,6 +223,7 @@ def eval_set(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            log_shared=log_shared,
             score=score,
             **kwargs,
         )

inspect_ai/_eval/run.py CHANGED Viewed

@@ -407,12 +407,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
         # Use anyio task group instead of manual task management
         try:
             async with anyio.create_task_group() as tg:
+                # computer number of workers (never more than total_tasks)
+                num_workers = min(parallel, total_tasks)
                 # start worker tasks
-                for _ in range(parallel):
+                for _ in range(num_workers):
                     tg.start_soon(worker)
                 # enqueue initial set of tasks
-                for _ in range(min(parallel, total_tasks)):
+                for _ in range(num_workers):
                     await enque_next_task()
         except anyio.get_cancelled_exc_class():
             pass

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, Iterator, Literal, cast
 from shortuuid import uuid
+from inspect_ai._display.core.display import TaskDisplayMetric
 from inspect_ai._eval.task.util import slice_dataset
 from inspect_ai._util.constants import PKG_NAME
 from inspect_ai._util.datetime import iso_now
@@ -34,6 +35,9 @@ from inspect_ai.log._log import (
     eval_config_defaults,
 )
 from inspect_ai.log._recorders import Recorder
+from inspect_ai.log._recorders.buffer import SampleBufferDatabase
+from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
+from inspect_ai.log._transcript import Event
 from inspect_ai.model import (
     GenerateConfig,
     Model,
@@ -159,10 +163,15 @@ class TaskLogger:
         # size of flush buffer (how many samples we buffer before hitting storage)
         self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
-        self.flush_pending = 0
+        self.flush_pending: list[tuple[str | int, int]] = []
     async def init(self) -> None:
         self._location = await self.recorder.log_init(self.eval)
+        self._buffer_db = SampleBufferDatabase(
+            location=self._location,
+            log_images=self.eval.config.log_images is not False,
+            log_shared=self.eval.config.log_shared,
+        )
     @property
     def location(self) -> str:
@@ -174,22 +183,53 @@ class TaskLogger:
     async def log_start(self, plan: EvalPlan) -> None:
         await self.recorder.log_start(self.eval, plan)
+        await self.recorder.flush(self.eval)
+    async def start_sample(self, sample: SampleSummary) -> None:
+        self._buffer_db.start_sample(sample)
+    def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
+        # log the sample event
+        self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
-    async def log_sample(self, sample: EvalSample, *, flush: bool) -> None:
+    async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
         # log the sample
         await self.recorder.log_sample(self.eval, sample)
+        # mark complete
+        self._buffer_db.complete_sample(
+            SampleSummary(
+                id=sample.id,
+                epoch=sample.epoch,
+                input=sample.input,
+                target=sample.target,
+                completed=True,
+                scores=sample.scores,
+                error=sample.error.message if sample.error is not None else None,
+                limit=f"{sample.limit.type}" if sample.limit is not None else None,
+            )
+        )
         # flush if requested
         if flush:
-            self.flush_pending += 1
-            if self.flush_pending >= self.flush_buffer:
+            self.flush_pending.append((sample.id, sample.epoch))
+            if len(self.flush_pending) >= self.flush_buffer:
+                # flush to disk
                 await self.recorder.flush(self.eval)
-                self.flush_pending = 0
+                # notify the event db it can remove these
+                self._buffer_db.remove_samples(self.flush_pending)
+                # Clear
+                self.flush_pending.clear()
         # track sucessful samples logged
         if sample.error is None:
             self._samples_completed += 1
+    def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
+        self._buffer_db.update_metrics(metrics)
     async def log_finish(
         self,
         status: Literal["success", "cancelled", "error"],
@@ -198,10 +238,17 @@ class TaskLogger:
         reductions: list[EvalSampleReductions] | None = None,
         error: EvalError | None = None,
     ) -> EvalLog:
-        return await self.recorder.log_finish(
+        # finish and get log
+        log = await self.recorder.log_finish(
             self.eval, status, stats, results, reductions, error
         )
+        # cleanup the events db
+        self._buffer_db.cleanup()
+        # return log
+        return log
 async def log_start(
     logger: TaskLogger,

inspect-ai 0.3.81__py3-none-any.whl → 0.3.82__py3-none-any.whl

inspect-ai 0.3.81py3-none-any.whl → 0.3.82py3-none-any.whl