PyPI - inspect-ai - Versions diffs - 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl - Mend

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

inspect_ai/_cli/eval.py +27 -0
inspect_ai/_display/textual/widgets/samples.py +3 -3
inspect_ai/_display/textual/widgets/transcript.py +3 -29
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +23 -27
inspect_ai/_util/answer.py +26 -0
inspect_ai/_util/constants.py +0 -1
inspect_ai/_util/local_server.py +398 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +173 -159
inspect_ai/_view/www/dist/assets/index.js +1417 -1142
inspect_ai/_view/www/log-schema.json +379 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +93 -14
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +22 -12
inspect_ai/agent/_as_tool.py +20 -6
inspect_ai/agent/_handoff.py +12 -1
inspect_ai/agent/_react.py +4 -3
inspect_ai/agent/_run.py +16 -3
inspect_ai/agent/_types.py +9 -0
inspect_ai/dataset/_dataset.py +6 -3
inspect_ai/log/__init__.py +14 -0
inspect_ai/log/_convert.py +4 -9
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +14 -25
inspect_ai/log/_transcript.py +84 -36
inspect_ai/log/_tree.py +118 -0
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +72 -44
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +66 -88
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/hf.py +27 -1
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +247 -0
inspect_ai/model/_providers/vllm.py +211 -400
inspect_ai/scorer/_choice.py +1 -2
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_chain.py +1 -1
inspect_ai/solver/_fork.py +1 -1
inspect_ai/solver/_multiple_choice.py +5 -22
inspect_ai/solver/_plan.py +2 -2
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/solver/_transcript.py +6 -7
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +8 -5
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_execute.py +4 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +16 -0
inspect_ai/util/_anyio.py +11 -0
inspect_ai/util/_collect.py +50 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
inspect_ai/util/_span.py +58 -0
inspect_ai/util/_subtask.py +27 -42
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
inspect_ai/_display/core/group.py +0 -79
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -43,6 +43,9 @@ MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in paral
 NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
 FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
 NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
+NO_LOG_REALTIME_HELP = (
+    "Do not log events in realtime (affects live viewing of samples in inspect view)"
+)
 NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
 RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
 LOG_IMAGES_HELP = (
@@ -281,6 +284,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help=NO_LOG_SAMPLES_HELP,
         envvar="INSPECT_EVAL_NO_LOG_SAMPLES",
     )
+    @click.option(
+        "--no-log-realtime",
+        type=bool,
+        is_flag=True,
+        help=NO_LOG_REALTIME_HELP,
+        envvar="INSPECT_EVAL_NO_LOG_REALTIME",
+    )
     @click.option(
         "--log-images/--no-log-images",
         type=bool,
@@ -544,6 +554,7 @@ def eval_command(
     no_fail_on_error: bool | None,
     retry_on_error: int | None,
     no_log_samples: bool | None,
+    no_log_realtime: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
     log_shared: int | None,
@@ -600,6 +611,7 @@ def eval_command(
         retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         no_log_samples=no_log_samples,
+        no_log_realtime=no_log_realtime,
         log_images=log_images,
         log_buffer=log_buffer,
         log_shared=log_shared,
@@ -718,6 +730,7 @@ def eval_set_command(
     no_fail_on_error: bool | None,
     retry_on_error: int | None,
     no_log_samples: bool | None,
+    no_log_realtime: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
     log_shared: int | None,
@@ -779,6 +792,7 @@ def eval_set_command(
         retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         no_log_samples=no_log_samples,
+        no_log_realtime=no_log_realtime,
         log_images=log_images,
         log_buffer=log_buffer,
         log_shared=log_shared,
@@ -837,6 +851,7 @@ def eval_exec(
     retry_on_error: int | None,
     debug_errors: bool | None,
     no_log_samples: bool | None,
+    no_log_realtime: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
     log_shared: int | None,
@@ -889,6 +904,7 @@ def eval_exec(
     # resolve negating options
     sandbox_cleanup = False if no_sandbox_cleanup else None
     log_samples = False if no_log_samples else None
+    log_realtime = False if no_log_realtime else None
     log_images = False if log_images is False else None
     trace = True if trace else None
     score = False if no_score else True
@@ -929,6 +945,7 @@ def eval_exec(
             max_subprocesses=max_subprocesses,
             max_sandboxes=max_sandboxes,
             log_samples=log_samples,
+            log_realtime=log_realtime,
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
@@ -1069,6 +1086,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
     help=NO_LOG_SAMPLES_HELP,
     envvar="INSPECT_EVAL_LOG_SAMPLES",
 )
+@click.option(
+    "--no-log-realtime",
+    type=bool,
+    is_flag=True,
+    help=NO_LOG_REALTIME_HELP,
+    envvar="INSPECT_EVAL_LOG_REALTIME",
+)
 @click.option(
     "--log-images/--no-log-images",
     type=bool,
@@ -1136,6 +1160,7 @@ def eval_retry_command(
     no_fail_on_error: bool | None,
     retry_on_error: int | None,
     no_log_samples: bool | None,
+    no_log_realtime: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
     log_shared: int | None,
@@ -1154,6 +1179,7 @@ def eval_retry_command(
     # resolve negating options
     sandbox_cleanup = False if no_sandbox_cleanup else None
     log_samples = False if no_log_samples else None
+    log_realtime = False if no_log_realtime else None
     log_images = False if log_images is False else None
     score = False if no_score else True
     score_display = False if no_score_display else None
@@ -1189,6 +1215,7 @@ def eval_retry_command(
         retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         log_samples=log_samples,
+        log_realtime=log_realtime,
         log_images=log_images,
         log_buffer=log_buffer,
         log_shared=log_shared,

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
                 )
                 if isinstance(last_event, ModelEvent):
                     # see if there are retries in play
-                    if sample.retry_count > 0:
-                        suffix = "retry" if sample.retry_count == 1 else "retries"
+                    if last_event.retries:
+                        suffix = "retry" if last_event.retries == 1 else "retries"
                         pending_caption_text = (
-                            f"Generating ({sample.retry_count:,} {suffix})..."
+                            f"Generating ({last_event.retries:,} {suffix})..."
                         )
                     else:
                         pending_caption_text = "Generating..."

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
     SampleInitEvent,
     SampleLimitEvent,
     ScoreEvent,
-    StepEvent,
+    SpanBeginEvent,
     SubtaskEvent,
     ToolEvent,
 )
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     # render the call
     content = transcript_tool_call(event)
-    # render sub-events
-    if event.events:
-        content.extend(render_sub_events(event.events))
     # render the output
     if isinstance(event.result, list):
         result: ToolResult = "\n".join(
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     return [EventDisplay("tool call", Group(*content))]
-def render_step_event(event: StepEvent) -> EventDisplay:
-    if event.type == "solver":
-        return render_solver_event(event)
-    if event.type == "scorer":
-        return render_scorer_event(event)
-    else:
-        return EventDisplay(step_title(event))
-def render_solver_event(event: StepEvent) -> EventDisplay:
-    return EventDisplay(step_title(event))
-def render_scorer_event(event: StepEvent) -> EventDisplay:
-    return EventDisplay(step_title(event))
 def render_score_event(event: ScoreEvent) -> EventDisplay:
     table = Table(box=None, show_header=False)
     table.add_column("", min_width=10, justify="left")
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
     # render header
     content: list[RenderableType] = [transcript_function(event.name, event.input)]
-    # render sub-events
-    if event.events:
-        content.extend(render_sub_events(event.events))
     if event.result:
         content.append(Text())
         if isinstance(event.result, str | int | float | bool | None):
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
     return content
-def step_title(event: StepEvent) -> str:
-    return f"{event.type or 'step'}: {event.name}"
+def span_title(event: SpanBeginEvent) -> str:
+    return f"{event.type or 'span'}: {event.name}"
 EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
 _renderers: list[tuple[Type[Event], EventRenderer]] = [
     (SampleInitEvent, render_sample_init_event),
     (SampleLimitEvent, render_sample_limit_event),
-    (StepEvent, render_step_event),
     (ModelEvent, render_model_event),
     (ToolEvent, render_tool_event),
     (SubtaskEvent, render_subtask_event),

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -101,6 +101,7 @@ def eval(
     max_subprocesses: int | None = None,
     max_sandboxes: int | None = None,
     log_samples: bool | None = None,
+    log_realtime: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
@@ -145,7 +146,7 @@ def eval(
             to "eval", the native high-performance format).
         limit: Limit evaluated samples
             (defaults to all samples).
-        sample_id: Evaluate specific sample(s) from the dataset.
+        sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
         epochs: Epochs to repeat samples for and optional score
             reducer function(s) used to combine sample scores (defaults to "mean")
         fail_on_error: `True` to fail on first sample error
@@ -171,6 +172,7 @@ def eval(
         max_sandboxes: Maximum number of sandboxes (per-provider)
             to run in parallel.
         log_samples: Log detailed samples and scores (defaults to True)
+        log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
         log_images: Log base64 encoded version of images,
             even if specified as a filename or URL (defaults to False)
         log_buffer: Number of samples to buffer before writing log file.
@@ -228,6 +230,7 @@ def eval(
                 max_subprocesses=max_subprocesses,
                 max_sandboxes=max_sandboxes,
                 log_samples=log_samples,
+                log_realtime=log_realtime,
                 log_images=log_images,
                 log_buffer=log_buffer,
                 log_shared=log_shared,
@@ -281,6 +284,7 @@ async def eval_async(
     max_subprocesses: int | None = None,
     max_sandboxes: int | None = None,
     log_samples: bool | None = None,
+    log_realtime: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
@@ -314,7 +318,7 @@ async def eval_async(
         log_dir: Output path for logging results (defaults to file log in ./logs directory).
         log_format: Format for writing log files (defaults to "eval", the native high-performance format).
         limit: Limit evaluated samples (defaults to all samples).
-        sample_id: Evaluate specific sample(s) from the dataset.
+        sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
         epochs: Epochs to repeat samples for and optional score
             reducer function(s) used to combine sample scores (defaults to "mean")
         fail_on_error: `True` to fail on first sample error
@@ -335,6 +339,7 @@ async def eval_async(
         max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
         max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
         log_samples: Log detailed samples and scores (defaults to True)
+        log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
         log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False)
         log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
@@ -473,6 +478,7 @@ async def eval_async(
             max_sandboxes=max_sandboxes,
             sandbox_cleanup=sandbox_cleanup,
             log_samples=log_samples,
+            log_realtime=log_realtime,
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
@@ -562,6 +568,7 @@ def eval_retry(
     retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
+    log_realtime: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
@@ -603,6 +610,7 @@ def eval_retry(
         debug_errors: Raise task errors (rather than logging them)
             so they can be debugged (defaults to False).
         log_samples: Log detailed samples and scores (defaults to True)
+        log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
         log_images: Log base64 encoded version of images,
             even if specified as a filename or URL (defaults to False)
         log_buffer: Number of samples to buffer before writing log file.
@@ -645,6 +653,7 @@ def eval_retry(
             retry_on_error=retry_on_error,
             debug_errors=debug_errors,
             log_samples=log_samples,
+            log_realtime=log_realtime,
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
@@ -673,6 +682,7 @@ async def eval_retry_async(
     retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
+    log_realtime: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
@@ -707,6 +717,7 @@ async def eval_retry_async(
         debug_errors: Raise task errors (rather than logging them)
            so they can be debugged (defaults to False).
         log_samples: Log detailed samples and scores (defaults to True)
+        log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
         log_images: Log base64 encoded version of images,
            even if specified as a filename or URL (defaults to False)
         log_buffer: Number of samples to buffer before writing log file.
@@ -817,6 +828,11 @@ async def eval_retry_async(
         log_samples = (
             log_samples if log_samples is not None else eval_log.eval.config.log_samples
         )
+        log_realtime = (
+            log_realtime
+            if log_realtime is not None
+            else eval_log.eval.config.log_realtime
+        )
         log_images = (
             log_images if log_images is not None else eval_log.eval.config.log_images
         )
@@ -875,6 +891,7 @@ async def eval_retry_async(
                 max_subprocesses=max_subprocesses,
                 max_sandboxes=max_sandboxes,
                 log_samples=log_samples,
+                log_realtime=log_realtime,
                 log_images=log_images,
                 log_buffer=log_buffer,
                 log_shared=log_shared,

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -93,6 +93,7 @@ def eval_set(
     max_subprocesses: int | None = None,
     max_sandboxes: int | None = None,
     log_samples: bool | None = None,
+    log_realtime: bool | None = None,
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
@@ -147,7 +148,7 @@ def eval_set(
             log files (defaults to "eval", the native high-performance format).
         limit: Limit evaluated samples
             (defaults to all samples).
-        sample_id: Evaluate specific sample(s) from the dataset.
+        sample_id: Evaluate specific sample(s) from the dataset. Use plain ids or preface with task names as required to disambiguate ids across tasks (e.g. `popularity:10`).
         epochs: Epochs to repeat samples for and optional score
             reducer function(s) used to combine sample scores (defaults to "mean")
         fail_on_error: `True` to fail on first sample error
@@ -173,6 +174,7 @@ def eval_set(
         max_sandboxes: Maximum number of sandboxes (per-provider)
             to run in parallel.
         log_samples: Log detailed samples and scores (defaults to True)
+        log_realtime: Log events in realtime (enables live viewing of samples in inspect view). Defaults to True.
         log_images: Log base64 encoded version of images,
             even if specified as a filename or URL (defaults to False)
         log_buffer: Number of samples to buffer before writing log file.
@@ -229,6 +231,7 @@ def eval_set(
             max_subprocesses=max_subprocesses,
             max_sandboxes=max_sandboxes,
             log_samples=log_samples,
+            log_realtime=log_realtime,
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,

inspect_ai/_eval/run.py CHANGED Viewed

@@ -122,6 +122,11 @@ async def eval_run(
                 task = resolved_task.task
                 task_eval_config = eval_config.model_copy()
+                # sample_ids can be specified per task
+                task_eval_config.sample_id = resolve_task_sample_ids(
+                    resolved_task.task.name, task_eval_config.sample_id
+                )
                 # resolve the task scorers
                 eval_scorer_specs = (
                     [as_scorer_spec(scorer) for scorer in task.scorer]
@@ -424,6 +429,42 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
         return results
+def resolve_task_sample_ids(
+    task: str, sample_id: str | int | list[str] | list[int] | list[str | int] | None
+) -> str | int | list[str] | list[int] | list[str | int] | None:
+    def collect_for_task(sample: str | int) -> str | int | None:
+        if isinstance(sample, str):
+            scoped = sample.split(":", maxsplit=1)
+            if len(scoped) > 1:
+                if scoped[0].lower() == task.lower():
+                    return scoped[1]
+                else:
+                    return None
+            else:
+                return sample
+        else:
+            return sample
+    if sample_id is not None:
+        if isinstance(sample_id, list):
+            ids: list[int | str] = []
+            for id in sample_id:
+                collect = collect_for_task(id)
+                if collect is not None:
+                    ids.append(collect)
+            return ids
+        else:
+            collect = collect_for_task(sample_id)
+            if collect is not None:
+                return collect
+            else:
+                return []
+    else:
+        return sample_id
 async def startup_sandbox_environments(
     eval_sandbox: SandboxEnvironmentSpec | None,
     tasks: list[ResolvedTask],

inspect_ai/_eval/task/generate.py CHANGED Viewed

@@ -4,7 +4,6 @@ from inspect_ai.model import CachePolicy, GenerateConfig, Model
 from inspect_ai.model._cache import epoch
 from inspect_ai.model._call_tools import execute_tools
 from inspect_ai.solver import TaskState
-from inspect_ai.solver._limit import SampleLimitExceededError
 from inspect_ai.tool import ToolFunction
@@ -18,53 +17,48 @@ async def task_generate(
     # track tool_choice (revert to "auto" after first forced call of a tool)
     tool_choice = state.tool_choice
-    try:
-        while True:
-            # If we don't update the epoch here as we go, it's entirely possible
-            # we'd cache the same response for every single epoch, which would
-            # completely defeat the point!
-            epoch.set(state.epoch)
+    while True:
+        # If we don't update the epoch here as we go, it's entirely possible
+        # we'd cache the same response for every single epoch, which would
+        # completely defeat the point!
+        epoch.set(state.epoch)
-            # call the model
-            state.output = await model.generate(
-                input=state.messages,
-                tools=state.tools,
-                tool_choice=tool_choice,
-                config=config,
-                cache=cache,
-            )
-            # append the assistant message
-            message = state.output.message
-            state.messages.append(message)
-            # check for completed
-            if state.completed:
-                return state
+        # call the model
+        state.output = await model.generate(
+            input=state.messages,
+            tools=state.tools,
+            tool_choice=tool_choice,
+            config=config,
+            cache=cache,
+        )
-            # resolve tool calls if necessary
-            if tool_calls != "none" and message.tool_calls:
-                # call tools and update messages and output
-                messages, output = await execute_tools(
-                    state.messages, state.tools, config.max_tool_output
-                )
-                state.messages.extend(messages)
-                if output is not None:
-                    state.output = output
+        # append the assistant message
+        message = state.output.message
+        state.messages.append(message)
-                # check for completed or only executing a single tool call
-                if state.completed or tool_calls == "single":
-                    return state
+        # check for completed
+        if state.completed:
+            return state
-                # if a tool_call was forced set tool_choice to 'auto'
-                # (otherwise it will get forced over and over again)
-                if isinstance(tool_choice, ToolFunction):
-                    tool_choice = "auto"
+        # resolve tool calls if necessary
+        if tool_calls != "none" and message.tool_calls:
+            # call tools and update messages and output
+            messages, output = await execute_tools(
+                state.messages, state.tools, config.max_tool_output
+            )
+            state.messages.extend(messages)
+            if output is not None:
+                state.output = output
-            # no tool calls or not resolving tool calls, we are done!
-            else:
+            # check for completed or only executing a single tool call
+            if state.completed or tool_calls == "single":
                 return state
-    # propagate current state along with sample limit exceeded
-    except SampleLimitExceededError as ex:
-        raise ex.with_state(state)
+            # if a tool_call was forced set tool_choice to 'auto'
+            # (otherwise it will get forced over and over again)
+            if isinstance(tool_choice, ToolFunction):
+                tool_choice = "auto"
+        # no tool calls or not resolving tool calls, we are done!
+        else:
+            return state

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -30,13 +30,14 @@ from inspect_ai.log._log import (
     EvalLog,
     EvalMetricDefinition,
     EvalSampleReductions,
+    EvalSampleSummary,
     EvalScorer,
     eval_config_defaults,
 )
 from inspect_ai.log._model import model_args_for_log, model_roles_to_model_roles_config
 from inspect_ai.log._recorders import Recorder
 from inspect_ai.log._recorders.buffer import SampleBufferDatabase
-from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
+from inspect_ai.log._recorders.types import SampleEvent
 from inspect_ai.log._transcript import Event
 from inspect_ai.model import (
     GenerateConfig,
@@ -160,13 +161,17 @@ class TaskLogger:
         self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
         self.flush_pending: list[tuple[str | int, int]] = []
+        # sample buffer db
+        self._buffer_db: SampleBufferDatabase | None = None
     async def init(self) -> None:
         self._location = await self.recorder.log_init(self.eval)
-        self._buffer_db = SampleBufferDatabase(
-            location=self._location,
-            log_images=self.eval.config.log_images is not False,
-            log_shared=self.eval.config.log_shared,
-        )
+        if self.eval.config.log_realtime is not False:
+            self._buffer_db = SampleBufferDatabase(
+                location=self._location,
+                log_images=self.eval.config.log_images is not False,
+                log_shared=self.eval.config.log_shared,
+            )
     @property
     def location(self) -> str:
@@ -180,36 +185,26 @@ class TaskLogger:
         await self.recorder.log_start(self.eval, plan)
         await self.recorder.flush(self.eval)
-    async def start_sample(self, sample: SampleSummary) -> None:
-        self._buffer_db.start_sample(sample)
+    async def start_sample(self, sample: EvalSampleSummary) -> None:
+        if self._buffer_db is not None:
+            self._buffer_db.start_sample(sample)
     def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
         # log the sample event
-        self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
+        if self._buffer_db is not None:
+            self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
     def remove_sample(self, id: str | int, epoch: int) -> None:
-        self._buffer_db.remove_samples([(id, epoch)])
+        if self._buffer_db is not None:
+            self._buffer_db.remove_samples([(id, epoch)])
     async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
         # log the sample
         await self.recorder.log_sample(self.eval, sample)
         # mark complete
-        self._buffer_db.complete_sample(
-            SampleSummary(
-                id=sample.id,
-                epoch=sample.epoch,
-                input=sample.input,
-                target=sample.target,
-                completed=True,
-                scores=sample.scores,
-                error=sample.error.message if sample.error is not None else None,
-                limit=f"{sample.limit.type}" if sample.limit is not None else None,
-                retries=len(sample.error_retries)
-                if sample.error_retries is not None
-                else None,
-            )
-        )
+        if self._buffer_db is not None:
+            self._buffer_db.complete_sample(sample.summary())
         # flush if requested
         if flush:
@@ -219,7 +214,8 @@ class TaskLogger:
                 await self.recorder.flush(self.eval)
                 # notify the event db it can remove these
-                self._buffer_db.remove_samples(self.flush_pending)
+                if self._buffer_db is not None:
+                    self._buffer_db.remove_samples(self.flush_pending)
                 # Clear
                 self.flush_pending.clear()
@@ -229,7 +225,8 @@ class TaskLogger:
             self._samples_completed += 1
     def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
-        self._buffer_db.update_metrics(metrics)
+        if self._buffer_db is not None:
+            self._buffer_db.update_metrics(metrics)
     async def log_finish(
         self,
@@ -245,7 +242,8 @@ class TaskLogger:
         )
         # cleanup the events db
-        self._buffer_db.cleanup()
+        if self._buffer_db is not None:
+            self._buffer_db.cleanup()
         # return log
         return log

inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

inspect-ai 0.3.92py3-none-any.whl → 0.3.94py3-none-any.whl