PyPI - inspect-ai - Versions diffs - 0.3.100__py3-none-any.whl → 0.3.101__py3-none-any.whl - Mend

inspect-ai 0.3.100py3-none-any.whl → 0.3.101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

inspect_ai/_cli/eval.py +2 -1
inspect_ai/_eval/eval.py +13 -1
inspect_ai/_eval/evalset.py +3 -2
inspect_ai/_eval/run.py +2 -0
inspect_ai/_eval/task/log.py +3 -1
inspect_ai/_view/www/dist/assets/index.css +44 -12
inspect_ai/_view/www/dist/assets/index.js +1499 -1467
inspect_ai/_view/www/package.json +4 -4
inspect_ai/_view/www/src/app/log-view/tabs/grouping.ts +4 -4
inspect_ai/_view/www/src/app/routing/navigationHooks.ts +22 -25
inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +17 -5
inspect_ai/_view/www/src/state/hooks.ts +1 -1
inspect_ai/_view/www/yarn.lock +21 -27
inspect_ai/analysis/beta/__init__.py +2 -0
inspect_ai/dataset/_sources/csv.py +2 -6
inspect_ai/dataset/_sources/hf.py +2 -6
inspect_ai/dataset/_sources/json.py +2 -6
inspect_ai/dataset/_util.py +23 -0
inspect_ai/log/_recorders/eval.py +4 -3
inspect_ai/log/_recorders/json.py +1 -0
inspect_ai/log/_recorders/recorder.py +1 -0
inspect_ai/model/_openai_responses.py +11 -6
inspect_ai/model/_openai_web_search.py +9 -2
inspect_ai/model/_providers/openai.py +3 -1
inspect_ai/model/_providers/openai_responses.py +5 -1
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/tool/_tools/_web_search/_google.py +28 -11
inspect_ai/tool/_tools/_web_search/_tavily.py +11 -1
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/METADATA +1 -1
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/RECORD +34 -34
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.100.dist-info → inspect_ai-0.3.101.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -35,7 +35,7 @@ from .util import (
 )
 MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
-MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
+MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1 for eval and 4 for eval-set)"
 MAX_SUBPROCESSES_HELP = (
     "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
 )
@@ -949,6 +949,7 @@ def eval_exec(
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
+            log_header_only=True,  # cli invocation doesn't need full log
             score=score,
             score_display=score_display,
         )

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -105,6 +105,7 @@ def eval(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
+    log_header_only: bool | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -181,6 +182,8 @@ def eval(
         log_shared: Sync sample events to log directory so that users on other systems
             can see log updates in realtime (defaults to no syncing). Specify `True`
             to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
+        log_header_only: If `True`, the function should return only log headers rather
+            than full logs with samples (defaults to `False`).
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -234,6 +237,7 @@ def eval(
                 log_images=log_images,
                 log_buffer=log_buffer,
                 log_shared=log_shared,
+                log_header_only=log_header_only,
                 score=score,
                 score_display=score_display,
                 **kwargs,
@@ -288,6 +292,7 @@ async def eval_async(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
+    log_header_only: bool | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -344,7 +349,9 @@ async def eval_async(
         log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
+        log_shared: Indicate that the log directory is shared, which results in additional
+        syncing of realtime log data for Inspect View.
+        log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -432,6 +439,9 @@ async def eval_async(
         # resolve log_shared
         log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
+        # resolve header only
+        log_header_only = log_header_only is True
         # validate that --log-shared can't use used with 'json' format
         if log_shared and log_format == JSON_LOG_FORMAT:
             raise PrerequisiteError(
@@ -507,6 +517,7 @@ async def eval_async(
                         eval_config=eval_config,
                         eval_sandbox=sandbox,
                         recorder=recorder,
+                        header_only=log_header_only,
                         epochs_reducer=epochs_reducer,
                         solver=solver,
                         tags=tags,
@@ -532,6 +543,7 @@ async def eval_async(
                 eval_config=eval_config,
                 eval_sandbox=sandbox,
                 recorder=recorder,
+                header_only=log_header_only,
                 epochs_reducer=epochs_reducer,
                 solver=solver,
                 tags=tags,

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -168,7 +168,7 @@ def eval_set(
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
-            (defaults to number of models being evaluated)
+            (defaults to the greater of 4 and the number of models being evaluated)
         max_subprocesses: Maximum number of subprocesses to
             run in parallel (default is os.cpu_count())
         max_sandboxes: Maximum number of sandboxes (per-provider)
@@ -235,6 +235,7 @@ def eval_set(
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
+            log_header_only=True,
             score=score,
             **kwargs,
         )
@@ -277,7 +278,7 @@ def eval_set(
     retry_connections = retry_connections or 0.5
     retry_cleanup = retry_cleanup is not False
     max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
-    max_tasks = max_tasks if max_tasks is not None else len(models)
+    max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
     # prepare console/status
     console = rich.get_console()

inspect_ai/_eval/run.py CHANGED Viewed

@@ -63,6 +63,7 @@ async def eval_run(
     eval_config: EvalConfig,
     eval_sandbox: SandboxEnvironmentType | None,
     recorder: Recorder,
+    header_only: bool,
     epochs_reducer: list[ScoreReducer] | None = None,
     solver: Solver | SolverSpec | None = None,
     tags: list[str] | None = None,
@@ -212,6 +213,7 @@ async def eval_run(
                     eval_config=task_eval_config,
                     metadata=((metadata or {}) | (task.metadata or {})) or None,
                     recorder=recorder,
+                    header_only=header_only,
                 )
                 await logger.init()

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -75,6 +75,7 @@ class TaskLogger:
         eval_config: EvalConfig,
         metadata: dict[str, Any] | None,
         recorder: Recorder,
+        header_only: bool,
     ) -> None:
         # determine versions
         git = git_context()
@@ -153,6 +154,7 @@ class TaskLogger:
         # stack recorder and location
         self.recorder = recorder
+        self.header_only = header_only
         # number of samples logged
         self._samples_completed = 0
@@ -238,7 +240,7 @@ class TaskLogger:
     ) -> EvalLog:
         # finish and get log
         log = await self.recorder.log_finish(
-            self.eval, status, stats, results, reductions, error
+            self.eval, status, stats, results, reductions, error, self.header_only
         )
         # cleanup the events db

inspect-ai 0.3.100__py3-none-any.whl → 0.3.101__py3-none-any.whl

inspect-ai 0.3.100py3-none-any.whl → 0.3.101py3-none-any.whl