PyPI - inspect-ai - Versions diffs - 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl - Mend

inspect-ai 0.3.104py3-none-any.whl → 0.3.105py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

inspect_ai/_eval/evalset.py +1 -1
inspect_ai/_eval/task/run.py +64 -38
inspect_ai/_view/server.py +17 -0
inspect_ai/_view/www/dist/assets/index.css +33 -29
inspect_ai/_view/www/dist/assets/index.js +559 -247
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
inspect_ai/_view/www/src/client/api/types.ts +3 -0
inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
inspect_ai/agent/_handoff.py +5 -2
inspect_ai/agent/_react.py +5 -5
inspect_ai/dataset/_dataset.py +1 -1
inspect_ai/log/_samples.py +5 -0
inspect_ai/model/_call_tools.py +4 -4
inspect_ai/model/_providers/anthropic.py +23 -2
inspect_ai/model/_providers/google.py +5 -1
inspect_ai/util/__init__.py +8 -0
inspect_ai/util/_background.py +64 -0
inspect_ai/util/_limit.py +72 -5
inspect_ai/util/_sandbox/__init__.py +2 -0
inspect_ai/util/_sandbox/service.py +28 -7
inspect_ai/util/_subprocess.py +51 -38
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +1 -1
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +38 -37
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -578,7 +578,7 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
     else:
         task_file = task.eval.task_file or ""
         task_name = task.eval.task
-        task_args = task.eval.task_args
+        task_args = task.eval.task_args_passed
         model = str(task.eval.model)
         model_roles = task.eval.model_roles or {}

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pathlib import PurePath
 from typing import Callable, Literal
 import anyio
+from anyio.abc import TaskGroup
 from typing_extensions import Unpack
 from inspect_ai._display import (
@@ -306,37 +307,57 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
                     task.metrics,
                 )
+                async def run_sample(
+                    sample: Sample, state: TaskState
+                ) -> dict[str, SampleScore] | None:
+                    result: dict[str, SampleScore] | None = None
+                    async def run(tg: TaskGroup) -> None:
+                        try:
+                            nonlocal result
+                            result = await task_run_sample(
+                                tg=tg,
+                                task_name=task.name,
+                                log_location=profile.log_location,
+                                sample=sample,
+                                state=state,
+                                sandbox=sandbox,
+                                max_sandboxes=config.max_sandboxes,
+                                sandbox_cleanup=sandbox_cleanup,
+                                plan=plan,
+                                scorers=scorers,
+                                generate=generate,
+                                progress=progress,
+                                logger=logger if log_samples else None,
+                                log_images=log_images,
+                                sample_source=sample_source,
+                                sample_error=sample_error_handler,
+                                sample_complete=sample_complete,
+                                fails_on_error=(
+                                    config.fail_on_error is None
+                                    or config.fail_on_error is True
+                                ),
+                                retry_on_error=config.retry_on_error or 0,
+                                error_retries=[],
+                                time_limit=config.time_limit,
+                                working_limit=config.working_limit,
+                                semaphore=sample_semaphore,
+                            )
+                        finally:
+                            tg.cancel_scope.cancel()
+                    async with anyio.create_task_group() as tg:
+                        tg.start_soon(run, tg)
+                    return result
                 sample_results = await tg_collect(
                     [
-                        functools.partial(
-                            task_run_sample,
-                            task_name=task.name,
-                            log_location=profile.log_location,
-                            sample=sample,
-                            state=state,
-                            sandbox=sandbox,
-                            max_sandboxes=config.max_sandboxes,
-                            sandbox_cleanup=sandbox_cleanup,
-                            plan=plan,
-                            scorers=scorers,
-                            generate=generate,
-                            progress=progress,
-                            logger=logger if log_samples else None,
-                            log_images=log_images,
-                            sample_source=sample_source,
-                            sample_error=sample_error_handler,
-                            sample_complete=sample_complete,
-                            fails_on_error=(
-                                config.fail_on_error is None
-                                or config.fail_on_error is True
-                            ),
-                            retry_on_error=config.retry_on_error or 0,
-                            error_retries=[],
-                            time_limit=config.time_limit,
-                            working_limit=config.working_limit,
-                            semaphore=sample_semaphore,
+                        functools.partial(run_sample, sample, state)
+                        for (sample, state) in zip(
+                            samples,
+                            states,
                         )
-                        for (sample, state) in zip(samples, states)
                     ]
                 )
@@ -492,6 +513,7 @@ def update_metrics_display_fn(
 async def task_run_sample(
     *,
+    tg: TaskGroup,
     task_name: str,
     log_location: str,
     sample: Sample,
@@ -611,12 +633,14 @@ async def task_run_sample(
             working_limit=working_limit,
             fails_on_error=fails_on_error or (retry_on_error > 0),
             transcript=sample_transcript,
+            tg=tg,
         ) as active,
     ):
         start_time: float | None = None
         error: EvalError | None = None
         raise_error: BaseException | None = None
         results: dict[str, SampleScore] = {}
+        limit: EvalSampleLimit | None = None
         try:
             # begin init
             init_span = span("init", type="init")
@@ -704,9 +728,17 @@ async def task_run_sample(
                         # handle the cancel exception
                         raise
-                except (LimitExceededError, TerminateSampleError):
+                except LimitExceededError as ex:
+                    # capture most recent state for scoring
+                    state = sample_state() or state
+                    limit = EvalSampleLimit(
+                        type=ex.type, limit=ex.limit if ex.limit is not None else -1
+                    )
+                except TerminateSampleError:
                     # capture most recent state for scoring
                     state = sample_state() or state
+                    limit = EvalSampleLimit(type="operator", limit=1)
                 except BaseException as ex:
                     error, raise_error = handle_error(ex)
@@ -815,6 +847,7 @@ async def task_run_sample(
                     state=state,
                     scores=results,
                     error=error,
+                    limit=limit,
                     error_retries=error_retries,
                     log_images=log_images,
                 )
@@ -854,6 +887,7 @@ async def task_run_sample(
             time_limit=time_limit,
             working_limit=working_limit,
             semaphore=semaphore,
+            tg=tg,
         )
     # no error
@@ -879,6 +913,7 @@ async def log_sample(
     state: TaskState,
     scores: dict[str, SampleScore],
     error: EvalError | None,
+    limit: EvalSampleLimit | None,
     error_retries: list[EvalError],
     log_images: bool,
 ) -> None:
@@ -894,15 +929,6 @@ async def log_sample(
     # compute total time if we can
     total_time = time.monotonic() - start_time if start_time is not None else None
-    # if a limit was hit, note that in the Eval Sample
-    limit = None
-    for e in transcript().events:
-        if e.event == "sample_limit":
-            limit = EvalSampleLimit(
-                type=e.type, limit=e.limit if e.limit is not None else -1
-            )
-            break
     eval_sample = EvalSample(
         id=id,
         epoch=state.epoch,

inspect_ai/_view/server.py CHANGED Viewed

@@ -155,6 +155,23 @@ def view_server(
                 body=samples.model_dump_json(), headers={"ETag": samples.etag}
             )
+    @routes.get("/api/log-message")
+    async def api_log_message(request: web.Request) -> web.Response:
+        # log file requested
+        file = query_param_required("log_file", request, str)
+        file = urllib.parse.unquote(file)
+        validate_log_file_request(file)
+        # message to log
+        message = query_param_required("message", request, str)
+        # log the message
+        logger.warning(f"[CLIENT MESSAGE] ({file}): {message}")
+        # respond
+        return web.Response(status=204)
     @routes.get("/api/pending-sample-data")
     async def api_sample_events(request: web.Request) -> web.Response:
         # log file requested

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -15655,18 +15655,40 @@ pre[class*="language-"] {
   background-color: var(--bs-light-border-subtle);
   margin-top: -1px;
 }
-._message_17kai_1 {
+._keyPairContainer_1ltuo_1 {
+  display: grid;
+  grid-template-columns: max-content auto;
+  column-gap: 0.5em;
+  padding-top: 4px;
+  padding-bottom: 4px;
+  border-bottom: solid 1px var(--bs-border-color);
+}
+._key_1ltuo_1 {
+  display: grid;
+  grid-template-columns: 1em auto;
+  cursor: pointer;
+}
+._pre_1ltuo_16 {
+  margin-bottom: 0;
+}
+._treeIcon_1ltuo_20 {
+  margin-top: -3px;
+}
+._message_1ivu3_1 {
   font-weight: 300;
   margin-left: 0;
   margin-right: 0;
   white-space: normal;
 }
-._systemRole_17kai_8 {
+._systemRole_1ivu3_8 {
   opacity: 0.7;
 }
-._messageGrid_17kai_12 {
+._messageGrid_1ivu3_12 {
   display: grid;
   grid-template-columns: max-content max-content max-content;
   column-gap: 0.3em;
@@ -15674,24 +15696,28 @@ pre[class*="language-"] {
   margin-bottom: 0.3em;
 }
-._messageContents_17kai_20 {
+._messageContents_1ivu3_20 {
   margin-left: 0;
   padding-bottom: 0;
 }
-._messageContents_17kai_20._indented_17kai_25 {
+._messageContents_1ivu3_20._indented_1ivu3_25 {
   margin-left: 0rem;
 }
-._copyLink_17kai_29 {
+._copyLink_1ivu3_29 {
   opacity: 0;
   padding-left: 0;
   padding-right: 2em;
 }
-._copyLink_17kai_29:hover {
+._copyLink_1ivu3_29:hover {
   opacity: 1;
 }
+._metadataLabel_1ivu3_39 {
+  padding-top: 1em;
+}
 ._webSearch_1376z_1 {
   display: grid;
   grid-template-columns: max-content 1fr;
@@ -15702,28 +15728,6 @@ pre[class*="language-"] {
 ._query_1376z_8 {
   font-family: var(--bs-font-monospace);
 }
-._keyPairContainer_1ltuo_1 {
-  display: grid;
-  grid-template-columns: max-content auto;
-  column-gap: 0.5em;
-  padding-top: 4px;
-  padding-bottom: 4px;
-  border-bottom: solid 1px var(--bs-border-color);
-}
-._key_1ltuo_1 {
-  display: grid;
-  grid-template-columns: 1em auto;
-  cursor: pointer;
-}
-._pre_1ltuo_16 {
-  margin-bottom: 0;
-}
-._treeIcon_1ltuo_20 {
-  margin-top: -3px;
-}
 ._contentData_1sd1z_1 {
   border: solid var(--bs-light-border-subtle) 1px;
   padding: 0.5em;

inspect-ai 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl

inspect-ai 0.3.104py3-none-any.whl → 0.3.105py3-none-any.whl