PyPI - inspect-ai - Versions diffs - 0.3.98__py3-none-any.whl → 0.3.99__py3-none-any.whl - Mend

inspect-ai 0.3.98py3-none-any.whl → 0.3.99py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

inspect_ai/__init__.py +2 -0
inspect_ai/_cli/log.py +1 -1
inspect_ai/_display/textual/widgets/transcript.py +15 -3
inspect_ai/_eval/run.py +12 -4
inspect_ai/_eval/task/log.py +1 -1
inspect_ai/_eval/task/task.py +1 -1
inspect_ai/_util/_async.py +1 -1
inspect_ai/_view/schema.py +1 -0
inspect_ai/_view/view.py +14 -0
inspect_ai/_view/www/dist/assets/index.css +10 -10
inspect_ai/_view/www/dist/assets/index.js +10 -10
inspect_ai/_view/www/log-schema.json +45 -5
inspect_ai/_view/www/src/@types/log.d.ts +11 -2
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +2 -1
inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +2 -2
inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css +2 -2
inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -1
inspect_ai/agent/_run.py +44 -8
inspect_ai/log/_bundle.py +5 -3
inspect_ai/log/_log.py +2 -2
inspect_ai/model/_providers/anthropic.py +3 -6
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_limit.py +160 -137
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/METADATA +1 -1
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/RECORD +30 -30
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/top_level.txt +0 -0

inspect_ai/__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ from inspect_ai._eval.score import score, score_async
 from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
 from inspect_ai._eval.task.tasks import Tasks
 from inspect_ai._util.constants import PKG_NAME
+from inspect_ai._view.view import view
 from inspect_ai.agent._human.agent import human_cli
 from inspect_ai.solver._human_agent import human_agent
@@ -32,4 +33,5 @@ __all__ = [
     "TaskInfo",
     "task",
     "task_with",
+    "view",
 ]

inspect_ai/_cli/log.py CHANGED Viewed

@@ -199,6 +199,6 @@ def view_resource(file: str) -> str:
 def view_type_resource(file: str) -> str:
-    resource = PKG_PATH / "_view" / "www" / "src" / "types" / file
+    resource = PKG_PATH / "_view" / "www" / "src" / "@types" / file
     with open(resource, "r", encoding="utf-8") as f:
         return f.read()

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -84,6 +84,7 @@ class TranscriptView(ScrollableContainer):
                 scroll_to_end = (
                     new_sample or abs(self.scroll_y - self.max_scroll_y) <= 20
                 )
                 async with self.batch():
                     await self.remove_children()
                     await self.mount_all(
@@ -100,9 +101,13 @@ class TranscriptView(ScrollableContainer):
         else:
             self._pending_sample = sample
-    def _widgets_for_events(self, events: Sequence[Event]) -> list[Widget]:
+    def _widgets_for_events(
+        self, events: Sequence[Event], limit: int = 10
+    ) -> list[Widget]:
         widgets: list[Widget] = []
-        for event in events:
+        widget_count = 0
+        # reverse the events so that the N most recents events are displayed
+        for event in events[::-1]:
             display = render_event(event)
             if display:
                 for d in display:
@@ -118,7 +123,14 @@ class TranscriptView(ScrollableContainer):
                             set_transcript_markdown_options(d.content)
                         widgets.append(Static(d.content, markup=False))
                         widgets.append(Static(Text(" ")))
-        return widgets
+                        widget_count += 1
+            # only render the N most recent events
+            if widget_count >= limit:
+                break
+        # reverse the list since we added the events in reverse order
+        return widgets[::-1]
 class EventDisplay(NamedTuple):

inspect_ai/_eval/run.py CHANGED Viewed

@@ -298,10 +298,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
     # setup pending tasks, queue, and results
     pending_tasks = tasks.copy()
-    results: list[EvalLog] = []
+    results: list[tuple[int, EvalLog]] = []
     tasks_completed = 0
     total_tasks = len(tasks)
+    # Create a mapping from task to its original index
+    task_to_original_index = {id(task): i for i, task in enumerate(tasks)}
     # produce/consume tasks
     send_channel, receive_channel = anyio.create_memory_object_stream[TaskRunOptions](
         parallel * 2
@@ -322,7 +325,7 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             # among those models, pick one with the least usage
             model = min(models_with_pending, key=lambda m: model_counts[m])
-            # now we know there’s at least one pending task for this model so it’s safe to pick it
+            # now we know there's at least one pending task for this model so it's safe to pick it
             next_task = next(t for t in pending_tasks if str(t.model) == model)
             pending_tasks.remove(next_task)
             model_counts[str(next_task.model)] += 1
@@ -339,6 +342,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             nonlocal tasks_completed
             async for task_options in receive_channel:
                 result: EvalLog | None = None
+                # Get the original index of this task
+                original_index = task_to_original_index[id(task_options)]
                 # run the task
                 try:
@@ -354,11 +359,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                             # see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
                             def create_task_runner(
                                 options: TaskRunOptions = task_options,
+                                idx: int = original_index,
                             ) -> Callable[[], Awaitable[None]]:
                                 async def run_task() -> None:
                                     nonlocal result
                                     result = await task_run(options)
-                                    results.append(result)
+                                    # Store result with its original index
+                                    results.append((idx, result))
                                 return run_task
@@ -426,7 +433,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             clear_task_screen()
-        return results
+        # Sort results by original index and return just the values
+        return [r for _, r in sorted(results)]
 def resolve_task_sample_ids(

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -56,7 +56,7 @@ class TaskLogger:
     def __init__(
         self,
         task_name: str,
-        task_version: int,
+        task_version: int | str,
         task_file: str | None,
         task_registry_name: str | None,
         task_id: str | None,

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -64,7 +64,7 @@ class Task:
         time_limit: int | None = None,
         working_limit: int | None = None,
         name: str | None = None,
-        version: int = 0,
+        version: int | str = 0,
         metadata: dict[str, Any] | None = None,
         **kwargs: Unpack[TaskDeprecatedArgs],
     ) -> None:

inspect_ai/_util/_async.py CHANGED Viewed

@@ -136,7 +136,7 @@ def current_async_backend() -> Literal["asyncio", "trio"] | None:
 def configured_async_backend() -> Literal["asyncio", "trio"]:
-    backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower()
+    backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower() or "asyncio"
     return _validate_backend(backend)

inspect_ai/_view/schema.py CHANGED Viewed

@@ -30,6 +30,7 @@ def sync_view_schema() -> None:
         for key in defs.keys():
             defs[key] = schema_to_strict(defs[key])
         f.write(json.dumps(schema, indent=2))
+        f.write("\n")
         # generate types w/ json-schema-to-typescript
         subprocess.run(

inspect_ai/_view/view.py CHANGED Viewed

@@ -30,6 +30,20 @@ def view(
     log_level: str | None = None,
     fs_options: dict[str, Any] = {},
 ) -> None:
+    """Run the Inspect View server.
+    Args:
+        log_dir: Directory to view logs from.
+        recursive: Recursively list files in `log_dir`.
+        host: Tcp/ip host (defaults to "127.0.0.1").
+        port: Tcp/ip port (defaults to 7575).
+        authorization: Validate requests by checking for this authorization header.
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
+            "info", "warning", "error", or "critical" (defaults to "warning").
+        fs_options: Additional arguments to pass through to the filesystem provider
+            (e.g. `S3FileSystem`). Use `{"anon": True }` if you are accessing a
+            public S3 bucket with no credentials.
+    """
     init_dotenv()
     init_logger(log_level)

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -17166,41 +17166,41 @@ thead th {
   flex-direction: column;
   padding-top: 0.1em;
 }
-._container_8i3m0_1 {
+._container_181fj_1 {
   display: grid;
   grid-template-columns:
-    minmax(auto, 1fr) minmax(auto, 1fr) minmax(auto, 1fr)
-    2fr;
+    minmax(0, max-content) minmax(0, max-content) minmax(0, max-content)
+    5fr;
   column-gap: 0.75em;
 }
-._container_8i3m0_1 ._cell_8i3m0_9 {
+._container_181fj_1 ._cell_181fj_9 {
   margin-bottom: 0.5em;
 }
-._fullWidth_8i3m0_13 {
+._fullWidth_181fj_13 {
   grid-column: 1 / -1;
 }
-._heading_8i3m0_17 {
+._heading_181fj_17 {
   font-weight: 600;
 }
-._padded_8i3m0_21 {
+._padded_181fj_21 {
   padding-bottom: 3em;
 }
-._separator_8i3m0_25 {
+._separator_181fj_25 {
   height: 1px;
   background-color: var(--bs-light-border-subtle);
 }
-._separatorPadded_8i3m0_30 {
+._separatorPadded_181fj_30 {
   margin-top: 0.5em;
   margin-bottom: 0.5em;
 }
-._headerSep_8i3m0_35 {
+._headerSep_181fj_35 {
   margin-top: 0.1em;
   margin-bottom: 0.2em;
 }

inspect_ai/_view/www/dist/assets/index.js CHANGED Viewed

@@ -39446,7 +39446,7 @@ Please change the parent <Route path="${parentPath}"> to <Route path="${parentPa
           const rendered = entry2.value.trim();
           if (options2.renderString === "markdown") {
             return {
-              rendered
+              rendered: /* @__PURE__ */ jsxRuntimeExports.jsx(MarkdownDiv, { markdown: rendered })
             };
           } else {
             return {
@@ -51898,12 +51898,12 @@ self.onmessage = function (e) {
       );
       return scorerDescriptor == null ? void 0 : scorerDescriptor.render(scoreData.value);
     };
-    const container$6 = "_container_8i3m0_1";
-    const cell$1 = "_cell_8i3m0_9";
-    const fullWidth = "_fullWidth_8i3m0_13";
-    const separator$2 = "_separator_8i3m0_25";
-    const separatorPadded = "_separatorPadded_8i3m0_30";
-    const headerSep = "_headerSep_8i3m0_35";
+    const container$6 = "_container_181fj_1";
+    const cell$1 = "_cell_181fj_9";
+    const fullWidth = "_fullWidth_181fj_13";
+    const separator$2 = "_separator_181fj_25";
+    const separatorPadded = "_separatorPadded_181fj_30";
+    const headerSep = "_headerSep_181fj_35";
     const styles$x = {
       container: container$6,
       cell: cell$1,
@@ -52473,7 +52473,7 @@ self.onmessage = function (e) {
             {
               output: event.error.traceback_ansi,
               style: {
-                fontSize: "clamp(0.5rem, calc(0.25em + 1vw), 0.8rem)",
+                fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
                 margin: "0.5em 0"
               }
             }
@@ -61749,7 +61749,7 @@ ${events}
                           output: sample2.error.traceback_ansi,
                           className: clsx("text-size-small", styles$A.ansi),
                           style: {
-                            fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
+                            fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
                             margin: "0.5em 0"
                           }
                         }
@@ -61764,7 +61764,7 @@ ${events}
                             output: retry.traceback_ansi,
                             className: clsx("text-size-small", styles$A.ansi),
                             style: {
-                              fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
+                              fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
                               margin: "0.5em 0"
                             }
                           }

inspect_ai/_view/www/log-schema.json CHANGED Viewed

@@ -1136,6 +1136,18 @@
           "default": null,
           "title": "Log Samples"
         },
+        "log_realtime": {
+          "anyOf": [
+            {
+              "type": "boolean"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Log Realtime"
+        },
         "log_images": {
           "anyOf": [
             {
@@ -1205,6 +1217,7 @@
         "max_sandboxes",
         "sandbox_cleanup",
         "log_samples",
+        "log_realtime",
         "log_images",
         "log_buffer",
         "log_shared",
@@ -1502,7 +1515,8 @@
             "reasoning_tokens": null,
             "reasoning_summary": null,
             "reasoning_history": null,
-            "response_schema": null
+            "response_schema": null,
+            "extra_body": null
           }
         }
       },
@@ -1944,7 +1958,7 @@
       "additionalProperties": false
     },
     "EvalSampleLimit": {
-      "description": "Limit encontered by sample.",
+      "description": "Limit encountered by sample.",
       "properties": {
         "type": {
           "enum": [
@@ -2277,6 +2291,10 @@
     "EvalSpec": {
       "description": "Eval target and configuration.",
       "properties": {
+        "eval_id": {
+          "title": "Eval Id",
+          "type": "string"
+        },
         "run_id": {
           "title": "Run Id",
           "type": "string"
@@ -2294,9 +2312,16 @@
           "type": "string"
         },
         "task_version": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "string"
+            }
+          ],
           "default": 0,
-          "title": "Task Version",
-          "type": "integer"
+          "title": "Task Version"
         },
         "task_file": {
           "anyOf": [
@@ -2500,6 +2525,7 @@
         }
       },
       "required": [
+        "eval_id",
         "run_id",
         "created",
         "task",
@@ -2897,6 +2923,19 @@
             }
           ],
           "default": null
+        },
+        "extra_body": {
+          "anyOf": [
+            {
+              "additionalProperties": true,
+              "type": "object"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Extra Body"
         }
       },
       "title": "GenerateConfig",
@@ -2927,7 +2966,8 @@
         "reasoning_tokens",
         "reasoning_summary",
         "reasoning_history",
-        "response_schema"
+        "response_schema",
+        "extra_body"
       ],
       "additionalProperties": false
     },

inspect_ai/_view/www/src/@types/log.d.ts CHANGED Viewed

@@ -7,11 +7,12 @@
 export type Version = number;
 export type Status = "started" | "success" | "cancelled" | "error";
+export type EvalId = string;
 export type RunId = string;
 export type Created = string;
 export type Task = string;
 export type TaskId = string;
-export type TaskVersion = number;
+export type TaskVersion = number | string;
 export type TaskFile = string | null;
 export type TaskRegistryName = string | null;
 export type Solver = string | null;
@@ -68,6 +69,9 @@ export type Anyof = JSONSchema[] | null;
 export type Required = string[] | null;
 export type Description1 = string | null;
 export type Strict = boolean | null;
+export type ExtraBody = {
+  [k: string]: unknown;
+} | null;
 export type ModelBaseUrl = string | null;
 export type ModelRoles = {
   [k: string]: EvalModelConfig;
@@ -99,6 +103,7 @@ export type MaxSubprocesses = number | null;
 export type MaxSandboxes = number | null;
 export type SandboxCleanup = boolean | null;
 export type LogSamples = boolean | null;
+export type LogRealtime = boolean | null;
 export type LogImages = boolean | null;
 export type LogBuffer = number | null;
 export type LogShared = number | null;
@@ -640,6 +645,7 @@ export interface EvalLog {
  * Eval target and configuration.
  */
 export interface EvalSpec {
+  eval_id: EvalId;
   run_id: RunId;
   created: Created;
   task: Task;
@@ -722,6 +728,7 @@ export interface GenerateConfig {
   reasoning_summary: ReasoningSummary;
   reasoning_history: ReasoningHistory;
   response_schema: ResponseSchema | null;
+  extra_body: ExtraBody;
 }
 /**
  * Schema for model response when using Structured Output.
@@ -786,6 +793,7 @@ export interface EvalConfig {
   max_sandboxes: MaxSandboxes;
   sandbox_cleanup: SandboxCleanup;
   log_samples: LogSamples;
+  log_realtime: LogRealtime;
   log_images: LogImages;
   log_buffer: LogBuffer;
   log_shared: LogShared;
@@ -888,6 +896,7 @@ export interface GenerateConfig1 {
   reasoning_summary: ReasoningSummary;
   reasoning_history: ReasoningHistory;
   response_schema: ResponseSchema | null;
+  extra_body: ExtraBody;
 }
 /**
  * Scoring results from evaluation.
@@ -1525,7 +1534,7 @@ export interface Attachments {
   [k: string]: string;
 }
 /**
- * Limit encontered by sample.
+ * Limit encountered by sample.
  */
 export interface EvalSampleLimit {
   type: Type16;

inspect_ai/_view/www/src/app/content/RenderedContent.tsx CHANGED Viewed

@@ -9,6 +9,7 @@ import { MetaDataView } from "./MetaDataView";
 import clsx from "clsx";
 import { FC, Fragment, isValidElement, JSX, ReactNode } from "react";
 import JSONPanel from "../../components/JsonPanel";
+import { MarkdownDiv } from "../../components/MarkdownDiv";
 import { isJson } from "../../utils/json";
 import styles from "./RenderedContent.module.css";
 import { Buckets, ContentRenderer, RenderOptions } from "./types";
@@ -142,7 +143,7 @@ const contentRenderers: Record<string, ContentRenderer> = {
       const rendered = entry.value.trim();
       if (options.renderString === "markdown") {
         return {
-          rendered: rendered,
+          rendered: <MarkdownDiv markdown={rendered} />,
         };
       } else {
         return {

inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx CHANGED Viewed

@@ -275,7 +275,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
                       output={sample.error.traceback_ansi}
                       className={clsx("text-size-small", styles.ansi)}
                       style={{
-                        fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
+                        fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
                         margin: "0.5em 0",
                       }}
                     />
@@ -291,7 +291,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
                         output={retry.traceback_ansi}
                         className={clsx("text-size-small", styles.ansi)}
                         style={{
-                          fontSize: "clamp(0.4rem, calc(0.15em + 1vw), 0.8rem)",
+                          fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
                           margin: "0.5em 0",
                         }}
                       />

inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css CHANGED Viewed

@@ -1,8 +1,8 @@
 .container {
   display: grid;
   grid-template-columns:
-    minmax(auto, 1fr) minmax(auto, 1fr) minmax(auto, 1fr)
-    2fr;
+    minmax(0, max-content) minmax(0, max-content) minmax(0, max-content)
+    5fr;
   column-gap: 0.75em;
 }

inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx CHANGED Viewed

@@ -32,7 +32,7 @@ export const ErrorEventView: FC<ErrorEventViewProps> = ({
       <ANSIDisplay
         output={event.error.traceback_ansi}
         style={{
-          fontSize: "clamp(0.5rem, calc(0.25em + 1vw), 0.8rem)",
+          fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
           margin: "0.5em 0",
         }}
       />

inspect_ai/agent/_run.py CHANGED Viewed

@@ -1,20 +1,43 @@
 from copy import copy
-from typing import Any
+from typing import Any, overload
 from inspect_ai._util.registry import registry_unqualified_name
 from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
-from inspect_ai.util._limit import Limit, apply_limits
+from inspect_ai.util._limit import Limit, LimitExceededError, apply_limits
 from inspect_ai.util._span import span
 from ._agent import Agent, AgentState
+@overload
 async def run(
     agent: Agent,
     input: str | list[ChatMessage] | AgentState,
     limits: list[Limit] = [],
+    *,
+    name: str | None = None,
     **agent_kwargs: Any,
-) -> AgentState:
+) -> tuple[AgentState, LimitExceededError | None]: ...
+@overload
+async def run(
+    agent: Agent,
+    input: str | list[ChatMessage] | AgentState,
+    *,
+    name: str | None = None,
+    **agent_kwargs: Any,
+) -> AgentState: ...
+async def run(
+    agent: Agent,
+    input: str | list[ChatMessage] | AgentState,
+    limits: list[Limit] = [],
+    *,
+    name: str | None = None,
+    **agent_kwargs: Any,
+) -> AgentState | tuple[AgentState, LimitExceededError | None]:
     """Run an agent.
     The input messages(s) will be copied prior to running so are
@@ -26,10 +49,16 @@ async def run(
         limits: List of limits to apply to the agent. Should a limit be
             exceeded, a LimitExceededError is raised which the caller may
             handle as appropriate.
+        name: Optional display name for the transcript entry. If not provided, the
+            agent's name as defined in the registry will be used.
         **agent_kwargs: Additional arguments to pass to agent.
     Returns:
-        AgentState: Messages and generated output.
+        AgentState: Messages and generated output. This is all that is returned if no
+            limits are supplied.
+        LimitExceededError | None: If a non-empty limits list is supplied, a tuple is
+            returned. If a limit was exceeded, the second value in the tuple is the
+            exception instance. If no limit was exceeded, the second element is None.
     """
     # copy input so we don't mutate it in place
     input = copy(input)
@@ -52,9 +81,16 @@ async def run(
     # create state
     state = AgentState(messages=input_messages)
-    # run the agent with limits
-    with apply_limits(limits):
+    # run the agent with limits, catching errors which are a direct result of our limits
+    with apply_limits(limits, catch_errors=True) as limit_scope:
         # run the agent
-        agent_name = registry_unqualified_name(agent)
+        agent_name = name or registry_unqualified_name(agent)
         async with span(name=agent_name, type="agent"):
-            return await agent(state, **agent_kwargs)
+            state = await agent(state, **agent_kwargs)
+            if limits:
+                return state, None
+            else:
+                return state
+    # execution reaches this point iff one of "our" limits was exceeded
+    return state, limit_scope.limit_error

inspect_ai/log/_bundle.py CHANGED Viewed

@@ -146,7 +146,7 @@ def copy_log_files(
     log_fs = filesystem(log_dir, fs_options)
     if log_fs.exists(log_dir):
         eval_logs = log_files_from_ls(
-            log_fs.ls(log_dir, recursive=True), ["json", "eval"], True
+            log_fs.ls(log_dir, recursive=True), ["json", "eval"], False
         )
         if len(eval_logs) == 0:
             raise PrerequisiteError(
@@ -201,8 +201,10 @@ def move_output(
                 output_fs.mkdir(dir_path)
             tick()
-            # Copy the files
-            for working_file in files:
+            # Copy the files, preserving relative mtime ordering
+            for _, working_file in sorted(
+                (os.stat(os.path.join(root, f)).st_mtime, f) for f in files
+            ):
                 target_path = (
                     os.path.join(relative_dir, working_file)
                     if relative_dir != "."

inspect_ai/log/_log.py CHANGED Viewed

@@ -158,7 +158,7 @@ class EvalConfig(BaseModel):
 class EvalSampleLimit(BaseModel):
-    """Limit encontered by sample."""
+    """Limit encountered by sample."""
     type: Literal[
         "context", "time", "working", "message", "token", "operator", "custom"
@@ -694,7 +694,7 @@ class EvalSpec(BaseModel):
     task_id: str = Field(default_factory=str)
     """Unique task id."""
-    task_version: int = Field(default=0)
+    task_version: int | str = Field(default=0)
     """Task version."""
     task_file: str | None = Field(default=None)

inspect-ai 0.3.98__py3-none-any.whl → 0.3.99__py3-none-any.whl

inspect-ai 0.3.98py3-none-any.whl → 0.3.99py3-none-any.whl