PyPI - inspect-ai - Versions diffs - 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl - Mend

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_cli/score.py +1 -12
inspect_ai/_cli/util.py +4 -2
inspect_ai/_display/core/footer.py +2 -2
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_eval/context.py +7 -1
inspect_ai/_eval/eval.py +51 -27
inspect_ai/_eval/evalset.py +27 -10
inspect_ai/_eval/loader.py +7 -8
inspect_ai/_eval/run.py +23 -31
inspect_ai/_eval/score.py +18 -1
inspect_ai/_eval/task/log.py +5 -13
inspect_ai/_eval/task/resolved.py +1 -0
inspect_ai/_eval/task/run.py +231 -244
inspect_ai/_eval/task/task.py +25 -2
inspect_ai/_eval/task/util.py +1 -8
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/json.py +8 -3
inspect_ai/_util/registry.py +30 -13
inspect_ai/_view/www/App.css +5 -0
inspect_ai/_view/www/dist/assets/index.css +55 -18
inspect_ai/_view/www/dist/assets/index.js +550 -458
inspect_ai/_view/www/log-schema.json +84 -1
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
inspect_ai/_view/www/src/types/log.d.ts +150 -129
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
inspect_ai/agent/_agent.py +12 -0
inspect_ai/agent/_as_tool.py +1 -1
inspect_ai/agent/_bridge/bridge.py +9 -2
inspect_ai/agent/_react.py +142 -74
inspect_ai/agent/_run.py +13 -2
inspect_ai/agent/_types.py +6 -0
inspect_ai/approval/_apply.py +6 -9
inspect_ai/approval/_approver.py +3 -3
inspect_ai/approval/_auto.py +2 -2
inspect_ai/approval/_call.py +20 -4
inspect_ai/approval/_human/approver.py +3 -3
inspect_ai/approval/_human/manager.py +2 -2
inspect_ai/approval/_human/panel.py +3 -3
inspect_ai/approval/_policy.py +3 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +23 -2
inspect_ai/log/_model.py +58 -0
inspect_ai/log/_recorders/file.py +14 -3
inspect_ai/log/_transcript.py +3 -0
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_call_tools.py +15 -2
inspect_ai/model/_model.py +49 -3
inspect_ai/model/_openai.py +151 -21
inspect_ai/model/_providers/anthropic.py +25 -14
inspect_ai/model/_providers/bedrock.py +3 -3
inspect_ai/model/_providers/cloudflare.py +29 -108
inspect_ai/model/_providers/google.py +21 -10
inspect_ai/model/_providers/grok.py +23 -17
inspect_ai/model/_providers/groq.py +61 -37
inspect_ai/model/_providers/llama_cpp_python.py +8 -9
inspect_ai/model/_providers/mistral.py +8 -3
inspect_ai/model/_providers/ollama.py +8 -9
inspect_ai/model/_providers/openai.py +53 -157
inspect_ai/model/_providers/openai_compatible.py +195 -0
inspect_ai/model/_providers/openrouter.py +4 -15
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/model/_providers/together.py +25 -23
inspect_ai/model/_trim.py +83 -0
inspect_ai/solver/_plan.py +5 -3
inspect_ai/tool/_tool_call.py +3 -0
inspect_ai/tool/_tool_def.py +8 -2
inspect_ai/util/__init__.py +3 -0
inspect_ai/util/_concurrency.py +15 -2
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
inspect_ai/_eval/task/rundir.py +0 -78
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -54,6 +54,7 @@ class Task:
         metrics: list[Metric] | dict[str, list[Metric]] | None = None,
         model: str | Model | None = None,
         config: GenerateConfig = GenerateConfig(),
+        model_roles: dict[str, str | Model] | None = None,
         sandbox: SandboxEnvironmentType | None = None,
         approval: str | list[ApprovalPolicy] | None = None,
         epochs: int | Epochs | None = None,
@@ -79,7 +80,8 @@ class Task:
             scorer: Scorer used to evaluate model output.
             metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
             model: Default model for task (Optional, defaults to eval model).
-            config: Model generation config.
+            config: Model generation config for default model (does not apply to model roles)
+            model_roles: Named roles for use in `get_model()`.
             sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
             approval: Tool use approval policies.
                 Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
@@ -136,6 +138,7 @@ class Task:
         self.metrics = metrics
         self.model = resolve_model(model)
         self.config = config
+        self.model_roles = resolve_model_roles(model_roles)
         self.sandbox = resolve_sandbox_environment(sandbox)
         self.approval = resolve_approval(approval)
         epochs = resolve_epochs(epochs)
@@ -185,6 +188,7 @@ def task_with(
     metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
     model: str | Model | NotGiven = NOT_GIVEN,
     config: GenerateConfig | NotGiven = NOT_GIVEN,
+    model_roles: dict[str, str | Model] | NotGiven = NOT_GIVEN,
     sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
     approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
     epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
@@ -214,7 +218,8 @@ def task_with(
         scorer: Scorer used to evaluate model output.
         metrics: Alternative metrics (overrides the metrics provided by the specified scorer).
         model: Default model for task (Optional, defaults to eval model).
-        config: Model generation config.
+        config: Model generation config for default model (does not apply to model roles)
+        model_roles: Named roles for use in `get_model()`.
         sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
         approval: Tool use approval policies.
             Either a path to an approval policy config file or a list of approval policies. Defaults to no approval policy.
@@ -257,6 +262,8 @@ def task_with(
         task.model = resolve_model(model)
     if not isinstance(config, NotGiven):
         task.config = config
+    if not isinstance(model_roles, NotGiven):
+        task.model_roles = resolve_model_roles(model_roles)
     if not isinstance(sandbox, NotGiven):
         task.sandbox = resolve_sandbox_environment(sandbox)
     if not isinstance(approval, NotGiven):
@@ -315,6 +322,7 @@ class PreviousTask:
     task: str | Task
     task_args: dict[str, Any]
     model: Model | None
+    model_roles: dict[str, Model] | None
     log: EvalLog
@@ -365,6 +373,21 @@ def resolve_model(model: str | Model | None) -> Model | None:
         return model
+def resolve_model_roles(
+    model_roles: dict[str, str | Model] | None,
+) -> dict[str, Model] | None:
+    if model_roles is not None:
+        resolved_model_roles = {
+            k: get_model(v, memoize=False) if isinstance(v, str) else v
+            for k, v in model_roles.items()
+        }
+        for k, v in resolved_model_roles.items():
+            v._set_role(k)
+        return resolved_model_roles
+    else:
+        return None
 def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
     return (
         scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None

inspect_ai/_eval/task/util.py CHANGED Viewed

@@ -25,13 +25,6 @@ def task_run_dir(task: Task) -> str:
     return getattr(task, TASK_RUN_DIR_ATTR, os.getcwd())
-def task_chdir(task: Task) -> str | None:
-    if task.attribs.get("chdir", False) is True:
-        return task_run_dir(task)
-    else:
-        return None
 def task_file(task: Task, relative: bool = False) -> str | None:
     file = cast(str | None, getattr(task, TASK_FILE_ATTR, None))
     if file:
@@ -46,7 +39,7 @@ def task_file(task: Task, relative: bool = False) -> str | None:
 def slice_dataset(
     dataset: Dataset,
     limit: int | tuple[int, int] | None,
-    sample_id: str | int | list[str | int] | None,
+    sample_id: str | int | list[str] | list[int] | list[str | int] | None,
 ) -> Dataset:
     def normalise(id: str | int | None) -> str:
         if isinstance(id, str) and id.isdigit():

inspect_ai/_util/constants.py CHANGED Viewed

@@ -38,6 +38,7 @@ CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
 SANDBOX_SETUP_TIMEOUT = 300
 NO_CONTENT = "(no content)"
+MODEL_NONE = "none/none"
 DESERIALIZING = "deserializing"
 DESERIALIZING_CONTEXT = {DESERIALIZING: True}

inspect_ai/_util/json.py CHANGED Viewed

@@ -93,9 +93,14 @@ def json_changes(
                 replaced = before
                 for path in paths:
                     decoded_path = decode_json_pointer_segment(path)
-                    index: Any = (
-                        int(decoded_path) if decoded_path.isnumeric() else decoded_path
-                    )
+                    if isinstance(replaced, list):
+                        if not decoded_path.isnumeric():
+                            raise ValueError(
+                                f"Invalid JSON Pointer segment for list: {decoded_path}"
+                            )
+                        index = int(decoded_path)
+                    else:
+                        index = decoded_path
                     replaced = replaced[index]
                 json_change.replaced = replaced
             changes.append(json_change)

inspect_ai/_util/registry.py CHANGED Viewed

@@ -14,18 +14,24 @@ from .entrypoints import ensure_entry_points
 obj_type = type
 RegistryType = Literal[
-    "modelapi",
     "task",
     "solver",
-    "plan",
+    "agent",
+    "tool",
     "scorer",
     "metric",
-    "tool",
-    "agent",
-    "sandboxenv",
     "score_reducer",
+    "modelapi",
+    "sandboxenv",
     "approver",
 ]
+"""Enumeration of registry object types.
+These are the types of objects in this system that can be
+registered using a decorator (e.g. `@task`, `@solver`).
+Registered objects can in turn be created dynamically using
+the `registry_create()` function.
+"""
 class RegistryInfo(BaseModel):
@@ -181,17 +187,28 @@ def registry_find(predicate: Callable[[RegistryInfo], bool]) -> list[object]:
 def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
     r"""Create a registry object.
-    Registry objects can be ordinary functions that implement a protocol,
-    factory functions that return a function based on **kwargs, or classes
-    deriving that can be created using **kwargs
+    Creates objects registered via decorator (e.g. `@task`, `@solver`). Note
+    that this can also create registered objects within Python packages, in
+    which case the name of the package should be used a prefix, e.g.
+    ```python
+    registry_create("scorer", "mypackage/myscorer", ...)
+    ```
+    Object within the Inspect package do not require a prefix, nor do
+    objects from imported modules that aren't in a package.
     Args:
-        type (RegistryType): Type of registry object to create
-        name (str): Name of registry options to create
-        **kwargs (Any): Optional creation arguments
+        type: Type of registry object to create
+        name: Name of registry object to create
+        **kwargs: Optional creation arguments
     Returns:
-        Registry object with registry info attribute
+        Instance of specified name and type.
+    Raises:
+        LookupError: If the named object was not found in the registry.
+        TypeError: If the specified parameters are not valid for the object.
     """
     # lookup the object
     obj = registry_lookup(type, name)
@@ -225,7 +242,7 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
         else:
             return obj
     else:
-        raise ValueError(f"{name} was not found in the registry")
+        raise LookupError(f"{name} was not found in the registry")
 def registry_info(o: object) -> RegistryInfo:

inspect_ai/_view/www/App.css CHANGED Viewed

@@ -31,6 +31,7 @@
   --inspect-font-size-base: 0.9rem;
   --inspect-font-size-small: 0.8rem;
   --inspect-font-size-smaller: 0.8rem;
+  --inspect-font-size-smallest: 0.7rem;
   /* Inspect Glass */
   --inspect-glass-color: #000000;
@@ -113,6 +114,10 @@ body[class^="vscode-"] .app-main-grid {
   font-size: var(--inspect-font-size-smaller);
 }
+.text-size-smallest {
+  font-size: var(--inspect-font-size-smallest);
+}
 .text-truncate {
   white-space: nowrap;
   text-overflow: ellipsis;

inspect_ai/_view/www/dist/assets/index.css CHANGED Viewed

@@ -14304,6 +14304,7 @@ pre[class*="language-"] {
   --inspect-font-size-base: 0.9rem;
   --inspect-font-size-small: 0.8rem;
   --inspect-font-size-smaller: 0.8rem;
+  --inspect-font-size-smallest: 0.7rem;
   /* Inspect Glass */
   --inspect-glass-color: #000000;
@@ -14386,6 +14387,10 @@ body[class^="vscode-"] .app-main-grid {
   font-size: var(--inspect-font-size-smaller);
 }
+.text-size-smallest {
+  font-size: var(--inspect-font-size-smallest);
+}
 .text-truncate {
   white-space: nowrap;
   text-overflow: ellipsis;
@@ -15934,37 +15939,37 @@ ul.jsondiffpatch-textdiff {
 ._number_140x5_7 {
   margin-top: 0.1em;
 }
-._table_1memb_1 {
+._table_9qith_1 {
   padding-left: 0;
   margin-left: 0;
   margin-bottom: 0.2rem;
 }
-._th_1memb_7 {
+._th_9qith_7 {
   padding: 0;
 }
-._cell_1memb_11 {
-  padding: 0.3em 0.3em 0.3em 0em;
+._cell_9qith_11 {
+  padding: 0em 0.5em 0.3em 0em !important;
 }
-._compact_1memb_15 ._cell_1memb_11 {
+._compact_9qith_15 ._cell_9qith_11 {
   padding: 0;
 }
-._cellKey_1memb_19 {
+._cellKey_9qith_19 {
   font-weight: 400;
   padding-right: 1em;
   white-space: nowrap;
 }
-._compact_1memb_15 ._cellKey_1memb_19 {
+._compact_9qith_15 ._cellKey_9qith_19 {
   font-weight: 400;
   padding-right: 0.2em;
   white-space: nowrap;
 }
-._cellValue_1memb_31 {
+._cellValue_9qith_31 {
   font-weight: 300;
   white-space: pre-wrap;
   word-wrap: anywhere;
@@ -19645,6 +19650,22 @@ span.ap-marker-container:hover span.ap-marker {
 ._text_1yknn_20 {
   margin-top: -2px;
 }
+._container_304w9_1 {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+  row-gap: 2em;
+  column-gap: 1em;
+}
+._modelInfo_304w9_8 {
+  display: grid;
+  grid-template-columns: max-content auto;
+  column-gap: 1em;
+}
+._role_304w9_14 {
+  grid-column: -1/1;
+}
 ._item_1uzhd_1 {
   margin-bottom: 0em;
 }
@@ -19808,55 +19829,71 @@ span.ap-marker-container:hover span.ap-marker {
 ._copyButton_1goi8_1:hover {
   opacity: 0.75;
 }
-._container_14b93_1 {
+._container_q17yq_1 {
+  display: flex;
+  flex-direction: row;
+  flex-wrap: wrap;
+  gap: 0;
+  margin-top: -0.2rem;
+  margin-bottom: 0.2rem;
+}
+._grid_q17yq_10 {
+  display: grid;
+  grid-template-rows: repeat(auto-fill, minmax(10px, 1fr));
+  grid-template-columns: 1fr;
+  gap: 0.1em;
+  padding-right: 1em;
+}
+._container_291sb_1 {
   display: flex;
   padding-top: 0;
   margin-left: 0.5rem;
   min-width: 250px;
 }
-._wrapper_14b93_8 {
+._wrapper_291sb_8 {
   display: grid;
   grid-template-columns: minmax(auto, 1fr) 1fr;
   width: 100%;
 }
-._toggle_14b93_14 {
+._toggle_291sb_14 {
   padding: 0rem 0.1rem 0.1rem 0rem;
   display: flex;
 }
-._body_14b93_19 {
+._body_291sb_19 {
   display: flex;
   flex-direction: column;
   margin-left: 0.2rem;
 }
-._bodyContainer_14b93_25 {
+._bodyContainer_291sb_25 {
   margin-top: 0.1rem;
   display: grid;
   grid-template-columns: minmax(30px, max-content) minmax(100px, max-content);
 }
-._taskTitle_14b93_31 {
+._taskTitle_291sb_31 {
   font-weight: 600;
   margin-right: 0.3rem;
 }
-._taskModel_14b93_36 {
+._taskModel_291sb_36 {
   padding-top: 0.4rem;
 }
-._taskStatus_14b93_40 {
+._taskStatus_291sb_40 {
   display: flex;
   justify-content: end;
   margin-right: 1em;
   margin-bottom: 0;
 }
-._secondaryContainer_14b93_47 {
+._secondaryContainer_291sb_47 {
   opacity: 0.7;
-  margin-top: 0.1rem;
+  margin-top: -0.1rem;
   padding-bottom: 0;
   display: grid;
   grid-template-columns: minmax(0, max-content) max-content;

inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl

inspect-ai 0.3.87py3-none-any.whl → 0.3.89py3-none-any.whl