PyPI - inspect-ai - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl - Mend

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +2 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/core/progress.py +1 -1
inspect_ai/_display/textual/app.py +8 -4
inspect_ai/_display/textual/widgets/samples.py +6 -5
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/__init__.py +0 -0
inspect_ai/_eval/eval.py +100 -97
inspect_ai/_eval/evalset.py +69 -69
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +6 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/__init__.py +0 -0
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/App.css +8 -3
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +66 -38
inspect_ai/_view/www/dist/assets/index.js +525 -523
inspect_ai/_view/www/log-schema.json +86 -73
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/App.tsx +1 -0
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
inspect_ai/_view/www/src/types/log.d.ts +107 -19
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +36 -45
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +13 -13
inspect_ai/dataset/_sources/hf.py +29 -29
inspect_ai/dataset/_sources/json.py +10 -10
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +98 -7
inspect_ai/log/_message.py +3 -1
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +2 -2
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openrouter.py +1 -1
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +1 -1
inspect_ai/scorer/_classification.py +4 -0
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +15 -18
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +2 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/_tools/_computer/_common.py +2 -2
inspect_ai/tool/_tools/_computer/_computer.py +11 -0
inspect_ai/tool/_tools/_execute.py +15 -9
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +10 -1
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0

inspect_ai/dataset/_sources/hf.py CHANGED Viewed

@@ -41,36 +41,36 @@ def hf_dataset(
     `datasets` package, including remote datasets on Hugging Face Hub.
     Args:
-        path (str): Path or name of the dataset. Depending on path, the dataset
-          builder that is used comes from a generic dataset script (JSON, CSV,
-          Parquet, text etc.) or from the dataset script (a python file) inside
-          the dataset directory.
-        split (str): Which split of the data to load.
-        name (str | None): Name of the dataset configuration.
-        data_dir (str | None): data_dir of the dataset configuration
-          to read data from.
-        revision (str | None): Specific revision to load (e.g. "main", a branch
-          name, or a specific commit SHA). When using `revision` the `cached` option
-          is ignored and datasets are revalidated on Hugging Face before loading.
-        sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
-          fields in the data source to Sample objects. Pass `None` if the data is already
-          stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
-          `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
+      path: Path or name of the dataset. Depending on path, the dataset
+        builder that is used comes from a generic dataset script (JSON, CSV,
+        Parquet, text etc.) or from the dataset script (a python file) inside
+        the dataset directory.
+      split: Which split of the data to load.
+      name: Name of the dataset configuration.
+      data_dir: data_dir of the dataset configuration
+        to read data from.
+      revision: Specific revision to load (e.g. "main", a branch
+        name, or a specific commit SHA). When using `revision` the `cached` option
+        is ignored and datasets are revalidated on Hugging Face before loading.
+      sample_fields: Method of mapping underlying
+        fields in the data source to Sample objects. Pass `None` if the data is already
+        stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
+        `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
           handle mapping with a custom function that returns one or more samples.
-        auto_id (bool): Assign an auto-incrementing ID for each sample.
-        shuffle (bool): Randomly shuffle the dataset order.
-        seed: (int | None): Seed used for random shuffle.
-        shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
-        limit (int | None): Limit the number of records to read.
-        trust (bool): Whether or not to allow for datasets defined on the Hub
-          using a dataset script. This option should only be set to True for
-          repositories you trust and in which you have read the code, as it
-          will execute code present on the Hub on your local machine.
-        cached (bool): By default, datasets are read once from HuggingFace
-          Hub and then cached for future reads. Pass `cached=False` to force
-          re-reading the dataset from Hugging Face. Ignored when the `revision`
-          option is specified.
-        **kwargs (dict[str, Any]): Additional arguments to pass through to the
+      auto_id: Assign an auto-incrementing ID for each sample.
+      shuffle: Randomly shuffle the dataset order.
+      seed: Seed used for random shuffle.
+      shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
+      limit: Limit the number of records to read.
+      trust: Whether or not to allow for datasets defined on the Hub
+        using a dataset script. This option should only be set to True for
+        repositories you trust and in which you have read the code, as it
+        will execute code present on the Hub on your local machine.
+      cached: By default, datasets are read once from HuggingFace
+        Hub and then cached for future reads. Pass `cached=False` to force
+        re-reading the dataset from Hugging Face. Ignored when the `revision`
+        option is specified.
+      **kwargs (dict[str, Any]): Additional arguments to pass through to the
           `load_dataset` function of the `datasets` package.
     Returns:

inspect_ai/dataset/_sources/json.py CHANGED Viewed

@@ -39,23 +39,23 @@ def json_dataset(
     the `sample_fields` argument.
     Args:
-      json_file (str): Path to JSON file. Can be a local filesystem path or
+      json_file: Path to JSON file. Can be a local filesystem path or
         a path to an S3 bucket (e.g. "s3://my-bucket"). Use `fs_options`
         to pass arguments through to the `S3FileSystem` constructor.
-      sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
+      sample_fields: Method of mapping underlying
         fields in the data source to `Sample` objects. Pass `None` if the data is already
         stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a
         `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
         handle mapping with a custom function that returns one or more samples.
-      auto_id (bool): Assign an auto-incrementing ID for each sample.
-      shuffle (bool): Randomly shuffle the dataset order.
-      seed: (int | None): Seed used for random shuffle.
-      shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
-      limit (int | None): Limit the number of records to read.
-      encoding (str): Text encoding for file (defaults to "utf-8").
-      name (str): Optional name for dataset (for logging). If not specified,
+      auto_id: Assign an auto-incrementing ID for each sample.
+      shuffle: Randomly shuffle the dataset order.
+      seed: Seed used for random shuffle.
+      shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
+      limit: Limit the number of records to read.
+      encoding: Text encoding for file (defaults to "utf-8").
+      name: Optional name for dataset (for logging). If not specified,
         defaults to the stem of the filename.
-      fs_options (dict[str, Any]): Optional. Additional arguments to pass through
+      fs_options: Optional. Additional arguments to pass through
         to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
         if you are accessing a public S3 bucket with no credentials.

inspect_ai/log/__init__.py CHANGED Viewed

@@ -22,6 +22,7 @@ from ._log import (
     EvalResults,
     EvalRevision,
     EvalSample,
+    EvalSampleLimit,
     EvalSampleReductions,
     EvalSampleScore,
     EvalScore,
@@ -61,6 +62,7 @@ __all__ = [
     "EvalResults",
     "EvalRevision",
     "EvalSample",
+    "EvalSampleLimit",
     "EvalSampleScore",
     "EvalSampleReductions",
     "EvalScore",

inspect_ai/log/_convert.py CHANGED Viewed

@@ -20,12 +20,12 @@ def convert_eval_logs(
     Args:
         path (str): Path to source log file(s). Should be either a single
-          log file or a directory containing log files.
+            log file or a directory containing log files.
         to (Literal["eval", "json"]): Format to convert to. If a file is
-          already in the target format it will just be copied to the output dir.
+            already in the target format it will just be copied to the output dir.
         output_dir (str): Output directory to write converted log file(s) to.
         overwrite (bool): Overwrite existing log files (defaults to `False`,
-          raising an error if the output file path already exists).
+            raising an error if the output file path already exists).
     """
     from inspect_ai._display import display

inspect_ai/log/_file.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 from logging import getLogger
 from typing import Any, Callable, Generator, Literal, cast
+from pydantic import BaseModel
 from pydantic_core import to_json
 from inspect_ai._util._async import run_coroutine
@@ -22,7 +23,21 @@ from ._recorders import recorder_type_for_format, recorder_type_for_location
 logger = getLogger(__name__)
-class EvalLogInfo(FileInfo):
+class EvalLogInfo(BaseModel):
+    """File info and task identifiers for eval log."""
+    name: str
+    """Name of file."""
+    type: str
+    """Type of file (file or directory)"""
+    size: int
+    """File size in bytes."""
+    mtime: float | None
+    """File modification time (None if the file is a directory on S3)."""
     task: str
     """Task name."""
@@ -231,7 +246,7 @@ def write_log_dir_manifest(
 def read_eval_log(
-    log_file: str | FileInfo,
+    log_file: str | EvalLogInfo,
     header_only: bool = False,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",
@@ -241,7 +256,7 @@ def read_eval_log(
     Args:
        log_file (str | FileInfo): Log file to read.
        header_only (bool): Read only the header (i.e. exclude
-         the "samples" and "logging" fields). Defaults to False.
+          the "samples" and "logging" fields). Defaults to False.
        resolve_attachments (bool): Resolve attachments (e.g. images)
           to their full content.
        format (Literal["eval", "json", "auto"]): Read from format
@@ -256,7 +271,7 @@ def read_eval_log(
 async def read_eval_log_async(
-    log_file: str | FileInfo,
+    log_file: str | EvalLogInfo,
     header_only: bool = False,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",
@@ -304,13 +319,13 @@ async def read_eval_log_async(
 def read_eval_log_headers(
-    log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
+    log_files: list[str] | list[EvalLogInfo],
 ) -> list[EvalLog]:
     return run_coroutine(read_eval_log_headers_async(log_files))
 async def read_eval_log_headers_async(
-    log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
+    log_files: list[str] | list[EvalLogInfo],
 ) -> list[EvalLog]:
     return [
         await read_eval_log_async(log_file, header_only=True) for log_file in log_files
@@ -318,7 +333,7 @@ async def read_eval_log_headers_async(
 def read_eval_log_sample(
-    log_file: str | FileInfo,
+    log_file: str | EvalLogInfo,
     id: int | str,
     epoch: int = 1,
     resolve_attachments: bool = False,
@@ -347,7 +362,7 @@ def read_eval_log_sample(
 async def read_eval_log_sample_async(
-    log_file: str | FileInfo,
+    log_file: str | EvalLogInfo,
     id: int | str,
     epoch: int = 1,
     resolve_attachments: bool = False,
@@ -386,7 +401,7 @@ async def read_eval_log_sample_async(
 def read_eval_log_samples(
-    log_file: str | FileInfo,
+    log_file: str | EvalLogInfo,
     all_samples_required: bool = True,
     resolve_attachments: bool = False,
     format: Literal["eval", "json", "auto"] = "auto",

inspect_ai/log/_log.py CHANGED Viewed

@@ -4,11 +4,17 @@ import sys
 import traceback
 from logging import getLogger
 from types import TracebackType
-from typing import Any, Literal, Type
+from typing import Any, Literal, Type, TypedDict
 import click
 import tenacity
-from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    PrivateAttr,
+    model_validator,
+)
 from rich.console import Console, RenderableType
 from rich.traceback import Traceback
@@ -30,7 +36,31 @@ logger = getLogger(__name__)
 SCORER_PLACEHOLDER = "88F74D2C"
+class EvalConfigDefaults(TypedDict):
+    epochs: int
+    epochs_reducer: list[str]
+    fail_on_error: bool
+    sandbox_cleanup: bool
+    log_samples: bool
+    log_images: bool
+    score_display: bool
+def eval_config_defaults() -> EvalConfigDefaults:
+    return {
+        "epochs": 1,
+        "epochs_reducer": ["mean"],
+        "fail_on_error": True,
+        "sandbox_cleanup": True,
+        "log_samples": True,
+        "log_images": True,
+        "score_display": True,
+    }
 class EvalConfig(BaseModel):
+    """Configuration used for evaluation."""
     limit: int | tuple[int, int] | None = Field(default=None)
     """Sample limit (number of samples or range of samples)."""
@@ -109,6 +139,8 @@ class EvalConfig(BaseModel):
 class EvalSampleLimit(BaseModel):
+    """Limit encontered by sample."""
     type: Literal["context", "time", "message", "token", "operator", "custom"]
     """The type of limit"""
@@ -117,6 +149,8 @@ class EvalSampleLimit(BaseModel):
 class EvalSample(BaseModel):
+    """Sample from evaluation task."""
     id: int | str
     """Unique id for sample."""
@@ -191,7 +225,7 @@ class EvalSample(BaseModel):
     """Attachments referenced from messages and events.
     Resolve attachments for a sample (replacing attachment://* references with
-    attachment content) with the resolve_sample_attachments() function.
+    attachment content) by passing `resolve_attachments=True` to log reading functions.
     """
     limit: EvalSampleLimit | None = Field(default=None)
@@ -262,6 +296,8 @@ class EvalEvents(BaseModel):
 class EvalPlanStep(BaseModel):
+    """Solver step."""
     solver: str
     """Name of solver."""
@@ -270,6 +306,8 @@ class EvalPlanStep(BaseModel):
 class EvalPlan(BaseModel):
+    """Plan (solvers) used in evaluation."""
     name: str = Field(default="plan")
     """Plan name."""
@@ -284,20 +322,24 @@ class EvalPlan(BaseModel):
 class EvalMetric(BaseModel):
+    """Metric for evaluation score."""
     name: str
     """Metric name."""
     value: int | float
     """Metric value."""
-    options: dict[str, Any] = Field(default_factory=dict)
-    """Options specified when creating metric."""
+    params: dict[str, Any] = Field(default_factory=dict)
+    """Params specified when creating metric."""
     metadata: dict[str, Any] | None = Field(default=None)
     """Additional metadata associated with metric."""
 class EvalScore(BaseModel):
+    """Score for evaluation task."""
     name: str
     """Score name."""
@@ -318,10 +360,15 @@ class EvalScore(BaseModel):
 class EvalSampleScore(Score):
+    """Score and sample_id scored."""
     sample_id: str | int | None = Field(default=None)
+    """Sample ID."""
 class EvalSampleReductions(BaseModel):
+    """Score reductions."""
     scorer: str
     """Name the of scorer"""
@@ -333,6 +380,8 @@ class EvalSampleReductions(BaseModel):
 class EvalResults(BaseModel):
+    """Scoring results from evaluation."""
     total_samples: int = Field(default=0)
     """Total samples in eval (dataset samples * epochs)"""
@@ -415,6 +464,8 @@ class EvalResults(BaseModel):
 class EvalDataset(BaseModel):
+    """Dataset used for evaluation."""
     name: str | None = Field(default=None)
     """Dataset name."""
@@ -431,7 +482,33 @@ class EvalDataset(BaseModel):
     """Was the dataset shuffled after reading."""
+class EvalMetricDefinition(BaseModel):
+    name: str
+    """Metric name"""
+    options: dict[str, Any] | None = Field(default=None)
+class EvalScorer(BaseModel):
+    name: str
+    """Scorer name"""
+    options: dict[str, Any] | None = Field(default=None)
+    """Scorer arguments"""
+    metrics: (
+        list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
+        | dict[str, list[EvalMetricDefinition]]
+        | None
+    ) = Field(default=None)
+    metadata: dict[str, Any] | None = Field(default=None)
+    """Scorer metadata"""
 class EvalRevision(BaseModel):
+    """Git revision for evaluation."""
     type: Literal["git"]
     """Type of revision (currently only "git")"""
@@ -443,6 +520,8 @@ class EvalRevision(BaseModel):
 class EvalSpec(BaseModel):
+    """Eval target and configuration."""
     run_id: str = Field(default_factory=str)
     """Unique run id"""
@@ -503,6 +582,14 @@ class EvalSpec(BaseModel):
     metadata: dict[str, Any] | None = Field(default=None)
     """Additional eval metadata."""
+    scorers: list[EvalScorer] | None = Field(default=None)
+    """Scorers and args for this eval"""
+    metrics: (
+        list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None
+    ) = Field(default=None)
+    """metrics and args for this eval"""
     # allow field model_args
     model_config = ConfigDict(protected_namespaces=())
@@ -546,6 +633,8 @@ def rich_traceback(
 class EvalStats(BaseModel):
+    """Timing and usage statistics."""
     started_at: str = Field(default_factory=str)
     """Evaluation start time."""
@@ -560,6 +649,8 @@ class EvalStats(BaseModel):
 class EvalLog(BaseModel):
+    """Evaluation log."""
     # WARNING: The order of these fields is important for the log file format.
     # Do not change the order of these fields without incrementing the version number,
     # updating the log file read/write functionality (such as read_eval_log),
@@ -575,13 +666,13 @@ class EvalLog(BaseModel):
     eval: EvalSpec
     """Eval identity and configuration."""
-    plan: EvalPlan = Field(default=EvalPlan())
+    plan: EvalPlan = Field(default_factory=EvalPlan)
     """Eval plan (solvers and config)"""
     results: EvalResults | None = None
     """Eval results (scores and metrics)."""
-    stats: EvalStats = Field(default=EvalStats())
+    stats: EvalStats = Field(default_factory=EvalStats)
     """Eval stats (runtime, model usage)"""
     error: EvalError | None = Field(default=None)

inspect_ai/log/_message.py CHANGED Viewed

@@ -11,6 +11,8 @@ LoggingLevel = Literal[
 class LoggingMessage(BaseModel):
+    """Message written to Python log."""
     name: str | None = Field(default=None)
     """Logger name (e.g. 'httpx')"""
@@ -33,7 +35,7 @@ class LoggingMessage(BaseModel):
     """Logged from line number."""
     @staticmethod
-    def from_log_record(record: LogRecord) -> "LoggingMessage":
+    def _from_log_record(record: LogRecord) -> "LoggingMessage":
         """Create a LoggingMesssage from a LogRecord.
         Args:

inspect_ai/log/_recorders/file.py CHANGED Viewed

@@ -28,6 +28,10 @@ class FileRecorder(Recorder):
     def is_local(self) -> bool:
         return self.fs.is_local()
+    @override
+    def is_writeable(self) -> bool:
+        return self.fs.is_writeable(self.log_dir)
     @override
     @classmethod
     async def read_log_sample(

inspect_ai/log/_recorders/recorder.py CHANGED Viewed

@@ -21,6 +21,9 @@ class Recorder(abc.ABC):
     @abc.abstractmethod
     def default_log_buffer(self) -> int: ...
+    @abc.abstractmethod
+    def is_writeable(self) -> bool: ...
     @abc.abstractmethod
     async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -167,7 +167,7 @@ class ToolEvent(BaseEvent):
     events: list["Event"] = Field(default_factory=list)
     """Transcript of events for tool."""
-    def set_result(
+    def _set_result(
         self,
         result: ToolResult,
         truncated: tuple[int, int] | None,
@@ -182,11 +182,11 @@ class ToolEvent(BaseEvent):
     # mechanism for operator to cancel the tool call
-    def set_task(self, task: asyncio.Task[Any]) -> None:
+    def _set_task(self, task: asyncio.Task[Any]) -> None:
         """Set the tool task (for possible cancellation)"""
         self._task = task
-    def cancel(self) -> None:
+    def _cancel(self) -> None:
         """Cancel the tool task."""
         if self._task:
             self._cancelled = True
@@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
     event: Literal["info"] = Field(default="info")
     """Event type."""
+    source: str | None = Field(default=None)
+    """Optional source for info event."""
     data: JsonValue
     """Data provided with event."""
@@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):
 class ScoreEvent(BaseEvent):
-    """Event with sample score."""
+    """Event with score.
+    Can be the final score for a `Sample`, or can be an intermediate score
+    resulting from a call to `score`.
+    """
     event: Literal["score"] = Field(default="score")
     """Event type."""
     score: Score
-    """Sample score."""
+    """Score value."""
     target: str | list[str] | None = Field(default=None)
     """"Sample target."""
+    intermediate: bool = Field(default=False)
+    """Was this an intermediate scoring?"""
 class StepEvent(BaseEvent):
     """Step within current sample or subtask."""
@@ -355,13 +365,14 @@ class Transcript:
         self.name = name
         self._events: list[Event] = []
-    def info(self, data: JsonValue) -> None:
+    def info(self, data: JsonValue, *, source: str | None = None) -> None:
         """Add an `InfoEvent` to the transcript.
         Args:
-           data (JsonValue): Data associated with the event.
+           data: Data associated with the event.
+           source: Optional event source.
         """
-        self._event(InfoEvent(data=data))
+        self._event(InfoEvent(source=source, data=data))
     @contextlib.contextmanager
     def step(self, name: str, type: str | None = None) -> Iterator[None]:

inspect_ai/model/__init__.py CHANGED Viewed

@@ -21,6 +21,7 @@ from ._call_tools import call_tools
 from ._chat_message import (
     ChatMessage,
     ChatMessageAssistant,
+    ChatMessageBase,
     ChatMessageSystem,
     ChatMessageTool,
     ChatMessageUser,
@@ -54,6 +55,7 @@ __all__ = [
     "ContentVideo",
     "Content",
     "ChatMessage",
+    "ChatMessageBase",
     "ChatMessageSystem",
     "ChatMessageUser",
     "ChatMessageAssistant",

inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

inspect-ai 0.3.63py3-none-any.whl → 0.3.65py3-none-any.whl