inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -41,36 +41,36 @@ def hf_dataset(
|
|
41
41
|
`datasets` package, including remote datasets on Hugging Face Hub.
|
42
42
|
|
43
43
|
Args:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
44
|
+
path: Path or name of the dataset. Depending on path, the dataset
|
45
|
+
builder that is used comes from a generic dataset script (JSON, CSV,
|
46
|
+
Parquet, text etc.) or from the dataset script (a python file) inside
|
47
|
+
the dataset directory.
|
48
|
+
split: Which split of the data to load.
|
49
|
+
name: Name of the dataset configuration.
|
50
|
+
data_dir: data_dir of the dataset configuration
|
51
|
+
to read data from.
|
52
|
+
revision: Specific revision to load (e.g. "main", a branch
|
53
|
+
name, or a specific commit SHA). When using `revision` the `cached` option
|
54
|
+
is ignored and datasets are revalidated on Hugging Face before loading.
|
55
|
+
sample_fields: Method of mapping underlying
|
56
|
+
fields in the data source to Sample objects. Pass `None` if the data is already
|
57
|
+
stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
|
58
|
+
`FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
|
59
59
|
handle mapping with a custom function that returns one or more samples.
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
60
|
+
auto_id: Assign an auto-incrementing ID for each sample.
|
61
|
+
shuffle: Randomly shuffle the dataset order.
|
62
|
+
seed: Seed used for random shuffle.
|
63
|
+
shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
64
|
+
limit: Limit the number of records to read.
|
65
|
+
trust: Whether or not to allow for datasets defined on the Hub
|
66
|
+
using a dataset script. This option should only be set to True for
|
67
|
+
repositories you trust and in which you have read the code, as it
|
68
|
+
will execute code present on the Hub on your local machine.
|
69
|
+
cached: By default, datasets are read once from HuggingFace
|
70
|
+
Hub and then cached for future reads. Pass `cached=False` to force
|
71
|
+
re-reading the dataset from Hugging Face. Ignored when the `revision`
|
72
|
+
option is specified.
|
73
|
+
**kwargs (dict[str, Any]): Additional arguments to pass through to the
|
74
74
|
`load_dataset` function of the `datasets` package.
|
75
75
|
|
76
76
|
Returns:
|
@@ -39,23 +39,23 @@ def json_dataset(
|
|
39
39
|
the `sample_fields` argument.
|
40
40
|
|
41
41
|
Args:
|
42
|
-
json_file
|
42
|
+
json_file: Path to JSON file. Can be a local filesystem path or
|
43
43
|
a path to an S3 bucket (e.g. "s3://my-bucket"). Use `fs_options`
|
44
44
|
to pass arguments through to the `S3FileSystem` constructor.
|
45
|
-
sample_fields
|
45
|
+
sample_fields: Method of mapping underlying
|
46
46
|
fields in the data source to `Sample` objects. Pass `None` if the data is already
|
47
47
|
stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a
|
48
48
|
`FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
|
49
49
|
handle mapping with a custom function that returns one or more samples.
|
50
|
-
auto_id
|
51
|
-
shuffle
|
52
|
-
seed:
|
53
|
-
shuffle_choices:
|
54
|
-
limit
|
55
|
-
encoding
|
56
|
-
name
|
50
|
+
auto_id: Assign an auto-incrementing ID for each sample.
|
51
|
+
shuffle: Randomly shuffle the dataset order.
|
52
|
+
seed: Seed used for random shuffle.
|
53
|
+
shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
54
|
+
limit: Limit the number of records to read.
|
55
|
+
encoding: Text encoding for file (defaults to "utf-8").
|
56
|
+
name: Optional name for dataset (for logging). If not specified,
|
57
57
|
defaults to the stem of the filename.
|
58
|
-
fs_options
|
58
|
+
fs_options: Optional. Additional arguments to pass through
|
59
59
|
to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
|
60
60
|
if you are accessing a public S3 bucket with no credentials.
|
61
61
|
|
inspect_ai/log/__init__.py
CHANGED
@@ -22,6 +22,7 @@ from ._log import (
|
|
22
22
|
EvalResults,
|
23
23
|
EvalRevision,
|
24
24
|
EvalSample,
|
25
|
+
EvalSampleLimit,
|
25
26
|
EvalSampleReductions,
|
26
27
|
EvalSampleScore,
|
27
28
|
EvalScore,
|
@@ -61,6 +62,7 @@ __all__ = [
|
|
61
62
|
"EvalResults",
|
62
63
|
"EvalRevision",
|
63
64
|
"EvalSample",
|
65
|
+
"EvalSampleLimit",
|
64
66
|
"EvalSampleScore",
|
65
67
|
"EvalSampleReductions",
|
66
68
|
"EvalScore",
|
inspect_ai/log/_convert.py
CHANGED
@@ -20,12 +20,12 @@ def convert_eval_logs(
|
|
20
20
|
|
21
21
|
Args:
|
22
22
|
path (str): Path to source log file(s). Should be either a single
|
23
|
-
|
23
|
+
log file or a directory containing log files.
|
24
24
|
to (Literal["eval", "json"]): Format to convert to. If a file is
|
25
|
-
|
25
|
+
already in the target format it will just be copied to the output dir.
|
26
26
|
output_dir (str): Output directory to write converted log file(s) to.
|
27
27
|
overwrite (bool): Overwrite existing log files (defaults to `False`,
|
28
|
-
|
28
|
+
raising an error if the output file path already exists).
|
29
29
|
"""
|
30
30
|
from inspect_ai._display import display
|
31
31
|
|
inspect_ai/log/_file.py
CHANGED
@@ -3,6 +3,7 @@ import re
|
|
3
3
|
from logging import getLogger
|
4
4
|
from typing import Any, Callable, Generator, Literal, cast
|
5
5
|
|
6
|
+
from pydantic import BaseModel
|
6
7
|
from pydantic_core import to_json
|
7
8
|
|
8
9
|
from inspect_ai._util._async import run_coroutine
|
@@ -22,7 +23,21 @@ from ._recorders import recorder_type_for_format, recorder_type_for_location
|
|
22
23
|
logger = getLogger(__name__)
|
23
24
|
|
24
25
|
|
25
|
-
class EvalLogInfo(
|
26
|
+
class EvalLogInfo(BaseModel):
|
27
|
+
"""File info and task identifiers for eval log."""
|
28
|
+
|
29
|
+
name: str
|
30
|
+
"""Name of file."""
|
31
|
+
|
32
|
+
type: str
|
33
|
+
"""Type of file (file or directory)"""
|
34
|
+
|
35
|
+
size: int
|
36
|
+
"""File size in bytes."""
|
37
|
+
|
38
|
+
mtime: float | None
|
39
|
+
"""File modification time (None if the file is a directory on S3)."""
|
40
|
+
|
26
41
|
task: str
|
27
42
|
"""Task name."""
|
28
43
|
|
@@ -231,7 +246,7 @@ def write_log_dir_manifest(
|
|
231
246
|
|
232
247
|
|
233
248
|
def read_eval_log(
|
234
|
-
log_file: str |
|
249
|
+
log_file: str | EvalLogInfo,
|
235
250
|
header_only: bool = False,
|
236
251
|
resolve_attachments: bool = False,
|
237
252
|
format: Literal["eval", "json", "auto"] = "auto",
|
@@ -241,7 +256,7 @@ def read_eval_log(
|
|
241
256
|
Args:
|
242
257
|
log_file (str | FileInfo): Log file to read.
|
243
258
|
header_only (bool): Read only the header (i.e. exclude
|
244
|
-
|
259
|
+
the "samples" and "logging" fields). Defaults to False.
|
245
260
|
resolve_attachments (bool): Resolve attachments (e.g. images)
|
246
261
|
to their full content.
|
247
262
|
format (Literal["eval", "json", "auto"]): Read from format
|
@@ -256,7 +271,7 @@ def read_eval_log(
|
|
256
271
|
|
257
272
|
|
258
273
|
async def read_eval_log_async(
|
259
|
-
log_file: str |
|
274
|
+
log_file: str | EvalLogInfo,
|
260
275
|
header_only: bool = False,
|
261
276
|
resolve_attachments: bool = False,
|
262
277
|
format: Literal["eval", "json", "auto"] = "auto",
|
@@ -304,13 +319,13 @@ async def read_eval_log_async(
|
|
304
319
|
|
305
320
|
|
306
321
|
def read_eval_log_headers(
|
307
|
-
log_files: list[str] | list[
|
322
|
+
log_files: list[str] | list[EvalLogInfo],
|
308
323
|
) -> list[EvalLog]:
|
309
324
|
return run_coroutine(read_eval_log_headers_async(log_files))
|
310
325
|
|
311
326
|
|
312
327
|
async def read_eval_log_headers_async(
|
313
|
-
log_files: list[str] | list[
|
328
|
+
log_files: list[str] | list[EvalLogInfo],
|
314
329
|
) -> list[EvalLog]:
|
315
330
|
return [
|
316
331
|
await read_eval_log_async(log_file, header_only=True) for log_file in log_files
|
@@ -318,7 +333,7 @@ async def read_eval_log_headers_async(
|
|
318
333
|
|
319
334
|
|
320
335
|
def read_eval_log_sample(
|
321
|
-
log_file: str |
|
336
|
+
log_file: str | EvalLogInfo,
|
322
337
|
id: int | str,
|
323
338
|
epoch: int = 1,
|
324
339
|
resolve_attachments: bool = False,
|
@@ -347,7 +362,7 @@ def read_eval_log_sample(
|
|
347
362
|
|
348
363
|
|
349
364
|
async def read_eval_log_sample_async(
|
350
|
-
log_file: str |
|
365
|
+
log_file: str | EvalLogInfo,
|
351
366
|
id: int | str,
|
352
367
|
epoch: int = 1,
|
353
368
|
resolve_attachments: bool = False,
|
@@ -386,7 +401,7 @@ async def read_eval_log_sample_async(
|
|
386
401
|
|
387
402
|
|
388
403
|
def read_eval_log_samples(
|
389
|
-
log_file: str |
|
404
|
+
log_file: str | EvalLogInfo,
|
390
405
|
all_samples_required: bool = True,
|
391
406
|
resolve_attachments: bool = False,
|
392
407
|
format: Literal["eval", "json", "auto"] = "auto",
|
inspect_ai/log/_log.py
CHANGED
@@ -4,11 +4,17 @@ import sys
|
|
4
4
|
import traceback
|
5
5
|
from logging import getLogger
|
6
6
|
from types import TracebackType
|
7
|
-
from typing import Any, Literal, Type
|
7
|
+
from typing import Any, Literal, Type, TypedDict
|
8
8
|
|
9
9
|
import click
|
10
10
|
import tenacity
|
11
|
-
from pydantic import
|
11
|
+
from pydantic import (
|
12
|
+
BaseModel,
|
13
|
+
ConfigDict,
|
14
|
+
Field,
|
15
|
+
PrivateAttr,
|
16
|
+
model_validator,
|
17
|
+
)
|
12
18
|
from rich.console import Console, RenderableType
|
13
19
|
from rich.traceback import Traceback
|
14
20
|
|
@@ -30,7 +36,31 @@ logger = getLogger(__name__)
|
|
30
36
|
SCORER_PLACEHOLDER = "88F74D2C"
|
31
37
|
|
32
38
|
|
39
|
+
class EvalConfigDefaults(TypedDict):
|
40
|
+
epochs: int
|
41
|
+
epochs_reducer: list[str]
|
42
|
+
fail_on_error: bool
|
43
|
+
sandbox_cleanup: bool
|
44
|
+
log_samples: bool
|
45
|
+
log_images: bool
|
46
|
+
score_display: bool
|
47
|
+
|
48
|
+
|
49
|
+
def eval_config_defaults() -> EvalConfigDefaults:
|
50
|
+
return {
|
51
|
+
"epochs": 1,
|
52
|
+
"epochs_reducer": ["mean"],
|
53
|
+
"fail_on_error": True,
|
54
|
+
"sandbox_cleanup": True,
|
55
|
+
"log_samples": True,
|
56
|
+
"log_images": True,
|
57
|
+
"score_display": True,
|
58
|
+
}
|
59
|
+
|
60
|
+
|
33
61
|
class EvalConfig(BaseModel):
|
62
|
+
"""Configuration used for evaluation."""
|
63
|
+
|
34
64
|
limit: int | tuple[int, int] | None = Field(default=None)
|
35
65
|
"""Sample limit (number of samples or range of samples)."""
|
36
66
|
|
@@ -109,6 +139,8 @@ class EvalConfig(BaseModel):
|
|
109
139
|
|
110
140
|
|
111
141
|
class EvalSampleLimit(BaseModel):
|
142
|
+
"""Limit encontered by sample."""
|
143
|
+
|
112
144
|
type: Literal["context", "time", "message", "token", "operator", "custom"]
|
113
145
|
"""The type of limit"""
|
114
146
|
|
@@ -117,6 +149,8 @@ class EvalSampleLimit(BaseModel):
|
|
117
149
|
|
118
150
|
|
119
151
|
class EvalSample(BaseModel):
|
152
|
+
"""Sample from evaluation task."""
|
153
|
+
|
120
154
|
id: int | str
|
121
155
|
"""Unique id for sample."""
|
122
156
|
|
@@ -191,7 +225,7 @@ class EvalSample(BaseModel):
|
|
191
225
|
"""Attachments referenced from messages and events.
|
192
226
|
|
193
227
|
Resolve attachments for a sample (replacing attachment://* references with
|
194
|
-
attachment content)
|
228
|
+
attachment content) by passing `resolve_attachments=True` to log reading functions.
|
195
229
|
"""
|
196
230
|
|
197
231
|
limit: EvalSampleLimit | None = Field(default=None)
|
@@ -262,6 +296,8 @@ class EvalEvents(BaseModel):
|
|
262
296
|
|
263
297
|
|
264
298
|
class EvalPlanStep(BaseModel):
|
299
|
+
"""Solver step."""
|
300
|
+
|
265
301
|
solver: str
|
266
302
|
"""Name of solver."""
|
267
303
|
|
@@ -270,6 +306,8 @@ class EvalPlanStep(BaseModel):
|
|
270
306
|
|
271
307
|
|
272
308
|
class EvalPlan(BaseModel):
|
309
|
+
"""Plan (solvers) used in evaluation."""
|
310
|
+
|
273
311
|
name: str = Field(default="plan")
|
274
312
|
"""Plan name."""
|
275
313
|
|
@@ -284,20 +322,24 @@ class EvalPlan(BaseModel):
|
|
284
322
|
|
285
323
|
|
286
324
|
class EvalMetric(BaseModel):
|
325
|
+
"""Metric for evaluation score."""
|
326
|
+
|
287
327
|
name: str
|
288
328
|
"""Metric name."""
|
289
329
|
|
290
330
|
value: int | float
|
291
331
|
"""Metric value."""
|
292
332
|
|
293
|
-
|
294
|
-
"""
|
333
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
334
|
+
"""Params specified when creating metric."""
|
295
335
|
|
296
336
|
metadata: dict[str, Any] | None = Field(default=None)
|
297
337
|
"""Additional metadata associated with metric."""
|
298
338
|
|
299
339
|
|
300
340
|
class EvalScore(BaseModel):
|
341
|
+
"""Score for evaluation task."""
|
342
|
+
|
301
343
|
name: str
|
302
344
|
"""Score name."""
|
303
345
|
|
@@ -318,10 +360,15 @@ class EvalScore(BaseModel):
|
|
318
360
|
|
319
361
|
|
320
362
|
class EvalSampleScore(Score):
|
363
|
+
"""Score and sample_id scored."""
|
364
|
+
|
321
365
|
sample_id: str | int | None = Field(default=None)
|
366
|
+
"""Sample ID."""
|
322
367
|
|
323
368
|
|
324
369
|
class EvalSampleReductions(BaseModel):
|
370
|
+
"""Score reductions."""
|
371
|
+
|
325
372
|
scorer: str
|
326
373
|
"""Name the of scorer"""
|
327
374
|
|
@@ -333,6 +380,8 @@ class EvalSampleReductions(BaseModel):
|
|
333
380
|
|
334
381
|
|
335
382
|
class EvalResults(BaseModel):
|
383
|
+
"""Scoring results from evaluation."""
|
384
|
+
|
336
385
|
total_samples: int = Field(default=0)
|
337
386
|
"""Total samples in eval (dataset samples * epochs)"""
|
338
387
|
|
@@ -415,6 +464,8 @@ class EvalResults(BaseModel):
|
|
415
464
|
|
416
465
|
|
417
466
|
class EvalDataset(BaseModel):
|
467
|
+
"""Dataset used for evaluation."""
|
468
|
+
|
418
469
|
name: str | None = Field(default=None)
|
419
470
|
"""Dataset name."""
|
420
471
|
|
@@ -431,7 +482,33 @@ class EvalDataset(BaseModel):
|
|
431
482
|
"""Was the dataset shuffled after reading."""
|
432
483
|
|
433
484
|
|
485
|
+
class EvalMetricDefinition(BaseModel):
|
486
|
+
name: str
|
487
|
+
"""Metric name"""
|
488
|
+
|
489
|
+
options: dict[str, Any] | None = Field(default=None)
|
490
|
+
|
491
|
+
|
492
|
+
class EvalScorer(BaseModel):
|
493
|
+
name: str
|
494
|
+
"""Scorer name"""
|
495
|
+
|
496
|
+
options: dict[str, Any] | None = Field(default=None)
|
497
|
+
"""Scorer arguments"""
|
498
|
+
|
499
|
+
metrics: (
|
500
|
+
list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
|
501
|
+
| dict[str, list[EvalMetricDefinition]]
|
502
|
+
| None
|
503
|
+
) = Field(default=None)
|
504
|
+
|
505
|
+
metadata: dict[str, Any] | None = Field(default=None)
|
506
|
+
"""Scorer metadata"""
|
507
|
+
|
508
|
+
|
434
509
|
class EvalRevision(BaseModel):
|
510
|
+
"""Git revision for evaluation."""
|
511
|
+
|
435
512
|
type: Literal["git"]
|
436
513
|
"""Type of revision (currently only "git")"""
|
437
514
|
|
@@ -443,6 +520,8 @@ class EvalRevision(BaseModel):
|
|
443
520
|
|
444
521
|
|
445
522
|
class EvalSpec(BaseModel):
|
523
|
+
"""Eval target and configuration."""
|
524
|
+
|
446
525
|
run_id: str = Field(default_factory=str)
|
447
526
|
"""Unique run id"""
|
448
527
|
|
@@ -503,6 +582,14 @@ class EvalSpec(BaseModel):
|
|
503
582
|
metadata: dict[str, Any] | None = Field(default=None)
|
504
583
|
"""Additional eval metadata."""
|
505
584
|
|
585
|
+
scorers: list[EvalScorer] | None = Field(default=None)
|
586
|
+
"""Scorers and args for this eval"""
|
587
|
+
|
588
|
+
metrics: (
|
589
|
+
list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None
|
590
|
+
) = Field(default=None)
|
591
|
+
"""metrics and args for this eval"""
|
592
|
+
|
506
593
|
# allow field model_args
|
507
594
|
model_config = ConfigDict(protected_namespaces=())
|
508
595
|
|
@@ -546,6 +633,8 @@ def rich_traceback(
|
|
546
633
|
|
547
634
|
|
548
635
|
class EvalStats(BaseModel):
|
636
|
+
"""Timing and usage statistics."""
|
637
|
+
|
549
638
|
started_at: str = Field(default_factory=str)
|
550
639
|
"""Evaluation start time."""
|
551
640
|
|
@@ -560,6 +649,8 @@ class EvalStats(BaseModel):
|
|
560
649
|
|
561
650
|
|
562
651
|
class EvalLog(BaseModel):
|
652
|
+
"""Evaluation log."""
|
653
|
+
|
563
654
|
# WARNING: The order of these fields is important for the log file format.
|
564
655
|
# Do not change the order of these fields without incrementing the version number,
|
565
656
|
# updating the log file read/write functionality (such as read_eval_log),
|
@@ -575,13 +666,13 @@ class EvalLog(BaseModel):
|
|
575
666
|
eval: EvalSpec
|
576
667
|
"""Eval identity and configuration."""
|
577
668
|
|
578
|
-
plan: EvalPlan = Field(
|
669
|
+
plan: EvalPlan = Field(default_factory=EvalPlan)
|
579
670
|
"""Eval plan (solvers and config)"""
|
580
671
|
|
581
672
|
results: EvalResults | None = None
|
582
673
|
"""Eval results (scores and metrics)."""
|
583
674
|
|
584
|
-
stats: EvalStats = Field(
|
675
|
+
stats: EvalStats = Field(default_factory=EvalStats)
|
585
676
|
"""Eval stats (runtime, model usage)"""
|
586
677
|
|
587
678
|
error: EvalError | None = Field(default=None)
|
inspect_ai/log/_message.py
CHANGED
@@ -11,6 +11,8 @@ LoggingLevel = Literal[
|
|
11
11
|
|
12
12
|
|
13
13
|
class LoggingMessage(BaseModel):
|
14
|
+
"""Message written to Python log."""
|
15
|
+
|
14
16
|
name: str | None = Field(default=None)
|
15
17
|
"""Logger name (e.g. 'httpx')"""
|
16
18
|
|
@@ -33,7 +35,7 @@ class LoggingMessage(BaseModel):
|
|
33
35
|
"""Logged from line number."""
|
34
36
|
|
35
37
|
@staticmethod
|
36
|
-
def
|
38
|
+
def _from_log_record(record: LogRecord) -> "LoggingMessage":
|
37
39
|
"""Create a LoggingMesssage from a LogRecord.
|
38
40
|
|
39
41
|
Args:
|
@@ -21,6 +21,9 @@ class Recorder(abc.ABC):
|
|
21
21
|
@abc.abstractmethod
|
22
22
|
def default_log_buffer(self) -> int: ...
|
23
23
|
|
24
|
+
@abc.abstractmethod
|
25
|
+
def is_writeable(self) -> bool: ...
|
26
|
+
|
24
27
|
@abc.abstractmethod
|
25
28
|
async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
|
26
29
|
|
inspect_ai/log/_transcript.py
CHANGED
@@ -167,7 +167,7 @@ class ToolEvent(BaseEvent):
|
|
167
167
|
events: list["Event"] = Field(default_factory=list)
|
168
168
|
"""Transcript of events for tool."""
|
169
169
|
|
170
|
-
def
|
170
|
+
def _set_result(
|
171
171
|
self,
|
172
172
|
result: ToolResult,
|
173
173
|
truncated: tuple[int, int] | None,
|
@@ -182,11 +182,11 @@ class ToolEvent(BaseEvent):
|
|
182
182
|
|
183
183
|
# mechanism for operator to cancel the tool call
|
184
184
|
|
185
|
-
def
|
185
|
+
def _set_task(self, task: asyncio.Task[Any]) -> None:
|
186
186
|
"""Set the tool task (for possible cancellation)"""
|
187
187
|
self._task = task
|
188
188
|
|
189
|
-
def
|
189
|
+
def _cancel(self) -> None:
|
190
190
|
"""Cancel the tool task."""
|
191
191
|
if self._task:
|
192
192
|
self._cancelled = True
|
@@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
|
|
264
264
|
event: Literal["info"] = Field(default="info")
|
265
265
|
"""Event type."""
|
266
266
|
|
267
|
+
source: str | None = Field(default=None)
|
268
|
+
"""Optional source for info event."""
|
269
|
+
|
267
270
|
data: JsonValue
|
268
271
|
"""Data provided with event."""
|
269
272
|
|
@@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):
|
|
279
282
|
|
280
283
|
|
281
284
|
class ScoreEvent(BaseEvent):
|
282
|
-
"""Event with
|
285
|
+
"""Event with score.
|
286
|
+
|
287
|
+
Can be the final score for a `Sample`, or can be an intermediate score
|
288
|
+
resulting from a call to `score`.
|
289
|
+
"""
|
283
290
|
|
284
291
|
event: Literal["score"] = Field(default="score")
|
285
292
|
"""Event type."""
|
286
293
|
|
287
294
|
score: Score
|
288
|
-
"""
|
295
|
+
"""Score value."""
|
289
296
|
|
290
297
|
target: str | list[str] | None = Field(default=None)
|
291
298
|
""""Sample target."""
|
292
299
|
|
300
|
+
intermediate: bool = Field(default=False)
|
301
|
+
"""Was this an intermediate scoring?"""
|
302
|
+
|
293
303
|
|
294
304
|
class StepEvent(BaseEvent):
|
295
305
|
"""Step within current sample or subtask."""
|
@@ -355,13 +365,14 @@ class Transcript:
|
|
355
365
|
self.name = name
|
356
366
|
self._events: list[Event] = []
|
357
367
|
|
358
|
-
def info(self, data: JsonValue) -> None:
|
368
|
+
def info(self, data: JsonValue, *, source: str | None = None) -> None:
|
359
369
|
"""Add an `InfoEvent` to the transcript.
|
360
370
|
|
361
371
|
Args:
|
362
|
-
data
|
372
|
+
data: Data associated with the event.
|
373
|
+
source: Optional event source.
|
363
374
|
"""
|
364
|
-
self._event(InfoEvent(data=data))
|
375
|
+
self._event(InfoEvent(source=source, data=data))
|
365
376
|
|
366
377
|
@contextlib.contextmanager
|
367
378
|
def step(self, name: str, type: str | None = None) -> Iterator[None]:
|
inspect_ai/model/__init__.py
CHANGED
@@ -21,6 +21,7 @@ from ._call_tools import call_tools
|
|
21
21
|
from ._chat_message import (
|
22
22
|
ChatMessage,
|
23
23
|
ChatMessageAssistant,
|
24
|
+
ChatMessageBase,
|
24
25
|
ChatMessageSystem,
|
25
26
|
ChatMessageTool,
|
26
27
|
ChatMessageUser,
|
@@ -54,6 +55,7 @@ __all__ = [
|
|
54
55
|
"ContentVideo",
|
55
56
|
"Content",
|
56
57
|
"ChatMessage",
|
58
|
+
"ChatMessageBase",
|
57
59
|
"ChatMessageSystem",
|
58
60
|
"ChatMessageUser",
|
59
61
|
"ChatMessageAssistant",
|