inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -41,36 +41,36 @@ def hf_dataset(
41
41
  `datasets` package, including remote datasets on Hugging Face Hub.
42
42
 
43
43
  Args:
44
- path (str): Path or name of the dataset. Depending on path, the dataset
45
- builder that is used comes from a generic dataset script (JSON, CSV,
46
- Parquet, text etc.) or from the dataset script (a python file) inside
47
- the dataset directory.
48
- split (str): Which split of the data to load.
49
- name (str | None): Name of the dataset configuration.
50
- data_dir (str | None): data_dir of the dataset configuration
51
- to read data from.
52
- revision (str | None): Specific revision to load (e.g. "main", a branch
53
- name, or a specific commit SHA). When using `revision` the `cached` option
54
- is ignored and datasets are revalidated on Hugging Face before loading.
55
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
56
- fields in the data source to Sample objects. Pass `None` if the data is already
57
- stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
58
- `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
44
+ path: Path or name of the dataset. Depending on path, the dataset
45
+ builder that is used comes from a generic dataset script (JSON, CSV,
46
+ Parquet, text etc.) or from the dataset script (a python file) inside
47
+ the dataset directory.
48
+ split: Which split of the data to load.
49
+ name: Name of the dataset configuration.
50
+ data_dir: data_dir of the dataset configuration
51
+ to read data from.
52
+ revision: Specific revision to load (e.g. "main", a branch
53
+ name, or a specific commit SHA). When using `revision` the `cached` option
54
+ is ignored and datasets are revalidated on Hugging Face before loading.
55
+ sample_fields: Method of mapping underlying
56
+ fields in the data source to Sample objects. Pass `None` if the data is already
57
+ stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
58
+ `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
59
59
  handle mapping with a custom function that returns one or more samples.
60
- auto_id (bool): Assign an auto-incrementing ID for each sample.
61
- shuffle (bool): Randomly shuffle the dataset order.
62
- seed: (int | None): Seed used for random shuffle.
63
- shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
64
- limit (int | None): Limit the number of records to read.
65
- trust (bool): Whether or not to allow for datasets defined on the Hub
66
- using a dataset script. This option should only be set to True for
67
- repositories you trust and in which you have read the code, as it
68
- will execute code present on the Hub on your local machine.
69
- cached (bool): By default, datasets are read once from HuggingFace
70
- Hub and then cached for future reads. Pass `cached=False` to force
71
- re-reading the dataset from Hugging Face. Ignored when the `revision`
72
- option is specified.
73
- **kwargs (dict[str, Any]): Additional arguments to pass through to the
60
+ auto_id: Assign an auto-incrementing ID for each sample.
61
+ shuffle: Randomly shuffle the dataset order.
62
+ seed: Seed used for random shuffle.
63
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
64
+ limit: Limit the number of records to read.
65
+ trust: Whether or not to allow for datasets defined on the Hub
66
+ using a dataset script. This option should only be set to True for
67
+ repositories you trust and in which you have read the code, as it
68
+ will execute code present on the Hub on your local machine.
69
+ cached: By default, datasets are read once from HuggingFace
70
+ Hub and then cached for future reads. Pass `cached=False` to force
71
+ re-reading the dataset from Hugging Face. Ignored when the `revision`
72
+ option is specified.
73
+ **kwargs (dict[str, Any]): Additional arguments to pass through to the
74
74
  `load_dataset` function of the `datasets` package.
75
75
 
76
76
  Returns:
@@ -39,23 +39,23 @@ def json_dataset(
39
39
  the `sample_fields` argument.
40
40
 
41
41
  Args:
42
- json_file (str): Path to JSON file. Can be a local filesystem path or
42
+ json_file: Path to JSON file. Can be a local filesystem path or
43
43
  a path to an S3 bucket (e.g. "s3://my-bucket"). Use `fs_options`
44
44
  to pass arguments through to the `S3FileSystem` constructor.
45
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
45
+ sample_fields: Method of mapping underlying
46
46
  fields in the data source to `Sample` objects. Pass `None` if the data is already
47
47
  stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a
48
48
  `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
49
49
  handle mapping with a custom function that returns one or more samples.
50
- auto_id (bool): Assign an auto-incrementing ID for each sample.
51
- shuffle (bool): Randomly shuffle the dataset order.
52
- seed: (int | None): Seed used for random shuffle.
53
- shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
54
- limit (int | None): Limit the number of records to read.
55
- encoding (str): Text encoding for file (defaults to "utf-8").
56
- name (str): Optional name for dataset (for logging). If not specified,
50
+ auto_id: Assign an auto-incrementing ID for each sample.
51
+ shuffle: Randomly shuffle the dataset order.
52
+ seed: Seed used for random shuffle.
53
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
54
+ limit: Limit the number of records to read.
55
+ encoding: Text encoding for file (defaults to "utf-8").
56
+ name: Optional name for dataset (for logging). If not specified,
57
57
  defaults to the stem of the filename.
58
- fs_options (dict[str, Any]): Optional. Additional arguments to pass through
58
+ fs_options: Optional. Additional arguments to pass through
59
59
  to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
60
60
  if you are accessing a public S3 bucket with no credentials.
61
61
 
@@ -22,6 +22,7 @@ from ._log import (
22
22
  EvalResults,
23
23
  EvalRevision,
24
24
  EvalSample,
25
+ EvalSampleLimit,
25
26
  EvalSampleReductions,
26
27
  EvalSampleScore,
27
28
  EvalScore,
@@ -61,6 +62,7 @@ __all__ = [
61
62
  "EvalResults",
62
63
  "EvalRevision",
63
64
  "EvalSample",
65
+ "EvalSampleLimit",
64
66
  "EvalSampleScore",
65
67
  "EvalSampleReductions",
66
68
  "EvalScore",
@@ -20,12 +20,12 @@ def convert_eval_logs(
20
20
 
21
21
  Args:
22
22
  path (str): Path to source log file(s). Should be either a single
23
- log file or a directory containing log files.
23
+ log file or a directory containing log files.
24
24
  to (Literal["eval", "json"]): Format to convert to. If a file is
25
- already in the target format it will just be copied to the output dir.
25
+ already in the target format it will just be copied to the output dir.
26
26
  output_dir (str): Output directory to write converted log file(s) to.
27
27
  overwrite (bool): Overwrite existing log files (defaults to `False`,
28
- raising an error if the output file path already exists).
28
+ raising an error if the output file path already exists).
29
29
  """
30
30
  from inspect_ai._display import display
31
31
 
inspect_ai/log/_file.py CHANGED
@@ -3,6 +3,7 @@ import re
3
3
  from logging import getLogger
4
4
  from typing import Any, Callable, Generator, Literal, cast
5
5
 
6
+ from pydantic import BaseModel
6
7
  from pydantic_core import to_json
7
8
 
8
9
  from inspect_ai._util._async import run_coroutine
@@ -22,7 +23,21 @@ from ._recorders import recorder_type_for_format, recorder_type_for_location
22
23
  logger = getLogger(__name__)
23
24
 
24
25
 
25
- class EvalLogInfo(FileInfo):
26
+ class EvalLogInfo(BaseModel):
27
+ """File info and task identifiers for eval log."""
28
+
29
+ name: str
30
+ """Name of file."""
31
+
32
+ type: str
33
+ """Type of file (file or directory)"""
34
+
35
+ size: int
36
+ """File size in bytes."""
37
+
38
+ mtime: float | None
39
+ """File modification time (None if the file is a directory on S3)."""
40
+
26
41
  task: str
27
42
  """Task name."""
28
43
 
@@ -231,7 +246,7 @@ def write_log_dir_manifest(
231
246
 
232
247
 
233
248
  def read_eval_log(
234
- log_file: str | FileInfo,
249
+ log_file: str | EvalLogInfo,
235
250
  header_only: bool = False,
236
251
  resolve_attachments: bool = False,
237
252
  format: Literal["eval", "json", "auto"] = "auto",
@@ -241,7 +256,7 @@ def read_eval_log(
241
256
  Args:
242
257
  log_file (str | FileInfo): Log file to read.
243
258
  header_only (bool): Read only the header (i.e. exclude
244
- the "samples" and "logging" fields). Defaults to False.
259
+ the "samples" and "logging" fields). Defaults to False.
245
260
  resolve_attachments (bool): Resolve attachments (e.g. images)
246
261
  to their full content.
247
262
  format (Literal["eval", "json", "auto"]): Read from format
@@ -256,7 +271,7 @@ def read_eval_log(
256
271
 
257
272
 
258
273
  async def read_eval_log_async(
259
- log_file: str | FileInfo,
274
+ log_file: str | EvalLogInfo,
260
275
  header_only: bool = False,
261
276
  resolve_attachments: bool = False,
262
277
  format: Literal["eval", "json", "auto"] = "auto",
@@ -304,13 +319,13 @@ async def read_eval_log_async(
304
319
 
305
320
 
306
321
  def read_eval_log_headers(
307
- log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
322
+ log_files: list[str] | list[EvalLogInfo],
308
323
  ) -> list[EvalLog]:
309
324
  return run_coroutine(read_eval_log_headers_async(log_files))
310
325
 
311
326
 
312
327
  async def read_eval_log_headers_async(
313
- log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
328
+ log_files: list[str] | list[EvalLogInfo],
314
329
  ) -> list[EvalLog]:
315
330
  return [
316
331
  await read_eval_log_async(log_file, header_only=True) for log_file in log_files
@@ -318,7 +333,7 @@ async def read_eval_log_headers_async(
318
333
 
319
334
 
320
335
  def read_eval_log_sample(
321
- log_file: str | FileInfo,
336
+ log_file: str | EvalLogInfo,
322
337
  id: int | str,
323
338
  epoch: int = 1,
324
339
  resolve_attachments: bool = False,
@@ -347,7 +362,7 @@ def read_eval_log_sample(
347
362
 
348
363
 
349
364
  async def read_eval_log_sample_async(
350
- log_file: str | FileInfo,
365
+ log_file: str | EvalLogInfo,
351
366
  id: int | str,
352
367
  epoch: int = 1,
353
368
  resolve_attachments: bool = False,
@@ -386,7 +401,7 @@ async def read_eval_log_sample_async(
386
401
 
387
402
 
388
403
  def read_eval_log_samples(
389
- log_file: str | FileInfo,
404
+ log_file: str | EvalLogInfo,
390
405
  all_samples_required: bool = True,
391
406
  resolve_attachments: bool = False,
392
407
  format: Literal["eval", "json", "auto"] = "auto",
inspect_ai/log/_log.py CHANGED
@@ -4,11 +4,17 @@ import sys
4
4
  import traceback
5
5
  from logging import getLogger
6
6
  from types import TracebackType
7
- from typing import Any, Literal, Type
7
+ from typing import Any, Literal, Type, TypedDict
8
8
 
9
9
  import click
10
10
  import tenacity
11
- from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator
11
+ from pydantic import (
12
+ BaseModel,
13
+ ConfigDict,
14
+ Field,
15
+ PrivateAttr,
16
+ model_validator,
17
+ )
12
18
  from rich.console import Console, RenderableType
13
19
  from rich.traceback import Traceback
14
20
 
@@ -30,7 +36,31 @@ logger = getLogger(__name__)
30
36
  SCORER_PLACEHOLDER = "88F74D2C"
31
37
 
32
38
 
39
+ class EvalConfigDefaults(TypedDict):
40
+ epochs: int
41
+ epochs_reducer: list[str]
42
+ fail_on_error: bool
43
+ sandbox_cleanup: bool
44
+ log_samples: bool
45
+ log_images: bool
46
+ score_display: bool
47
+
48
+
49
+ def eval_config_defaults() -> EvalConfigDefaults:
50
+ return {
51
+ "epochs": 1,
52
+ "epochs_reducer": ["mean"],
53
+ "fail_on_error": True,
54
+ "sandbox_cleanup": True,
55
+ "log_samples": True,
56
+ "log_images": True,
57
+ "score_display": True,
58
+ }
59
+
60
+
33
61
  class EvalConfig(BaseModel):
62
+ """Configuration used for evaluation."""
63
+
34
64
  limit: int | tuple[int, int] | None = Field(default=None)
35
65
  """Sample limit (number of samples or range of samples)."""
36
66
 
@@ -109,6 +139,8 @@ class EvalConfig(BaseModel):
109
139
 
110
140
 
111
141
  class EvalSampleLimit(BaseModel):
142
+ """Limit encontered by sample."""
143
+
112
144
  type: Literal["context", "time", "message", "token", "operator", "custom"]
113
145
  """The type of limit"""
114
146
 
@@ -117,6 +149,8 @@ class EvalSampleLimit(BaseModel):
117
149
 
118
150
 
119
151
  class EvalSample(BaseModel):
152
+ """Sample from evaluation task."""
153
+
120
154
  id: int | str
121
155
  """Unique id for sample."""
122
156
 
@@ -191,7 +225,7 @@ class EvalSample(BaseModel):
191
225
  """Attachments referenced from messages and events.
192
226
 
193
227
  Resolve attachments for a sample (replacing attachment://* references with
194
- attachment content) with the resolve_sample_attachments() function.
228
+ attachment content) by passing `resolve_attachments=True` to log reading functions.
195
229
  """
196
230
 
197
231
  limit: EvalSampleLimit | None = Field(default=None)
@@ -262,6 +296,8 @@ class EvalEvents(BaseModel):
262
296
 
263
297
 
264
298
  class EvalPlanStep(BaseModel):
299
+ """Solver step."""
300
+
265
301
  solver: str
266
302
  """Name of solver."""
267
303
 
@@ -270,6 +306,8 @@ class EvalPlanStep(BaseModel):
270
306
 
271
307
 
272
308
  class EvalPlan(BaseModel):
309
+ """Plan (solvers) used in evaluation."""
310
+
273
311
  name: str = Field(default="plan")
274
312
  """Plan name."""
275
313
 
@@ -284,20 +322,24 @@ class EvalPlan(BaseModel):
284
322
 
285
323
 
286
324
  class EvalMetric(BaseModel):
325
+ """Metric for evaluation score."""
326
+
287
327
  name: str
288
328
  """Metric name."""
289
329
 
290
330
  value: int | float
291
331
  """Metric value."""
292
332
 
293
- options: dict[str, Any] = Field(default_factory=dict)
294
- """Options specified when creating metric."""
333
+ params: dict[str, Any] = Field(default_factory=dict)
334
+ """Params specified when creating metric."""
295
335
 
296
336
  metadata: dict[str, Any] | None = Field(default=None)
297
337
  """Additional metadata associated with metric."""
298
338
 
299
339
 
300
340
  class EvalScore(BaseModel):
341
+ """Score for evaluation task."""
342
+
301
343
  name: str
302
344
  """Score name."""
303
345
 
@@ -318,10 +360,15 @@ class EvalScore(BaseModel):
318
360
 
319
361
 
320
362
  class EvalSampleScore(Score):
363
+ """Score and sample_id scored."""
364
+
321
365
  sample_id: str | int | None = Field(default=None)
366
+ """Sample ID."""
322
367
 
323
368
 
324
369
  class EvalSampleReductions(BaseModel):
370
+ """Score reductions."""
371
+
325
372
  scorer: str
326
373
  """Name the of scorer"""
327
374
 
@@ -333,6 +380,8 @@ class EvalSampleReductions(BaseModel):
333
380
 
334
381
 
335
382
  class EvalResults(BaseModel):
383
+ """Scoring results from evaluation."""
384
+
336
385
  total_samples: int = Field(default=0)
337
386
  """Total samples in eval (dataset samples * epochs)"""
338
387
 
@@ -415,6 +464,8 @@ class EvalResults(BaseModel):
415
464
 
416
465
 
417
466
  class EvalDataset(BaseModel):
467
+ """Dataset used for evaluation."""
468
+
418
469
  name: str | None = Field(default=None)
419
470
  """Dataset name."""
420
471
 
@@ -431,7 +482,33 @@ class EvalDataset(BaseModel):
431
482
  """Was the dataset shuffled after reading."""
432
483
 
433
484
 
485
+ class EvalMetricDefinition(BaseModel):
486
+ name: str
487
+ """Metric name"""
488
+
489
+ options: dict[str, Any] | None = Field(default=None)
490
+
491
+
492
+ class EvalScorer(BaseModel):
493
+ name: str
494
+ """Scorer name"""
495
+
496
+ options: dict[str, Any] | None = Field(default=None)
497
+ """Scorer arguments"""
498
+
499
+ metrics: (
500
+ list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
501
+ | dict[str, list[EvalMetricDefinition]]
502
+ | None
503
+ ) = Field(default=None)
504
+
505
+ metadata: dict[str, Any] | None = Field(default=None)
506
+ """Scorer metadata"""
507
+
508
+
434
509
  class EvalRevision(BaseModel):
510
+ """Git revision for evaluation."""
511
+
435
512
  type: Literal["git"]
436
513
  """Type of revision (currently only "git")"""
437
514
 
@@ -443,6 +520,8 @@ class EvalRevision(BaseModel):
443
520
 
444
521
 
445
522
  class EvalSpec(BaseModel):
523
+ """Eval target and configuration."""
524
+
446
525
  run_id: str = Field(default_factory=str)
447
526
  """Unique run id"""
448
527
 
@@ -503,6 +582,14 @@ class EvalSpec(BaseModel):
503
582
  metadata: dict[str, Any] | None = Field(default=None)
504
583
  """Additional eval metadata."""
505
584
 
585
+ scorers: list[EvalScorer] | None = Field(default=None)
586
+ """Scorers and args for this eval"""
587
+
588
+ metrics: (
589
+ list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None
590
+ ) = Field(default=None)
591
+ """metrics and args for this eval"""
592
+
506
593
  # allow field model_args
507
594
  model_config = ConfigDict(protected_namespaces=())
508
595
 
@@ -546,6 +633,8 @@ def rich_traceback(
546
633
 
547
634
 
548
635
  class EvalStats(BaseModel):
636
+ """Timing and usage statistics."""
637
+
549
638
  started_at: str = Field(default_factory=str)
550
639
  """Evaluation start time."""
551
640
 
@@ -560,6 +649,8 @@ class EvalStats(BaseModel):
560
649
 
561
650
 
562
651
  class EvalLog(BaseModel):
652
+ """Evaluation log."""
653
+
563
654
  # WARNING: The order of these fields is important for the log file format.
564
655
  # Do not change the order of these fields without incrementing the version number,
565
656
  # updating the log file read/write functionality (such as read_eval_log),
@@ -575,13 +666,13 @@ class EvalLog(BaseModel):
575
666
  eval: EvalSpec
576
667
  """Eval identity and configuration."""
577
668
 
578
- plan: EvalPlan = Field(default=EvalPlan())
669
+ plan: EvalPlan = Field(default_factory=EvalPlan)
579
670
  """Eval plan (solvers and config)"""
580
671
 
581
672
  results: EvalResults | None = None
582
673
  """Eval results (scores and metrics)."""
583
674
 
584
- stats: EvalStats = Field(default=EvalStats())
675
+ stats: EvalStats = Field(default_factory=EvalStats)
585
676
  """Eval stats (runtime, model usage)"""
586
677
 
587
678
  error: EvalError | None = Field(default=None)
@@ -11,6 +11,8 @@ LoggingLevel = Literal[
11
11
 
12
12
 
13
13
  class LoggingMessage(BaseModel):
14
+ """Message written to Python log."""
15
+
14
16
  name: str | None = Field(default=None)
15
17
  """Logger name (e.g. 'httpx')"""
16
18
 
@@ -33,7 +35,7 @@ class LoggingMessage(BaseModel):
33
35
  """Logged from line number."""
34
36
 
35
37
  @staticmethod
36
- def from_log_record(record: LogRecord) -> "LoggingMessage":
38
+ def _from_log_record(record: LogRecord) -> "LoggingMessage":
37
39
  """Create a LoggingMesssage from a LogRecord.
38
40
 
39
41
  Args:
@@ -28,6 +28,10 @@ class FileRecorder(Recorder):
28
28
  def is_local(self) -> bool:
29
29
  return self.fs.is_local()
30
30
 
31
+ @override
32
+ def is_writeable(self) -> bool:
33
+ return self.fs.is_writeable(self.log_dir)
34
+
31
35
  @override
32
36
  @classmethod
33
37
  async def read_log_sample(
@@ -21,6 +21,9 @@ class Recorder(abc.ABC):
21
21
  @abc.abstractmethod
22
22
  def default_log_buffer(self) -> int: ...
23
23
 
24
+ @abc.abstractmethod
25
+ def is_writeable(self) -> bool: ...
26
+
24
27
  @abc.abstractmethod
25
28
  async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
26
29
 
@@ -167,7 +167,7 @@ class ToolEvent(BaseEvent):
167
167
  events: list["Event"] = Field(default_factory=list)
168
168
  """Transcript of events for tool."""
169
169
 
170
- def set_result(
170
+ def _set_result(
171
171
  self,
172
172
  result: ToolResult,
173
173
  truncated: tuple[int, int] | None,
@@ -182,11 +182,11 @@ class ToolEvent(BaseEvent):
182
182
 
183
183
  # mechanism for operator to cancel the tool call
184
184
 
185
- def set_task(self, task: asyncio.Task[Any]) -> None:
185
+ def _set_task(self, task: asyncio.Task[Any]) -> None:
186
186
  """Set the tool task (for possible cancellation)"""
187
187
  self._task = task
188
188
 
189
- def cancel(self) -> None:
189
+ def _cancel(self) -> None:
190
190
  """Cancel the tool task."""
191
191
  if self._task:
192
192
  self._cancelled = True
@@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
264
264
  event: Literal["info"] = Field(default="info")
265
265
  """Event type."""
266
266
 
267
+ source: str | None = Field(default=None)
268
+ """Optional source for info event."""
269
+
267
270
  data: JsonValue
268
271
  """Data provided with event."""
269
272
 
@@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):
279
282
 
280
283
 
281
284
  class ScoreEvent(BaseEvent):
282
- """Event with sample score."""
285
+ """Event with score.
286
+
287
+ Can be the final score for a `Sample`, or can be an intermediate score
288
+ resulting from a call to `score`.
289
+ """
283
290
 
284
291
  event: Literal["score"] = Field(default="score")
285
292
  """Event type."""
286
293
 
287
294
  score: Score
288
- """Sample score."""
295
+ """Score value."""
289
296
 
290
297
  target: str | list[str] | None = Field(default=None)
291
298
  """"Sample target."""
292
299
 
300
+ intermediate: bool = Field(default=False)
301
+ """Was this an intermediate scoring?"""
302
+
293
303
 
294
304
  class StepEvent(BaseEvent):
295
305
  """Step within current sample or subtask."""
@@ -355,13 +365,14 @@ class Transcript:
355
365
  self.name = name
356
366
  self._events: list[Event] = []
357
367
 
358
- def info(self, data: JsonValue) -> None:
368
+ def info(self, data: JsonValue, *, source: str | None = None) -> None:
359
369
  """Add an `InfoEvent` to the transcript.
360
370
 
361
371
  Args:
362
- data (JsonValue): Data associated with the event.
372
+ data: Data associated with the event.
373
+ source: Optional event source.
363
374
  """
364
- self._event(InfoEvent(data=data))
375
+ self._event(InfoEvent(source=source, data=data))
365
376
 
366
377
  @contextlib.contextmanager
367
378
  def step(self, name: str, type: str | None = None) -> Iterator[None]:
@@ -21,6 +21,7 @@ from ._call_tools import call_tools
21
21
  from ._chat_message import (
22
22
  ChatMessage,
23
23
  ChatMessageAssistant,
24
+ ChatMessageBase,
24
25
  ChatMessageSystem,
25
26
  ChatMessageTool,
26
27
  ChatMessageUser,
@@ -54,6 +55,7 @@ __all__ = [
54
55
  "ContentVideo",
55
56
  "Content",
56
57
  "ChatMessage",
58
+ "ChatMessageBase",
57
59
  "ChatMessageSystem",
58
60
  "ChatMessageUser",
59
61
  "ChatMessageAssistant",