inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. inspect_ai/_cli/eval.py +31 -0
  2. inspect_ai/_eval/eval.py +19 -2
  3. inspect_ai/_eval/evalset.py +4 -1
  4. inspect_ai/_eval/run.py +41 -0
  5. inspect_ai/_eval/task/generate.py +38 -44
  6. inspect_ai/_eval/task/log.py +26 -28
  7. inspect_ai/_eval/task/run.py +13 -20
  8. inspect_ai/_util/local_server.py +368 -0
  9. inspect_ai/_util/working.py +10 -4
  10. inspect_ai/_view/www/dist/assets/index.css +159 -146
  11. inspect_ai/_view/www/dist/assets/index.js +1020 -1061
  12. inspect_ai/_view/www/log-schema.json +4 -3
  13. inspect_ai/_view/www/package.json +1 -1
  14. inspect_ai/_view/www/src/@types/log.d.ts +3 -2
  15. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  16. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  17. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  18. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  19. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  20. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  21. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  22. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  23. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  24. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  25. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  26. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  27. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  28. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  29. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  30. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  31. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  32. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  33. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  34. inspect_ai/_view/www/src/components/Card.css +0 -1
  35. inspect_ai/_view/www/src/constants.ts +2 -0
  36. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  37. inspect_ai/agent/_agent.py +3 -3
  38. inspect_ai/agent/_as_solver.py +20 -12
  39. inspect_ai/agent/_as_tool.py +15 -3
  40. inspect_ai/agent/_handoff.py +8 -1
  41. inspect_ai/agent/_run.py +11 -3
  42. inspect_ai/log/__init__.py +4 -0
  43. inspect_ai/log/_file.py +56 -0
  44. inspect_ai/log/_log.py +99 -0
  45. inspect_ai/log/_recorders/__init__.py +2 -0
  46. inspect_ai/log/_recorders/buffer/database.py +12 -11
  47. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  48. inspect_ai/log/_recorders/buffer/types.py +2 -2
  49. inspect_ai/log/_recorders/eval.py +20 -65
  50. inspect_ai/log/_recorders/file.py +28 -6
  51. inspect_ai/log/_recorders/recorder.py +7 -0
  52. inspect_ai/log/_recorders/types.py +1 -23
  53. inspect_ai/log/_samples.py +0 -8
  54. inspect_ai/log/_transcript.py +7 -1
  55. inspect_ai/log/_util.py +52 -0
  56. inspect_ai/model/__init__.py +5 -1
  57. inspect_ai/model/_call_tools.py +32 -12
  58. inspect_ai/model/_generate_config.py +14 -8
  59. inspect_ai/model/_model.py +21 -48
  60. inspect_ai/model/_model_output.py +25 -0
  61. inspect_ai/model/_openai.py +2 -0
  62. inspect_ai/model/_openai_responses.py +13 -1
  63. inspect_ai/model/_providers/anthropic.py +13 -23
  64. inspect_ai/model/_providers/openai_o1.py +8 -2
  65. inspect_ai/model/_providers/providers.py +18 -4
  66. inspect_ai/model/_providers/sglang.py +241 -0
  67. inspect_ai/model/_providers/vllm.py +207 -400
  68. inspect_ai/solver/__init__.py +7 -2
  69. inspect_ai/solver/_basic_agent.py +3 -10
  70. inspect_ai/solver/_task_state.py +26 -88
  71. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  72. inspect_ai/tool/_mcp/_mcp.py +2 -0
  73. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  74. inspect_ai/tool/_mcp/server.py +3 -1
  75. inspect_ai/tool/_tool_call.py +4 -1
  76. inspect_ai/tool/_tool_support_helpers.py +51 -12
  77. inspect_ai/tool/_tools/_bash_session.py +190 -68
  78. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  79. inspect_ai/tool/_tools/_text_editor.py +4 -3
  80. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  81. inspect_ai/util/__init__.py +12 -0
  82. inspect_ai/util/_limit.py +393 -0
  83. inspect_ai/util/_limited_conversation.py +57 -0
  84. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
  85. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
  86. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
  87. inspect_ai/solver/_limit.py +0 -39
  88. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  89. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  90. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  91. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  92. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  93. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  94. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  95. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  96. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  97. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  98. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  99. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  100. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  101. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  102. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  103. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  104. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  105. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  106. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  107. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  108. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  109. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  110. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  111. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  112. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  113. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  114. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  115. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  116. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
  117. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
  118. {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
inspect_ai/log/_file.py CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.file import (
16
16
  )
17
17
  from inspect_ai._util.json import jsonable_python
18
18
  from inspect_ai.log._condense import resolve_sample_attachments
19
+ from inspect_ai.log._log import EvalSampleSummary
19
20
 
20
21
  from ._log import EvalLog, EvalSample
21
22
  from ._recorders import recorder_type_for_format, recorder_type_for_location
@@ -393,6 +394,61 @@ async def read_eval_log_sample_async(
393
394
  return sample
394
395
 
395
396
 
397
+ def read_eval_log_sample_summaries(
398
+ log_file: str | Path | EvalLogInfo,
399
+ format: Literal["eval", "json", "auto"] = "auto",
400
+ ) -> list[EvalSampleSummary]:
401
+ """Read sample summaries from an eval log.
402
+
403
+ Args:
404
+ log_file (str | FileInfo): Log file to read.
405
+ format (Literal["eval", "json", "auto"]): Read from format
406
+ (defaults to 'auto' based on `log_file` extension)
407
+
408
+ Returns:
409
+ Sample summaries for eval log.
410
+ """
411
+ # don't mix trio and asyncio
412
+ if current_async_backend() == "trio":
413
+ raise RuntimeError(
414
+ "read_eval_log_sample_summaries cannot be called from a trio async context (please use read_eval_log_sample_summaries_asymc instead)"
415
+ )
416
+
417
+ # will use s3fs and is not called from main inspect solver/scorer/tool/sandbox
418
+ # flow, so force the use of asyncio
419
+ return run_coroutine(read_eval_log_sample_summaries_async(log_file, format))
420
+
421
+
422
+ async def read_eval_log_sample_summaries_async(
423
+ log_file: str | Path | EvalLogInfo,
424
+ format: Literal["eval", "json", "auto"] = "auto",
425
+ ) -> list[EvalSampleSummary]:
426
+ """Read sample summaries from an eval log.
427
+
428
+ Args:
429
+ log_file (str | FileInfo): Log file to read.
430
+ format (Literal["eval", "json", "auto"]): Read from format
431
+ (defaults to 'auto' based on `log_file` extension)
432
+
433
+ Returns:
434
+ Sample summaries for eval log.
435
+ """
436
+ # resolve to file path
437
+ log_file = (
438
+ log_file
439
+ if isinstance(log_file, str)
440
+ else log_file.as_posix()
441
+ if isinstance(log_file, Path)
442
+ else log_file.name
443
+ )
444
+
445
+ if format == "auto":
446
+ recorder_type = recorder_type_for_location(log_file)
447
+ else:
448
+ recorder_type = recorder_type_for_format(format)
449
+ return await recorder_type.read_log_sample_summaries(log_file)
450
+
451
+
396
452
  def read_eval_log_samples(
397
453
  log_file: str | Path | EvalLogInfo,
398
454
  all_samples_required: bool = True,
inspect_ai/log/_log.py CHANGED
@@ -30,6 +30,7 @@ from inspect_ai.util._store import Store
30
30
  from inspect_ai.util._store_model import SMT
31
31
 
32
32
  from ._transcript import Event
33
+ from ._util import text_input_only, thin_metadata
33
34
 
34
35
  logger = getLogger(__name__)
35
36
 
@@ -42,6 +43,7 @@ class EvalConfigDefaults(TypedDict):
42
43
  fail_on_error: bool
43
44
  sandbox_cleanup: bool
44
45
  log_samples: bool
46
+ log_realtime: bool
45
47
  log_images: bool
46
48
  score_display: bool
47
49
 
@@ -53,6 +55,7 @@ def eval_config_defaults() -> EvalConfigDefaults:
53
55
  "fail_on_error": True,
54
56
  "sandbox_cleanup": True,
55
57
  "log_samples": True,
58
+ "log_realtime": True,
56
59
  "log_images": True,
57
60
  "score_display": True,
58
61
  }
@@ -120,6 +123,9 @@ class EvalConfig(BaseModel):
120
123
  log_samples: bool | None = Field(default=None)
121
124
  """Log detailed information on each sample."""
122
125
 
126
+ log_realtime: bool | None = Field(default=None)
127
+ """Log events in realtime (enables live viewing of samples in inspect view)."""
128
+
123
129
  log_images: bool | None = Field(default=None)
124
130
  """Log base64 encoded versions of images."""
125
131
 
@@ -161,6 +167,70 @@ class EvalSampleLimit(BaseModel):
161
167
  """The limit value"""
162
168
 
163
169
 
170
+ class EvalSampleSummary(BaseModel):
171
+ """Summary information (including scoring) for a sample."""
172
+
173
+ id: int | str
174
+ """Unique id for sample."""
175
+
176
+ epoch: int
177
+ """Epoch number for sample."""
178
+
179
+ input: str | list[ChatMessage]
180
+ """Sample input (text inputs only)."""
181
+
182
+ target: str | list[str]
183
+ """Sample target value(s)"""
184
+
185
+ metadata: dict[str, Any] = Field(default_factory=dict)
186
+ """Sample metadata (scalar types only, strings truncated to 1k)."""
187
+
188
+ scores: dict[str, Score] | None = Field(default=None)
189
+ """Scores for sample (score values only, no answers, explanations, or metadata)."""
190
+
191
+ model_usage: dict[str, ModelUsage] = Field(default_factory=dict)
192
+ """Model token usage for sample."""
193
+
194
+ total_time: float | None = Field(default=None)
195
+ """Total time that the sample was running."""
196
+
197
+ working_time: float | None = Field(default=None)
198
+ """Time spent working (model generation, sandbox calls, etc.)"""
199
+
200
+ uuid: str | None = Field(default=None)
201
+ """Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70)"""
202
+
203
+ error: str | None = Field(default=None)
204
+ """Error that halted sample."""
205
+
206
+ limit: str | None = Field(default=None)
207
+ """Limit that halted the sample"""
208
+
209
+ retries: int | None = Field(default=None)
210
+ """Number of retries for the sample."""
211
+
212
+ completed: bool = Field(default=False)
213
+ """Is the sample complete."""
214
+
215
+ @model_validator(mode="after")
216
+ def thin_data(self) -> "EvalSampleSummary":
217
+ # thin input
218
+ self.input = text_input_only(self.input)
219
+
220
+ # thin metadata
221
+ self.metadata = thin_metadata(self.metadata)
222
+
223
+ # thin score explanations and metadata
224
+ if self.scores is not None:
225
+ self.scores = {
226
+ key: Score(value=score.value) for key, score in self.scores.items()
227
+ }
228
+ return self
229
+
230
+ # allow field model_usage
231
+ model_config = ConfigDict(protected_namespaces=())
232
+
233
+
164
234
  class EvalSample(BaseModel):
165
235
  """Sample from evaluation task."""
166
236
 
@@ -271,6 +341,35 @@ class EvalSample(BaseModel):
271
341
  limit: EvalSampleLimit | None = Field(default=None)
272
342
  """The limit that halted the sample"""
273
343
 
344
+ def summary(self) -> EvalSampleSummary:
345
+ """Summary of sample.
346
+
347
+ The summary excludes potentially large fields like messages, output,
348
+ events, store, and metadata so that it is always fast to load.
349
+
350
+ If there are images, audio, or video in the input, they are
351
+ replaced with a placeholder.
352
+
353
+ Returns:
354
+ Summary of sample.
355
+ """
356
+ return EvalSampleSummary(
357
+ id=self.id,
358
+ epoch=self.epoch,
359
+ input=self.input,
360
+ target=self.target,
361
+ metadata=self.metadata,
362
+ scores=self.scores,
363
+ model_usage=self.model_usage,
364
+ total_time=self.total_time,
365
+ working_time=self.working_time,
366
+ uuid=self.uuid,
367
+ error=self.error.message if self.error is not None else None,
368
+ limit=f"{self.limit.type}" if self.limit is not None else None,
369
+ retries=len(self.error_retries) if self.error_retries is not None else None,
370
+ completed=True,
371
+ )
372
+
274
373
  # deprecated properties
275
374
 
276
375
  @property
@@ -1,3 +1,4 @@
1
+ from .._log import EvalSampleSummary
1
2
  from .create import (
2
3
  create_recorder_for_format,
3
4
  create_recorder_for_location,
@@ -7,6 +8,7 @@ from .create import (
7
8
  from .recorder import Recorder
8
9
 
9
10
  __all__ = [
11
+ "EvalSampleSummary",
10
12
  "Recorder",
11
13
  "create_recorder_for_format",
12
14
  "create_recorder_for_location",
@@ -26,7 +26,8 @@ from ..._condense import (
26
26
  walk_input,
27
27
  walk_json_dict,
28
28
  )
29
- from ..types import SampleEvent, SampleSummary
29
+ from ..._log import EvalSampleSummary
30
+ from ..types import SampleEvent
30
31
  from .filestore import (
31
32
  Manifest,
32
33
  SampleBufferFilestore,
@@ -141,7 +142,7 @@ class SampleBufferDatabase(SampleBuffer):
141
142
  )
142
143
  self._sync_time = time.monotonic()
143
144
 
144
- def start_sample(self, sample: SampleSummary) -> None:
145
+ def start_sample(self, sample: EvalSampleSummary) -> None:
145
146
  with self._get_connection(write=True) as conn:
146
147
  sample = self._consense_sample(conn, sample)
147
148
  conn.execute(
@@ -177,7 +178,7 @@ class SampleBufferDatabase(SampleBuffer):
177
178
  # Insert all rows
178
179
  conn.execute(sql, values)
179
180
 
180
- def complete_sample(self, summary: SampleSummary) -> None:
181
+ def complete_sample(self, summary: EvalSampleSummary) -> None:
181
182
  with self._get_connection(write=True) as conn:
182
183
  summary = self._consense_sample(conn, summary)
183
184
  conn.execute(
@@ -307,9 +308,9 @@ class SampleBufferDatabase(SampleBuffer):
307
308
  conn.execute("PRAGMA foreign_keys = ON")
308
309
 
309
310
  # concurrency setup
310
- conn.execute("PRAGMA journal_mode=WAL")
311
+ conn.execute("PRAGMA journal_mode=MEMORY")
311
312
  conn.execute("PRAGMA busy_timeout=10000")
312
- conn.execute("PRAGMA synchronous=NORMAL")
313
+ conn.execute("PRAGMA synchronous=OFF")
313
314
 
314
315
  # do work
315
316
  yield conn
@@ -359,7 +360,7 @@ class SampleBufferDatabase(SampleBuffer):
359
360
 
360
361
  def _get_samples(
361
362
  self, conn: Connection, resolve_attachments: bool = False
362
- ) -> Iterator[SampleSummary]:
363
+ ) -> Iterator[EvalSampleSummary]:
363
364
  cursor = conn.execute(
364
365
  """
365
366
  SELECT s.data as sample_data
@@ -369,7 +370,7 @@ class SampleBufferDatabase(SampleBuffer):
369
370
  )
370
371
 
371
372
  for row in cursor:
372
- summary = SampleSummary.model_validate_json(row["sample_data"])
373
+ summary = EvalSampleSummary.model_validate_json(row["sample_data"])
373
374
  if resolve_attachments:
374
375
  summary = self._resolve_sample_attachments(conn, summary)
375
376
  yield summary
@@ -437,8 +438,8 @@ class SampleBufferDatabase(SampleBuffer):
437
438
  )
438
439
 
439
440
  def _consense_sample(
440
- self, conn: Connection, sample: SampleSummary
441
- ) -> SampleSummary:
441
+ self, conn: Connection, sample: EvalSampleSummary
442
+ ) -> EvalSampleSummary:
442
443
  # alias attachments
443
444
  attachments: dict[str, str] = {}
444
445
  sample = sample.model_copy(
@@ -456,8 +457,8 @@ class SampleBufferDatabase(SampleBuffer):
456
457
  return sample
457
458
 
458
459
  def _resolve_sample_attachments(
459
- self, conn: Connection, sample: SampleSummary
460
- ) -> SampleSummary:
460
+ self, conn: Connection, sample: EvalSampleSummary
461
+ ) -> EvalSampleSummary:
461
462
  return sample.model_copy(
462
463
  update={
463
464
  "input": walk_input(
@@ -14,7 +14,7 @@ from inspect_ai._util.file import FileSystem, basename, dirname, file, filesyste
14
14
  from inspect_ai._util.json import to_json_safe, to_json_str_safe
15
15
  from inspect_ai.log._file import read_eval_log
16
16
 
17
- from ..types import SampleSummary
17
+ from ..._log import EvalSampleSummary
18
18
  from .types import SampleBuffer, SampleData, Samples
19
19
 
20
20
  logger = getLogger(__name__)
@@ -33,7 +33,7 @@ class SegmentFile(BaseModel):
33
33
 
34
34
 
35
35
  class SampleManifest(BaseModel):
36
- summary: SampleSummary
36
+ summary: EvalSampleSummary
37
37
  segments: list[int] = Field(default_factory=list)
38
38
 
39
39
 
@@ -5,13 +5,13 @@ from pydantic import BaseModel, JsonValue
5
5
 
6
6
  from inspect_ai._display.core.display import TaskDisplayMetric
7
7
 
8
- from ..types import SampleSummary
8
+ from ..._log import EvalSampleSummary
9
9
 
10
10
  JsonData: TypeAlias = dict[str, JsonValue]
11
11
 
12
12
 
13
13
  class Samples(BaseModel):
14
- samples: list[SampleSummary]
14
+ samples: list[EvalSampleSummary]
15
15
  metrics: list[TaskDisplayMetric]
16
16
  refresh: int
17
17
  etag: str
@@ -11,18 +11,10 @@ from pydantic_core import to_json
11
11
  from typing_extensions import override
12
12
 
13
13
  from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
14
- from inspect_ai._util.content import (
15
- ContentAudio,
16
- ContentImage,
17
- ContentReasoning,
18
- ContentText,
19
- ContentVideo,
20
- )
21
14
  from inspect_ai._util.error import EvalError
22
15
  from inspect_ai._util.file import FileSystem, dirname, file, filesystem
23
16
  from inspect_ai._util.json import jsonable_python
24
17
  from inspect_ai._util.trace import trace_action
25
- from inspect_ai.model._chat_message import ChatMessage
26
18
 
27
19
  from .._log import (
28
20
  EvalLog,
@@ -30,12 +22,12 @@ from .._log import (
30
22
  EvalResults,
31
23
  EvalSample,
32
24
  EvalSampleReductions,
25
+ EvalSampleSummary,
33
26
  EvalSpec,
34
27
  EvalStats,
35
28
  sort_samples,
36
29
  )
37
30
  from .file import FileRecorder
38
- from .types import SampleSummary
39
31
 
40
32
  logger = getLogger(__name__)
41
33
 
@@ -222,6 +214,15 @@ class EvalRecorder(FileRecorder):
222
214
  f"Sample id {id} for epoch {epoch} not found in log {location}"
223
215
  )
224
216
 
217
+ @classmethod
218
+ @override
219
+ async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
220
+ with file(location, "rb") as z:
221
+ with ZipFile(z, mode="r") as zip:
222
+ summary_counter = _read_summary_counter(zip)
223
+ summaries = _read_all_summaries(zip, summary_counter)
224
+ return summaries
225
+
225
226
  @classmethod
226
227
  @override
227
228
  async def write_log(cls, location: str, log: EvalLog) -> None:
@@ -236,36 +237,6 @@ class EvalRecorder(FileRecorder):
236
237
  )
237
238
 
238
239
 
239
- def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
240
- # Clean the input of any images
241
- if isinstance(inputs, list):
242
- input: list[ChatMessage] = []
243
- for message in inputs:
244
- if not isinstance(message.content, str):
245
- filtered_content: list[
246
- ContentText
247
- | ContentReasoning
248
- | ContentImage
249
- | ContentAudio
250
- | ContentVideo
251
- ] = []
252
- for content in message.content:
253
- if content.type == "text":
254
- filtered_content.append(content)
255
- else:
256
- filtered_content.append(
257
- ContentText(text=f"({content.type.capitalize()})")
258
- )
259
- message.content = filtered_content
260
- input.append(message)
261
- else:
262
- input.append(message)
263
-
264
- return input
265
- else:
266
- return inputs
267
-
268
-
269
240
  class ZipLogFile:
270
241
  _zip: ZipFile | None
271
242
  _temp_file: BinaryIO
@@ -273,19 +244,20 @@ class ZipLogFile:
273
244
 
274
245
  def __init__(self, file: str) -> None:
275
246
  self._file = file
247
+ self._zip = None
276
248
  self._fs = filesystem(file)
277
249
  self._lock = anyio.Lock()
278
250
  self._temp_file = tempfile.TemporaryFile()
279
251
  self._samples: list[EvalSample] = []
280
252
  self._summary_counter = 0
281
- self._summaries: list[SampleSummary] = []
253
+ self._summaries: list[EvalSampleSummary] = []
282
254
  self._log_start: LogStart | None = None
283
255
 
284
256
  async def init(
285
257
  self,
286
258
  log_start: LogStart | None,
287
259
  summary_counter: int,
288
- summaries: list[SampleSummary],
260
+ summaries: list[EvalSampleSummary],
289
261
  ) -> None:
290
262
  async with self._lock:
291
263
  self._open()
@@ -309,31 +281,14 @@ class ZipLogFile:
309
281
  async def write_buffered_samples(self) -> None:
310
282
  async with self._lock:
311
283
  # Write the buffered samples
312
- summaries: list[SampleSummary] = []
284
+ summaries: list[EvalSampleSummary] = []
313
285
  for sample in self._samples:
314
286
  # Write the sample
315
287
  self._zip_writestr(_sample_filename(sample.id, sample.epoch), sample)
316
288
 
317
289
  # Capture the summary
318
- summaries.append(
319
- SampleSummary(
320
- id=sample.id,
321
- epoch=sample.epoch,
322
- input=text_inputs(sample.input),
323
- target=sample.target,
324
- completed=True,
325
- scores=sample.scores,
326
- error=sample.error.message
327
- if sample.error is not None
328
- else None,
329
- limit=f"{sample.limit.type}"
330
- if sample.limit is not None
331
- else None,
332
- retries=len(sample.error_retries)
333
- if sample.error_retries is not None
334
- else None,
335
- )
336
- )
290
+ summaries.append(sample.summary())
291
+
337
292
  self._samples.clear()
338
293
 
339
294
  # write intermediary summaries and add to master list
@@ -451,12 +406,12 @@ def _read_summary_counter(zip: ZipFile) -> int:
451
406
  return current_count
452
407
 
453
408
 
454
- def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
409
+ def _read_all_summaries(zip: ZipFile, count: int) -> list[EvalSampleSummary]:
455
410
  if SUMMARIES_JSON in zip.namelist():
456
411
  summaries_raw = _read_json(zip, SUMMARIES_JSON)
457
412
  if isinstance(summaries_raw, list):
458
413
  return [
459
- SampleSummary.model_validate(value, context=DESERIALIZING_CONTEXT)
414
+ EvalSampleSummary.model_validate(value, context=DESERIALIZING_CONTEXT)
460
415
  for value in summaries_raw
461
416
  ]
462
417
  else:
@@ -464,7 +419,7 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
464
419
  f"Expected a list of summaries when reading {SUMMARIES_JSON}"
465
420
  )
466
421
  else:
467
- summaries: list[SampleSummary] = []
422
+ summaries: list[EvalSampleSummary] = []
468
423
  for i in range(1, count):
469
424
  summary_file = _journal_summary_file(i)
470
425
  summary_path = _journal_summary_path(summary_file)
@@ -472,7 +427,7 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
472
427
  if isinstance(summary, list):
473
428
  summaries.extend(
474
429
  [
475
- SampleSummary.model_validate(
430
+ EvalSampleSummary.model_validate(
476
431
  value, context=DESERIALIZING_CONTEXT
477
432
  )
478
433
  for value in summary
@@ -8,7 +8,7 @@ from inspect_ai._util.constants import MODEL_NONE
8
8
  from inspect_ai._util.file import filesystem
9
9
  from inspect_ai._util.registry import registry_unqualified_name
10
10
 
11
- from .._log import EvalLog, EvalSample, EvalSpec
11
+ from .._log import EvalLog, EvalSample, EvalSampleSummary, EvalSpec
12
12
  from .recorder import Recorder
13
13
 
14
14
  logger = getLogger(__name__)
@@ -40,11 +40,7 @@ class FileRecorder(Recorder):
40
40
  cls, location: str, id: str | int, epoch: int = 1
41
41
  ) -> EvalSample:
42
42
  # establish the log to read from (might be cached)
43
- if cls.__last_read_sample_log and (cls.__last_read_sample_log[0] == "location"):
44
- eval_log = cls.__last_read_sample_log[1]
45
- else:
46
- eval_log = await cls.read_log(location)
47
- cls.__last_read_sample_log = (location, eval_log)
43
+ eval_log = await cls._log_file_maybe_cached(location)
48
44
 
49
45
  # throw if no samples
50
46
  if not eval_log.samples:
@@ -66,6 +62,32 @@ class FileRecorder(Recorder):
66
62
  else:
67
63
  return eval_sample
68
64
 
65
+ @classmethod
66
+ @override
67
+ async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
68
+ # establish the log to read from (might be cached)
69
+ eval_log = await cls._log_file_maybe_cached(location)
70
+
71
+ # throw if no samples
72
+ if not eval_log.samples:
73
+ raise IndexError(f"No samples found in log {location}")
74
+
75
+ summaries: list[EvalSampleSummary] = []
76
+ for sample in eval_log.samples:
77
+ summaries.append(sample.summary())
78
+
79
+ return summaries
80
+
81
+ @classmethod
82
+ async def _log_file_maybe_cached(cls, location: str) -> EvalLog:
83
+ # establish the log to read from (might be cached)
84
+ if cls.__last_read_sample_log and (cls.__last_read_sample_log[0] == "location"):
85
+ eval_log = cls.__last_read_sample_log[1]
86
+ else:
87
+ eval_log = await cls.read_log(location)
88
+ cls.__last_read_sample_log = (location, eval_log)
89
+ return eval_log
90
+
69
91
  def _log_file_key(self, eval: EvalSpec) -> str:
70
92
  # clean underscores, slashes, and : from the log file key (so we can reliably parse it
71
93
  # later without worrying about underscores)
@@ -8,6 +8,7 @@ from inspect_ai.log._log import (
8
8
  EvalResults,
9
9
  EvalSample,
10
10
  EvalSampleReductions,
11
+ EvalSampleSummary,
11
12
  EvalSpec,
12
13
  EvalStats,
13
14
  )
@@ -57,6 +58,12 @@ class Recorder(abc.ABC):
57
58
  cls, location: str, id: str | int, epoch: int = 1
58
59
  ) -> EvalSample: ...
59
60
 
61
+ @classmethod
62
+ @abc.abstractmethod
63
+ async def read_log_sample_summaries(
64
+ cls, location: str
65
+ ) -> list[EvalSampleSummary]: ...
66
+
60
67
  @classmethod
61
68
  @abc.abstractmethod
62
69
  async def write_log(cls, location: str, log: EvalLog) -> None: ...
@@ -1,31 +1,9 @@
1
- from pydantic import BaseModel, Field, model_validator
1
+ from pydantic import BaseModel
2
2
 
3
3
  from inspect_ai.log._transcript import Event
4
- from inspect_ai.model._chat_message import ChatMessage
5
- from inspect_ai.scorer._metric import Score
6
4
 
7
5
 
8
6
  class SampleEvent(BaseModel):
9
7
  id: str | int
10
8
  epoch: int
11
9
  event: Event
12
-
13
-
14
- class SampleSummary(BaseModel):
15
- id: int | str
16
- epoch: int
17
- input: str | list[ChatMessage]
18
- target: str | list[str]
19
- completed: bool = Field(default=False)
20
- scores: dict[str, Score] | None = Field(default=None)
21
- error: str | None = Field(default=None)
22
- limit: str | None = Field(default=None)
23
- retries: int | None = Field(default=None)
24
-
25
- @model_validator(mode="after")
26
- def thin_scores(self) -> "SampleSummary":
27
- if self.scores is not None:
28
- self.scores = {
29
- key: Score(value=score.value) for key, score in self.scores.items()
30
- }
31
- return self
@@ -119,14 +119,6 @@ def sample_active() -> ActiveSample | None:
119
119
  return _sample_active.get(None)
120
120
 
121
121
 
122
- def active_sample_token_limit() -> int | None:
123
- active = sample_active()
124
- if active:
125
- return active.token_limit
126
- else:
127
- return None
128
-
129
-
130
122
  def set_active_sample_token_limit(token_limit: int | None) -> None:
131
123
  active = sample_active()
132
124
  if active:
@@ -14,7 +14,13 @@ from typing import (
14
14
  Union,
15
15
  )
16
16
 
17
- from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
17
+ from pydantic import (
18
+ BaseModel,
19
+ ConfigDict,
20
+ Field,
21
+ JsonValue,
22
+ field_serializer,
23
+ )
18
24
  from shortuuid import uuid
19
25
 
20
26
  from inspect_ai._util.constants import SAMPLE_SUBTASK