inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +31 -0
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +13 -20
- inspect_ai/_util/local_server.py +368 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +159 -146
- inspect_ai/_view/www/dist/assets/index.js +1020 -1061
- inspect_ai/_view/www/log-schema.json +4 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +3 -2
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +20 -12
- inspect_ai/agent/_as_tool.py +15 -3
- inspect_ai/agent/_handoff.py +8 -1
- inspect_ai/agent/_run.py +11 -3
- inspect_ai/log/__init__.py +4 -0
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +0 -8
- inspect_ai/log/_transcript.py +7 -1
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +32 -12
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +21 -48
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_openai_responses.py +13 -1
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +241 -0
- inspect_ai/model/_providers/vllm.py +207 -400
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +2 -0
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +12 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
inspect_ai/log/_file.py
CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.file import (
|
|
16
16
|
)
|
17
17
|
from inspect_ai._util.json import jsonable_python
|
18
18
|
from inspect_ai.log._condense import resolve_sample_attachments
|
19
|
+
from inspect_ai.log._log import EvalSampleSummary
|
19
20
|
|
20
21
|
from ._log import EvalLog, EvalSample
|
21
22
|
from ._recorders import recorder_type_for_format, recorder_type_for_location
|
@@ -393,6 +394,61 @@ async def read_eval_log_sample_async(
|
|
393
394
|
return sample
|
394
395
|
|
395
396
|
|
397
|
+
def read_eval_log_sample_summaries(
|
398
|
+
log_file: str | Path | EvalLogInfo,
|
399
|
+
format: Literal["eval", "json", "auto"] = "auto",
|
400
|
+
) -> list[EvalSampleSummary]:
|
401
|
+
"""Read sample summaries from an eval log.
|
402
|
+
|
403
|
+
Args:
|
404
|
+
log_file (str | FileInfo): Log file to read.
|
405
|
+
format (Literal["eval", "json", "auto"]): Read from format
|
406
|
+
(defaults to 'auto' based on `log_file` extension)
|
407
|
+
|
408
|
+
Returns:
|
409
|
+
Sample summaries for eval log.
|
410
|
+
"""
|
411
|
+
# don't mix trio and asyncio
|
412
|
+
if current_async_backend() == "trio":
|
413
|
+
raise RuntimeError(
|
414
|
+
"read_eval_log_sample_summaries cannot be called from a trio async context (please use read_eval_log_sample_summaries_asymc instead)"
|
415
|
+
)
|
416
|
+
|
417
|
+
# will use s3fs and is not called from main inspect solver/scorer/tool/sandbox
|
418
|
+
# flow, so force the use of asyncio
|
419
|
+
return run_coroutine(read_eval_log_sample_summaries_async(log_file, format))
|
420
|
+
|
421
|
+
|
422
|
+
async def read_eval_log_sample_summaries_async(
|
423
|
+
log_file: str | Path | EvalLogInfo,
|
424
|
+
format: Literal["eval", "json", "auto"] = "auto",
|
425
|
+
) -> list[EvalSampleSummary]:
|
426
|
+
"""Read sample summaries from an eval log.
|
427
|
+
|
428
|
+
Args:
|
429
|
+
log_file (str | FileInfo): Log file to read.
|
430
|
+
format (Literal["eval", "json", "auto"]): Read from format
|
431
|
+
(defaults to 'auto' based on `log_file` extension)
|
432
|
+
|
433
|
+
Returns:
|
434
|
+
Sample summaries for eval log.
|
435
|
+
"""
|
436
|
+
# resolve to file path
|
437
|
+
log_file = (
|
438
|
+
log_file
|
439
|
+
if isinstance(log_file, str)
|
440
|
+
else log_file.as_posix()
|
441
|
+
if isinstance(log_file, Path)
|
442
|
+
else log_file.name
|
443
|
+
)
|
444
|
+
|
445
|
+
if format == "auto":
|
446
|
+
recorder_type = recorder_type_for_location(log_file)
|
447
|
+
else:
|
448
|
+
recorder_type = recorder_type_for_format(format)
|
449
|
+
return await recorder_type.read_log_sample_summaries(log_file)
|
450
|
+
|
451
|
+
|
396
452
|
def read_eval_log_samples(
|
397
453
|
log_file: str | Path | EvalLogInfo,
|
398
454
|
all_samples_required: bool = True,
|
inspect_ai/log/_log.py
CHANGED
@@ -30,6 +30,7 @@ from inspect_ai.util._store import Store
|
|
30
30
|
from inspect_ai.util._store_model import SMT
|
31
31
|
|
32
32
|
from ._transcript import Event
|
33
|
+
from ._util import text_input_only, thin_metadata
|
33
34
|
|
34
35
|
logger = getLogger(__name__)
|
35
36
|
|
@@ -42,6 +43,7 @@ class EvalConfigDefaults(TypedDict):
|
|
42
43
|
fail_on_error: bool
|
43
44
|
sandbox_cleanup: bool
|
44
45
|
log_samples: bool
|
46
|
+
log_realtime: bool
|
45
47
|
log_images: bool
|
46
48
|
score_display: bool
|
47
49
|
|
@@ -53,6 +55,7 @@ def eval_config_defaults() -> EvalConfigDefaults:
|
|
53
55
|
"fail_on_error": True,
|
54
56
|
"sandbox_cleanup": True,
|
55
57
|
"log_samples": True,
|
58
|
+
"log_realtime": True,
|
56
59
|
"log_images": True,
|
57
60
|
"score_display": True,
|
58
61
|
}
|
@@ -120,6 +123,9 @@ class EvalConfig(BaseModel):
|
|
120
123
|
log_samples: bool | None = Field(default=None)
|
121
124
|
"""Log detailed information on each sample."""
|
122
125
|
|
126
|
+
log_realtime: bool | None = Field(default=None)
|
127
|
+
"""Log events in realtime (enables live viewing of samples in inspect view)."""
|
128
|
+
|
123
129
|
log_images: bool | None = Field(default=None)
|
124
130
|
"""Log base64 encoded versions of images."""
|
125
131
|
|
@@ -161,6 +167,70 @@ class EvalSampleLimit(BaseModel):
|
|
161
167
|
"""The limit value"""
|
162
168
|
|
163
169
|
|
170
|
+
class EvalSampleSummary(BaseModel):
|
171
|
+
"""Summary information (including scoring) for a sample."""
|
172
|
+
|
173
|
+
id: int | str
|
174
|
+
"""Unique id for sample."""
|
175
|
+
|
176
|
+
epoch: int
|
177
|
+
"""Epoch number for sample."""
|
178
|
+
|
179
|
+
input: str | list[ChatMessage]
|
180
|
+
"""Sample input (text inputs only)."""
|
181
|
+
|
182
|
+
target: str | list[str]
|
183
|
+
"""Sample target value(s)"""
|
184
|
+
|
185
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
186
|
+
"""Sample metadata (scalar types only, strings truncated to 1k)."""
|
187
|
+
|
188
|
+
scores: dict[str, Score] | None = Field(default=None)
|
189
|
+
"""Scores for sample (score values only, no answers, explanations, or metadata)."""
|
190
|
+
|
191
|
+
model_usage: dict[str, ModelUsage] = Field(default_factory=dict)
|
192
|
+
"""Model token usage for sample."""
|
193
|
+
|
194
|
+
total_time: float | None = Field(default=None)
|
195
|
+
"""Total time that the sample was running."""
|
196
|
+
|
197
|
+
working_time: float | None = Field(default=None)
|
198
|
+
"""Time spent working (model generation, sandbox calls, etc.)"""
|
199
|
+
|
200
|
+
uuid: str | None = Field(default=None)
|
201
|
+
"""Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70)"""
|
202
|
+
|
203
|
+
error: str | None = Field(default=None)
|
204
|
+
"""Error that halted sample."""
|
205
|
+
|
206
|
+
limit: str | None = Field(default=None)
|
207
|
+
"""Limit that halted the sample"""
|
208
|
+
|
209
|
+
retries: int | None = Field(default=None)
|
210
|
+
"""Number of retries for the sample."""
|
211
|
+
|
212
|
+
completed: bool = Field(default=False)
|
213
|
+
"""Is the sample complete."""
|
214
|
+
|
215
|
+
@model_validator(mode="after")
|
216
|
+
def thin_data(self) -> "EvalSampleSummary":
|
217
|
+
# thin input
|
218
|
+
self.input = text_input_only(self.input)
|
219
|
+
|
220
|
+
# thin metadata
|
221
|
+
self.metadata = thin_metadata(self.metadata)
|
222
|
+
|
223
|
+
# thin score explanations and metadata
|
224
|
+
if self.scores is not None:
|
225
|
+
self.scores = {
|
226
|
+
key: Score(value=score.value) for key, score in self.scores.items()
|
227
|
+
}
|
228
|
+
return self
|
229
|
+
|
230
|
+
# allow field model_usage
|
231
|
+
model_config = ConfigDict(protected_namespaces=())
|
232
|
+
|
233
|
+
|
164
234
|
class EvalSample(BaseModel):
|
165
235
|
"""Sample from evaluation task."""
|
166
236
|
|
@@ -271,6 +341,35 @@ class EvalSample(BaseModel):
|
|
271
341
|
limit: EvalSampleLimit | None = Field(default=None)
|
272
342
|
"""The limit that halted the sample"""
|
273
343
|
|
344
|
+
def summary(self) -> EvalSampleSummary:
|
345
|
+
"""Summary of sample.
|
346
|
+
|
347
|
+
The summary excludes potentially large fields like messages, output,
|
348
|
+
events, store, and metadata so that it is always fast to load.
|
349
|
+
|
350
|
+
If there are images, audio, or video in the input, they are
|
351
|
+
replaced with a placeholder.
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
Summary of sample.
|
355
|
+
"""
|
356
|
+
return EvalSampleSummary(
|
357
|
+
id=self.id,
|
358
|
+
epoch=self.epoch,
|
359
|
+
input=self.input,
|
360
|
+
target=self.target,
|
361
|
+
metadata=self.metadata,
|
362
|
+
scores=self.scores,
|
363
|
+
model_usage=self.model_usage,
|
364
|
+
total_time=self.total_time,
|
365
|
+
working_time=self.working_time,
|
366
|
+
uuid=self.uuid,
|
367
|
+
error=self.error.message if self.error is not None else None,
|
368
|
+
limit=f"{self.limit.type}" if self.limit is not None else None,
|
369
|
+
retries=len(self.error_retries) if self.error_retries is not None else None,
|
370
|
+
completed=True,
|
371
|
+
)
|
372
|
+
|
274
373
|
# deprecated properties
|
275
374
|
|
276
375
|
@property
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from .._log import EvalSampleSummary
|
1
2
|
from .create import (
|
2
3
|
create_recorder_for_format,
|
3
4
|
create_recorder_for_location,
|
@@ -7,6 +8,7 @@ from .create import (
|
|
7
8
|
from .recorder import Recorder
|
8
9
|
|
9
10
|
__all__ = [
|
11
|
+
"EvalSampleSummary",
|
10
12
|
"Recorder",
|
11
13
|
"create_recorder_for_format",
|
12
14
|
"create_recorder_for_location",
|
@@ -26,7 +26,8 @@ from ..._condense import (
|
|
26
26
|
walk_input,
|
27
27
|
walk_json_dict,
|
28
28
|
)
|
29
|
-
from
|
29
|
+
from ..._log import EvalSampleSummary
|
30
|
+
from ..types import SampleEvent
|
30
31
|
from .filestore import (
|
31
32
|
Manifest,
|
32
33
|
SampleBufferFilestore,
|
@@ -141,7 +142,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
141
142
|
)
|
142
143
|
self._sync_time = time.monotonic()
|
143
144
|
|
144
|
-
def start_sample(self, sample:
|
145
|
+
def start_sample(self, sample: EvalSampleSummary) -> None:
|
145
146
|
with self._get_connection(write=True) as conn:
|
146
147
|
sample = self._consense_sample(conn, sample)
|
147
148
|
conn.execute(
|
@@ -177,7 +178,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
177
178
|
# Insert all rows
|
178
179
|
conn.execute(sql, values)
|
179
180
|
|
180
|
-
def complete_sample(self, summary:
|
181
|
+
def complete_sample(self, summary: EvalSampleSummary) -> None:
|
181
182
|
with self._get_connection(write=True) as conn:
|
182
183
|
summary = self._consense_sample(conn, summary)
|
183
184
|
conn.execute(
|
@@ -307,9 +308,9 @@ class SampleBufferDatabase(SampleBuffer):
|
|
307
308
|
conn.execute("PRAGMA foreign_keys = ON")
|
308
309
|
|
309
310
|
# concurrency setup
|
310
|
-
conn.execute("PRAGMA journal_mode=
|
311
|
+
conn.execute("PRAGMA journal_mode=MEMORY")
|
311
312
|
conn.execute("PRAGMA busy_timeout=10000")
|
312
|
-
conn.execute("PRAGMA synchronous=
|
313
|
+
conn.execute("PRAGMA synchronous=OFF")
|
313
314
|
|
314
315
|
# do work
|
315
316
|
yield conn
|
@@ -359,7 +360,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
359
360
|
|
360
361
|
def _get_samples(
|
361
362
|
self, conn: Connection, resolve_attachments: bool = False
|
362
|
-
) -> Iterator[
|
363
|
+
) -> Iterator[EvalSampleSummary]:
|
363
364
|
cursor = conn.execute(
|
364
365
|
"""
|
365
366
|
SELECT s.data as sample_data
|
@@ -369,7 +370,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
369
370
|
)
|
370
371
|
|
371
372
|
for row in cursor:
|
372
|
-
summary =
|
373
|
+
summary = EvalSampleSummary.model_validate_json(row["sample_data"])
|
373
374
|
if resolve_attachments:
|
374
375
|
summary = self._resolve_sample_attachments(conn, summary)
|
375
376
|
yield summary
|
@@ -437,8 +438,8 @@ class SampleBufferDatabase(SampleBuffer):
|
|
437
438
|
)
|
438
439
|
|
439
440
|
def _consense_sample(
|
440
|
-
self, conn: Connection, sample:
|
441
|
-
) ->
|
441
|
+
self, conn: Connection, sample: EvalSampleSummary
|
442
|
+
) -> EvalSampleSummary:
|
442
443
|
# alias attachments
|
443
444
|
attachments: dict[str, str] = {}
|
444
445
|
sample = sample.model_copy(
|
@@ -456,8 +457,8 @@ class SampleBufferDatabase(SampleBuffer):
|
|
456
457
|
return sample
|
457
458
|
|
458
459
|
def _resolve_sample_attachments(
|
459
|
-
self, conn: Connection, sample:
|
460
|
-
) ->
|
460
|
+
self, conn: Connection, sample: EvalSampleSummary
|
461
|
+
) -> EvalSampleSummary:
|
461
462
|
return sample.model_copy(
|
462
463
|
update={
|
463
464
|
"input": walk_input(
|
@@ -14,7 +14,7 @@ from inspect_ai._util.file import FileSystem, basename, dirname, file, filesyste
|
|
14
14
|
from inspect_ai._util.json import to_json_safe, to_json_str_safe
|
15
15
|
from inspect_ai.log._file import read_eval_log
|
16
16
|
|
17
|
-
from
|
17
|
+
from ..._log import EvalSampleSummary
|
18
18
|
from .types import SampleBuffer, SampleData, Samples
|
19
19
|
|
20
20
|
logger = getLogger(__name__)
|
@@ -33,7 +33,7 @@ class SegmentFile(BaseModel):
|
|
33
33
|
|
34
34
|
|
35
35
|
class SampleManifest(BaseModel):
|
36
|
-
summary:
|
36
|
+
summary: EvalSampleSummary
|
37
37
|
segments: list[int] = Field(default_factory=list)
|
38
38
|
|
39
39
|
|
@@ -5,13 +5,13 @@ from pydantic import BaseModel, JsonValue
|
|
5
5
|
|
6
6
|
from inspect_ai._display.core.display import TaskDisplayMetric
|
7
7
|
|
8
|
-
from
|
8
|
+
from ..._log import EvalSampleSummary
|
9
9
|
|
10
10
|
JsonData: TypeAlias = dict[str, JsonValue]
|
11
11
|
|
12
12
|
|
13
13
|
class Samples(BaseModel):
|
14
|
-
samples: list[
|
14
|
+
samples: list[EvalSampleSummary]
|
15
15
|
metrics: list[TaskDisplayMetric]
|
16
16
|
refresh: int
|
17
17
|
etag: str
|
@@ -11,18 +11,10 @@ from pydantic_core import to_json
|
|
11
11
|
from typing_extensions import override
|
12
12
|
|
13
13
|
from inspect_ai._util.constants import DESERIALIZING_CONTEXT, LOG_SCHEMA_VERSION
|
14
|
-
from inspect_ai._util.content import (
|
15
|
-
ContentAudio,
|
16
|
-
ContentImage,
|
17
|
-
ContentReasoning,
|
18
|
-
ContentText,
|
19
|
-
ContentVideo,
|
20
|
-
)
|
21
14
|
from inspect_ai._util.error import EvalError
|
22
15
|
from inspect_ai._util.file import FileSystem, dirname, file, filesystem
|
23
16
|
from inspect_ai._util.json import jsonable_python
|
24
17
|
from inspect_ai._util.trace import trace_action
|
25
|
-
from inspect_ai.model._chat_message import ChatMessage
|
26
18
|
|
27
19
|
from .._log import (
|
28
20
|
EvalLog,
|
@@ -30,12 +22,12 @@ from .._log import (
|
|
30
22
|
EvalResults,
|
31
23
|
EvalSample,
|
32
24
|
EvalSampleReductions,
|
25
|
+
EvalSampleSummary,
|
33
26
|
EvalSpec,
|
34
27
|
EvalStats,
|
35
28
|
sort_samples,
|
36
29
|
)
|
37
30
|
from .file import FileRecorder
|
38
|
-
from .types import SampleSummary
|
39
31
|
|
40
32
|
logger = getLogger(__name__)
|
41
33
|
|
@@ -222,6 +214,15 @@ class EvalRecorder(FileRecorder):
|
|
222
214
|
f"Sample id {id} for epoch {epoch} not found in log {location}"
|
223
215
|
)
|
224
216
|
|
217
|
+
@classmethod
|
218
|
+
@override
|
219
|
+
async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
|
220
|
+
with file(location, "rb") as z:
|
221
|
+
with ZipFile(z, mode="r") as zip:
|
222
|
+
summary_counter = _read_summary_counter(zip)
|
223
|
+
summaries = _read_all_summaries(zip, summary_counter)
|
224
|
+
return summaries
|
225
|
+
|
225
226
|
@classmethod
|
226
227
|
@override
|
227
228
|
async def write_log(cls, location: str, log: EvalLog) -> None:
|
@@ -236,36 +237,6 @@ class EvalRecorder(FileRecorder):
|
|
236
237
|
)
|
237
238
|
|
238
239
|
|
239
|
-
def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
|
240
|
-
# Clean the input of any images
|
241
|
-
if isinstance(inputs, list):
|
242
|
-
input: list[ChatMessage] = []
|
243
|
-
for message in inputs:
|
244
|
-
if not isinstance(message.content, str):
|
245
|
-
filtered_content: list[
|
246
|
-
ContentText
|
247
|
-
| ContentReasoning
|
248
|
-
| ContentImage
|
249
|
-
| ContentAudio
|
250
|
-
| ContentVideo
|
251
|
-
] = []
|
252
|
-
for content in message.content:
|
253
|
-
if content.type == "text":
|
254
|
-
filtered_content.append(content)
|
255
|
-
else:
|
256
|
-
filtered_content.append(
|
257
|
-
ContentText(text=f"({content.type.capitalize()})")
|
258
|
-
)
|
259
|
-
message.content = filtered_content
|
260
|
-
input.append(message)
|
261
|
-
else:
|
262
|
-
input.append(message)
|
263
|
-
|
264
|
-
return input
|
265
|
-
else:
|
266
|
-
return inputs
|
267
|
-
|
268
|
-
|
269
240
|
class ZipLogFile:
|
270
241
|
_zip: ZipFile | None
|
271
242
|
_temp_file: BinaryIO
|
@@ -273,19 +244,20 @@ class ZipLogFile:
|
|
273
244
|
|
274
245
|
def __init__(self, file: str) -> None:
|
275
246
|
self._file = file
|
247
|
+
self._zip = None
|
276
248
|
self._fs = filesystem(file)
|
277
249
|
self._lock = anyio.Lock()
|
278
250
|
self._temp_file = tempfile.TemporaryFile()
|
279
251
|
self._samples: list[EvalSample] = []
|
280
252
|
self._summary_counter = 0
|
281
|
-
self._summaries: list[
|
253
|
+
self._summaries: list[EvalSampleSummary] = []
|
282
254
|
self._log_start: LogStart | None = None
|
283
255
|
|
284
256
|
async def init(
|
285
257
|
self,
|
286
258
|
log_start: LogStart | None,
|
287
259
|
summary_counter: int,
|
288
|
-
summaries: list[
|
260
|
+
summaries: list[EvalSampleSummary],
|
289
261
|
) -> None:
|
290
262
|
async with self._lock:
|
291
263
|
self._open()
|
@@ -309,31 +281,14 @@ class ZipLogFile:
|
|
309
281
|
async def write_buffered_samples(self) -> None:
|
310
282
|
async with self._lock:
|
311
283
|
# Write the buffered samples
|
312
|
-
summaries: list[
|
284
|
+
summaries: list[EvalSampleSummary] = []
|
313
285
|
for sample in self._samples:
|
314
286
|
# Write the sample
|
315
287
|
self._zip_writestr(_sample_filename(sample.id, sample.epoch), sample)
|
316
288
|
|
317
289
|
# Capture the summary
|
318
|
-
summaries.append(
|
319
|
-
|
320
|
-
id=sample.id,
|
321
|
-
epoch=sample.epoch,
|
322
|
-
input=text_inputs(sample.input),
|
323
|
-
target=sample.target,
|
324
|
-
completed=True,
|
325
|
-
scores=sample.scores,
|
326
|
-
error=sample.error.message
|
327
|
-
if sample.error is not None
|
328
|
-
else None,
|
329
|
-
limit=f"{sample.limit.type}"
|
330
|
-
if sample.limit is not None
|
331
|
-
else None,
|
332
|
-
retries=len(sample.error_retries)
|
333
|
-
if sample.error_retries is not None
|
334
|
-
else None,
|
335
|
-
)
|
336
|
-
)
|
290
|
+
summaries.append(sample.summary())
|
291
|
+
|
337
292
|
self._samples.clear()
|
338
293
|
|
339
294
|
# write intermediary summaries and add to master list
|
@@ -451,12 +406,12 @@ def _read_summary_counter(zip: ZipFile) -> int:
|
|
451
406
|
return current_count
|
452
407
|
|
453
408
|
|
454
|
-
def _read_all_summaries(zip: ZipFile, count: int) -> list[
|
409
|
+
def _read_all_summaries(zip: ZipFile, count: int) -> list[EvalSampleSummary]:
|
455
410
|
if SUMMARIES_JSON in zip.namelist():
|
456
411
|
summaries_raw = _read_json(zip, SUMMARIES_JSON)
|
457
412
|
if isinstance(summaries_raw, list):
|
458
413
|
return [
|
459
|
-
|
414
|
+
EvalSampleSummary.model_validate(value, context=DESERIALIZING_CONTEXT)
|
460
415
|
for value in summaries_raw
|
461
416
|
]
|
462
417
|
else:
|
@@ -464,7 +419,7 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
|
|
464
419
|
f"Expected a list of summaries when reading {SUMMARIES_JSON}"
|
465
420
|
)
|
466
421
|
else:
|
467
|
-
summaries: list[
|
422
|
+
summaries: list[EvalSampleSummary] = []
|
468
423
|
for i in range(1, count):
|
469
424
|
summary_file = _journal_summary_file(i)
|
470
425
|
summary_path = _journal_summary_path(summary_file)
|
@@ -472,7 +427,7 @@ def _read_all_summaries(zip: ZipFile, count: int) -> list[SampleSummary]:
|
|
472
427
|
if isinstance(summary, list):
|
473
428
|
summaries.extend(
|
474
429
|
[
|
475
|
-
|
430
|
+
EvalSampleSummary.model_validate(
|
476
431
|
value, context=DESERIALIZING_CONTEXT
|
477
432
|
)
|
478
433
|
for value in summary
|
@@ -8,7 +8,7 @@ from inspect_ai._util.constants import MODEL_NONE
|
|
8
8
|
from inspect_ai._util.file import filesystem
|
9
9
|
from inspect_ai._util.registry import registry_unqualified_name
|
10
10
|
|
11
|
-
from .._log import EvalLog, EvalSample, EvalSpec
|
11
|
+
from .._log import EvalLog, EvalSample, EvalSampleSummary, EvalSpec
|
12
12
|
from .recorder import Recorder
|
13
13
|
|
14
14
|
logger = getLogger(__name__)
|
@@ -40,11 +40,7 @@ class FileRecorder(Recorder):
|
|
40
40
|
cls, location: str, id: str | int, epoch: int = 1
|
41
41
|
) -> EvalSample:
|
42
42
|
# establish the log to read from (might be cached)
|
43
|
-
|
44
|
-
eval_log = cls.__last_read_sample_log[1]
|
45
|
-
else:
|
46
|
-
eval_log = await cls.read_log(location)
|
47
|
-
cls.__last_read_sample_log = (location, eval_log)
|
43
|
+
eval_log = await cls._log_file_maybe_cached(location)
|
48
44
|
|
49
45
|
# throw if no samples
|
50
46
|
if not eval_log.samples:
|
@@ -66,6 +62,32 @@ class FileRecorder(Recorder):
|
|
66
62
|
else:
|
67
63
|
return eval_sample
|
68
64
|
|
65
|
+
@classmethod
|
66
|
+
@override
|
67
|
+
async def read_log_sample_summaries(cls, location: str) -> list[EvalSampleSummary]:
|
68
|
+
# establish the log to read from (might be cached)
|
69
|
+
eval_log = await cls._log_file_maybe_cached(location)
|
70
|
+
|
71
|
+
# throw if no samples
|
72
|
+
if not eval_log.samples:
|
73
|
+
raise IndexError(f"No samples found in log {location}")
|
74
|
+
|
75
|
+
summaries: list[EvalSampleSummary] = []
|
76
|
+
for sample in eval_log.samples:
|
77
|
+
summaries.append(sample.summary())
|
78
|
+
|
79
|
+
return summaries
|
80
|
+
|
81
|
+
@classmethod
|
82
|
+
async def _log_file_maybe_cached(cls, location: str) -> EvalLog:
|
83
|
+
# establish the log to read from (might be cached)
|
84
|
+
if cls.__last_read_sample_log and (cls.__last_read_sample_log[0] == "location"):
|
85
|
+
eval_log = cls.__last_read_sample_log[1]
|
86
|
+
else:
|
87
|
+
eval_log = await cls.read_log(location)
|
88
|
+
cls.__last_read_sample_log = (location, eval_log)
|
89
|
+
return eval_log
|
90
|
+
|
69
91
|
def _log_file_key(self, eval: EvalSpec) -> str:
|
70
92
|
# clean underscores, slashes, and : from the log file key (so we can reliably parse it
|
71
93
|
# later without worrying about underscores)
|
@@ -8,6 +8,7 @@ from inspect_ai.log._log import (
|
|
8
8
|
EvalResults,
|
9
9
|
EvalSample,
|
10
10
|
EvalSampleReductions,
|
11
|
+
EvalSampleSummary,
|
11
12
|
EvalSpec,
|
12
13
|
EvalStats,
|
13
14
|
)
|
@@ -57,6 +58,12 @@ class Recorder(abc.ABC):
|
|
57
58
|
cls, location: str, id: str | int, epoch: int = 1
|
58
59
|
) -> EvalSample: ...
|
59
60
|
|
61
|
+
@classmethod
|
62
|
+
@abc.abstractmethod
|
63
|
+
async def read_log_sample_summaries(
|
64
|
+
cls, location: str
|
65
|
+
) -> list[EvalSampleSummary]: ...
|
66
|
+
|
60
67
|
@classmethod
|
61
68
|
@abc.abstractmethod
|
62
69
|
async def write_log(cls, location: str, log: EvalLog) -> None: ...
|
@@ -1,31 +1,9 @@
|
|
1
|
-
from pydantic import BaseModel
|
1
|
+
from pydantic import BaseModel
|
2
2
|
|
3
3
|
from inspect_ai.log._transcript import Event
|
4
|
-
from inspect_ai.model._chat_message import ChatMessage
|
5
|
-
from inspect_ai.scorer._metric import Score
|
6
4
|
|
7
5
|
|
8
6
|
class SampleEvent(BaseModel):
|
9
7
|
id: str | int
|
10
8
|
epoch: int
|
11
9
|
event: Event
|
12
|
-
|
13
|
-
|
14
|
-
class SampleSummary(BaseModel):
|
15
|
-
id: int | str
|
16
|
-
epoch: int
|
17
|
-
input: str | list[ChatMessage]
|
18
|
-
target: str | list[str]
|
19
|
-
completed: bool = Field(default=False)
|
20
|
-
scores: dict[str, Score] | None = Field(default=None)
|
21
|
-
error: str | None = Field(default=None)
|
22
|
-
limit: str | None = Field(default=None)
|
23
|
-
retries: int | None = Field(default=None)
|
24
|
-
|
25
|
-
@model_validator(mode="after")
|
26
|
-
def thin_scores(self) -> "SampleSummary":
|
27
|
-
if self.scores is not None:
|
28
|
-
self.scores = {
|
29
|
-
key: Score(value=score.value) for key, score in self.scores.items()
|
30
|
-
}
|
31
|
-
return self
|
inspect_ai/log/_samples.py
CHANGED
@@ -119,14 +119,6 @@ def sample_active() -> ActiveSample | None:
|
|
119
119
|
return _sample_active.get(None)
|
120
120
|
|
121
121
|
|
122
|
-
def active_sample_token_limit() -> int | None:
|
123
|
-
active = sample_active()
|
124
|
-
if active:
|
125
|
-
return active.token_limit
|
126
|
-
else:
|
127
|
-
return None
|
128
|
-
|
129
|
-
|
130
122
|
def set_active_sample_token_limit(token_limit: int | None) -> None:
|
131
123
|
active = sample_active()
|
132
124
|
if active:
|
inspect_ai/log/_transcript.py
CHANGED
@@ -14,7 +14,13 @@ from typing import (
|
|
14
14
|
Union,
|
15
15
|
)
|
16
16
|
|
17
|
-
from pydantic import
|
17
|
+
from pydantic import (
|
18
|
+
BaseModel,
|
19
|
+
ConfigDict,
|
20
|
+
Field,
|
21
|
+
JsonValue,
|
22
|
+
field_serializer,
|
23
|
+
)
|
18
24
|
from shortuuid import uuid
|
19
25
|
|
20
26
|
from inspect_ai._util.constants import SAMPLE_SUBTASK
|