inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/eval.py +35 -2
- inspect_ai/_cli/util.py +44 -1
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +13 -4
- inspect_ai/_display/core/results.py +1 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +8 -8
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +44 -0
- inspect_ai/_eval/eval.py +74 -25
- inspect_ai/_eval/evalset.py +22 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +13 -15
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +55 -6
- inspect_ai/_eval/task/run.py +51 -10
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/file.py +30 -1
- inspect_ai/_util/json.py +37 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/server.py +113 -1
- inspect_ai/_view/www/App.css +7 -1
- inspect_ai/_view/www/dist/assets/index.css +813 -415
- inspect_ai/_view/www/dist/assets/index.js +54475 -32003
- inspect_ai/_view/www/eslint.config.mjs +1 -1
- inspect_ai/_view/www/log-schema.json +137 -31
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/package.json +11 -2
- inspect_ai/_view/www/src/App.tsx +161 -853
- inspect_ai/_view/www/src/api/api-browser.ts +176 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
- inspect_ai/_view/www/src/api/client-api.ts +66 -10
- inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
- inspect_ai/_view/www/src/api/types.ts +107 -2
- inspect_ai/_view/www/src/appearance/icons.ts +2 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
- inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
- inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
- inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
- inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
- inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
- inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
- inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
- inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
- inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
- inspect_ai/_view/www/src/index.tsx +26 -94
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
- inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
- inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
- inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
- inspect_ai/_view/www/src/scoring/utils.ts +87 -0
- inspect_ai/_view/www/src/state/appSlice.ts +244 -0
- inspect_ai/_view/www/src/state/hooks.ts +399 -0
- inspect_ai/_view/www/src/state/logPolling.ts +200 -0
- inspect_ai/_view/www/src/state/logSlice.ts +224 -0
- inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
- inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
- inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
- inspect_ai/_view/www/src/state/scrolling.ts +206 -0
- inspect_ai/_view/www/src/state/store.ts +168 -0
- inspect_ai/_view/www/src/state/store_filter.ts +84 -0
- inspect_ai/_view/www/src/state/utils.ts +23 -0
- inspect_ai/_view/www/src/storage/index.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +36 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/types.ts +94 -32
- inspect_ai/_view/www/src/utils/attachments.ts +58 -23
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/utils/logger.ts +52 -0
- inspect_ai/_view/www/src/utils/polling.ts +100 -0
- inspect_ai/_view/www/src/utils/react.ts +30 -0
- inspect_ai/_view/www/src/utils/vscode.ts +1 -1
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
- inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
- inspect_ai/_view/www/src/workspace/types.ts +4 -3
- inspect_ai/_view/www/src/workspace/utils.ts +4 -4
- inspect_ai/_view/www/vite.config.js +6 -0
- inspect_ai/_view/www/yarn.lock +464 -355
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_condense.py +26 -0
- inspect_ai/log/_log.py +17 -5
- inspect_ai/log/_recorders/buffer/__init__.py +14 -0
- inspect_ai/log/_recorders/buffer/buffer.py +30 -0
- inspect_ai/log/_recorders/buffer/database.py +685 -0
- inspect_ai/log/_recorders/buffer/filestore.py +259 -0
- inspect_ai/log/_recorders/buffer/types.py +84 -0
- inspect_ai/log/_recorders/eval.py +2 -11
- inspect_ai/log/_recorders/types.py +30 -0
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +257 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_generate_config.py +2 -2
- inspect_ai/model/_model.py +114 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +4 -4
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- inspect_ai/util/_subtask.py +1 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
from logging import getLogger
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Literal
|
6
|
+
from zipfile import ZIP_DEFLATED, ZipFile
|
7
|
+
|
8
|
+
from pydantic import BaseModel, Field
|
9
|
+
from typing_extensions import override
|
10
|
+
|
11
|
+
from inspect_ai._display.core.display import TaskDisplayMetric
|
12
|
+
from inspect_ai._util.constants import DEFAULT_LOG_SHARED, EVAL_LOG_FORMAT
|
13
|
+
from inspect_ai._util.file import FileSystem, basename, dirname, file, filesystem
|
14
|
+
from inspect_ai._util.json import to_json_safe, to_json_str_safe
|
15
|
+
from inspect_ai.log._file import read_eval_log
|
16
|
+
|
17
|
+
from ..types import SampleSummary
|
18
|
+
from .types import SampleBuffer, SampleData, Samples
|
19
|
+
|
20
|
+
logger = getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class Segment(BaseModel):
|
24
|
+
id: int
|
25
|
+
last_event_id: int
|
26
|
+
last_attachment_id: int
|
27
|
+
|
28
|
+
|
29
|
+
class SegmentFile(BaseModel):
|
30
|
+
id: str | int
|
31
|
+
epoch: int
|
32
|
+
data: SampleData
|
33
|
+
|
34
|
+
|
35
|
+
class SampleManifest(BaseModel):
|
36
|
+
summary: SampleSummary
|
37
|
+
segments: list[int] = Field(default_factory=list)
|
38
|
+
|
39
|
+
|
40
|
+
class Manifest(BaseModel):
|
41
|
+
metrics: list[TaskDisplayMetric] = Field(default_factory=list)
|
42
|
+
samples: list[SampleManifest] = Field(default_factory=list)
|
43
|
+
segments: list[Segment] = Field(default_factory=list)
|
44
|
+
|
45
|
+
|
46
|
+
MANIFEST = "manifest.json"
|
47
|
+
|
48
|
+
|
49
|
+
class SampleBufferFilestore(SampleBuffer):
|
50
|
+
def __init__(
|
51
|
+
self,
|
52
|
+
location: str,
|
53
|
+
*,
|
54
|
+
create: bool = True,
|
55
|
+
update_interval: int = DEFAULT_LOG_SHARED,
|
56
|
+
) -> None:
|
57
|
+
self._fs = filesystem(location)
|
58
|
+
self._dir = f"{sample_buffer_dir(dirname(location), self._fs)}{self._fs.sep}{os.path.splitext(basename(location))[0]}{self._fs.sep}"
|
59
|
+
self.update_interval = update_interval
|
60
|
+
|
61
|
+
if create:
|
62
|
+
self._fs.mkdir(self._dir, exist_ok=True)
|
63
|
+
|
64
|
+
# place a file in the dir to force it to be created
|
65
|
+
self._fs.touch(f"{self._dir}.keep")
|
66
|
+
|
67
|
+
def write_manifest(self, manifest: Manifest) -> None:
|
68
|
+
with file(self._manifest_file(), "wb") as f:
|
69
|
+
f.write(to_json_safe(manifest))
|
70
|
+
|
71
|
+
def write_segment(self, id: int, files: list[SegmentFile]) -> None:
|
72
|
+
# write the file locally
|
73
|
+
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as segment_file:
|
74
|
+
name = segment_file.name
|
75
|
+
with ZipFile(
|
76
|
+
segment_file, mode="w", compression=ZIP_DEFLATED, compresslevel=5
|
77
|
+
) as zip:
|
78
|
+
for sf in files:
|
79
|
+
zip.writestr(
|
80
|
+
segment_file_name(sf.id, sf.epoch),
|
81
|
+
to_json_str_safe(sf.data),
|
82
|
+
)
|
83
|
+
segment_file.flush()
|
84
|
+
os.fsync(segment_file.fileno())
|
85
|
+
|
86
|
+
# write then move for atomicity
|
87
|
+
try:
|
88
|
+
with open(name, "rb") as zf:
|
89
|
+
with file(f"{self._dir}{segment_name(id)}", "wb") as f:
|
90
|
+
f.write(zf.read())
|
91
|
+
f.flush()
|
92
|
+
finally:
|
93
|
+
os.unlink(name)
|
94
|
+
|
95
|
+
def read_manifest(self) -> Manifest | None:
|
96
|
+
try:
|
97
|
+
with file(self._manifest_file(), "r") as f:
|
98
|
+
contents = f.read()
|
99
|
+
return Manifest.model_validate_json(contents)
|
100
|
+
except FileNotFoundError:
|
101
|
+
return None
|
102
|
+
|
103
|
+
def read_segment_data(
|
104
|
+
self, id: int, sample_id: str | int, epoch_id: int
|
105
|
+
) -> SampleData:
|
106
|
+
segment_file = f"{self._dir}{segment_name(id)}"
|
107
|
+
with file(segment_file, "rb") as f:
|
108
|
+
with ZipFile(f, mode="r") as zip:
|
109
|
+
with zip.open(segment_file_name(sample_id, epoch_id), "r") as sf:
|
110
|
+
return SampleData.model_validate_json(sf.read())
|
111
|
+
|
112
|
+
def cleanup(self) -> None:
|
113
|
+
cleanup_sample_buffer_filestore(self._dir, self._fs)
|
114
|
+
|
115
|
+
@classmethod
|
116
|
+
@override
|
117
|
+
def running_tasks(cls, log_dir: str) -> list[str] | None:
|
118
|
+
buffer_dir = Path(sample_buffer_dir(log_dir))
|
119
|
+
if buffer_dir.exists():
|
120
|
+
return [
|
121
|
+
f"{basename(path.name)}.{EVAL_LOG_FORMAT}"
|
122
|
+
for path in buffer_dir.iterdir()
|
123
|
+
if path.is_dir()
|
124
|
+
]
|
125
|
+
else:
|
126
|
+
return None
|
127
|
+
|
128
|
+
@override
|
129
|
+
def get_samples(
|
130
|
+
self, etag: str | None = None
|
131
|
+
) -> Samples | Literal["NotModified"] | None:
|
132
|
+
# get the etag on the filestore
|
133
|
+
try:
|
134
|
+
info = self._fs.info(self._manifest_file())
|
135
|
+
fs_etag = info.etag or f"{info.mtime}{info.size}"
|
136
|
+
except FileNotFoundError:
|
137
|
+
return None
|
138
|
+
|
139
|
+
# if the etag matches then return not modified
|
140
|
+
if etag == fs_etag:
|
141
|
+
return "NotModified"
|
142
|
+
|
143
|
+
# read the manifest
|
144
|
+
manifest = self.read_manifest()
|
145
|
+
if manifest is None:
|
146
|
+
return None
|
147
|
+
|
148
|
+
# provide samples + etag from the manifest
|
149
|
+
return Samples(
|
150
|
+
samples=[sm.summary for sm in manifest.samples],
|
151
|
+
metrics=manifest.metrics,
|
152
|
+
refresh=self.update_interval,
|
153
|
+
etag=fs_etag,
|
154
|
+
)
|
155
|
+
|
156
|
+
@override
|
157
|
+
def get_sample_data(
|
158
|
+
self,
|
159
|
+
id: str | int,
|
160
|
+
epoch: int,
|
161
|
+
after_event_id: int | None = None,
|
162
|
+
after_attachment_id: int | None = None,
|
163
|
+
) -> SampleData | None:
|
164
|
+
# read the manifest
|
165
|
+
manifest = self.read_manifest()
|
166
|
+
if manifest is None:
|
167
|
+
return None
|
168
|
+
|
169
|
+
# find this sample in the manifest
|
170
|
+
sample = next(
|
171
|
+
(
|
172
|
+
sample
|
173
|
+
for sample in manifest.samples
|
174
|
+
if sample.summary.id == id and sample.summary.epoch == epoch
|
175
|
+
),
|
176
|
+
None,
|
177
|
+
)
|
178
|
+
if sample is None:
|
179
|
+
return None
|
180
|
+
|
181
|
+
# determine which segments we need to return in order to
|
182
|
+
# satisfy the after_event_id and after_attachment_id
|
183
|
+
after_event_id = after_event_id or -1
|
184
|
+
after_attachment_id = after_attachment_id or -1
|
185
|
+
segments = [
|
186
|
+
segment for segment in manifest.segments if segment.id in sample.segments
|
187
|
+
]
|
188
|
+
segments = [
|
189
|
+
segment
|
190
|
+
for segment in segments
|
191
|
+
if segment.last_event_id > after_event_id
|
192
|
+
or segment.last_attachment_id > after_attachment_id
|
193
|
+
]
|
194
|
+
|
195
|
+
# collect data from the segments
|
196
|
+
sample_data = SampleData(events=[], attachments=[])
|
197
|
+
for segment in segments:
|
198
|
+
data = self.read_segment_data(segment.id, id, epoch)
|
199
|
+
sample_data.events.extend(data.events)
|
200
|
+
sample_data.attachments.extend(data.attachments)
|
201
|
+
|
202
|
+
return sample_data
|
203
|
+
|
204
|
+
def _manifest_file(self) -> str:
|
205
|
+
return f"{self._dir}{MANIFEST}"
|
206
|
+
|
207
|
+
|
208
|
+
def cleanup_sample_buffer_filestores(log_dir: str) -> None:
|
209
|
+
# read log buffer dirs (bail if there is no buffer_dir)
|
210
|
+
fs = filesystem(log_dir)
|
211
|
+
buffer_dir = sample_buffer_dir(log_dir, fs)
|
212
|
+
try:
|
213
|
+
log_buffers = [
|
214
|
+
buffer for buffer in fs.ls(buffer_dir) if buffer.type == "directory"
|
215
|
+
]
|
216
|
+
except FileNotFoundError:
|
217
|
+
return
|
218
|
+
|
219
|
+
# for each buffer dir, confirm there is a running .eval file
|
220
|
+
# (remove the buffer dir if there is no .eval or the eval is finished)
|
221
|
+
for log_buffer in log_buffers:
|
222
|
+
try:
|
223
|
+
log_file = f"{log_dir}{fs.sep}{basename(log_buffer.name)}.{EVAL_LOG_FORMAT}"
|
224
|
+
log_header = read_eval_log(log_file, header_only=True)
|
225
|
+
if log_header.status != "started":
|
226
|
+
cleanup_sample_buffer_filestore(log_buffer.name, fs)
|
227
|
+
|
228
|
+
except FileNotFoundError:
|
229
|
+
cleanup_sample_buffer_filestore(log_buffer.name, fs)
|
230
|
+
|
231
|
+
# remove the .buffer dir if it's empty
|
232
|
+
try:
|
233
|
+
if len(fs.ls(buffer_dir)) == 0:
|
234
|
+
fs.rm(buffer_dir, recursive=True)
|
235
|
+
except FileNotFoundError:
|
236
|
+
pass
|
237
|
+
|
238
|
+
|
239
|
+
def cleanup_sample_buffer_filestore(buffer_dir: str, fs: FileSystem) -> None:
|
240
|
+
try:
|
241
|
+
fs.rm(buffer_dir, recursive=True)
|
242
|
+
except Exception as ex:
|
243
|
+
logger.warning(
|
244
|
+
f"Error cleaning up sample buffer database at {buffer_dir}: {ex}"
|
245
|
+
)
|
246
|
+
|
247
|
+
|
248
|
+
def segment_name(id: int) -> str:
|
249
|
+
return f"segment.{id}.zip"
|
250
|
+
|
251
|
+
|
252
|
+
def segment_file_name(id: str | int, epoch: int) -> str:
|
253
|
+
return f"{id}_{epoch}.json"
|
254
|
+
|
255
|
+
|
256
|
+
def sample_buffer_dir(log_dir: str, fs: FileSystem | None = None) -> str:
|
257
|
+
log_dir = log_dir.rstrip("/\\")
|
258
|
+
fs = fs or filesystem(log_dir)
|
259
|
+
return f"{log_dir}{fs.sep}.buffer"
|
@@ -0,0 +1,84 @@
|
|
1
|
+
import abc
|
2
|
+
from typing import Literal, TypeAlias
|
3
|
+
|
4
|
+
from pydantic import BaseModel, JsonValue
|
5
|
+
|
6
|
+
from inspect_ai._display.core.display import TaskDisplayMetric
|
7
|
+
|
8
|
+
from ..types import SampleSummary
|
9
|
+
|
10
|
+
JsonData: TypeAlias = dict[str, JsonValue]
|
11
|
+
|
12
|
+
|
13
|
+
class Samples(BaseModel):
|
14
|
+
samples: list[SampleSummary]
|
15
|
+
metrics: list[TaskDisplayMetric]
|
16
|
+
refresh: int
|
17
|
+
etag: str
|
18
|
+
|
19
|
+
|
20
|
+
class EventData(BaseModel):
|
21
|
+
id: int
|
22
|
+
event_id: str
|
23
|
+
sample_id: str
|
24
|
+
epoch: int
|
25
|
+
event: JsonData
|
26
|
+
|
27
|
+
|
28
|
+
class AttachmentData(BaseModel):
|
29
|
+
id: int
|
30
|
+
sample_id: str
|
31
|
+
epoch: int
|
32
|
+
hash: str
|
33
|
+
content: str
|
34
|
+
|
35
|
+
|
36
|
+
class SampleData(BaseModel):
|
37
|
+
events: list[EventData]
|
38
|
+
attachments: list[AttachmentData]
|
39
|
+
|
40
|
+
|
41
|
+
class SampleBuffer(abc.ABC):
|
42
|
+
@classmethod
|
43
|
+
@abc.abstractmethod
|
44
|
+
def running_tasks(cls, log_dir: str) -> list[str] | None: ...
|
45
|
+
|
46
|
+
@abc.abstractmethod
|
47
|
+
def get_samples(
|
48
|
+
self, etag: str | None = None
|
49
|
+
) -> Samples | Literal["NotModified"] | None:
|
50
|
+
"""Get the manifest of all running samples.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
etag: Optional etag (returned in `Samples`) for checking
|
54
|
+
whether there are any changes in the datatabase.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
- `Samples` if the database exists and has updates
|
58
|
+
- "NotModifed" if the database exists and has no updates.
|
59
|
+
- None if the database no longer exists
|
60
|
+
|
61
|
+
"""
|
62
|
+
...
|
63
|
+
|
64
|
+
@abc.abstractmethod
|
65
|
+
def get_sample_data(
|
66
|
+
self,
|
67
|
+
id: str | int,
|
68
|
+
epoch: int,
|
69
|
+
after_event_id: int | None = None,
|
70
|
+
after_attachment_id: int | None = None,
|
71
|
+
) -> SampleData | None:
|
72
|
+
"""Get event and attachment data for a sample.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
id: Sample id
|
76
|
+
epoch: Sample epoch
|
77
|
+
after_event_id: Optional. Fetch only event data greater than this id.
|
78
|
+
after_attachment_id: Optioinal. Fetch only attachment data greater than this id.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
- `SampleData` with event and attachment data.
|
82
|
+
- None if the database no longer exists
|
83
|
+
"""
|
84
|
+
...
|
@@ -23,7 +23,6 @@ from inspect_ai._util.file import FileSystem, dirname, file, filesystem
|
|
23
23
|
from inspect_ai._util.json import jsonable_python
|
24
24
|
from inspect_ai._util.trace import trace_action
|
25
25
|
from inspect_ai.model._chat_message import ChatMessage
|
26
|
-
from inspect_ai.scorer._metric import Score
|
27
26
|
|
28
27
|
from .._log import (
|
29
28
|
EvalLog,
|
@@ -36,20 +35,11 @@ from .._log import (
|
|
36
35
|
sort_samples,
|
37
36
|
)
|
38
37
|
from .file import FileRecorder
|
38
|
+
from .types import SampleSummary
|
39
39
|
|
40
40
|
logger = getLogger(__name__)
|
41
41
|
|
42
42
|
|
43
|
-
class SampleSummary(BaseModel):
|
44
|
-
id: int | str
|
45
|
-
epoch: int
|
46
|
-
input: str | list[ChatMessage]
|
47
|
-
target: str | list[str]
|
48
|
-
scores: dict[str, Score] | None = Field(default=None)
|
49
|
-
error: str | None = Field(default=None)
|
50
|
-
limit: str | None = Field(default=None)
|
51
|
-
|
52
|
-
|
53
43
|
class LogStart(BaseModel):
|
54
44
|
version: int
|
55
45
|
eval: EvalSpec
|
@@ -331,6 +321,7 @@ class ZipLogFile:
|
|
331
321
|
epoch=sample.epoch,
|
332
322
|
input=text_inputs(sample.input),
|
333
323
|
target=sample.target,
|
324
|
+
completed=True,
|
334
325
|
scores=sample.scores,
|
335
326
|
error=sample.error.message
|
336
327
|
if sample.error is not None
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
2
|
+
|
3
|
+
from inspect_ai.log._transcript import Event
|
4
|
+
from inspect_ai.model._chat_message import ChatMessage
|
5
|
+
from inspect_ai.scorer._metric import Score
|
6
|
+
|
7
|
+
|
8
|
+
class SampleEvent(BaseModel):
|
9
|
+
id: str | int
|
10
|
+
epoch: int
|
11
|
+
event: Event
|
12
|
+
|
13
|
+
|
14
|
+
class SampleSummary(BaseModel):
|
15
|
+
id: int | str
|
16
|
+
epoch: int
|
17
|
+
input: str | list[ChatMessage]
|
18
|
+
target: str | list[str]
|
19
|
+
completed: bool = Field(default=False)
|
20
|
+
scores: dict[str, Score] | None = Field(default=None)
|
21
|
+
error: str | None = Field(default=None)
|
22
|
+
limit: str | None = Field(default=None)
|
23
|
+
|
24
|
+
@model_validator(mode="after")
|
25
|
+
def thin_scores(self) -> "SampleSummary":
|
26
|
+
if self.scores is not None:
|
27
|
+
self.scores = {
|
28
|
+
key: Score(value=score.value) for key, score in self.scores.items()
|
29
|
+
}
|
30
|
+
return self
|
inspect_ai/log/_transcript.py
CHANGED
@@ -15,6 +15,7 @@ from typing import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
|
18
|
+
from shortuuid import uuid
|
18
19
|
|
19
20
|
from inspect_ai._util.constants import SAMPLE_SUBTASK
|
20
21
|
from inspect_ai._util.error import EvalError
|
@@ -43,6 +44,13 @@ logger = getLogger(__name__)
|
|
43
44
|
|
44
45
|
|
45
46
|
class BaseEvent(BaseModel):
|
47
|
+
model_config = {
|
48
|
+
"json_schema_extra": lambda schema: schema.get("properties", {}).pop(
|
49
|
+
"id_", None
|
50
|
+
)
|
51
|
+
}
|
52
|
+
id_: str = Field(default_factory=lambda: str(uuid()), exclude=True)
|
53
|
+
|
46
54
|
timestamp: datetime = Field(default_factory=datetime.now)
|
47
55
|
"""Clock time at which event occurred."""
|
48
56
|
|
@@ -170,8 +178,8 @@ class ToolEvent(BaseEvent):
|
|
170
178
|
arguments: dict[str, JsonValue]
|
171
179
|
"""Arguments to function."""
|
172
180
|
|
173
|
-
|
174
|
-
"""
|
181
|
+
internal: JsonValue | None = Field(default=None)
|
182
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
175
183
|
|
176
184
|
view: ToolCallContent | None = Field(default=None)
|
177
185
|
"""Custom view of tool call input."""
|
@@ -194,6 +202,12 @@ class ToolEvent(BaseEvent):
|
|
194
202
|
working_time: float | None = Field(default=None)
|
195
203
|
"""Working time for tool call (i.e. time not spent waiting on semaphores)."""
|
196
204
|
|
205
|
+
agent: str | None = Field(default=None)
|
206
|
+
"""Name of agent if the tool call was an agent handoff."""
|
207
|
+
|
208
|
+
failed: bool | None = Field(default=None)
|
209
|
+
"""Did the tool call fail with a hard error?."""
|
210
|
+
|
197
211
|
def _set_result(
|
198
212
|
self,
|
199
213
|
result: ToolResult,
|
@@ -201,6 +215,8 @@ class ToolEvent(BaseEvent):
|
|
201
215
|
error: ToolCallError | None,
|
202
216
|
events: list["Event"],
|
203
217
|
waiting_time: float,
|
218
|
+
agent: str | None,
|
219
|
+
failed: bool | None,
|
204
220
|
) -> None:
|
205
221
|
self.result = result
|
206
222
|
self.truncated = truncated
|
@@ -210,6 +226,8 @@ class ToolEvent(BaseEvent):
|
|
210
226
|
completed = datetime.now()
|
211
227
|
self.completed = completed
|
212
228
|
self.working_time = (completed - self.timestamp).total_seconds() - waiting_time
|
229
|
+
self.agent = agent
|
230
|
+
self.failed = failed
|
213
231
|
|
214
232
|
# mechanism for operator to cancel the tool call
|
215
233
|
|
@@ -451,8 +469,11 @@ ET = TypeVar("ET", bound=BaseEvent)
|
|
451
469
|
class Transcript:
|
452
470
|
"""Transcript of events."""
|
453
471
|
|
472
|
+
_event_logger: Callable[[Event], None] | None
|
473
|
+
|
454
474
|
def __init__(self, name: str = "") -> None:
|
455
475
|
self.name = name
|
476
|
+
self._event_logger = None
|
456
477
|
self._events: list[Event] = []
|
457
478
|
|
458
479
|
def info(self, data: JsonValue, *, source: str | None = None) -> None:
|
@@ -493,8 +514,17 @@ class Transcript:
|
|
493
514
|
return None
|
494
515
|
|
495
516
|
def _event(self, event: Event) -> None:
|
517
|
+
if self._event_logger:
|
518
|
+
self._event_logger(event)
|
496
519
|
self._events.append(event)
|
497
520
|
|
521
|
+
def _event_updated(self, event: Event) -> None:
|
522
|
+
if self._event_logger:
|
523
|
+
self._event_logger(event)
|
524
|
+
|
525
|
+
def _subscribe(self, event_logger: Callable[[Event], None]) -> None:
|
526
|
+
self._event_logger = event_logger
|
527
|
+
|
498
528
|
|
499
529
|
def transcript() -> Transcript:
|
500
530
|
"""Get the current `Transcript`."""
|
inspect_ai/model/__init__.py
CHANGED
@@ -18,7 +18,7 @@ from ._cache import (
|
|
18
18
|
cache_prune,
|
19
19
|
cache_size,
|
20
20
|
)
|
21
|
-
from ._call_tools import call_tools
|
21
|
+
from ._call_tools import ExecuteToolsResult, call_tools, execute_tools
|
22
22
|
from ._chat_message import (
|
23
23
|
ChatMessage,
|
24
24
|
ChatMessageAssistant,
|
@@ -27,6 +27,7 @@ from ._chat_message import (
|
|
27
27
|
ChatMessageTool,
|
28
28
|
ChatMessageUser,
|
29
29
|
)
|
30
|
+
from ._conversation import ModelConversation
|
30
31
|
from ._generate_config import GenerateConfig, GenerateConfigArgs, ResponseSchema
|
31
32
|
from ._model import (
|
32
33
|
Model,
|
@@ -34,6 +35,7 @@ from ._model import (
|
|
34
35
|
ModelName,
|
35
36
|
get_model,
|
36
37
|
)
|
38
|
+
from ._model_call import ModelCall
|
37
39
|
from ._model_output import (
|
38
40
|
ChatCompletionChoice,
|
39
41
|
Logprob,
|
@@ -64,7 +66,9 @@ __all__ = [
|
|
64
66
|
"ChatMessageAssistant",
|
65
67
|
"ChatMessageTool",
|
66
68
|
"ChatCompletionChoice",
|
69
|
+
"ModelCall",
|
67
70
|
"ModelOutput",
|
71
|
+
"ModelConversation",
|
68
72
|
"Logprobs",
|
69
73
|
"Logprob",
|
70
74
|
"TopLogprob",
|
@@ -74,6 +78,8 @@ __all__ = [
|
|
74
78
|
"ModelUsage",
|
75
79
|
"StopReason",
|
76
80
|
"call_tools",
|
81
|
+
"execute_tools",
|
82
|
+
"ExecuteToolsResult",
|
77
83
|
"cache_clear",
|
78
84
|
"cache_list_expired",
|
79
85
|
"cache_path",
|