inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/log/_log.py
CHANGED
@@ -4,11 +4,17 @@ import sys
|
|
4
4
|
import traceback
|
5
5
|
from logging import getLogger
|
6
6
|
from types import TracebackType
|
7
|
-
from typing import Any, Literal, Type
|
7
|
+
from typing import Any, Literal, Type, TypedDict
|
8
8
|
|
9
9
|
import click
|
10
10
|
import tenacity
|
11
|
-
from pydantic import
|
11
|
+
from pydantic import (
|
12
|
+
BaseModel,
|
13
|
+
ConfigDict,
|
14
|
+
Field,
|
15
|
+
PrivateAttr,
|
16
|
+
model_validator,
|
17
|
+
)
|
12
18
|
from rich.console import Console, RenderableType
|
13
19
|
from rich.traceback import Traceback
|
14
20
|
|
@@ -17,12 +23,7 @@ from inspect_ai._util.error import EvalError, exception_message
|
|
17
23
|
from inspect_ai._util.logger import warn_once
|
18
24
|
from inspect_ai.approval._policy import ApprovalPolicyConfig
|
19
25
|
from inspect_ai.dataset._dataset import MT, metadata_as
|
20
|
-
from inspect_ai.model import
|
21
|
-
ChatMessage,
|
22
|
-
GenerateConfig,
|
23
|
-
ModelOutput,
|
24
|
-
ModelUsage,
|
25
|
-
)
|
26
|
+
from inspect_ai.model import ChatMessage, GenerateConfig, ModelOutput, ModelUsage
|
26
27
|
from inspect_ai.scorer import Score
|
27
28
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
28
29
|
from inspect_ai.util._store import Store
|
@@ -35,7 +36,31 @@ logger = getLogger(__name__)
|
|
35
36
|
SCORER_PLACEHOLDER = "88F74D2C"
|
36
37
|
|
37
38
|
|
39
|
+
class EvalConfigDefaults(TypedDict):
|
40
|
+
epochs: int
|
41
|
+
epochs_reducer: list[str]
|
42
|
+
fail_on_error: bool
|
43
|
+
sandbox_cleanup: bool
|
44
|
+
log_samples: bool
|
45
|
+
log_images: bool
|
46
|
+
score_display: bool
|
47
|
+
|
48
|
+
|
49
|
+
def eval_config_defaults() -> EvalConfigDefaults:
|
50
|
+
return {
|
51
|
+
"epochs": 1,
|
52
|
+
"epochs_reducer": ["mean"],
|
53
|
+
"fail_on_error": True,
|
54
|
+
"sandbox_cleanup": True,
|
55
|
+
"log_samples": True,
|
56
|
+
"log_images": True,
|
57
|
+
"score_display": True,
|
58
|
+
}
|
59
|
+
|
60
|
+
|
38
61
|
class EvalConfig(BaseModel):
|
62
|
+
"""Configuration used for evaluation."""
|
63
|
+
|
39
64
|
limit: int | tuple[int, int] | None = Field(default=None)
|
40
65
|
"""Sample limit (number of samples or range of samples)."""
|
41
66
|
|
@@ -114,6 +139,8 @@ class EvalConfig(BaseModel):
|
|
114
139
|
|
115
140
|
|
116
141
|
class EvalSampleLimit(BaseModel):
|
142
|
+
"""Limit encontered by sample."""
|
143
|
+
|
117
144
|
type: Literal["context", "time", "message", "token", "operator", "custom"]
|
118
145
|
"""The type of limit"""
|
119
146
|
|
@@ -122,6 +149,8 @@ class EvalSampleLimit(BaseModel):
|
|
122
149
|
|
123
150
|
|
124
151
|
class EvalSample(BaseModel):
|
152
|
+
"""Sample from evaluation task."""
|
153
|
+
|
125
154
|
id: int | str
|
126
155
|
"""Unique id for sample."""
|
127
156
|
|
@@ -196,7 +225,7 @@ class EvalSample(BaseModel):
|
|
196
225
|
"""Attachments referenced from messages and events.
|
197
226
|
|
198
227
|
Resolve attachments for a sample (replacing attachment://* references with
|
199
|
-
attachment content)
|
228
|
+
attachment content) by passing `resolve_attachments=True` to log reading functions.
|
200
229
|
"""
|
201
230
|
|
202
231
|
limit: EvalSampleLimit | None = Field(default=None)
|
@@ -267,6 +296,8 @@ class EvalEvents(BaseModel):
|
|
267
296
|
|
268
297
|
|
269
298
|
class EvalPlanStep(BaseModel):
|
299
|
+
"""Solver step."""
|
300
|
+
|
270
301
|
solver: str
|
271
302
|
"""Name of solver."""
|
272
303
|
|
@@ -275,6 +306,8 @@ class EvalPlanStep(BaseModel):
|
|
275
306
|
|
276
307
|
|
277
308
|
class EvalPlan(BaseModel):
|
309
|
+
"""Plan (solvers) used in evaluation."""
|
310
|
+
|
278
311
|
name: str = Field(default="plan")
|
279
312
|
"""Plan name."""
|
280
313
|
|
@@ -289,20 +322,24 @@ class EvalPlan(BaseModel):
|
|
289
322
|
|
290
323
|
|
291
324
|
class EvalMetric(BaseModel):
|
325
|
+
"""Metric for evaluation score."""
|
326
|
+
|
292
327
|
name: str
|
293
328
|
"""Metric name."""
|
294
329
|
|
295
330
|
value: int | float
|
296
331
|
"""Metric value."""
|
297
332
|
|
298
|
-
|
299
|
-
"""
|
333
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
334
|
+
"""Params specified when creating metric."""
|
300
335
|
|
301
336
|
metadata: dict[str, Any] | None = Field(default=None)
|
302
337
|
"""Additional metadata associated with metric."""
|
303
338
|
|
304
339
|
|
305
340
|
class EvalScore(BaseModel):
|
341
|
+
"""Score for evaluation task."""
|
342
|
+
|
306
343
|
name: str
|
307
344
|
"""Score name."""
|
308
345
|
|
@@ -323,10 +360,15 @@ class EvalScore(BaseModel):
|
|
323
360
|
|
324
361
|
|
325
362
|
class EvalSampleScore(Score):
|
363
|
+
"""Score and sample_id scored."""
|
364
|
+
|
326
365
|
sample_id: str | int | None = Field(default=None)
|
366
|
+
"""Sample ID."""
|
327
367
|
|
328
368
|
|
329
369
|
class EvalSampleReductions(BaseModel):
|
370
|
+
"""Score reductions."""
|
371
|
+
|
330
372
|
scorer: str
|
331
373
|
"""Name the of scorer"""
|
332
374
|
|
@@ -338,6 +380,8 @@ class EvalSampleReductions(BaseModel):
|
|
338
380
|
|
339
381
|
|
340
382
|
class EvalResults(BaseModel):
|
383
|
+
"""Scoring results from evaluation."""
|
384
|
+
|
341
385
|
total_samples: int = Field(default=0)
|
342
386
|
"""Total samples in eval (dataset samples * epochs)"""
|
343
387
|
|
@@ -404,6 +448,8 @@ class EvalResults(BaseModel):
|
|
404
448
|
if "metrics" in values:
|
405
449
|
metrics = values["metrics"]
|
406
450
|
del values["metrics"]
|
451
|
+
else:
|
452
|
+
metrics = None
|
407
453
|
# Convert the scorer to the new schema
|
408
454
|
score = values["scorer"]
|
409
455
|
if metrics:
|
@@ -418,6 +464,8 @@ class EvalResults(BaseModel):
|
|
418
464
|
|
419
465
|
|
420
466
|
class EvalDataset(BaseModel):
|
467
|
+
"""Dataset used for evaluation."""
|
468
|
+
|
421
469
|
name: str | None = Field(default=None)
|
422
470
|
"""Dataset name."""
|
423
471
|
|
@@ -434,7 +482,33 @@ class EvalDataset(BaseModel):
|
|
434
482
|
"""Was the dataset shuffled after reading."""
|
435
483
|
|
436
484
|
|
485
|
+
class EvalMetricDefinition(BaseModel):
|
486
|
+
name: str
|
487
|
+
"""Metric name"""
|
488
|
+
|
489
|
+
options: dict[str, Any] | None = Field(default=None)
|
490
|
+
|
491
|
+
|
492
|
+
class EvalScorer(BaseModel):
|
493
|
+
name: str
|
494
|
+
"""Scorer name"""
|
495
|
+
|
496
|
+
options: dict[str, Any] | None = Field(default=None)
|
497
|
+
"""Scorer arguments"""
|
498
|
+
|
499
|
+
metrics: (
|
500
|
+
list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
|
501
|
+
| dict[str, list[EvalMetricDefinition]]
|
502
|
+
| None
|
503
|
+
) = Field(default=None)
|
504
|
+
|
505
|
+
metadata: dict[str, Any] | None = Field(default=None)
|
506
|
+
"""Scorer metadata"""
|
507
|
+
|
508
|
+
|
437
509
|
class EvalRevision(BaseModel):
|
510
|
+
"""Git revision for evaluation."""
|
511
|
+
|
438
512
|
type: Literal["git"]
|
439
513
|
"""Type of revision (currently only "git")"""
|
440
514
|
|
@@ -446,6 +520,8 @@ class EvalRevision(BaseModel):
|
|
446
520
|
|
447
521
|
|
448
522
|
class EvalSpec(BaseModel):
|
523
|
+
"""Eval target and configuration."""
|
524
|
+
|
449
525
|
run_id: str = Field(default_factory=str)
|
450
526
|
"""Unique run id"""
|
451
527
|
|
@@ -506,6 +582,14 @@ class EvalSpec(BaseModel):
|
|
506
582
|
metadata: dict[str, Any] | None = Field(default=None)
|
507
583
|
"""Additional eval metadata."""
|
508
584
|
|
585
|
+
scorers: list[EvalScorer] | None = Field(default=None)
|
586
|
+
"""Scorers and args for this eval"""
|
587
|
+
|
588
|
+
metrics: (
|
589
|
+
list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None
|
590
|
+
) = Field(default=None)
|
591
|
+
"""metrics and args for this eval"""
|
592
|
+
|
509
593
|
# allow field model_args
|
510
594
|
model_config = ConfigDict(protected_namespaces=())
|
511
595
|
|
@@ -549,6 +633,8 @@ def rich_traceback(
|
|
549
633
|
|
550
634
|
|
551
635
|
class EvalStats(BaseModel):
|
636
|
+
"""Timing and usage statistics."""
|
637
|
+
|
552
638
|
started_at: str = Field(default_factory=str)
|
553
639
|
"""Evaluation start time."""
|
554
640
|
|
@@ -563,6 +649,8 @@ class EvalStats(BaseModel):
|
|
563
649
|
|
564
650
|
|
565
651
|
class EvalLog(BaseModel):
|
652
|
+
"""Evaluation log."""
|
653
|
+
|
566
654
|
# WARNING: The order of these fields is important for the log file format.
|
567
655
|
# Do not change the order of these fields without incrementing the version number,
|
568
656
|
# updating the log file read/write functionality (such as read_eval_log),
|
@@ -578,13 +666,13 @@ class EvalLog(BaseModel):
|
|
578
666
|
eval: EvalSpec
|
579
667
|
"""Eval identity and configuration."""
|
580
668
|
|
581
|
-
plan: EvalPlan = Field(
|
669
|
+
plan: EvalPlan = Field(default_factory=EvalPlan)
|
582
670
|
"""Eval plan (solvers and config)"""
|
583
671
|
|
584
672
|
results: EvalResults | None = None
|
585
673
|
"""Eval results (scores and metrics)."""
|
586
674
|
|
587
|
-
stats: EvalStats = Field(
|
675
|
+
stats: EvalStats = Field(default_factory=EvalStats)
|
588
676
|
"""Eval stats (runtime, model usage)"""
|
589
677
|
|
590
678
|
error: EvalError | None = Field(default=None)
|
inspect_ai/log/_message.py
CHANGED
@@ -5,12 +5,14 @@ from typing import Any, Literal, Type, cast
|
|
5
5
|
from pydantic import BaseModel, Field, model_validator
|
6
6
|
|
7
7
|
LoggingLevel = Literal[
|
8
|
-
"debug", "http", "sandbox", "info", "warning", "error", "critical"
|
8
|
+
"debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
|
9
9
|
]
|
10
10
|
"""Logging level."""
|
11
11
|
|
12
12
|
|
13
13
|
class LoggingMessage(BaseModel):
|
14
|
+
"""Message written to Python log."""
|
15
|
+
|
14
16
|
name: str | None = Field(default=None)
|
15
17
|
"""Logger name (e.g. 'httpx')"""
|
16
18
|
|
@@ -33,7 +35,7 @@ class LoggingMessage(BaseModel):
|
|
33
35
|
"""Logged from line number."""
|
34
36
|
|
35
37
|
@staticmethod
|
36
|
-
def
|
38
|
+
def _from_log_record(record: LogRecord) -> "LoggingMessage":
|
37
39
|
"""Create a LoggingMesssage from a LogRecord.
|
38
40
|
|
39
41
|
Args:
|
@@ -9,12 +9,7 @@ from typing_extensions import override
|
|
9
9
|
|
10
10
|
from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
11
11
|
from inspect_ai._util.error import EvalError
|
12
|
-
from inspect_ai._util.file import
|
13
|
-
absolute_file_path,
|
14
|
-
async_fileystem,
|
15
|
-
file,
|
16
|
-
filesystem,
|
17
|
-
)
|
12
|
+
from inspect_ai._util.file import absolute_file_path, async_fileystem, file, filesystem
|
18
13
|
from inspect_ai._util.trace import trace_action
|
19
14
|
|
20
15
|
from .._log import (
|
@@ -236,12 +231,13 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
236
231
|
f.seek(0)
|
237
232
|
|
238
233
|
# Parse the log file, stopping before parsing samples
|
234
|
+
status: Literal["started", "success", "cancelled", "error"] | None = None
|
239
235
|
for k, v in ijson.kvitems(f, ""):
|
240
236
|
if k == "status":
|
241
237
|
assert v in get_args(
|
242
238
|
Literal["started", "success", "cancelled", "error"]
|
243
239
|
)
|
244
|
-
status
|
240
|
+
status = v
|
245
241
|
if k == "eval":
|
246
242
|
eval = EvalSpec(**v)
|
247
243
|
elif k == "plan":
|
@@ -257,6 +253,8 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
257
253
|
error = EvalError(**v)
|
258
254
|
break
|
259
255
|
|
256
|
+
assert status, "Must encounter a 'status'"
|
257
|
+
|
260
258
|
return EvalLog(
|
261
259
|
eval=eval,
|
262
260
|
plan=plan,
|
@@ -21,6 +21,9 @@ class Recorder(abc.ABC):
|
|
21
21
|
@abc.abstractmethod
|
22
22
|
def default_log_buffer(self) -> int: ...
|
23
23
|
|
24
|
+
@abc.abstractmethod
|
25
|
+
def is_writeable(self) -> bool: ...
|
26
|
+
|
24
27
|
@abc.abstractmethod
|
25
28
|
async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
|
26
29
|
|
inspect_ai/log/_transcript.py
CHANGED
@@ -167,7 +167,7 @@ class ToolEvent(BaseEvent):
|
|
167
167
|
events: list["Event"] = Field(default_factory=list)
|
168
168
|
"""Transcript of events for tool."""
|
169
169
|
|
170
|
-
def
|
170
|
+
def _set_result(
|
171
171
|
self,
|
172
172
|
result: ToolResult,
|
173
173
|
truncated: tuple[int, int] | None,
|
@@ -182,11 +182,11 @@ class ToolEvent(BaseEvent):
|
|
182
182
|
|
183
183
|
# mechanism for operator to cancel the tool call
|
184
184
|
|
185
|
-
def
|
185
|
+
def _set_task(self, task: asyncio.Task[Any]) -> None:
|
186
186
|
"""Set the tool task (for possible cancellation)"""
|
187
187
|
self._task = task
|
188
188
|
|
189
|
-
def
|
189
|
+
def _cancel(self) -> None:
|
190
190
|
"""Cancel the tool task."""
|
191
191
|
if self._task:
|
192
192
|
self._cancelled = True
|
@@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
|
|
264
264
|
event: Literal["info"] = Field(default="info")
|
265
265
|
"""Event type."""
|
266
266
|
|
267
|
+
source: str | None = Field(default=None)
|
268
|
+
"""Optional source for info event."""
|
269
|
+
|
267
270
|
data: JsonValue
|
268
271
|
"""Data provided with event."""
|
269
272
|
|
@@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):
|
|
279
282
|
|
280
283
|
|
281
284
|
class ScoreEvent(BaseEvent):
|
282
|
-
"""Event with
|
285
|
+
"""Event with score.
|
286
|
+
|
287
|
+
Can be the final score for a `Sample`, or can be an intermediate score
|
288
|
+
resulting from a call to `score`.
|
289
|
+
"""
|
283
290
|
|
284
291
|
event: Literal["score"] = Field(default="score")
|
285
292
|
"""Event type."""
|
286
293
|
|
287
294
|
score: Score
|
288
|
-
"""
|
295
|
+
"""Score value."""
|
289
296
|
|
290
297
|
target: str | list[str] | None = Field(default=None)
|
291
298
|
""""Sample target."""
|
292
299
|
|
300
|
+
intermediate: bool = Field(default=False)
|
301
|
+
"""Was this an intermediate scoring?"""
|
302
|
+
|
293
303
|
|
294
304
|
class StepEvent(BaseEvent):
|
295
305
|
"""Step within current sample or subtask."""
|
@@ -355,13 +365,14 @@ class Transcript:
|
|
355
365
|
self.name = name
|
356
366
|
self._events: list[Event] = []
|
357
367
|
|
358
|
-
def info(self, data: JsonValue) -> None:
|
368
|
+
def info(self, data: JsonValue, *, source: str | None = None) -> None:
|
359
369
|
"""Add an `InfoEvent` to the transcript.
|
360
370
|
|
361
371
|
Args:
|
362
|
-
data
|
372
|
+
data: Data associated with the event.
|
373
|
+
source: Optional event source.
|
363
374
|
"""
|
364
|
-
self._event(InfoEvent(data=data))
|
375
|
+
self._event(InfoEvent(source=source, data=data))
|
365
376
|
|
366
377
|
@contextlib.contextmanager
|
367
378
|
def step(self, name: str, type: str | None = None) -> Iterator[None]:
|
inspect_ai/model/__init__.py
CHANGED
@@ -21,6 +21,7 @@ from ._call_tools import call_tools
|
|
21
21
|
from ._chat_message import (
|
22
22
|
ChatMessage,
|
23
23
|
ChatMessageAssistant,
|
24
|
+
ChatMessageBase,
|
24
25
|
ChatMessageSystem,
|
25
26
|
ChatMessageTool,
|
26
27
|
ChatMessageUser,
|
@@ -54,6 +55,7 @@ __all__ = [
|
|
54
55
|
"ContentVideo",
|
55
56
|
"Content",
|
56
57
|
"ChatMessage",
|
58
|
+
"ChatMessageBase",
|
57
59
|
"ChatMessageSystem",
|
58
60
|
"ChatMessageUser",
|
59
61
|
"ChatMessageAssistant",
|
inspect_ai/model/_cache.py
CHANGED
@@ -58,22 +58,23 @@ def _parse_expiry(period: str) -> int:
|
|
58
58
|
class CachePolicy:
|
59
59
|
"""The `CachePolicy` is used to define various criteria that impact how model calls are cached.
|
60
60
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
61
|
+
`expiry`: Default "24h". The expiry time for the cache entry.
|
62
|
+
This is a string of the format "12h" for 12 hours or "1W" for a week,
|
63
|
+
etc. This is how long we will keep the cache entry, if we access it
|
64
|
+
after this point we'll clear it. Setting to `None` will cache
|
65
|
+
indefinitely.
|
66
|
+
|
67
|
+
`per_epoch`: Default True. By default we cache responses separately
|
68
|
+
for different epochs. The general use case is that if there are
|
69
|
+
multiple epochs, we should cache each response separately because
|
70
|
+
scorers will aggregate across epochs. However, sometimes a response
|
71
|
+
can be cached regardless of epoch if the call being made isn't under
|
72
|
+
test as part of the evaluation. If False, this option allows you to
|
73
|
+
bypass that and cache independently of the epoch.
|
74
|
+
|
75
|
+
`scopes`: A dictionary of additional metadata that should
|
76
|
+
be included in the cache key. This allows for more fine-grained
|
77
|
+
control over the cache key generation.
|
77
78
|
"""
|
78
79
|
|
79
80
|
def __init__(
|
@@ -82,6 +83,14 @@ class CachePolicy:
|
|
82
83
|
per_epoch: bool = True,
|
83
84
|
scopes: dict[str, str] = {},
|
84
85
|
) -> None:
|
86
|
+
"""Create a CachePolicy.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
expiry: Expiry.
|
90
|
+
per_epoch: Per epoch
|
91
|
+
scopes: Scopes
|
92
|
+
|
93
|
+
"""
|
85
94
|
self.per_epoch = per_epoch
|
86
95
|
self.scopes = scopes
|
87
96
|
|
@@ -236,7 +245,11 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
|
|
236
245
|
|
237
246
|
|
238
247
|
def cache_clear(model: str = "") -> bool:
|
239
|
-
"""Clear the cache directory.
|
248
|
+
"""Clear the cache directory.
|
249
|
+
|
250
|
+
Args:
|
251
|
+
model: Model to clear cache for.
|
252
|
+
"""
|
240
253
|
try:
|
241
254
|
path = cache_path(model)
|
242
255
|
|
@@ -252,6 +265,11 @@ def cache_clear(model: str = "") -> bool:
|
|
252
265
|
|
253
266
|
|
254
267
|
def cache_path(model: str = "") -> Path:
|
268
|
+
"""Path to cache directory.
|
269
|
+
|
270
|
+
Args:
|
271
|
+
model: Path to cache directory for specific model.
|
272
|
+
"""
|
255
273
|
env_cache_dir = os.environ.get("INSPECT_CACHE_DIR", None)
|
256
274
|
if env_cache_dir:
|
257
275
|
generate_cache = Path(env_cache_dir) / "generate"
|
@@ -320,9 +338,9 @@ def cache_size(
|
|
320
338
|
will be calculated.
|
321
339
|
|
322
340
|
Args:
|
323
|
-
subdirs
|
341
|
+
subdirs: List of folders to filter by, which are generally
|
324
342
|
model names. Empty directories will be ignored.
|
325
|
-
files
|
343
|
+
files: List of files to filter by explicitly. Note that
|
326
344
|
return value group these up by their parent directory
|
327
345
|
|
328
346
|
Returns:
|
@@ -344,7 +362,7 @@ def cache_list_expired(filter_by: list[str] = []) -> list[Path]:
|
|
344
362
|
"""Returns a list of all the cached files that have passed their expiry time.
|
345
363
|
|
346
364
|
Args:
|
347
|
-
filter_by
|
365
|
+
filter_by: Default []. List of model names to filter by. If
|
348
366
|
an empty list, this will search the entire cache.
|
349
367
|
"""
|
350
368
|
expired_cache_entries = []
|
@@ -384,7 +402,7 @@ def cache_prune(files: list[Path] = []) -> None:
|
|
384
402
|
"""Delete all expired cache entries.
|
385
403
|
|
386
404
|
Args:
|
387
|
-
files
|
405
|
+
files: List of files to prune. If empty, this
|
388
406
|
will search the entire cache.
|
389
407
|
"""
|
390
408
|
if not files:
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -133,7 +133,8 @@ async def call_tools(
|
|
133
133
|
):
|
134
134
|
content: str | list[Content] = [result]
|
135
135
|
elif isinstance(result, list) and (
|
136
|
-
|
136
|
+
len(result) == 0
|
137
|
+
or isinstance(
|
137
138
|
result[0], ContentText | ContentImage | ContentAudio | ContentVideo
|
138
139
|
)
|
139
140
|
):
|
@@ -186,7 +187,7 @@ async def call_tools(
|
|
186
187
|
view=call.view,
|
187
188
|
pending=True,
|
188
189
|
)
|
189
|
-
event.
|
190
|
+
event._set_task(task)
|
190
191
|
transcript()._event(event)
|
191
192
|
|
192
193
|
# execute the tool call. if the operator cancelled the
|
@@ -226,7 +227,7 @@ async def call_tools(
|
|
226
227
|
conversation_tool_mesage(tool_message)
|
227
228
|
|
228
229
|
# update the event with the results
|
229
|
-
event.
|
230
|
+
event._set_result(
|
230
231
|
result=result_event.result,
|
231
232
|
truncated=result_event.truncated,
|
232
233
|
error=result_event.error,
|
@@ -13,8 +13,13 @@ logger = getLogger(__name__)
|
|
13
13
|
|
14
14
|
|
15
15
|
class ChatMessageBase(BaseModel):
|
16
|
+
"""Base class for chat messages."""
|
17
|
+
|
18
|
+
role: Literal["system", "user", "assistant", "tool"]
|
19
|
+
"""Conversation role"""
|
20
|
+
|
16
21
|
content: str | list[Content]
|
17
|
-
"""Content (simple string or list of
|
22
|
+
"""Content (simple string or list of content objects)"""
|
18
23
|
|
19
24
|
source: Literal["input", "generate"] | None = Field(default=None)
|
20
25
|
"""Source of message."""
|
@@ -31,9 +36,6 @@ class ChatMessageBase(BaseModel):
|
|
31
36
|
property returns either the plain str content, or if the
|
32
37
|
content is a list of text and images, the text items
|
33
38
|
concatenated together (separated by newline)
|
34
|
-
|
35
|
-
Returns: Text content of `ChatMessage` If this message does
|
36
|
-
not have text content then "" is returned.
|
37
39
|
"""
|
38
40
|
if isinstance(self.content, str):
|
39
41
|
return self.content
|
@@ -66,11 +68,15 @@ class ChatMessageBase(BaseModel):
|
|
66
68
|
|
67
69
|
|
68
70
|
class ChatMessageSystem(ChatMessageBase):
|
71
|
+
"""System chat message."""
|
72
|
+
|
69
73
|
role: Literal["system"] = Field(default="system")
|
70
74
|
"""Conversation role."""
|
71
75
|
|
72
76
|
|
73
77
|
class ChatMessageUser(ChatMessageBase):
|
78
|
+
"""User chat message."""
|
79
|
+
|
74
80
|
role: Literal["user"] = Field(default="user")
|
75
81
|
"""Conversation role."""
|
76
82
|
|
@@ -79,6 +85,8 @@ class ChatMessageUser(ChatMessageBase):
|
|
79
85
|
|
80
86
|
|
81
87
|
class ChatMessageAssistant(ChatMessageBase):
|
88
|
+
"""Assistant chat message."""
|
89
|
+
|
82
90
|
role: Literal["assistant"] = Field(default="assistant")
|
83
91
|
"""Conversation role."""
|
84
92
|
|
@@ -112,6 +120,8 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
112
120
|
|
113
121
|
|
114
122
|
class ChatMessageTool(ChatMessageBase):
|
123
|
+
"""Tool chat message."""
|
124
|
+
|
115
125
|
role: Literal["tool"] = Field(default="tool")
|
116
126
|
"""Conversation role."""
|
117
127
|
|
@@ -80,7 +80,7 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
80
80
|
|
81
81
|
|
82
82
|
class GenerateConfig(BaseModel):
|
83
|
-
"""
|
83
|
+
"""Model generation options."""
|
84
84
|
|
85
85
|
max_retries: int | None = Field(default=None)
|
86
86
|
"""Maximum number of times to retry request (defaults to 5)."""
|