inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/score.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
import asyncio
|
2
2
|
from copy import deepcopy
|
3
|
-
from
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Callable, Literal, cast
|
4
5
|
|
5
6
|
from inspect_ai._display import display
|
6
|
-
from inspect_ai.
|
7
|
+
from inspect_ai._eval.loader import scorer_from_spec
|
7
8
|
from inspect_ai._util.platform import platform_init
|
8
9
|
from inspect_ai._util.registry import registry_create, registry_unqualified_name
|
9
10
|
from inspect_ai.log import (
|
10
11
|
EvalLog,
|
11
|
-
EvalMetric,
|
12
12
|
)
|
13
|
+
from inspect_ai.log._log import EvalMetricDefinition
|
13
14
|
from inspect_ai.model import ModelName
|
14
15
|
from inspect_ai.scorer import Metric, Scorer, Target
|
15
16
|
from inspect_ai.scorer._metric import SampleScore
|
@@ -19,18 +20,19 @@ from inspect_ai.scorer._reducer import (
|
|
19
20
|
create_reducers,
|
20
21
|
reducer_log_names,
|
21
22
|
)
|
22
|
-
from inspect_ai.scorer._scorer import unique_scorer_name
|
23
|
+
from inspect_ai.scorer._scorer import ScorerSpec, unique_scorer_name
|
23
24
|
from inspect_ai.solver import TaskState
|
24
25
|
|
25
|
-
from .task import Task
|
26
26
|
from .task.results import eval_results
|
27
|
-
|
27
|
+
|
28
|
+
ScoreAction = Literal["append", "overwrite"]
|
28
29
|
|
29
30
|
|
30
31
|
def score(
|
31
32
|
log: EvalLog,
|
32
33
|
scorers: Scorer | list[Scorer],
|
33
34
|
epochs_reducer: ScoreReducers | None = None,
|
35
|
+
action: ScoreAction | None = None,
|
34
36
|
) -> EvalLog:
|
35
37
|
"""Score an evaluation log.
|
36
38
|
|
@@ -40,6 +42,7 @@ def score(
|
|
40
42
|
epochs_reducer (ScoreReducers | None):
|
41
43
|
Reducer function(s) for aggregating scores in each sample.
|
42
44
|
Defaults to previously used reducer(s).
|
45
|
+
action: Whether to append or overwrite this score
|
43
46
|
|
44
47
|
Returns:
|
45
48
|
Log with scores yielded by scorer.
|
@@ -50,13 +53,14 @@ def score(
|
|
50
53
|
# resolve scorers into a list
|
51
54
|
scorers = [scorers] if isinstance(scorers, Scorer) else scorers
|
52
55
|
|
53
|
-
return asyncio.run(score_async(log, scorers, epochs_reducer))
|
56
|
+
return asyncio.run(score_async(log, scorers, epochs_reducer, action))
|
54
57
|
|
55
58
|
|
56
59
|
async def score_async(
|
57
60
|
log: EvalLog,
|
58
61
|
scorers: list[Scorer],
|
59
62
|
epochs_reducer: ScoreReducers | None = None,
|
63
|
+
action: ScoreAction | None = None,
|
60
64
|
) -> EvalLog:
|
61
65
|
"""Score an evaluation log.
|
62
66
|
|
@@ -66,6 +70,8 @@ async def score_async(
|
|
66
70
|
epochs_reducer (ScoreReducers | None):
|
67
71
|
Reducer function(s) for aggregating scores in each sample.
|
68
72
|
Defaults to previously used reducer(s).
|
73
|
+
action: Whether to append or overwrite this score
|
74
|
+
|
69
75
|
|
70
76
|
|
71
77
|
Returns:
|
@@ -109,7 +115,22 @@ async def score_async(
|
|
109
115
|
|
110
116
|
# write them back (gather ensures that they come back in the same order)
|
111
117
|
for index, score in enumerate(scores):
|
112
|
-
|
118
|
+
if action == "overwrite":
|
119
|
+
log.samples[index].scores = {k: v.score for k, v in score.items()}
|
120
|
+
else:
|
121
|
+
existing_scores = log.samples[index].scores or {}
|
122
|
+
new_scores = {k: v.score for k, v in score.items()}
|
123
|
+
|
124
|
+
for key, value in new_scores.items():
|
125
|
+
if key not in existing_scores:
|
126
|
+
existing_scores[key] = value
|
127
|
+
else:
|
128
|
+
# This key already exists, dedupe its name
|
129
|
+
count = 1
|
130
|
+
while f"{key}-{count}" in existing_scores.keys():
|
131
|
+
count = count + 1
|
132
|
+
existing_scores[f"{key}-{count}"] = value
|
133
|
+
log.samples[index].scores = existing_scores
|
113
134
|
|
114
135
|
# collect metrics from EvalLog (they may overlap w/ the scorer metrics,
|
115
136
|
# that will be taken care of in eval_results)
|
@@ -130,30 +151,37 @@ async def score_async(
|
|
130
151
|
return log
|
131
152
|
|
132
153
|
|
133
|
-
async def task_score(
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
154
|
+
async def task_score(
|
155
|
+
log: EvalLog,
|
156
|
+
scorer: str | None = None,
|
157
|
+
scorer_args: dict[str, Any] | None = None,
|
158
|
+
action: ScoreAction | None = None,
|
159
|
+
) -> EvalLog:
|
160
|
+
# confirm we have a scorer
|
161
|
+
scorers = resolve_scorers(log, scorer, scorer_args)
|
162
|
+
if len(scorers) == 0:
|
163
|
+
raise ValueError(
|
164
|
+
"Unable to resolve any scorers for this log. Please specify a scorer using the '--scorer' param."
|
165
|
+
)
|
138
166
|
|
139
|
-
|
140
|
-
|
141
|
-
|
167
|
+
# confirm we have samples
|
168
|
+
if log.samples is None or len(log.samples) == 0:
|
169
|
+
raise ValueError("There are no samples to score in the log.")
|
142
170
|
|
143
|
-
|
144
|
-
|
171
|
+
task_name = log.eval.task
|
172
|
+
display().print(f"\nScoring {task_name} ({len(log.samples)} samples)")
|
145
173
|
|
146
|
-
|
147
|
-
|
174
|
+
# perform scoring
|
175
|
+
log = await score_async(log=log, scorers=scorers, action=action)
|
148
176
|
|
149
177
|
# compute and log metrics
|
150
|
-
|
151
|
-
if task.scorer and log.samples:
|
178
|
+
if log.samples:
|
152
179
|
sample_scores = [
|
153
180
|
{
|
154
181
|
score_key: SampleScore(
|
155
182
|
score=score,
|
156
183
|
sample_id=sample.id,
|
184
|
+
sample_metadata=sample.metadata,
|
157
185
|
)
|
158
186
|
for score_key, score in sample.scores.items()
|
159
187
|
}
|
@@ -161,12 +189,15 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
|
|
161
189
|
if sample.scores is not None
|
162
190
|
]
|
163
191
|
|
192
|
+
epochs_reducer = reducers_from_log(log)
|
193
|
+
metrics = metrics_from_log(log)
|
194
|
+
|
164
195
|
log.results, log.reductions = eval_results(
|
165
196
|
log.results.total_samples if log.results else 0,
|
166
197
|
sample_scores,
|
167
|
-
|
168
|
-
|
169
|
-
|
198
|
+
epochs_reducer,
|
199
|
+
scorers,
|
200
|
+
metrics,
|
170
201
|
)
|
171
202
|
return log
|
172
203
|
|
@@ -185,6 +216,7 @@ async def run_score_task(
|
|
185
216
|
results[scorer_name] = SampleScore(
|
186
217
|
score=result,
|
187
218
|
sample_id=state.sample_id,
|
219
|
+
sample_metadata=state.metadata,
|
188
220
|
scorer=registry_unqualified_name(scorer),
|
189
221
|
)
|
190
222
|
|
@@ -192,21 +224,78 @@ async def run_score_task(
|
|
192
224
|
return results
|
193
225
|
|
194
226
|
|
195
|
-
def metrics_from_log(log: EvalLog) -> list[Metric]:
|
227
|
+
def metrics_from_log(log: EvalLog) -> list[Metric] | dict[str, list[Metric]] | None:
|
228
|
+
# See if we have metrics in the eval itself
|
229
|
+
if log.eval.metrics:
|
230
|
+
if isinstance(log.eval.metrics, list):
|
231
|
+
return [metric_from_log(metric) for metric in log.eval.metrics]
|
232
|
+
else:
|
233
|
+
return {
|
234
|
+
key: [metric_from_log(metric) for metric in metrics]
|
235
|
+
for key, metrics in log.eval.metrics.items()
|
236
|
+
}
|
237
|
+
return None
|
238
|
+
|
239
|
+
|
240
|
+
def metric_from_log(metric: EvalMetricDefinition) -> Metric:
|
241
|
+
return cast(
|
242
|
+
Metric, registry_create("metric", metric.name, **(metric.options or {}))
|
243
|
+
)
|
244
|
+
|
245
|
+
|
246
|
+
def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
|
247
|
+
return create_reducers(log.eval.config.epochs_reducer)
|
248
|
+
|
249
|
+
|
250
|
+
def resolve_scorers(
|
251
|
+
log: EvalLog, scorer: str | None, scorer_args: dict[str, Any] | None
|
252
|
+
) -> list[Scorer]:
|
253
|
+
"""
|
254
|
+
Create a list of Scorer objects from an evaluation log.
|
255
|
+
|
256
|
+
Args:
|
257
|
+
log: EvalLog object containing evaluation configuration and results
|
258
|
+
scorer:: Scorer name (simple name or file.py@name).
|
259
|
+
scorer_args: Dictionary of scorer arguments
|
260
|
+
|
261
|
+
Returns:
|
262
|
+
list[Scorer]: List of initialized scorers
|
263
|
+
"""
|
264
|
+
# resolve the scorer path
|
265
|
+
task_path = Path(log.eval.task_file) if log.eval.task_file else None
|
266
|
+
|
267
|
+
# If there is an explicit scorer
|
268
|
+
if scorer:
|
269
|
+
return [
|
270
|
+
scorer_from_spec(
|
271
|
+
spec=ScorerSpec(scorer=scorer),
|
272
|
+
task_path=task_path,
|
273
|
+
**(scorer_args or {}),
|
274
|
+
)
|
275
|
+
]
|
276
|
+
# See if we can create scorers from the eval itself
|
277
|
+
elif log.eval.scorers is not None:
|
278
|
+
return (
|
279
|
+
[
|
280
|
+
scorer_from_spec(
|
281
|
+
spec=ScorerSpec(scorer=score.name),
|
282
|
+
task_path=task_path,
|
283
|
+
**(score.options or {}),
|
284
|
+
)
|
285
|
+
for score in log.eval.scorers
|
286
|
+
]
|
287
|
+
if log.results
|
288
|
+
else []
|
289
|
+
)
|
290
|
+
|
291
|
+
# Otherwise, perhaps we can re-create them from the results
|
196
292
|
return (
|
197
293
|
[
|
198
|
-
|
294
|
+
scorer_from_spec(
|
295
|
+
spec=ScorerSpec(scorer=score.name), task_path=task_path, **score.params
|
296
|
+
)
|
199
297
|
for score in log.results.scores
|
200
|
-
for metric in score.metrics.values()
|
201
298
|
]
|
202
299
|
if log.results
|
203
300
|
else []
|
204
301
|
)
|
205
|
-
|
206
|
-
|
207
|
-
def metric_from_log(metric: EvalMetric) -> Metric:
|
208
|
-
return cast(Metric, registry_create("metric", metric.name, **metric.options))
|
209
|
-
|
210
|
-
|
211
|
-
def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
|
212
|
-
return create_reducers(log.eval.config.epochs_reducer)
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -4,9 +4,7 @@ from typing import Any, Literal, cast
|
|
4
4
|
from shortuuid import uuid
|
5
5
|
|
6
6
|
from inspect_ai._eval.task.util import slice_dataset
|
7
|
-
from inspect_ai._util.constants import
|
8
|
-
PKG_NAME,
|
9
|
-
)
|
7
|
+
from inspect_ai._util.constants import PKG_NAME
|
10
8
|
from inspect_ai._util.datetime import iso_now
|
11
9
|
from inspect_ai._util.git import git_context
|
12
10
|
from inspect_ai._util.path import cwd_relative_path
|
@@ -27,7 +25,13 @@ from inspect_ai.log import (
|
|
27
25
|
EvalSpec,
|
28
26
|
EvalStats,
|
29
27
|
)
|
30
|
-
from inspect_ai.log._log import
|
28
|
+
from inspect_ai.log._log import (
|
29
|
+
EvalLog,
|
30
|
+
EvalMetricDefinition,
|
31
|
+
EvalSampleReductions,
|
32
|
+
EvalScorer,
|
33
|
+
eval_config_defaults,
|
34
|
+
)
|
31
35
|
from inspect_ai.log._recorders import Recorder
|
32
36
|
from inspect_ai.model import (
|
33
37
|
GenerateConfig,
|
@@ -35,6 +39,8 @@ from inspect_ai.model import (
|
|
35
39
|
ModelName,
|
36
40
|
)
|
37
41
|
from inspect_ai.model._model import model_usage
|
42
|
+
from inspect_ai.scorer._metric import MetricSpec
|
43
|
+
from inspect_ai.scorer._scorer import ScorerSpec
|
38
44
|
from inspect_ai.solver._plan import Plan
|
39
45
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
40
46
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
@@ -52,6 +58,8 @@ class TaskLogger:
|
|
52
58
|
tags: list[str] | None,
|
53
59
|
model: Model,
|
54
60
|
dataset: Dataset,
|
61
|
+
scorer: list[ScorerSpec] | None,
|
62
|
+
metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
|
55
63
|
sandbox: SandboxEnvironmentSpec | None,
|
56
64
|
task_attribs: dict[str, Any],
|
57
65
|
task_args: dict[str, Any],
|
@@ -92,6 +100,17 @@ class TaskLogger:
|
|
92
100
|
],
|
93
101
|
)
|
94
102
|
|
103
|
+
# write defaults for unspecified config
|
104
|
+
for name, value in eval_config_defaults().items():
|
105
|
+
if getattr(eval_config, name, None) is None:
|
106
|
+
setattr(eval_config, name, value)
|
107
|
+
|
108
|
+
# resolve scorers
|
109
|
+
eval_scorers = resolve_eval_scorers(scorer)
|
110
|
+
|
111
|
+
# resolve metrics
|
112
|
+
eval_metrics = resolve_eval_metrics(metrics)
|
113
|
+
|
95
114
|
# create eval spec
|
96
115
|
self.eval = EvalSpec(
|
97
116
|
run_id=run_id,
|
@@ -114,6 +133,8 @@ class TaskLogger:
|
|
114
133
|
sample_ids=sample_ids,
|
115
134
|
shuffled=dataset.shuffled,
|
116
135
|
),
|
136
|
+
scorers=eval_scorers,
|
137
|
+
metrics=eval_metrics,
|
117
138
|
sandbox=sandbox,
|
118
139
|
model_args=model_args,
|
119
140
|
config=eval_config,
|
@@ -200,3 +221,83 @@ def collect_eval_data(stats: EvalStats) -> None:
|
|
200
221
|
# collect stats
|
201
222
|
stats.completed_at = iso_now()
|
202
223
|
stats.model_usage = model_usage()
|
224
|
+
|
225
|
+
|
226
|
+
def resolve_eval_metrics(
|
227
|
+
metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
|
228
|
+
) -> list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None:
|
229
|
+
if metrics is None:
|
230
|
+
return None
|
231
|
+
elif isinstance(metrics, list):
|
232
|
+
return [EvalMetricDefinition(name=m.metric, options=m.args) for m in metrics]
|
233
|
+
else:
|
234
|
+
return {
|
235
|
+
k: [
|
236
|
+
EvalMetricDefinition(name=v.metric, options=v.args) for v in metric_list
|
237
|
+
]
|
238
|
+
for k, metric_list in metrics.items()
|
239
|
+
}
|
240
|
+
|
241
|
+
|
242
|
+
def resolve_eval_scorers(scorers: list[ScorerSpec] | None) -> list[EvalScorer] | None:
|
243
|
+
if scorers is None:
|
244
|
+
return None
|
245
|
+
else:
|
246
|
+
results = []
|
247
|
+
for scorer in scorers:
|
248
|
+
results.append(
|
249
|
+
EvalScorer(
|
250
|
+
name=scorer.scorer,
|
251
|
+
metrics=resolve_scorer_metrics(scorer.metrics),
|
252
|
+
options=scorer.args,
|
253
|
+
metadata=scorer.metadata,
|
254
|
+
)
|
255
|
+
)
|
256
|
+
return results
|
257
|
+
|
258
|
+
|
259
|
+
def resolve_scorer_metrics(
|
260
|
+
metrics: list[MetricSpec | dict[str, list[MetricSpec]]]
|
261
|
+
| dict[str, list[MetricSpec]]
|
262
|
+
| None,
|
263
|
+
) -> (
|
264
|
+
list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
|
265
|
+
| dict[str, list[EvalMetricDefinition]]
|
266
|
+
| None
|
267
|
+
):
|
268
|
+
if metrics is None:
|
269
|
+
return None
|
270
|
+
elif isinstance(metrics, list):
|
271
|
+
resolved_metrics: list[
|
272
|
+
EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]
|
273
|
+
] = []
|
274
|
+
for metric_item in metrics:
|
275
|
+
if isinstance(metric_item, MetricSpec):
|
276
|
+
resolved_metrics.append(
|
277
|
+
EvalMetricDefinition(
|
278
|
+
name=metric_item.metric, options=metric_item.args
|
279
|
+
)
|
280
|
+
)
|
281
|
+
elif isinstance(metric_item, dict):
|
282
|
+
resolved_metrics.append(
|
283
|
+
{
|
284
|
+
metric_group: [
|
285
|
+
EvalMetricDefinition(
|
286
|
+
name=metric_spec.metric, options=metric_spec.args
|
287
|
+
)
|
288
|
+
for metric_spec in metric_specs
|
289
|
+
]
|
290
|
+
for metric_group, metric_specs in metric_item.items()
|
291
|
+
}
|
292
|
+
)
|
293
|
+
else:
|
294
|
+
raise TypeError(f"Unexpected item in list: {metric_item}")
|
295
|
+
return resolved_metrics
|
296
|
+
else:
|
297
|
+
return {
|
298
|
+
metric_group: [
|
299
|
+
EvalMetricDefinition(name=metric_spec.metric, options=metric_spec.args)
|
300
|
+
for metric_spec in metric_specs
|
301
|
+
]
|
302
|
+
for metric_group, metric_specs in metrics.items()
|
303
|
+
}
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
import fnmatch
|
2
|
+
import inspect
|
3
|
+
import logging
|
2
4
|
import re
|
3
5
|
from collections import defaultdict
|
4
6
|
from copy import deepcopy
|
5
7
|
from dataclasses import dataclass, field
|
6
|
-
from typing import Any, Tuple, cast
|
8
|
+
from typing import Any, Tuple, TypeGuard, cast, get_args, get_origin, get_type_hints
|
7
9
|
|
10
|
+
from inspect_ai._util.logger import warn_once
|
8
11
|
from inspect_ai._util.registry import (
|
9
12
|
registry_info,
|
10
13
|
registry_log_name,
|
@@ -19,7 +22,12 @@ from inspect_ai.log import (
|
|
19
22
|
)
|
20
23
|
from inspect_ai.log._log import EvalSampleReductions
|
21
24
|
from inspect_ai.scorer import Metric, Score, Scorer
|
22
|
-
from inspect_ai.scorer._metric import
|
25
|
+
from inspect_ai.scorer._metric import (
|
26
|
+
MetricDeprecated,
|
27
|
+
MetricProtocol,
|
28
|
+
SampleScore,
|
29
|
+
Value,
|
30
|
+
)
|
23
31
|
from inspect_ai.scorer._metrics.accuracy import accuracy
|
24
32
|
from inspect_ai.scorer._metrics.std import stderr
|
25
33
|
from inspect_ai.scorer._reducer import ScoreReducer, mean_score, reducer_log_name
|
@@ -29,6 +37,8 @@ from inspect_ai.scorer._scorer import (
|
|
29
37
|
unique_scorer_name,
|
30
38
|
)
|
31
39
|
|
40
|
+
logger = logging.getLogger(__name__)
|
41
|
+
|
32
42
|
|
33
43
|
@dataclass
|
34
44
|
class ScorerInfo:
|
@@ -99,12 +109,14 @@ def eval_results(
|
|
99
109
|
reduced_samples = EvalSampleReductions(
|
100
110
|
scorer=scorer_name,
|
101
111
|
reducer=reducer_display_nm,
|
102
|
-
samples=
|
112
|
+
samples=[
|
113
|
+
EvalSampleScore(**ss.score.__dict__, sample_id=ss.sample_id)
|
114
|
+
for ss in reduced_scores
|
115
|
+
],
|
103
116
|
)
|
104
117
|
sample_reductions.append(reduced_samples)
|
105
118
|
|
106
119
|
# Compute metrics for this scorer
|
107
|
-
simple_scores = cast(list[Score], reduced_scores)
|
108
120
|
targets = metrics if metrics is not None else scorer_info.metrics
|
109
121
|
if isinstance(targets, list):
|
110
122
|
## split the metrics into the simple metrics and any dictionary
|
@@ -119,7 +131,7 @@ def eval_results(
|
|
119
131
|
scorer_for_metrics(
|
120
132
|
scorer_name=scorer_name,
|
121
133
|
scorer_info=scorer_info,
|
122
|
-
|
134
|
+
sample_scores=reduced_scores,
|
123
135
|
metrics=simple_metrics,
|
124
136
|
reducer_name=reducer_display_nm,
|
125
137
|
)
|
@@ -129,7 +141,7 @@ def eval_results(
|
|
129
141
|
scorers_from_metric_dict(
|
130
142
|
scorer_name=scorer_name,
|
131
143
|
scorer_info=scorer_info,
|
132
|
-
|
144
|
+
sample_scores=reduced_scores,
|
133
145
|
metrics=dict_metric,
|
134
146
|
reducer_name=reducer_display_nm,
|
135
147
|
)
|
@@ -145,7 +157,7 @@ def eval_results(
|
|
145
157
|
scorers_from_metric_dict(
|
146
158
|
scorer_name=scorer_name,
|
147
159
|
scorer_info=scorer_info,
|
148
|
-
|
160
|
+
sample_scores=reduced_scores,
|
149
161
|
metrics=targets,
|
150
162
|
reducer_name=reducer_display_nm,
|
151
163
|
)
|
@@ -184,7 +196,7 @@ def split_metrics(
|
|
184
196
|
def scorer_for_metrics(
|
185
197
|
scorer_name: str,
|
186
198
|
scorer_info: ScorerInfo,
|
187
|
-
|
199
|
+
sample_scores: list[SampleScore],
|
188
200
|
metrics: list[Metric],
|
189
201
|
reducer_name: str | None = None,
|
190
202
|
) -> list[EvalScore]:
|
@@ -200,10 +212,10 @@ def scorer_for_metrics(
|
|
200
212
|
key = metrics_unique_key(
|
201
213
|
registry_unqualified_name(metric), list(list_metrics.keys())
|
202
214
|
)
|
203
|
-
|
215
|
+
params = registry_params(metric)
|
204
216
|
# process metric values
|
205
|
-
if len(
|
206
|
-
metric_value = metric
|
217
|
+
if len(sample_scores) > 0:
|
218
|
+
metric_value = call_metric(metric, sample_scores)
|
207
219
|
else:
|
208
220
|
metric_value = float("Nan")
|
209
221
|
base_metric_name = registry_log_name(metric)
|
@@ -215,8 +227,7 @@ def scorer_for_metrics(
|
|
215
227
|
if value is not None:
|
216
228
|
name = metrics_unique_key(metric_key, list(list_metrics.keys()))
|
217
229
|
list_metrics[name] = EvalMetric(
|
218
|
-
name=name,
|
219
|
-
value=float(value),
|
230
|
+
name=name, value=float(value), params=params
|
220
231
|
)
|
221
232
|
|
222
233
|
# If the metric value is a list, turn each element in the list
|
@@ -229,13 +240,14 @@ def scorer_for_metrics(
|
|
229
240
|
with_suffix(key, count), list(list_metrics.keys())
|
230
241
|
)
|
231
242
|
|
232
|
-
list_metrics[name] = EvalMetric(
|
243
|
+
list_metrics[name] = EvalMetric(
|
244
|
+
name=name, value=float(value), params=params
|
245
|
+
)
|
233
246
|
|
234
247
|
# the metric is a float, str, or int
|
235
248
|
else:
|
236
249
|
list_metrics[key] = EvalMetric(
|
237
|
-
name=base_metric_name,
|
238
|
-
value=float(metric_value),
|
250
|
+
name=base_metric_name, value=float(metric_value), params=params
|
239
251
|
)
|
240
252
|
|
241
253
|
# build results
|
@@ -257,7 +269,7 @@ def scorer_for_metrics(
|
|
257
269
|
def scorers_from_metric_dict(
|
258
270
|
scorer_name: str,
|
259
271
|
scorer_info: ScorerInfo,
|
260
|
-
|
272
|
+
sample_scores: list[SampleScore],
|
261
273
|
metrics: dict[str, list[Metric]],
|
262
274
|
reducer_name: str | None = None,
|
263
275
|
) -> list[EvalScore]:
|
@@ -265,18 +277,22 @@ def scorers_from_metric_dict(
|
|
265
277
|
|
266
278
|
# Expand any metric keys
|
267
279
|
resolved_metrics = (
|
268
|
-
resolve_glob_metric_keys(metrics,
|
280
|
+
resolve_glob_metric_keys(metrics, sample_scores[0].score)
|
281
|
+
if len(sample_scores) > 0
|
282
|
+
else metrics
|
269
283
|
)
|
270
284
|
|
271
285
|
for metric_key, metric_list in resolved_metrics.items():
|
272
286
|
# filter scores to a list of scalars with the value of the metric name
|
273
|
-
metric_scores: list[
|
274
|
-
for
|
275
|
-
if isinstance(score.value, dict):
|
276
|
-
if metric_key in score.value:
|
287
|
+
metric_scores: list[SampleScore] = []
|
288
|
+
for sample_score in sample_scores:
|
289
|
+
if isinstance(sample_score.score.value, dict):
|
290
|
+
if metric_key in sample_score.score.value:
|
277
291
|
# Convert the score into a simple scalar value to apply metrics
|
278
|
-
metric_score = deepcopy(
|
279
|
-
metric_score.value = cast(
|
292
|
+
metric_score = deepcopy(sample_score)
|
293
|
+
metric_score.score.value = cast(
|
294
|
+
float, sample_score.score.value[metric_key]
|
295
|
+
)
|
280
296
|
metric_scores.append(metric_score)
|
281
297
|
else:
|
282
298
|
raise TypeError(
|
@@ -291,8 +307,9 @@ def scorers_from_metric_dict(
|
|
291
307
|
for target_metric in metric_list:
|
292
308
|
# compute the metric value
|
293
309
|
metric_name = registry_log_name(target_metric)
|
310
|
+
metric_params = registry_params(target_metric)
|
294
311
|
if len(metric_scores) > 0:
|
295
|
-
value = target_metric
|
312
|
+
value = call_metric(target_metric, metric_scores)
|
296
313
|
else:
|
297
314
|
value = float("Nan")
|
298
315
|
|
@@ -302,20 +319,17 @@ def scorers_from_metric_dict(
|
|
302
319
|
for key, val in value.items():
|
303
320
|
name = f"{metric_name}_{key}"
|
304
321
|
result_metrics[name] = EvalMetric(
|
305
|
-
name=name,
|
306
|
-
value=cast(float, val),
|
322
|
+
name=name, value=cast(float, val), params=metric_params
|
307
323
|
)
|
308
324
|
elif isinstance(value, list):
|
309
325
|
for idx, item in enumerate(value):
|
310
326
|
name = f"{metric_name}_{idx}"
|
311
327
|
result_metrics[name] = EvalMetric(
|
312
|
-
name=name,
|
313
|
-
value=cast(float, item),
|
328
|
+
name=name, value=cast(float, item), params=metric_params
|
314
329
|
)
|
315
330
|
else:
|
316
331
|
result_metrics[metric_name] = EvalMetric(
|
317
|
-
name=metric_name,
|
318
|
-
value=cast(float, value),
|
332
|
+
name=metric_name, value=cast(float, value), params=metric_params
|
319
333
|
)
|
320
334
|
|
321
335
|
# create a scorer result for this metric
|
@@ -336,6 +350,48 @@ def scorers_from_metric_dict(
|
|
336
350
|
return results
|
337
351
|
|
338
352
|
|
353
|
+
def call_metric(metric: Metric, sample_scores: list[SampleScore]) -> Value:
|
354
|
+
if is_metric_deprecated(metric):
|
355
|
+
warn_once(
|
356
|
+
logger,
|
357
|
+
f"Metric {registry_log_name(metric)} should be updated to take list[SampleScore]. "
|
358
|
+
f"Metrics with list[Score] are deprecated.",
|
359
|
+
)
|
360
|
+
scores = [sample_score.score for sample_score in sample_scores]
|
361
|
+
return metric(scores)
|
362
|
+
else:
|
363
|
+
metric = cast(MetricProtocol, metric)
|
364
|
+
return metric(sample_scores)
|
365
|
+
|
366
|
+
|
367
|
+
def is_metric_deprecated(metric: Metric) -> TypeGuard[MetricDeprecated]:
|
368
|
+
"""Type guard to check if a metric follows the deprecated signature."""
|
369
|
+
try:
|
370
|
+
# signature and params
|
371
|
+
sig = inspect.signature(metric)
|
372
|
+
param_types = get_type_hints(metric)
|
373
|
+
|
374
|
+
# there should be only one param, check it
|
375
|
+
first_param = next(iter(sig.parameters.values()), None)
|
376
|
+
if first_param is None:
|
377
|
+
# No parameters, who knows what this is, treat it as deprecated
|
378
|
+
return True
|
379
|
+
|
380
|
+
expected_type: Any = param_types.get(first_param.name, None)
|
381
|
+
|
382
|
+
if expected_type is None or expected_type is Any:
|
383
|
+
# no helpful type info, treat it as deprecated
|
384
|
+
return True
|
385
|
+
|
386
|
+
# Extract generic base type and arguments to check if it matches list[Score]
|
387
|
+
origin = get_origin(expected_type)
|
388
|
+
args = get_args(expected_type)
|
389
|
+
|
390
|
+
return origin is list and args == (Score,)
|
391
|
+
except (AttributeError, ValueError, TypeError):
|
392
|
+
return False
|
393
|
+
|
394
|
+
|
339
395
|
def resolve_glob_metric_keys(
|
340
396
|
metrics: dict[str, list[Metric]], base_score: Score
|
341
397
|
) -> dict[str, list[Metric]]:
|
@@ -375,7 +431,7 @@ def resolve_glob_metric_keys(
|
|
375
431
|
|
376
432
|
def reduce_scores(
|
377
433
|
scores: list[SampleScore], reducer: ScoreReducer
|
378
|
-
) -> list[
|
434
|
+
) -> list[SampleScore]:
|
379
435
|
# Group the scores by sample_id
|
380
436
|
grouped_scores: dict[str, list[SampleScore]] = defaultdict(list)
|
381
437
|
for sample_score in scores:
|
@@ -383,16 +439,14 @@ def reduce_scores(
|
|
383
439
|
grouped_scores[str(sample_score.sample_id)].append(sample_score)
|
384
440
|
|
385
441
|
# reduce the scores
|
386
|
-
reduced_scores: list[
|
442
|
+
reduced_scores: list[SampleScore] = []
|
387
443
|
for scores in grouped_scores.values():
|
388
444
|
reduced = reducer([score.score for score in scores])
|
389
445
|
reduced_scores.append(
|
390
|
-
|
446
|
+
SampleScore(
|
391
447
|
sample_id=scores[0].sample_id,
|
392
|
-
|
393
|
-
|
394
|
-
explanation=reduced.explanation,
|
395
|
-
metadata=reduced.metadata,
|
448
|
+
sample_metadata=scores[0].sample_metadata,
|
449
|
+
score=reduced,
|
396
450
|
)
|
397
451
|
)
|
398
452
|
|