inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/scorer/_metric.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
1
2
|
from logging import getLogger
|
2
3
|
from typing import (
|
3
4
|
Any,
|
@@ -12,11 +13,15 @@ from typing import (
|
|
12
13
|
|
13
14
|
from pydantic import BaseModel, Field
|
14
15
|
|
16
|
+
from inspect_ai._util.error import PrerequisiteError
|
15
17
|
from inspect_ai._util.registry import (
|
16
18
|
RegistryInfo,
|
19
|
+
is_registry_object,
|
17
20
|
registry_add,
|
18
21
|
registry_create,
|
22
|
+
registry_info,
|
19
23
|
registry_name,
|
24
|
+
registry_params,
|
20
25
|
registry_tag,
|
21
26
|
)
|
22
27
|
|
@@ -43,19 +48,12 @@ Value = Union[
|
|
43
48
|
"""Value provided by a score.
|
44
49
|
|
45
50
|
Use the methods of `Score` to easily treat
|
46
|
-
the Value as a simple scalar of various types.
|
51
|
+
the `Value` as a simple scalar of various types.
|
47
52
|
"""
|
48
53
|
|
49
54
|
|
50
55
|
class Score(BaseModel):
|
51
|
-
"""Score generated by a scorer.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
value (Value): Score value.
|
55
|
-
answer (str | None): Answer extracted from model output (optional).
|
56
|
-
explanation (str | None): Explanation of score (optional).
|
57
|
-
metadata (dict[str,Any]): Additional metadata related to the score.
|
58
|
-
"""
|
56
|
+
"""Score generated by a scorer."""
|
59
57
|
|
60
58
|
value: Value
|
61
59
|
"""Score value."""
|
@@ -112,12 +110,7 @@ class Score(BaseModel):
|
|
112
110
|
|
113
111
|
|
114
112
|
class SampleScore(BaseModel):
|
115
|
-
"""Score for a Sample
|
116
|
-
|
117
|
-
Args:
|
118
|
-
score: Score
|
119
|
-
sample_id: (str | int | None) Unique id of a sample
|
120
|
-
"""
|
113
|
+
"""Score for a Sample."""
|
121
114
|
|
122
115
|
score: Score
|
123
116
|
"""A score"""
|
@@ -125,6 +118,9 @@ class SampleScore(BaseModel):
|
|
125
118
|
sample_id: str | int | None = Field(default=None)
|
126
119
|
"""A sample id"""
|
127
120
|
|
121
|
+
sample_metadata: dict[str, Any] | None = Field(default=None)
|
122
|
+
"""Metadata from the sample"""
|
123
|
+
|
128
124
|
scorer: str | None = Field(default=None)
|
129
125
|
"""Registry name of scorer that created this score."""
|
130
126
|
|
@@ -188,22 +184,57 @@ def value_to_float(
|
|
188
184
|
|
189
185
|
|
190
186
|
@runtime_checkable
|
191
|
-
class
|
192
|
-
|
193
|
-
|
194
|
-
Args:
|
195
|
-
scores (list[Score]): List of scores.
|
187
|
+
class MetricDeprecated(Protocol):
|
188
|
+
def __call__(self, scores: list[Score]) -> Value: ...
|
196
189
|
|
197
|
-
Returns:
|
198
|
-
Metric value
|
199
|
-
"""
|
200
190
|
|
201
|
-
|
191
|
+
@runtime_checkable
|
192
|
+
class MetricProtocol(Protocol):
|
193
|
+
def __call__(self, scores: list[SampleScore]) -> Value:
|
194
|
+
r"""Compute a metric on a list of scores.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
scores: List of scores.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
Metric value
|
201
|
+
|
202
|
+
Examples:
|
203
|
+
```python
|
204
|
+
@metric
|
205
|
+
def mean() -> Metric:
|
206
|
+
def metric(scores: list[SampleScore]) -> Value:
|
207
|
+
return np.mean([score.score.as_float() for score in scores]).item()
|
208
|
+
return metric
|
209
|
+
```
|
210
|
+
"""
|
211
|
+
...
|
212
|
+
|
213
|
+
|
214
|
+
Metric = MetricProtocol | MetricDeprecated
|
215
|
+
"""Metric protocol.
|
216
|
+
|
217
|
+
The Metric signature changed in release v0.3.64. Both
|
218
|
+
the previous and new signatures are supported -- you
|
219
|
+
should use `MetricProtocol` for new code as the
|
220
|
+
depreacated signature will eventually be removed.
|
221
|
+
"""
|
202
222
|
|
203
223
|
|
204
224
|
P = ParamSpec("P")
|
205
225
|
|
206
226
|
|
227
|
+
@dataclass(frozen=True)
|
228
|
+
class MetricSpec:
|
229
|
+
"""Scorer specification used to (re-)create scorers."""
|
230
|
+
|
231
|
+
metric: str
|
232
|
+
"""Metric name"""
|
233
|
+
|
234
|
+
args: dict[str, Any] = field(default_factory=dict)
|
235
|
+
"""Metric arguments."""
|
236
|
+
|
237
|
+
|
207
238
|
def metric_register(metric: Callable[P, Metric], name: str = "") -> Callable[P, Metric]:
|
208
239
|
r"""Register a function or class as a metric.
|
209
240
|
|
@@ -237,6 +268,26 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
|
|
237
268
|
return cast(Metric, registry_create("metric", name, **kwargs))
|
238
269
|
|
239
270
|
|
271
|
+
def to_metric_specs(
|
272
|
+
metrics: list[Metric] | dict[str, list[Metric]],
|
273
|
+
) -> list[MetricSpec] | dict[str, list[MetricSpec]]:
|
274
|
+
if isinstance(metrics, list):
|
275
|
+
return [as_metric_spec(m) for m in metrics]
|
276
|
+
else:
|
277
|
+
return {
|
278
|
+
k: [as_metric_spec(v) for v in metric_list]
|
279
|
+
for k, metric_list in metrics.items()
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
def as_metric_spec(metric: Metric) -> MetricSpec:
|
284
|
+
if not is_registry_object(metric):
|
285
|
+
raise PrerequisiteError(
|
286
|
+
f"The metric {getattr(metric, '__name__', '<unknown>')} was not created by a function decorated with @metric so cannot be recorded."
|
287
|
+
)
|
288
|
+
return MetricSpec(metric=registry_info(metric).name, args=registry_params(metric))
|
289
|
+
|
290
|
+
|
240
291
|
@overload
|
241
292
|
def metric(name: str) -> Callable[[Callable[P, Metric]], Callable[P, Metric]]: ...
|
242
293
|
|
@@ -252,10 +303,18 @@ def metric(
|
|
252
303
|
r"""Decorator for registering metrics.
|
253
304
|
|
254
305
|
Args:
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
306
|
+
name: Optional name for metric. If the decorator has no name
|
307
|
+
argument then the name of the underlying MetricType
|
308
|
+
will be used to automatically assign a name.
|
309
|
+
|
310
|
+
Examples:
|
311
|
+
```python
|
312
|
+
@metric
|
313
|
+
def mean() -> Metric:
|
314
|
+
def metric(scores: list[SampleScore]) -> Value:
|
315
|
+
return np.mean([score.score.as_float() for score in scores]).item()
|
316
|
+
return metric
|
317
|
+
```
|
259
318
|
"""
|
260
319
|
|
261
320
|
# create_metric_wrapper:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
from .accuracy import accuracy
|
2
|
-
from .mean import mean
|
3
|
-
from .std import bootstrap_stderr, std, stderr
|
2
|
+
from .mean import mean
|
3
|
+
from .std import bootstrap_stderr, std, stderr, var
|
4
4
|
|
5
5
|
__all__ = [
|
6
6
|
"accuracy",
|
7
7
|
"mean",
|
8
|
-
"var",
|
9
8
|
"bootstrap_stderr",
|
10
9
|
"std",
|
11
10
|
"stderr",
|
11
|
+
"var",
|
12
12
|
]
|
@@ -2,7 +2,7 @@ from logging import getLogger
|
|
2
2
|
|
3
3
|
from .._metric import (
|
4
4
|
Metric,
|
5
|
-
|
5
|
+
SampleScore,
|
6
6
|
ValueToFloat,
|
7
7
|
metric,
|
8
8
|
value_to_float,
|
@@ -16,22 +16,20 @@ def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric:
|
|
16
16
|
r"""Compute proportion of total answers which are correct.
|
17
17
|
|
18
18
|
Args:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
float directly, and prints a warning and returns
|
25
|
-
0 if the Value is a complex object (list or dict).
|
19
|
+
to_float: Function for mapping `Value` to float for computing
|
20
|
+
metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
|
21
|
+
INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
|
22
|
+
casts numeric values to float directly, and prints a warning and returns
|
23
|
+
0 if the Value is a complex object (list or dict).
|
26
24
|
|
27
25
|
Returns:
|
28
26
|
Accuracy metric
|
29
27
|
"""
|
30
28
|
|
31
|
-
def metric(scores: list[
|
29
|
+
def metric(scores: list[SampleScore]) -> float:
|
32
30
|
total = 0.0
|
33
31
|
for item in scores:
|
34
|
-
total += to_float(item.value)
|
32
|
+
total += to_float(item.score.value)
|
35
33
|
return total / float(len(scores))
|
36
34
|
|
37
35
|
return metric
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import numpy as np
|
2
2
|
|
3
|
-
from .._metric import Metric,
|
3
|
+
from .._metric import Metric, SampleScore, metric
|
4
4
|
|
5
5
|
|
6
6
|
@metric
|
@@ -11,21 +11,7 @@ def mean() -> Metric:
|
|
11
11
|
mean metric
|
12
12
|
"""
|
13
13
|
|
14
|
-
def metric(scores: list[
|
15
|
-
return np.mean([score.as_float() for score in scores]).item()
|
16
|
-
|
17
|
-
return metric
|
18
|
-
|
19
|
-
|
20
|
-
@metric
|
21
|
-
def var() -> Metric:
|
22
|
-
"""Compute variance over all scores.
|
23
|
-
|
24
|
-
Returns:
|
25
|
-
var metric
|
26
|
-
"""
|
27
|
-
|
28
|
-
def metric(scores: list[Score]) -> float:
|
29
|
-
return np.var([score.as_float() for score in scores]).item()
|
14
|
+
def metric(scores: list[SampleScore]) -> float:
|
15
|
+
return np.mean([score.score.as_float() for score in scores]).item()
|
30
16
|
|
31
17
|
return metric
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
5
5
|
|
6
6
|
from .._metric import (
|
7
7
|
Metric,
|
8
|
-
|
8
|
+
SampleScore,
|
9
9
|
ValueToFloat,
|
10
10
|
metric,
|
11
11
|
value_to_float,
|
@@ -21,21 +21,21 @@ def bootstrap_stderr(
|
|
21
21
|
"""Standard error of the mean using bootstrap.
|
22
22
|
|
23
23
|
Args:
|
24
|
-
num_samples
|
25
|
-
to_float
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
num_samples: Number of bootstrap samples to take.
|
25
|
+
to_float: Function for mapping
|
26
|
+
Value to float for computing metrics. The default
|
27
|
+
`value_to_float()` maps CORRECT ("C") to 1.0,
|
28
|
+
INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
|
29
|
+
NOANSWER ("N") to 0, casts numeric values to
|
30
|
+
float directly, and prints a warning and returns
|
31
|
+
0 if the Value is a complex object (list or dict).
|
32
32
|
|
33
33
|
Returns:
|
34
34
|
bootstrap_stderr metric
|
35
35
|
"""
|
36
36
|
|
37
|
-
def metric(scores: list[
|
38
|
-
values = [to_float(score.value) for score in scores]
|
37
|
+
def metric(scores: list[SampleScore]) -> float:
|
38
|
+
values = [to_float(score.score.value) for score in scores]
|
39
39
|
std = np.std(
|
40
40
|
[
|
41
41
|
np.mean(np.random.choice(values, len(values), replace=True))
|
@@ -48,24 +48,71 @@ def bootstrap_stderr(
|
|
48
48
|
|
49
49
|
|
50
50
|
@metric
|
51
|
-
def stderr(
|
51
|
+
def stderr(
|
52
|
+
to_float: ValueToFloat = value_to_float(), cluster: str | None = None
|
53
|
+
) -> Metric:
|
52
54
|
"""Standard error of the mean using Central Limit Theorem.
|
53
55
|
|
54
56
|
Args:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
57
|
+
to_float: Function for mapping `Value` to float for computing
|
58
|
+
metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
|
59
|
+
INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
|
60
|
+
casts numeric values to float directly, and prints a warning and returns
|
61
|
+
0 if the Value is a complex object (list or dict).
|
62
|
+
cluster (str | None): The key from the Sample metadata
|
63
|
+
corresponding to a cluster identifier for computing
|
64
|
+
[clustered standard errors](https://en.wikipedia.org/wiki/Clustered_standard_errors).
|
62
65
|
|
63
66
|
Returns:
|
64
|
-
|
67
|
+
stderr metric
|
65
68
|
"""
|
66
69
|
|
67
|
-
def
|
68
|
-
|
70
|
+
def clustered_metric(scores: list[SampleScore]) -> float:
|
71
|
+
"""Computes a clustered standard error.
|
72
|
+
|
73
|
+
For details, see Appendix A of https://arxiv.org/pdf/2411.00640.
|
74
|
+
The version here uses a finite cluster correction (unlike the paper)
|
75
|
+
"""
|
76
|
+
assert cluster is not None
|
77
|
+
cluster_list = []
|
78
|
+
value_list = []
|
79
|
+
for sample_score in scores:
|
80
|
+
if (
|
81
|
+
sample_score.sample_metadata is None
|
82
|
+
or cluster not in sample_score.sample_metadata
|
83
|
+
):
|
84
|
+
raise ValueError(
|
85
|
+
f"Sample {sample_score.sample_id} has no cluster metadata. To compute `stderr` with clustering, each sample metadata must have a value for '{cluster}'"
|
86
|
+
)
|
87
|
+
cluster_list.append(sample_score.sample_metadata[cluster])
|
88
|
+
value_list.append(to_float(sample_score.score.value))
|
89
|
+
clusters = np.array(cluster_list)
|
90
|
+
values = np.array(value_list)
|
91
|
+
mean = float(np.mean(values))
|
92
|
+
|
93
|
+
# Convert to numpy arrays and get unique clusters
|
94
|
+
unique_clusters = np.unique(clusters)
|
95
|
+
cluster_count = len(unique_clusters)
|
96
|
+
|
97
|
+
# Compute clustered variance using NumPy operations
|
98
|
+
clustered_variance = 0.0
|
99
|
+
for cluster_id in unique_clusters:
|
100
|
+
# get a data vector for this cluster
|
101
|
+
cluster_data = values[clusters == cluster_id]
|
102
|
+
# this computes X' \Omega X = \sum_i \sum_j (s_{i,c} - mean) * (s_{j,c} - mean)
|
103
|
+
clustered_variance += np.outer(
|
104
|
+
cluster_data - mean, cluster_data - mean
|
105
|
+
).sum()
|
106
|
+
|
107
|
+
# Multiply by C / (C - 1) to unbias the variance estimate
|
108
|
+
standard_error = np.sqrt(
|
109
|
+
clustered_variance * cluster_count / (cluster_count - 1)
|
110
|
+
) / len(scores)
|
111
|
+
|
112
|
+
return cast(float, standard_error)
|
113
|
+
|
114
|
+
def metric(scores: list[SampleScore]) -> float:
|
115
|
+
values = [to_float(score.score.value) for score in scores]
|
69
116
|
n = len(values)
|
70
117
|
|
71
118
|
# standard deviation is calculated by dividing by n-ddof so ensure
|
@@ -81,6 +128,9 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
|
|
81
128
|
|
82
129
|
return cast(float, standard_error)
|
83
130
|
|
131
|
+
if cluster is not None:
|
132
|
+
return clustered_metric
|
133
|
+
|
84
134
|
return metric
|
85
135
|
|
86
136
|
|
@@ -88,6 +138,39 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
|
|
88
138
|
def std(to_float: ValueToFloat = value_to_float()) -> Metric:
|
89
139
|
"""Calculates the sample standard deviation of a list of scores.
|
90
140
|
|
141
|
+
Args:
|
142
|
+
to_float: Function for mapping `Value` to float for computing
|
143
|
+
metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
|
144
|
+
INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
|
145
|
+
casts numeric values to float directly, and prints a warning and returns
|
146
|
+
0 if the Value is a complex object (list or dict).
|
147
|
+
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
std metric
|
151
|
+
"""
|
152
|
+
|
153
|
+
def metric(scores: list[SampleScore]) -> float:
|
154
|
+
values = [to_float(score.score.value) for score in scores]
|
155
|
+
n = len(values)
|
156
|
+
|
157
|
+
# standard deviation is calculated by dividing by n-ddof so ensure
|
158
|
+
# that we won't divide by zero
|
159
|
+
if (n - 1) < 1:
|
160
|
+
return 0
|
161
|
+
|
162
|
+
# Calculate the sample standard deviation
|
163
|
+
sample_std = np.std(values, ddof=1)
|
164
|
+
|
165
|
+
return cast(float, sample_std)
|
166
|
+
|
167
|
+
return metric
|
168
|
+
|
169
|
+
|
170
|
+
@metric
|
171
|
+
def var(to_float: ValueToFloat = value_to_float()) -> Metric:
|
172
|
+
"""Compute the sample variance of a list of scores.
|
173
|
+
|
91
174
|
Args:
|
92
175
|
to_float (ValueToFloat): Function for mapping
|
93
176
|
Value to float for computing metrics. The default
|
@@ -98,21 +181,19 @@ def std(to_float: ValueToFloat = value_to_float()) -> Metric:
|
|
98
181
|
0 if the Value is a complex object (list or dict).
|
99
182
|
|
100
183
|
Returns:
|
101
|
-
|
184
|
+
var metric
|
102
185
|
"""
|
103
186
|
|
104
|
-
def metric(scores: list[
|
105
|
-
values = [to_float(score.value) for score in scores]
|
187
|
+
def metric(scores: list[SampleScore]) -> float:
|
188
|
+
values = [to_float(score.score.value) for score in scores]
|
106
189
|
n = len(values)
|
107
|
-
|
108
|
-
# standard deviation is calculated by dividing by n-ddof so ensure
|
190
|
+
# variance is calculated by dividing by n-ddof so ensure
|
109
191
|
# that we won't divide by zero
|
110
192
|
if (n - 1) < 1:
|
111
193
|
return 0
|
112
194
|
|
113
|
-
|
114
|
-
sample_std = np.std(values, ddof=1)
|
195
|
+
variance = np.var(values, ddof=1)
|
115
196
|
|
116
|
-
return cast(float,
|
197
|
+
return cast(float, variance)
|
117
198
|
|
118
199
|
return metric
|
inspect_ai/scorer/_model.py
CHANGED
@@ -35,31 +35,31 @@ def model_graded_fact(
|
|
35
35
|
"""Score a question/answer task with a fact response using a model.
|
36
36
|
|
37
37
|
Args:
|
38
|
-
template
|
38
|
+
template: Template for grading prompt. This template uses
|
39
39
|
four variables: `question`, `criterion`, `answer`, and
|
40
40
|
`instructions` (which is fed from the `instructions` parameter).
|
41
41
|
Variables from sample `metadata` are also available in the template.
|
42
|
-
instructions
|
42
|
+
instructions: Grading instructions. This should
|
43
43
|
include a prompt for the model to answer (e.g. with
|
44
44
|
with chain of thought reasoning) in a way that matches
|
45
45
|
the specified `grade_pattern`, for example, the default
|
46
46
|
`grade_pattern` looks for one of GRADE: C, GRADE: P, or
|
47
47
|
GRADE: I).
|
48
|
-
grade_pattern
|
48
|
+
grade_pattern: Regex to extract the grade from the
|
49
49
|
model response. Defaults to looking for e.g. GRADE: C
|
50
50
|
The regex should have a single capture group that
|
51
51
|
extracts exactly the letter C, P, or I.
|
52
|
-
include_history
|
52
|
+
include_history:
|
53
53
|
Whether to include the full chat history in the presented
|
54
54
|
question. Defaults to `False`, which presents only the
|
55
55
|
original sample input. Optionally provide a function to
|
56
56
|
customise how the chat history is presented.
|
57
|
-
partial_credit
|
57
|
+
partial_credit: Whether to allow for "partial" credit for
|
58
58
|
answers (by default assigned a score of 0.5). Defaults
|
59
59
|
to `False`. Note that this parameter is only used
|
60
60
|
with the default `instructions` (as custom instructions
|
61
61
|
provide their own prompts for grades).
|
62
|
-
model
|
62
|
+
model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
|
63
63
|
"""
|
64
64
|
return model_graded_qa(
|
65
65
|
template=template if template else DEFAULT_MODEL_GRADED_FACT_TEMPLATE,
|
@@ -83,32 +83,32 @@ def model_graded_qa(
|
|
83
83
|
"""Score a question/answer task using a model.
|
84
84
|
|
85
85
|
Args:
|
86
|
-
template
|
86
|
+
template: Template for grading prompt. This template has
|
87
87
|
four variables:
|
88
88
|
- `question`, `criterion`, `answer`, and
|
89
89
|
`instructions` (which is fed from the `instructions` parameter).
|
90
90
|
Variables from sample `metadata` are also available in the template.
|
91
|
-
instructions
|
91
|
+
instructions: Grading instructions. This should
|
92
92
|
include a prompt for the model to answer (e.g. with
|
93
93
|
with chain of thought reasoning) in a way that matches
|
94
94
|
the specified `grade_pattern`, for example, the default
|
95
95
|
`grade_pattern` looks for one of GRADE: C, GRADE: P, or
|
96
96
|
GRADE: I.
|
97
|
-
grade_pattern
|
97
|
+
grade_pattern: Regex to extract the grade from the
|
98
98
|
model response. Defaults to looking for e.g. GRADE: C
|
99
99
|
The regex should have a single capture group that
|
100
100
|
extracts exactly the letter C, P, I.
|
101
|
-
include_history
|
101
|
+
include_history:
|
102
102
|
Whether to include the full chat history in the presented
|
103
103
|
question. Defaults to `False`, which presents only the
|
104
104
|
original sample input. Optionally provide a function to
|
105
105
|
customise how the chat history is presented.
|
106
|
-
partial_credit
|
106
|
+
partial_credit: Whether to allow for "partial" credit for
|
107
107
|
answers (by default assigned a score of 0.5). Defaults
|
108
108
|
to `False`. Note that this parameter is only used
|
109
109
|
with the default `instructions` (as custom instructions
|
110
110
|
provide their own prompts for grades).
|
111
|
-
model
|
111
|
+
model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
|
112
112
|
"""
|
113
113
|
# bind variables
|
114
114
|
get_scorer = partial(
|
inspect_ai/scorer/_pattern.py
CHANGED
@@ -55,11 +55,11 @@ def pattern(pattern: str, ignore_case: bool = True, match_all: bool = False) ->
|
|
55
55
|
to match either one or all of the extracted groups
|
56
56
|
|
57
57
|
Args:
|
58
|
-
pattern
|
58
|
+
pattern: Regular expression for extracting the
|
59
59
|
answer from model output.
|
60
|
-
ignore_case
|
60
|
+
ignore_case: Ignore case when comparing
|
61
61
|
the extract answer to the targets. (Default: True)
|
62
|
-
match_all
|
62
|
+
match_all: With multiple captures, do all captured
|
63
63
|
values need to match the target? (Default: False)
|
64
64
|
"""
|
65
65
|
|
@@ -12,6 +12,8 @@ from .types import ScoreReducer
|
|
12
12
|
|
13
13
|
@score_reducer(name="mode")
|
14
14
|
def mode_score() -> ScoreReducer:
|
15
|
+
r"""Take the mode from a list of scores."""
|
16
|
+
|
15
17
|
def reduce(scores: list[Score]) -> Score:
|
16
18
|
r"""A utility function for the most common score in a list of scores.
|
17
19
|
|
@@ -36,12 +38,13 @@ def mode_score() -> ScoreReducer:
|
|
36
38
|
|
37
39
|
@score_reducer(name="mean")
|
38
40
|
def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
|
39
|
-
|
40
|
-
r"""A utility function for taking a mean value over a list of scores.
|
41
|
+
r"""Take the mean of a list of scores.
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
Args:
|
44
|
+
value_to_float: Function to convert the value to a float
|
45
|
+
"""
|
46
|
+
|
47
|
+
def reduce(scores: list[Score]) -> Score:
|
45
48
|
if isinstance(scores[0].value, dict):
|
46
49
|
return _compute_dict_stat(scores, value_to_float, statistics.mean)
|
47
50
|
elif isinstance(scores[0].value, list):
|
@@ -54,12 +57,13 @@ def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
|
|
54
57
|
|
55
58
|
@score_reducer(name="median")
|
56
59
|
def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
|
57
|
-
|
58
|
-
r"""A utility function for taking a median value over a list of scores.
|
60
|
+
r"""Take the median value from a list of scores.
|
59
61
|
|
60
|
-
|
61
|
-
|
62
|
-
|
62
|
+
Args:
|
63
|
+
value_to_float: Function to convert the value to a float
|
64
|
+
"""
|
65
|
+
|
66
|
+
def reduce(scores: list[Score]) -> Score:
|
63
67
|
if isinstance(scores[0].value, dict):
|
64
68
|
return _compute_dict_stat(scores, value_to_float, statistics.median)
|
65
69
|
elif isinstance(scores[0].value, list):
|
@@ -74,13 +78,15 @@ def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReduce
|
|
74
78
|
def at_least(
|
75
79
|
k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
|
76
80
|
) -> ScoreReducer:
|
77
|
-
|
78
|
-
r"""A utility function for scoring a value as correct if there are at least n score values greater than or equal to the value
|
81
|
+
r"""Score correct if there are at least k score values greater than or equal to the value.
|
79
82
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
+
Args:
|
84
|
+
k: Number of score values that must exceed `value`.
|
85
|
+
value: Score value threshold.
|
86
|
+
value_to_float: Function to convert score values to float.
|
87
|
+
"""
|
83
88
|
|
89
|
+
def reduce(scores: list[Score]) -> Score:
|
84
90
|
def gte_n(
|
85
91
|
counter: Counter[str | int | float | bool],
|
86
92
|
) -> str | int | float | bool:
|
@@ -104,6 +110,14 @@ def at_least(
|
|
104
110
|
def pass_at(
|
105
111
|
k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
|
106
112
|
) -> ScoreReducer:
|
113
|
+
r"""Probability of at least 1 correct sample given `k` epochs (<https://arxiv.org/pdf/2107.03374>).
|
114
|
+
|
115
|
+
Args:
|
116
|
+
k: Epochs to compute probability for.
|
117
|
+
value: Score value threshold.
|
118
|
+
value_to_float: Function to convert score values to float.
|
119
|
+
"""
|
120
|
+
|
107
121
|
def reduce(scores: list[Score]) -> Score:
|
108
122
|
def pass_at_k(values: list[float]) -> float:
|
109
123
|
total = len(scores)
|
@@ -129,12 +143,13 @@ def pass_at(
|
|
129
143
|
|
130
144
|
@score_reducer(name="max")
|
131
145
|
def max_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
|
132
|
-
|
133
|
-
r"""A utility function for taking the maximum value from a list of scores
|
146
|
+
r"""Take the maximum value from a list of scores.
|
134
147
|
|
135
|
-
|
136
|
-
|
137
|
-
|
148
|
+
Args:
|
149
|
+
value_to_float: Function to convert the value to a float
|
150
|
+
"""
|
151
|
+
|
152
|
+
def reduce(scores: list[Score]) -> Score:
|
138
153
|
if isinstance(scores[0].value, dict):
|
139
154
|
dict_result: dict[str, str | int | float | bool | None] = {}
|
140
155
|
keys = scores[0].value.keys() # type: ignore
|
@@ -238,7 +253,7 @@ def _compute_dict_stat(
|
|
238
253
|
|
239
254
|
Args:
|
240
255
|
scores: a list of Scores.
|
241
|
-
value_to_float:
|
256
|
+
value_to_float: Function to convert the value to a float
|
242
257
|
statistic: the statistic to apply
|
243
258
|
"""
|
244
259
|
# Make sure these are all dictionaries be we proceed
|