PyPI - inspect-ai - Versions diffs - 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl - Mend

inspect-ai 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (518) hide show

inspect_ai/_cli/cache.py +8 -7
inspect_ai/_cli/common.py +0 -12
inspect_ai/_cli/eval.py +32 -4
inspect_ai/_cli/info.py +1 -0
inspect_ai/_cli/list.py +1 -1
inspect_ai/_cli/log.py +2 -0
inspect_ai/_cli/main.py +1 -1
inspect_ai/_cli/sandbox.py +4 -1
inspect_ai/_cli/score.py +181 -32
inspect_ai/_cli/trace.py +10 -0
inspect_ai/_cli/view.py +4 -2
inspect_ai/_display/core/active.py +2 -3
inspect_ai/_display/core/config.py +7 -1
inspect_ai/_display/textual/widgets/samples.py +4 -3
inspect_ai/_display/textual/widgets/sandbox.py +6 -0
inspect_ai/_eval/eval.py +104 -101
inspect_ai/_eval/evalset.py +75 -75
inspect_ai/_eval/loader.py +122 -12
inspect_ai/_eval/registry.py +1 -1
inspect_ai/_eval/run.py +14 -0
inspect_ai/_eval/score.py +125 -36
inspect_ai/_eval/task/log.py +105 -4
inspect_ai/_eval/task/results.py +92 -38
inspect_ai/_eval/task/run.py +9 -2
inspect_ai/_eval/task/sandbox.py +35 -2
inspect_ai/_eval/task/task.py +49 -46
inspect_ai/_util/constants.py +1 -1
inspect_ai/_util/content.py +8 -0
inspect_ai/_util/error.py +2 -0
inspect_ai/_util/file.py +15 -1
inspect_ai/_util/hash.py +1 -1
inspect_ai/_util/logger.py +4 -2
inspect_ai/_util/registry.py +7 -1
inspect_ai/_view/view.py +1 -2
inspect_ai/_view/www/.vscode/extensions.json +3 -0
inspect_ai/_view/www/.vscode/settings.json +8 -0
inspect_ai/_view/www/App.css +97 -29
inspect_ai/_view/www/README.md +1 -1
inspect_ai/_view/www/dist/assets/index.css +16663 -14674
inspect_ai/_view/www/dist/assets/index.js +58808 -51348
inspect_ai/_view/www/dist/index.html +1 -1
inspect_ai/_view/www/index.html +2 -2
inspect_ai/_view/www/log-schema.json +87 -73
inspect_ai/_view/www/package.json +22 -4
inspect_ai/_view/www/postcss.config.cjs +8 -9
inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
inspect_ai/_view/www/src/api/api-browser.ts +2 -2
inspect_ai/_view/www/src/api/api-http.ts +3 -5
inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
inspect_ai/_view/www/src/api/client-api.ts +4 -4
inspect_ai/_view/www/src/api/index.ts +4 -4
inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
inspect_ai/_view/www/src/appearance/colors.ts +9 -0
inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
inspect_ai/_view/www/src/appearance/icons.ts +100 -0
inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
inspect_ai/_view/www/src/components/Card.css +60 -0
inspect_ai/_view/www/src/components/Card.tsx +109 -0
inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
inspect_ai/_view/www/src/components/FindBand.css +49 -0
inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
inspect_ai/_view/www/src/components/MessageBand.css +43 -0
inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
inspect_ai/_view/www/src/components/ToolButton.css +3 -0
inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
inspect_ai/_view/www/src/metadata/types.ts +18 -0
inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
inspect_ai/_view/www/src/samples/error/error.ts +15 -0
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
inspect_ai/_view/www/src/types/log.d.ts +108 -19
inspect_ai/_view/www/src/types/prism.d.ts +11 -0
inspect_ai/_view/www/src/types.ts +71 -0
inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
inspect_ai/_view/www/src/utils/attachments.ts +42 -0
inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
inspect_ai/_view/www/src/utils/debugging.ts +28 -0
inspect_ai/_view/www/src/utils/dom.ts +30 -0
inspect_ai/_view/www/src/utils/format.ts +194 -0
inspect_ai/_view/www/src/utils/git.ts +7 -0
inspect_ai/_view/www/src/utils/html.ts +6 -0
inspect_ai/_view/www/src/utils/http.ts +14 -0
inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
inspect_ai/_view/www/src/utils/queue.ts +51 -0
inspect_ai/_view/www/src/utils/sync.ts +114 -0
inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
inspect_ai/_view/www/src/utils/vscode.ts +13 -0
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
inspect_ai/_view/www/src/workspace/types.ts +10 -0
inspect_ai/_view/www/src/workspace/utils.ts +34 -0
inspect_ai/_view/www/tsconfig.json +23 -9
inspect_ai/_view/www/vite.config.js +8 -17
inspect_ai/_view/www/yarn.lock +627 -556
inspect_ai/approval/_approval.py +2 -0
inspect_ai/approval/_approver.py +4 -4
inspect_ai/approval/_auto.py +1 -1
inspect_ai/approval/_human/approver.py +3 -0
inspect_ai/approval/_policy.py +5 -0
inspect_ai/approval/_registry.py +2 -2
inspect_ai/dataset/_dataset.py +64 -37
inspect_ai/dataset/_sources/__init__.py +0 -0
inspect_ai/dataset/_sources/csv.py +20 -12
inspect_ai/dataset/_sources/file.py +4 -0
inspect_ai/dataset/_sources/hf.py +39 -29
inspect_ai/dataset/_sources/json.py +17 -9
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_convert.py +3 -3
inspect_ai/log/_file.py +24 -9
inspect_ai/log/_log.py +101 -13
inspect_ai/log/_message.py +4 -2
inspect_ai/log/_recorders/file.py +4 -0
inspect_ai/log/_recorders/json.py +5 -7
inspect_ai/log/_recorders/recorder.py +3 -0
inspect_ai/log/_transcript.py +19 -8
inspect_ai/model/__init__.py +2 -0
inspect_ai/model/_cache.py +39 -21
inspect_ai/model/_call_tools.py +4 -3
inspect_ai/model/_chat_message.py +14 -4
inspect_ai/model/_generate_config.py +1 -1
inspect_ai/model/_model.py +31 -24
inspect_ai/model/_model_output.py +14 -1
inspect_ai/model/_openai.py +10 -18
inspect_ai/model/_providers/anthropic.py +3 -3
inspect_ai/model/_providers/google.py +9 -5
inspect_ai/model/_providers/openai.py +5 -9
inspect_ai/model/_providers/openai_o1.py +3 -5
inspect_ai/model/_providers/openrouter.py +86 -0
inspect_ai/model/_providers/providers.py +11 -0
inspect_ai/scorer/__init__.py +6 -1
inspect_ai/scorer/_answer.py +7 -7
inspect_ai/scorer/_classification.py +38 -18
inspect_ai/scorer/_common.py +2 -8
inspect_ai/scorer/_match.py +4 -5
inspect_ai/scorer/_metric.py +87 -28
inspect_ai/scorer/_metrics/__init__.py +3 -3
inspect_ai/scorer/_metrics/accuracy.py +8 -10
inspect_ai/scorer/_metrics/mean.py +3 -17
inspect_ai/scorer/_metrics/std.py +111 -30
inspect_ai/scorer/_model.py +12 -12
inspect_ai/scorer/_pattern.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +36 -21
inspect_ai/scorer/_reducer/registry.py +2 -2
inspect_ai/scorer/_reducer/types.py +7 -1
inspect_ai/scorer/_score.py +11 -1
inspect_ai/scorer/_scorer.py +110 -16
inspect_ai/solver/__init__.py +1 -1
inspect_ai/solver/_basic_agent.py +19 -22
inspect_ai/solver/_bridge/__init__.py +0 -3
inspect_ai/solver/_bridge/bridge.py +3 -3
inspect_ai/solver/_chain.py +1 -2
inspect_ai/solver/_critique.py +3 -3
inspect_ai/solver/_fork.py +2 -2
inspect_ai/solver/_human_agent/__init__.py +0 -0
inspect_ai/solver/_human_agent/agent.py +5 -8
inspect_ai/solver/_human_agent/commands/clock.py +14 -10
inspect_ai/solver/_human_agent/commands/note.py +1 -1
inspect_ai/solver/_human_agent/commands/score.py +0 -11
inspect_ai/solver/_multiple_choice.py +38 -26
inspect_ai/solver/_prompt.py +7 -7
inspect_ai/solver/_solver.py +53 -52
inspect_ai/solver/_task_state.py +80 -69
inspect_ai/solver/_use_tools.py +9 -9
inspect_ai/tool/__init__.py +4 -1
inspect_ai/tool/_tool.py +43 -14
inspect_ai/tool/_tool_call.py +6 -2
inspect_ai/tool/_tool_choice.py +3 -1
inspect_ai/tool/_tool_def.py +10 -8
inspect_ai/tool/_tool_params.py +24 -0
inspect_ai/tool/_tool_with.py +7 -7
inspect_ai/tool/_tools/__init__.py +0 -0
inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_execute.py +23 -11
inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
inspect_ai/tool/_tools/_web_search.py +7 -5
inspect_ai/tool/beta.py +3 -0
inspect_ai/util/_concurrency.py +3 -3
inspect_ai/util/_panel.py +2 -0
inspect_ai/util/_resource.py +12 -12
inspect_ai/util/_sandbox/docker/compose.py +23 -20
inspect_ai/util/_sandbox/docker/config.py +2 -1
inspect_ai/util/_sandbox/docker/docker.py +42 -86
inspect_ai/util/_sandbox/docker/service.py +100 -0
inspect_ai/util/_sandbox/environment.py +99 -96
inspect_ai/util/_sandbox/self_check.py +124 -16
inspect_ai/util/_subprocess.py +5 -3
inspect_ai/util/_subtask.py +15 -16
{inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
{inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
inspect_ai-0.3.64.dist-info/RECORD +625 -0
inspect_ai/_view/www/src/Register.mjs +0 -3
inspect_ai/_view/www/src/Types.mjs +0 -38
inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
inspect_ai/_view/www/src/components/Card.mjs +0 -126
inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
inspect_ai/_view/www/src/components/Tools.mjs +0 -376
inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
inspect_ai/_view/www/src/components/ansi-output.js +0 -932
inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
inspect_ai/_view/www/src/utils/Format.mjs +0 -260
inspect_ai/_view/www/src/utils/Git.mjs +0 -12
inspect_ai/_view/www/src/utils/Html.mjs +0 -21
inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
inspect_ai/_view/www/src/utils/http.mjs +0 -18
inspect_ai/_view/www/src/utils/queue.mjs +0 -67
inspect_ai/_view/www/src/utils/sync.mjs +0 -101
inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
inspect_ai/tool/beta/__init__.py +0 -5
inspect_ai-0.3.62.dist-info/RECORD +0 -481
/inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
/inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
/inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
/inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
{inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0

inspect_ai/scorer/_metric.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from dataclasses import dataclass, field
 from logging import getLogger
 from typing import (
     Any,
@@ -12,11 +13,15 @@ from typing import (
 from pydantic import BaseModel, Field
+from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.registry import (
     RegistryInfo,
+    is_registry_object,
     registry_add,
     registry_create,
+    registry_info,
     registry_name,
+    registry_params,
     registry_tag,
 )
@@ -43,19 +48,12 @@ Value = Union[
 """Value provided by a score.
 Use the methods of `Score` to easily treat
-the Value as a simple scalar of various types.
+the `Value` as a simple scalar of various types.
 """
 class Score(BaseModel):
-    """Score generated by a scorer.
-    Args:
-       value (Value): Score value.
-       answer (str | None): Answer extracted from model output (optional).
-       explanation (str | None): Explanation of score (optional).
-       metadata (dict[str,Any]): Additional metadata related to the score.
-    """
+    """Score generated by a scorer."""
     value: Value
     """Score value."""
@@ -112,12 +110,7 @@ class Score(BaseModel):
 class SampleScore(BaseModel):
-    """Score for a Sample
-    Args:
-       score: Score
-       sample_id: (str | int | None) Unique id of a sample
-    """
+    """Score for a Sample."""
     score: Score
     """A score"""
@@ -125,6 +118,9 @@ class SampleScore(BaseModel):
     sample_id: str | int | None = Field(default=None)
     """A sample id"""
+    sample_metadata: dict[str, Any] | None = Field(default=None)
+    """Metadata from the sample"""
     scorer: str | None = Field(default=None)
     """Registry name of scorer that created this score."""
@@ -188,22 +184,57 @@ def value_to_float(
 @runtime_checkable
-class Metric(Protocol):
-    r"""Evaluate scores using a metric.
-    Args:
-        scores (list[Score]): List of scores.
+class MetricDeprecated(Protocol):
+    def __call__(self, scores: list[Score]) -> Value: ...
-    Returns:
-        Metric value
-    """
-    def __call__(self, scores: list[Score]) -> Value: ...
+@runtime_checkable
+class MetricProtocol(Protocol):
+    def __call__(self, scores: list[SampleScore]) -> Value:
+        r"""Compute a metric on a list of scores.
+        Args:
+          scores: List of scores.
+        Returns:
+          Metric value
+        Examples:
+          ```python
+          @metric
+          def mean() -> Metric:
+              def metric(scores: list[SampleScore]) -> Value:
+                  return np.mean([score.score.as_float() for score in scores]).item()
+              return metric
+          ```
+        """
+        ...
+Metric = MetricProtocol | MetricDeprecated
+"""Metric protocol.
+The Metric signature changed in release v0.3.64. Both
+the previous and new signatures are supported -- you
+should use `MetricProtocol` for new code as the
+depreacated signature will eventually be removed.
+"""
 P = ParamSpec("P")
+@dataclass(frozen=True)
+class MetricSpec:
+    """Scorer specification used to (re-)create scorers."""
+    metric: str
+    """Metric name"""
+    args: dict[str, Any] = field(default_factory=dict)
+    """Metric arguments."""
 def metric_register(metric: Callable[P, Metric], name: str = "") -> Callable[P, Metric]:
     r"""Register a function or class as a metric.
@@ -237,6 +268,26 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
     return cast(Metric, registry_create("metric", name, **kwargs))
+def to_metric_specs(
+    metrics: list[Metric] | dict[str, list[Metric]],
+) -> list[MetricSpec] | dict[str, list[MetricSpec]]:
+    if isinstance(metrics, list):
+        return [as_metric_spec(m) for m in metrics]
+    else:
+        return {
+            k: [as_metric_spec(v) for v in metric_list]
+            for k, metric_list in metrics.items()
+        }
+def as_metric_spec(metric: Metric) -> MetricSpec:
+    if not is_registry_object(metric):
+        raise PrerequisiteError(
+            f"The metric {getattr(metric, '__name__', '<unknown>')} was not created by a function decorated with @metric so cannot be recorded."
+        )
+    return MetricSpec(metric=registry_info(metric).name, args=registry_params(metric))
 @overload
 def metric(name: str) -> Callable[[Callable[P, Metric]], Callable[P, Metric]]: ...
@@ -252,10 +303,18 @@ def metric(
     r"""Decorator for registering metrics.
     Args:
-        name: (str | MetricType):
-            Optional name for metric. If the decorator has no name
-            argument then the name of the underlying MetricType
-            will be used to automatically assign a name.
+      name: Optional name for metric. If the decorator has no name
+        argument then the name of the underlying MetricType
+        will be used to automatically assign a name.
+    Examples:
+      ```python
+      @metric
+      def mean() -> Metric:
+          def metric(scores: list[SampleScore]) -> Value:
+              return np.mean([score.score.as_float() for score in scores]).item()
+          return metric
+    ```
     """
     # create_metric_wrapper:

inspect_ai/scorer/_metrics/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .accuracy import accuracy
-from .mean import mean, var
-from .std import bootstrap_stderr, std, stderr
+from .mean import mean
+from .std import bootstrap_stderr, std, stderr, var
 __all__ = [
     "accuracy",
     "mean",
-    "var",
     "bootstrap_stderr",
     "std",
     "stderr",
+    "var",
 ]

inspect_ai/scorer/_metrics/accuracy.py CHANGED Viewed

@@ -2,7 +2,7 @@ from logging import getLogger
 from .._metric import (
     Metric,
-    Score,
+    SampleScore,
     ValueToFloat,
     metric,
     value_to_float,
@@ -16,22 +16,20 @@ def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric:
     r"""Compute proportion of total answers which are correct.
     Args:
-      to_float (ValueToFloat): Function for mapping
-        Value to float for computing metrics. The default
-        `value_to_float()` maps CORRECT ("C") to 1.0,
-        INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
-        NOANSWER ("N") to 0, casts numeric values to
-        float directly, and prints a warning and returns
-        0 if the Value is a complex object (list or dict).
+       to_float: Function for mapping `Value` to float for computing
+          metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
+          INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
+          casts numeric values to float directly, and prints a warning and returns
+          0 if the Value is a complex object (list or dict).
     Returns:
        Accuracy metric
     """
-    def metric(scores: list[Score]) -> float:
+    def metric(scores: list[SampleScore]) -> float:
         total = 0.0
         for item in scores:
-            total += to_float(item.value)
+            total += to_float(item.score.value)
         return total / float(len(scores))
     return metric

inspect_ai/scorer/_metrics/mean.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
-from .._metric import Metric, Score, metric
+from .._metric import Metric, SampleScore, metric
 @metric
@@ -11,21 +11,7 @@ def mean() -> Metric:
        mean metric
     """
-    def metric(scores: list[Score]) -> float:
-        return np.mean([score.as_float() for score in scores]).item()
-    return metric
-@metric
-def var() -> Metric:
-    """Compute variance over all scores.
-    Returns:
-       var metric
-    """
-    def metric(scores: list[Score]) -> float:
-        return np.var([score.as_float() for score in scores]).item()
+    def metric(scores: list[SampleScore]) -> float:
+        return np.mean([score.score.as_float() for score in scores]).item()
     return metric

inspect_ai/scorer/_metrics/std.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 from .._metric import (
     Metric,
-    Score,
+    SampleScore,
     ValueToFloat,
     metric,
     value_to_float,
@@ -21,21 +21,21 @@ def bootstrap_stderr(
     """Standard error of the mean using bootstrap.
     Args:
-       num_samples (int): Number of bootstrap samples to take.
-       to_float (ValueToFloat): Function for mapping
-         Value to float for computing metrics. The default
-         `value_to_float()` maps CORRECT ("C") to 1.0,
-         INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
-         NOANSWER ("N") to 0, casts numeric values to
-         float directly, and prints a warning and returns
-         0 if the Value is a complex object (list or dict).
+       num_samples: Number of bootstrap samples to take.
+       to_float: Function for mapping
+          Value to float for computing metrics. The default
+          `value_to_float()` maps CORRECT ("C") to 1.0,
+          INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
+          NOANSWER ("N") to 0, casts numeric values to
+          float directly, and prints a warning and returns
+          0 if the Value is a complex object (list or dict).
     Returns:
        bootstrap_stderr metric
     """
-    def metric(scores: list[Score]) -> float:
-        values = [to_float(score.value) for score in scores]
+    def metric(scores: list[SampleScore]) -> float:
+        values = [to_float(score.score.value) for score in scores]
         std = np.std(
             [
                 np.mean(np.random.choice(values, len(values), replace=True))
@@ -48,24 +48,71 @@ def bootstrap_stderr(
 @metric
-def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
+def stderr(
+    to_float: ValueToFloat = value_to_float(), cluster: str | None = None
+) -> Metric:
     """Standard error of the mean using Central Limit Theorem.
     Args:
-        to_float (ValueToFloat): Function for mapping
-            Value to float for computing metrics. The default
-            `value_to_float()` maps CORRECT ("C") to 1.0,
-            INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
-            NOANSWER ("N") to 0, casts numeric values to
-            float directly, and prints a warning and returns
-            0 if the Value is a complex object (list or dict).
+       to_float: Function for mapping `Value` to float for computing
+          metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
+          INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
+          casts numeric values to float directly, and prints a warning and returns
+          0 if the Value is a complex object (list or dict).
+       cluster (str | None): The key from the Sample metadata
+          corresponding to a cluster identifier for computing
+          [clustered standard errors](https://en.wikipedia.org/wiki/Clustered_standard_errors).
     Returns:
-        stderr metric
+       stderr metric
     """
-    def metric(scores: list[Score]) -> float:
-        values = [to_float(score.value) for score in scores]
+    def clustered_metric(scores: list[SampleScore]) -> float:
+        """Computes a clustered standard error.
+        For details, see Appendix A of https://arxiv.org/pdf/2411.00640.
+        The version here uses a finite cluster correction (unlike the paper)
+        """
+        assert cluster is not None
+        cluster_list = []
+        value_list = []
+        for sample_score in scores:
+            if (
+                sample_score.sample_metadata is None
+                or cluster not in sample_score.sample_metadata
+            ):
+                raise ValueError(
+                    f"Sample {sample_score.sample_id} has no cluster metadata. To compute `stderr` with clustering, each sample metadata must have a value for '{cluster}'"
+                )
+            cluster_list.append(sample_score.sample_metadata[cluster])
+            value_list.append(to_float(sample_score.score.value))
+        clusters = np.array(cluster_list)
+        values = np.array(value_list)
+        mean = float(np.mean(values))
+        # Convert to numpy arrays and get unique clusters
+        unique_clusters = np.unique(clusters)
+        cluster_count = len(unique_clusters)
+        # Compute clustered variance using NumPy operations
+        clustered_variance = 0.0
+        for cluster_id in unique_clusters:
+            # get a data vector for this cluster
+            cluster_data = values[clusters == cluster_id]
+            # this computes X' \Omega X = \sum_i \sum_j (s_{i,c} - mean) * (s_{j,c} - mean)
+            clustered_variance += np.outer(
+                cluster_data - mean, cluster_data - mean
+            ).sum()
+        # Multiply by C / (C - 1) to unbias the variance estimate
+        standard_error = np.sqrt(
+            clustered_variance * cluster_count / (cluster_count - 1)
+        ) / len(scores)
+        return cast(float, standard_error)
+    def metric(scores: list[SampleScore]) -> float:
+        values = [to_float(score.score.value) for score in scores]
         n = len(values)
         # standard deviation is calculated by dividing by n-ddof so ensure
@@ -81,6 +128,9 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
         return cast(float, standard_error)
+    if cluster is not None:
+        return clustered_metric
     return metric
@@ -88,6 +138,39 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
 def std(to_float: ValueToFloat = value_to_float()) -> Metric:
     """Calculates the sample standard deviation of a list of scores.
+    Args:
+       to_float: Function for mapping `Value` to float for computing
+          metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
+          INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
+          casts numeric values to float directly, and prints a warning and returns
+          0 if the Value is a complex object (list or dict).
+    Returns:
+        std metric
+    """
+    def metric(scores: list[SampleScore]) -> float:
+        values = [to_float(score.score.value) for score in scores]
+        n = len(values)
+        # standard deviation is calculated by dividing by n-ddof so ensure
+        # that we won't divide by zero
+        if (n - 1) < 1:
+            return 0
+        # Calculate the sample standard deviation
+        sample_std = np.std(values, ddof=1)
+        return cast(float, sample_std)
+    return metric
+@metric
+def var(to_float: ValueToFloat = value_to_float()) -> Metric:
+    """Compute the sample variance of a list of scores.
     Args:
         to_float (ValueToFloat): Function for mapping
             Value to float for computing metrics. The default
@@ -98,21 +181,19 @@ def std(to_float: ValueToFloat = value_to_float()) -> Metric:
             0 if the Value is a complex object (list or dict).
     Returns:
-        std metric
+       var metric
     """
-    def metric(scores: list[Score]) -> float:
-        values = [to_float(score.value) for score in scores]
+    def metric(scores: list[SampleScore]) -> float:
+        values = [to_float(score.score.value) for score in scores]
         n = len(values)
-        # standard deviation is calculated by dividing by n-ddof so ensure
+        # variance is calculated by dividing by n-ddof so ensure
         # that we won't divide by zero
         if (n - 1) < 1:
             return 0
-        # Calculate the sample standard deviation
-        sample_std = np.std(values, ddof=1)
+        variance = np.var(values, ddof=1)
-        return cast(float, sample_std)
+        return cast(float, variance)
     return metric

inspect_ai/scorer/_model.py CHANGED Viewed

@@ -35,31 +35,31 @@ def model_graded_fact(
     """Score a question/answer task with a fact response using a model.
     Args:
-      template (str): Template for grading prompt. This template uses
+      template: Template for grading prompt. This template uses
         four variables: `question`, `criterion`, `answer`, and
         `instructions` (which is fed from the `instructions` parameter).
         Variables from sample `metadata` are also available in the template.
-      instructions (str): Grading instructions. This should
+      instructions: Grading instructions. This should
         include a prompt for the model to answer (e.g. with
         with chain of thought reasoning) in a way that matches
         the specified `grade_pattern`, for example, the default
         `grade_pattern` looks for one of GRADE: C, GRADE: P, or
         GRADE: I).
-      grade_pattern (str): Regex to extract the grade from the
+      grade_pattern: Regex to extract the grade from the
         model response. Defaults to looking for e.g. GRADE: C
         The regex should have a single capture group that
         extracts exactly the letter C, P, or I.
-      include_history (bool | Callable[[TaskState], str]):
+      include_history:
         Whether to include the full chat history in the presented
         question. Defaults to `False`, which presents only the
         original sample input. Optionally provide a function to
         customise how the chat history is presented.
-      partial_credit (bool): Whether to allow for "partial" credit for
+      partial_credit: Whether to allow for "partial" credit for
          answers (by default assigned a score of 0.5). Defaults
          to `False`. Note that this parameter is only used
          with the default `instructions` (as custom instructions
          provide their own prompts for grades).
-      model (list[str | Model] | str | Model | None): Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
+      model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
     """
     return model_graded_qa(
         template=template if template else DEFAULT_MODEL_GRADED_FACT_TEMPLATE,
@@ -83,32 +83,32 @@ def model_graded_qa(
     """Score a question/answer task using a model.
     Args:
-      template (str): Template for grading prompt. This template has
+      template: Template for grading prompt. This template has
         four variables:
            - `question`, `criterion`, `answer`, and
         `instructions` (which is fed from the `instructions` parameter).
         Variables from sample `metadata` are also available in the template.
-      instructions (str): Grading instructions. This should
+      instructions: Grading instructions. This should
         include a prompt for the model to answer (e.g. with
         with chain of thought reasoning) in a way that matches
         the specified `grade_pattern`, for example, the default
         `grade_pattern` looks for one of GRADE: C, GRADE: P, or
         GRADE: I.
-      grade_pattern (str): Regex to extract the grade from the
+      grade_pattern: Regex to extract the grade from the
         model response. Defaults to looking for e.g. GRADE: C
         The regex should have a single capture group that
         extracts exactly the letter C, P, I.
-      include_history (bool | Callable[[TaskState], str]):
+      include_history:
         Whether to include the full chat history in the presented
         question. Defaults to `False`, which presents only the
         original sample input. Optionally provide a function to
         customise how the chat history is presented.
-      partial_credit (bool): Whether to allow for "partial" credit for
+      partial_credit: Whether to allow for "partial" credit for
         answers (by default assigned a score of 0.5). Defaults
         to `False`. Note that this parameter is only used
         with the default `instructions` (as custom instructions
         provide their own prompts for grades).
-      model (list[str | Model] | str | Model | None): Model or Models to use for grading. If     multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
+      model: Model or Models to use for grading. If     multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
     """
     # bind variables
     get_scorer = partial(

inspect_ai/scorer/_pattern.py CHANGED Viewed

@@ -55,11 +55,11 @@ def pattern(pattern: str, ignore_case: bool = True, match_all: bool = False) ->
     to match either one or all of the extracted groups
     Args:
-       pattern (str): Regular expression for extracting the
+       pattern: Regular expression for extracting the
           answer from model output.
-       ignore_case (bool): Ignore case when comparing
+       ignore_case: Ignore case when comparing
           the extract answer to the targets. (Default: True)
-       match_all (bool): With multiple captures, do all captured
+       match_all: With multiple captures, do all captured
           values need to match the target? (Default: False)
     """

inspect_ai/scorer/_reducer/reducer.py CHANGED Viewed

@@ -12,6 +12,8 @@ from .types import ScoreReducer
 @score_reducer(name="mode")
 def mode_score() -> ScoreReducer:
+    r"""Take the mode from a list of scores."""
     def reduce(scores: list[Score]) -> Score:
         r"""A utility function for the most common score in a list of scores.
@@ -36,12 +38,13 @@ def mode_score() -> ScoreReducer:
 @score_reducer(name="mean")
 def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
-    def reduce(scores: list[Score]) -> Score:
-        r"""A utility function for taking a mean value over a list of scores.
+    r"""Take the mean of a list of scores.
-        Args:
-            scores: a list of Scores.
-        """
+    Args:
+       value_to_float: Function to convert the value to a float
+    """
+    def reduce(scores: list[Score]) -> Score:
         if isinstance(scores[0].value, dict):
             return _compute_dict_stat(scores, value_to_float, statistics.mean)
         elif isinstance(scores[0].value, list):
@@ -54,12 +57,13 @@ def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
 @score_reducer(name="median")
 def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
-    def reduce(scores: list[Score]) -> Score:
-        r"""A utility function for taking a median value over a list of scores.
+    r"""Take the median value from a list of scores.
-        Args:
-            scores: a list of Scores.
-        """
+    Args:
+       value_to_float: Function to convert the value to a float
+    """
+    def reduce(scores: list[Score]) -> Score:
         if isinstance(scores[0].value, dict):
             return _compute_dict_stat(scores, value_to_float, statistics.median)
         elif isinstance(scores[0].value, list):
@@ -74,13 +78,15 @@ def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReduce
 def at_least(
     k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
 ) -> ScoreReducer:
-    def reduce(scores: list[Score]) -> Score:
-        r"""A utility function for scoring a value as correct if there are at least n score values greater than or equal to the value
+    r"""Score correct if there are at least k score values greater than or equal to the value.
-        Args:
-            scores: a list of Scores.
-        """
+    Args:
+       k: Number of score values that must exceed `value`.
+       value: Score value threshold.
+       value_to_float: Function to convert score values to float.
+    """
+    def reduce(scores: list[Score]) -> Score:
         def gte_n(
             counter: Counter[str | int | float | bool],
         ) -> str | int | float | bool:
@@ -104,6 +110,14 @@ def at_least(
 def pass_at(
     k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
 ) -> ScoreReducer:
+    r"""Probability of at least 1 correct sample given `k` epochs (<https://arxiv.org/pdf/2107.03374>).
+    Args:
+       k: Epochs to compute probability for.
+       value: Score value threshold.
+       value_to_float: Function to convert score values to float.
+    """
     def reduce(scores: list[Score]) -> Score:
         def pass_at_k(values: list[float]) -> float:
             total = len(scores)
@@ -129,12 +143,13 @@ def pass_at(
 @score_reducer(name="max")
 def max_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
-    def reduce(scores: list[Score]) -> Score:
-        r"""A utility function for taking the maximum value from a list of scores
+    r"""Take the maximum value from a list of scores.
-        Args:
-            scores: a list of Scores.
-        """
+    Args:
+       value_to_float: Function to convert the value to a float
+    """
+    def reduce(scores: list[Score]) -> Score:
         if isinstance(scores[0].value, dict):
             dict_result: dict[str, str | int | float | bool | None] = {}
             keys = scores[0].value.keys()  # type: ignore
@@ -238,7 +253,7 @@ def _compute_dict_stat(
     Args:
         scores: a list of Scores.
-        value_to_float: function to convert the value to a float
+        value_to_float: Function to convert the value to a float
         statistic: the statistic to apply
     """
     # Make sure these are all dictionaries be we proceed

inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

inspect-ai 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl