inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -40,9 +40,9 @@ def score_reducer(
|
|
40
40
|
"""Decorator for registering Score Reducers.
|
41
41
|
|
42
42
|
Args:
|
43
|
-
func
|
43
|
+
func: Function returning `ScoreReducer` targeted by
|
44
44
|
plain task decorator without attributes (e.g. `@score_reducer`)
|
45
|
-
name
|
45
|
+
name: Optional name for reducer. If the decorator has no name
|
46
46
|
argument then the name of the function will be used to automatically assign a name.
|
47
47
|
|
48
48
|
Returns:
|
@@ -5,7 +5,13 @@ from .._metric import Score
|
|
5
5
|
|
6
6
|
@runtime_checkable
|
7
7
|
class ScoreReducer(Protocol):
|
8
|
-
def __call__(self, scores: list[Score]) -> Score:
|
8
|
+
def __call__(self, scores: list[Score]) -> Score:
|
9
|
+
"""Reduce a set of scores to a single score.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
scores: List of scores.
|
13
|
+
"""
|
14
|
+
...
|
9
15
|
|
10
16
|
@property
|
11
17
|
def __name__(self) -> str: ...
|
inspect_ai/scorer/_score.py
CHANGED
@@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]:
|
|
23
23
|
a task that does not have a scorer.
|
24
24
|
|
25
25
|
"""
|
26
|
+
from inspect_ai.log._transcript import ScoreEvent, transcript
|
27
|
+
|
26
28
|
scorers = _scorers.get(None)
|
27
29
|
target = _target.get(None)
|
28
30
|
if scorers is None or target is None:
|
@@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]:
|
|
30
32
|
"The score() function can only be called while executing a task with a scorer."
|
31
33
|
)
|
32
34
|
|
33
|
-
|
35
|
+
scores: list[Score] = []
|
36
|
+
for scorer in scorers:
|
37
|
+
score = await scorer(state, target)
|
38
|
+
scores.append(score)
|
39
|
+
transcript()._event(
|
40
|
+
ScoreEvent(score=score, target=target.target, intermediate=True)
|
41
|
+
)
|
42
|
+
|
43
|
+
return scores
|
34
44
|
|
35
45
|
|
36
46
|
def init_scoring_context(scorers: list[Scorer], target: Target) -> None:
|
inspect_ai/scorer/_scorer.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from copy import deepcopy
|
2
|
+
from dataclasses import dataclass, field
|
1
3
|
from functools import wraps
|
2
4
|
from typing import (
|
3
5
|
Any,
|
@@ -9,38 +11,74 @@ from typing import (
|
|
9
11
|
)
|
10
12
|
|
11
13
|
from inspect_ai._util._async import is_callable_coroutine
|
14
|
+
from inspect_ai._util.error import PrerequisiteError
|
12
15
|
from inspect_ai._util.registry import (
|
13
16
|
RegistryInfo,
|
17
|
+
is_registry_object,
|
14
18
|
registry_add,
|
15
19
|
registry_create,
|
16
20
|
registry_info,
|
17
21
|
registry_name,
|
22
|
+
registry_params,
|
18
23
|
registry_tag,
|
19
24
|
registry_unqualified_name,
|
20
25
|
)
|
21
26
|
from inspect_ai.solver._task_state import TaskState
|
22
27
|
|
23
|
-
from ._metric import Metric, Score
|
28
|
+
from ._metric import Metric, MetricSpec, Score, as_metric_spec
|
24
29
|
from ._target import Target
|
25
30
|
|
26
31
|
|
27
32
|
@runtime_checkable
|
28
33
|
class Scorer(Protocol):
|
29
|
-
r"""Score model outputs.
|
30
|
-
|
31
|
-
Evaluate the passed outputs and targets and return a
|
32
|
-
dictionary with scoring outcomes and context.
|
33
|
-
|
34
|
-
Args:
|
35
|
-
state (TaskState): Task state
|
36
|
-
target (Target): Ideal target for the output.
|
37
|
-
"""
|
38
|
-
|
39
34
|
async def __call__(
|
40
35
|
self,
|
41
36
|
state: TaskState,
|
42
37
|
target: Target,
|
43
|
-
) -> Score:
|
38
|
+
) -> Score:
|
39
|
+
r"""Score model outputs.
|
40
|
+
|
41
|
+
Evaluate the passed outputs and targets and return a
|
42
|
+
dictionary with scoring outcomes and context.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
state: Task state
|
46
|
+
target: Ideal target for the output.
|
47
|
+
|
48
|
+
Examples:
|
49
|
+
```python
|
50
|
+
@scorer
|
51
|
+
def custom_scorer() -> Scorer:
|
52
|
+
async def score(state: TaskState, target: Target) -> Score:
|
53
|
+
# Compare state / model output with target
|
54
|
+
# to yield a score
|
55
|
+
return Score(value=...)
|
56
|
+
|
57
|
+
return score
|
58
|
+
````
|
59
|
+
"""
|
60
|
+
...
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass(frozen=True)
|
64
|
+
class ScorerSpec:
|
65
|
+
"""Scorer specification used to (re-)create scorers."""
|
66
|
+
|
67
|
+
scorer: str
|
68
|
+
"""Scorer name"""
|
69
|
+
|
70
|
+
args: dict[str, Any] = field(default_factory=dict)
|
71
|
+
"""Scorer arguments."""
|
72
|
+
|
73
|
+
metadata: dict[str, Any] | None = field(default=None)
|
74
|
+
"""Scorer metadata"""
|
75
|
+
|
76
|
+
metrics: (
|
77
|
+
list[MetricSpec | dict[str, list[MetricSpec]]]
|
78
|
+
| dict[str, list[MetricSpec]]
|
79
|
+
| None
|
80
|
+
) = field(default=None)
|
81
|
+
"""Scorer metrics"""
|
44
82
|
|
45
83
|
|
46
84
|
P = ParamSpec("P")
|
@@ -90,17 +128,28 @@ def scorer(
|
|
90
128
|
r"""Decorator for registering scorers.
|
91
129
|
|
92
130
|
Args:
|
93
|
-
metrics
|
131
|
+
metrics: One or more metrics to calculate
|
94
132
|
over the scores.
|
95
|
-
name
|
96
|
-
Optional name for scorer. If the decorator has no name
|
133
|
+
name: Optional name for scorer. If the decorator has no name
|
97
134
|
argument then the name of the underlying ScorerType
|
98
135
|
object will be used to automatically assign a name.
|
99
|
-
**metadata
|
136
|
+
**metadata: Additional values to serialize
|
100
137
|
in metadata.
|
101
138
|
|
102
139
|
Returns:
|
103
140
|
Scorer with registry attributes.
|
141
|
+
|
142
|
+
Examples:
|
143
|
+
```python
|
144
|
+
@scorer
|
145
|
+
def custom_scorer() -> Scorer:
|
146
|
+
async def score(state: TaskState, target: Target) -> Score:
|
147
|
+
# Compare state / model output with target
|
148
|
+
# to yield a score
|
149
|
+
return Score(value=...)
|
150
|
+
|
151
|
+
return score
|
152
|
+
````
|
104
153
|
"""
|
105
154
|
|
106
155
|
def wrapper(scorer_type: Callable[P, Scorer]) -> Callable[P, Scorer]:
|
@@ -142,6 +191,51 @@ def scorer(
|
|
142
191
|
return wrapper
|
143
192
|
|
144
193
|
|
194
|
+
def as_scorer_spec(scorer: Scorer) -> ScorerSpec:
|
195
|
+
if not is_registry_object(scorer):
|
196
|
+
raise PrerequisiteError(
|
197
|
+
f"The scorer {getattr(scorer, '__name__', '<unknown>')} was not created by a function decorated with @scorer so cannot be recorded."
|
198
|
+
)
|
199
|
+
name = registry_unqualified_name(scorer)
|
200
|
+
metrics = scorer_metrics(scorer)
|
201
|
+
resolved_metrics = resolve_metrics(metrics)
|
202
|
+
|
203
|
+
args = registry_params(scorer)
|
204
|
+
metadata = deepcopy(registry_info(scorer).metadata)
|
205
|
+
del metadata[SCORER_METRICS]
|
206
|
+
|
207
|
+
return ScorerSpec(
|
208
|
+
scorer=name, args=args, metadata=metadata, metrics=resolved_metrics
|
209
|
+
)
|
210
|
+
|
211
|
+
|
212
|
+
def resolve_metrics(
|
213
|
+
metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]],
|
214
|
+
) -> (
|
215
|
+
list[MetricSpec | dict[str, list[MetricSpec]]] | dict[str, list[MetricSpec]] | None
|
216
|
+
):
|
217
|
+
if isinstance(metrics, list):
|
218
|
+
resolved_metrics: list[MetricSpec | dict[str, list[MetricSpec]]] = []
|
219
|
+
for metric_item in metrics:
|
220
|
+
if isinstance(metric_item, Metric):
|
221
|
+
resolved_metrics.append(as_metric_spec(metric_item))
|
222
|
+
else:
|
223
|
+
resolved_metrics.append(
|
224
|
+
{
|
225
|
+
metric_group: [
|
226
|
+
as_metric_spec(metric) for metric in metrics_list
|
227
|
+
]
|
228
|
+
for metric_group, metrics_list in metric_item.items()
|
229
|
+
}
|
230
|
+
)
|
231
|
+
return resolved_metrics
|
232
|
+
else:
|
233
|
+
return {
|
234
|
+
metric_group: [as_metric_spec(metric) for metric in metrics_list]
|
235
|
+
for metric_group, metrics_list in metrics.items()
|
236
|
+
}
|
237
|
+
|
238
|
+
|
145
239
|
def scorer_metrics(
|
146
240
|
scorer: Scorer,
|
147
241
|
) -> list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]:
|
inspect_ai/solver/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from inspect_ai._util.deprecation import relocated_module_attribute
|
2
2
|
|
3
3
|
from ._basic_agent import basic_agent
|
4
|
-
from ._bridge import bridge
|
4
|
+
from ._bridge.bridge import bridge
|
5
5
|
from ._chain import chain
|
6
6
|
from ._critique import self_critique
|
7
7
|
from ._fork import fork
|
@@ -81,31 +81,28 @@ def basic_agent(
|
|
81
81
|
alternate conversion scheme as required via `score_value`.
|
82
82
|
|
83
83
|
Args:
|
84
|
-
init: (
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
max_attempts (int): Maximum number of submissions to accept before terminating.
|
91
|
-
message_limit (int | None): Limit on messages in sample before terminating agent.
|
84
|
+
init: Agent initialisation (defaults to system_message with basic ReAct prompt)
|
85
|
+
tools: Tools available for the agent. Either a list of tools or a Solver that
|
86
|
+
can yield dynamic tools per-sample.
|
87
|
+
cache: Caching behaviour for generate responses (defaults to no caching).
|
88
|
+
max_attempts: Maximum number of submissions to accept before terminating.
|
89
|
+
message_limit: Limit on messages in sample before terminating agent.
|
92
90
|
If not specified, will use limit_messages defined for the task. If there is none
|
93
91
|
defined for the task, 50 will be used as a default.
|
94
|
-
token_limit
|
95
|
-
max_tool_output
|
92
|
+
token_limit: Limit on tokens used in sample before terminating agent.
|
93
|
+
max_tool_output: Maximum output length (in bytes).
|
96
94
|
Defaults to max_tool_output from active GenerateConfig.
|
97
|
-
score_value
|
98
|
-
|
99
|
-
incorrect_message
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
**kwargs (Any): Deprecated arguments for backward compatibility.
|
95
|
+
score_value: Function used to extract float from scores (defaults
|
96
|
+
to standard value_to_float())
|
97
|
+
incorrect_message: User message reply for an incorrect submission from the model.
|
98
|
+
Alternatively, a function which returns a message (function may optionally be async)
|
99
|
+
continue_message: User message to urge the model to continue when it
|
100
|
+
doesn't make a tool call.
|
101
|
+
submit_name: Name for tool used to make submissions
|
102
|
+
(defaults to 'submit')
|
103
|
+
submit_description: Description of submit tool (defaults to
|
104
|
+
'Submit an answer for evaluation')
|
105
|
+
**kwargs: Deprecated arguments for backward compatibility.
|
109
106
|
|
110
107
|
Returns:
|
111
108
|
Plan for agent.
|
@@ -17,7 +17,7 @@ from .._task_state import TaskState
|
|
17
17
|
def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
|
18
18
|
"""Bridge an external agent into an Inspect Solver.
|
19
19
|
|
20
|
-
See documentation at https://inspect.ai-safety-institute.org.uk/agent-bridge.html
|
20
|
+
See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
|
21
21
|
|
22
22
|
Args:
|
23
23
|
agent: Callable which takes a sample `dict` and returns a result `dict`.
|
@@ -63,11 +63,11 @@ def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solv
|
|
63
63
|
else state.input
|
64
64
|
)
|
65
65
|
|
66
|
-
# create sample
|
66
|
+
# create sample (use standard gpt-4 message encoding -- i.e. no 'developer' messages)
|
67
67
|
sample = BridgeSample(
|
68
68
|
sample_id=str(state.sample_id),
|
69
69
|
epoch=state.epoch,
|
70
|
-
input=await openai_chat_messages(input,
|
70
|
+
input=await openai_chat_messages(input, model="gpt-4"),
|
71
71
|
metadata=state.metadata,
|
72
72
|
target=list(state.target),
|
73
73
|
)
|
inspect_ai/solver/_chain.py
CHANGED
@@ -15,8 +15,7 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
|
|
15
15
|
early.
|
16
16
|
|
17
17
|
Args:
|
18
|
-
solvers
|
19
|
-
or lists of solvers to chain together.
|
18
|
+
*solvers: One or more solvers or lists of solvers to chain together.
|
20
19
|
|
21
20
|
Returns:
|
22
21
|
Solver that executes the passed solvers as a chain.
|
inspect_ai/solver/_critique.py
CHANGED
@@ -25,15 +25,15 @@ def self_critique(
|
|
25
25
|
need to use the model being evaluated).
|
26
26
|
|
27
27
|
Args:
|
28
|
-
critique_template
|
28
|
+
critique_template: String or path to file
|
29
29
|
containing critique template. The template uses two
|
30
30
|
variables: `question` and `completion`.
|
31
31
|
Variables from sample `metadata` are also available
|
32
32
|
in the template.
|
33
|
-
completion_template
|
33
|
+
completion_template: String or path to file
|
34
34
|
containing completion template. The template uses
|
35
35
|
three variables: `question`, `completion`, and `critique`
|
36
|
-
model
|
36
|
+
model: Alternate model to be used
|
37
37
|
for critique (by default the model being evaluated
|
38
38
|
is used).
|
39
39
|
"""
|
inspect_ai/solver/_fork.py
CHANGED
@@ -32,8 +32,8 @@ async def fork(
|
|
32
32
|
Store that doesn't affect the Store of other subtasks or the parent).
|
33
33
|
|
34
34
|
Args:
|
35
|
-
state
|
36
|
-
solvers
|
35
|
+
state: Beginning TaskState
|
36
|
+
solvers: Solvers to apply on the TaskState.
|
37
37
|
Each Solver will get a standalone copy of the TaskState.
|
38
38
|
|
39
39
|
Returns:
|
File without changes
|
@@ -30,14 +30,11 @@ def human_agent(
|
|
30
30
|
using a VS Code Window or Terminal.
|
31
31
|
|
32
32
|
Args:
|
33
|
-
answer
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
check their score while working.
|
39
|
-
record_session (bool): Record all user commands and outputs in
|
40
|
-
the sandbox bash session.
|
33
|
+
answer: Is an explicit answer required for this task or is it scored
|
34
|
+
based on files in the container? Pass a `str` with a regex to validate
|
35
|
+
that the answer matches the expected format.
|
36
|
+
intermediate_scoring: Allow the human agent to check their score while working.
|
37
|
+
record_session: Record all user commands and outputs in the sandbox bash session.
|
41
38
|
|
42
39
|
Returns:
|
43
40
|
Solver: Human agent solver.
|
@@ -27,14 +27,10 @@ class StartCommand(HumanAgentCommand):
|
|
27
27
|
print(call_human_agent("start"))
|
28
28
|
|
29
29
|
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
30
|
-
from inspect_ai.log._transcript import transcript
|
31
|
-
|
32
30
|
async def start() -> str:
|
33
31
|
if not state.running:
|
34
32
|
state.running = True
|
35
|
-
|
36
|
-
f"Task started (total time: {format_progress_time(state.time)})"
|
37
|
-
)
|
33
|
+
clock_action_event("start", state)
|
38
34
|
return render_status(state)
|
39
35
|
|
40
36
|
return start
|
@@ -57,14 +53,22 @@ class StopCommand(HumanAgentCommand):
|
|
57
53
|
print(call_human_agent("stop"))
|
58
54
|
|
59
55
|
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
60
|
-
from inspect_ai.log._transcript import transcript
|
61
|
-
|
62
56
|
async def stop() -> str:
|
63
57
|
if state.running:
|
64
58
|
state.running = False
|
65
|
-
|
66
|
-
f"Task stopped (total time: {format_progress_time(state.time)})"
|
67
|
-
)
|
59
|
+
clock_action_event("stop", state)
|
68
60
|
return render_status(state)
|
69
61
|
|
70
62
|
return stop
|
63
|
+
|
64
|
+
|
65
|
+
def clock_action_event(action: str, state: HumanAgentState) -> None:
|
66
|
+
from inspect_ai.log._transcript import transcript
|
67
|
+
|
68
|
+
transcript().info(
|
69
|
+
{
|
70
|
+
"action": action,
|
71
|
+
"total_time": format_progress_time(state.time, False),
|
72
|
+
},
|
73
|
+
source="human_agent",
|
74
|
+
)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
from argparse import Namespace
|
2
2
|
from copy import deepcopy
|
3
|
-
from textwrap import dedent
|
4
3
|
from typing import Awaitable, Callable, Literal
|
5
4
|
|
6
5
|
from pydantic import JsonValue
|
@@ -51,8 +50,6 @@ class ScoreCommand(HumanAgentCommand):
|
|
51
50
|
|
52
51
|
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
53
52
|
async def score_task(answer: str | None) -> str:
|
54
|
-
from inspect_ai.log._transcript import transcript
|
55
|
-
|
56
53
|
# make a copy of TaskState, add the answer, then score
|
57
54
|
if answer:
|
58
55
|
task_state = deepcopy(self._state)
|
@@ -64,14 +61,6 @@ class ScoreCommand(HumanAgentCommand):
|
|
64
61
|
# record the scoring action in our state
|
65
62
|
state.scorings.append(IntermediateScoring(time=state.time, scores=result))
|
66
63
|
|
67
|
-
# record to transcript
|
68
|
-
transcript().info(
|
69
|
-
dedent(f"""
|
70
|
-
### Intermediate Score
|
71
|
-
**Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
|
72
|
-
""")
|
73
|
-
)
|
74
|
-
|
75
64
|
# notify user
|
76
65
|
return render_text(
|
77
66
|
f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
|
@@ -1,13 +1,19 @@
|
|
1
|
+
import logging
|
1
2
|
import re
|
2
3
|
from enum import Enum
|
3
4
|
from random import Random
|
4
|
-
from typing import Match
|
5
|
+
from typing import Match, TypedDict
|
5
6
|
|
7
|
+
from typing_extensions import Unpack
|
8
|
+
|
9
|
+
from inspect_ai._util.logger import warn_once
|
6
10
|
from inspect_ai.util import resource
|
7
11
|
|
8
12
|
from ._solver import Generate, Solver, solver
|
9
13
|
from ._task_state import Choices, TaskState
|
10
14
|
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
11
17
|
SINGLE_ANSWER_TEMPLATE = r"""
|
12
18
|
Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
13
19
|
|
@@ -201,52 +207,58 @@ class MultipleChoiceTemplate(str, Enum):
|
|
201
207
|
MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
|
202
208
|
|
203
209
|
|
210
|
+
class DeprecatedArgs(TypedDict, total=False):
|
211
|
+
shuffle: bool | Random
|
212
|
+
|
213
|
+
|
204
214
|
@solver
|
205
215
|
def multiple_choice(
|
206
216
|
*,
|
207
217
|
template: str | None = None,
|
208
218
|
cot: bool = False,
|
209
219
|
multiple_correct: bool = False,
|
210
|
-
|
220
|
+
**kwargs: Unpack[DeprecatedArgs],
|
211
221
|
) -> Solver:
|
212
|
-
"""Multiple choice question solver.
|
213
|
-
|
214
|
-
Formats a multiple choice question prompt, then calls `generate()`
|
215
|
-
|
216
|
-
### Usage
|
222
|
+
"""Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
|
217
223
|
|
218
224
|
Note that due to the way this solver works, it has some constraints:
|
219
225
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
### Shuffling
|
225
|
-
|
226
|
-
If the choices are shuffled, we will unshuffle them in the message history
|
227
|
-
after the model has been called, essentially rewriting history. It is
|
228
|
-
something to be aware of if writing custom scorers or solvers that interact
|
229
|
-
with this scorer.
|
226
|
+
1. The `Sample` must have the `choices` attribute set.
|
227
|
+
2. The only built-in compatible scorer is the `choice` scorer.
|
228
|
+
3. It calls `generate()` internally, so you don't need to call it again
|
230
229
|
|
231
230
|
Args:
|
232
|
-
template
|
231
|
+
template: Template to use for the multiple choice question.
|
233
232
|
The defaults vary based on the options and are taken from the `MultipleChoiceTemplate` enum. The template will have questions and possible answers substituted into it before being sent to the model. Consequently it requires three specific template variables:
|
234
|
-
|
235
|
-
|
233
|
+
|
234
|
+
- `{question}`: The question to be asked.
|
235
|
+
- `{choices}`: The choices available, which will be formatted as a
|
236
236
|
list of A) ... B) ... etc. before sending to the model.
|
237
|
-
|
237
|
+
- `{letters}`: (optional) A string of letters representing the choices, e.g.
|
238
238
|
"A,B,C". Used to be explicit to the model about the possible answers.
|
239
|
-
cot
|
239
|
+
cot: Default `False`. Whether the solver should perform chain-of-thought
|
240
240
|
reasoning before answering. NOTE: this has no effect if you provide a custom template.
|
241
|
-
multiple_correct
|
241
|
+
multiple_correct: Default `False`. Whether to allow multiple
|
242
242
|
answers to the multiple choice question. For example, "What numbers are
|
243
243
|
squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
|
244
244
|
as `False` if there's exactly one correct answer from the choices
|
245
245
|
available. NOTE: this has no effect if you provide a custom template.
|
246
|
-
|
247
|
-
|
248
|
-
|
246
|
+
**kwargs (Any): Deprecated arguments for backward compatibility.
|
247
|
+
|
248
|
+
#### Shuffling
|
249
|
+
|
250
|
+
You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
|
249
251
|
"""
|
252
|
+
shuffle: bool | Random = False
|
253
|
+
if "shuffle" in kwargs:
|
254
|
+
shuffle = kwargs["shuffle"]
|
255
|
+
|
256
|
+
if shuffle:
|
257
|
+
warn_once(
|
258
|
+
logger,
|
259
|
+
"The multiple choice shuffle parameter is deprecated. Please shuffle choices at the time your dataset is read by using the shuffle_choices method/parameter of the datasets API.",
|
260
|
+
)
|
261
|
+
|
250
262
|
if template and not valid_template(template):
|
251
263
|
raise ValueError(
|
252
264
|
"The template must contain '{question}' and '{choices}' placeholders for string substitution."
|
inspect_ai/solver/_prompt.py
CHANGED
@@ -20,8 +20,8 @@ def prompt_template(template: str, **params: Any) -> Solver:
|
|
20
20
|
`params`.
|
21
21
|
|
22
22
|
Args:
|
23
|
-
template:
|
24
|
-
**params
|
23
|
+
template: Template for prompt.
|
24
|
+
**params: Parameters to fill into the template.
|
25
25
|
|
26
26
|
Returns:
|
27
27
|
A solver that uses the specified prompt template.
|
@@ -51,8 +51,8 @@ def system_message(template: str, **params: Any) -> Solver:
|
|
51
51
|
are none it will be inserted at the beginning of the conversation).
|
52
52
|
|
53
53
|
Args:
|
54
|
-
template
|
55
|
-
**params
|
54
|
+
template: Template for system message.
|
55
|
+
**params: Parameters to fill into the template.
|
56
56
|
|
57
57
|
Returns:
|
58
58
|
A solver that inserts the parameterised system message.
|
@@ -80,8 +80,8 @@ def user_message(template: str, **params: Any) -> Solver:
|
|
80
80
|
included in the `params`.
|
81
81
|
|
82
82
|
Args:
|
83
|
-
template
|
84
|
-
**params
|
83
|
+
template: Template for user message.
|
84
|
+
**params: Parameters to fill into the template.
|
85
85
|
|
86
86
|
Returns:
|
87
87
|
A solver that inserts the parameterised user message.
|
@@ -109,7 +109,7 @@ def chain_of_thought(template: str = DEFAULT_COT_TEMPLATE) -> Solver:
|
|
109
109
|
"""Solver which modifies the user prompt to encourage chain of thought.
|
110
110
|
|
111
111
|
Args:
|
112
|
-
template
|
112
|
+
template: String or path to file containing CoT template.
|
113
113
|
The template uses a single variable: `prompt`.
|
114
114
|
"""
|
115
115
|
|