inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/eval.py
CHANGED
@@ -89,67 +89,67 @@ def eval(
|
|
89
89
|
r"""Evaluate tasks using a Model.
|
90
90
|
|
91
91
|
Args:
|
92
|
-
tasks:
|
92
|
+
tasks: Task(s) to evaluate. If None, attempt
|
93
93
|
to evaluate a task in the current working directory
|
94
|
-
model
|
94
|
+
model: Model(s) for
|
95
95
|
evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
96
96
|
environment variable.
|
97
|
-
model_base_url:
|
97
|
+
model_base_url: Base URL for communicating
|
98
98
|
with the model API.
|
99
|
-
model_args
|
99
|
+
model_args: Model creation args
|
100
100
|
(as a dictionary or as a path to a JSON or YAML config file)
|
101
|
-
task_args
|
101
|
+
task_args: Task creation arguments
|
102
102
|
(as a dictionary or as a path to a JSON or YAML config file)
|
103
|
-
sandbox
|
104
|
-
|
105
|
-
sandbox_cleanup
|
106
|
-
|
107
|
-
solver
|
108
|
-
|
109
|
-
tags
|
110
|
-
trace
|
111
|
-
display
|
112
|
-
approval:
|
113
|
-
|
114
|
-
|
115
|
-
log_level
|
116
|
-
|
117
|
-
log_level_transcript
|
118
|
-
log_dir
|
119
|
-
|
120
|
-
log_format
|
121
|
-
|
122
|
-
limit
|
123
|
-
|
124
|
-
sample_id
|
125
|
-
epochs
|
126
|
-
|
127
|
-
fail_on_error
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
debug_errors
|
132
|
-
|
133
|
-
message_limit
|
134
|
-
token_limit
|
135
|
-
time_limit
|
136
|
-
max_samples
|
137
|
-
|
138
|
-
max_tasks
|
139
|
-
|
140
|
-
max_subprocesses
|
141
|
-
|
142
|
-
max_sandboxes
|
143
|
-
|
144
|
-
log_samples:
|
145
|
-
log_images:
|
146
|
-
|
147
|
-
log_buffer:
|
148
|
-
|
149
|
-
|
150
|
-
score
|
151
|
-
score_display
|
152
|
-
**kwargs
|
103
|
+
sandbox: Sandbox environment type
|
104
|
+
(or optionally a str or tuple with a shorthand spec)
|
105
|
+
sandbox_cleanup: Cleanup sandbox environments after task completes
|
106
|
+
(defaults to True)
|
107
|
+
solver: Alternative solver for task(s).
|
108
|
+
Optional (uses task solver by default).
|
109
|
+
tags: Tags to associate with this evaluation run.
|
110
|
+
trace: Trace message interactions with evaluated model to terminal.
|
111
|
+
display: Task display type (defaults to 'full').
|
112
|
+
approval: Tool use approval policies.
|
113
|
+
Either a path to an approval policy config file or a list of approval policies.
|
114
|
+
Defaults to no approval policy.
|
115
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
116
|
+
"info", "warning", "error", or "critical" (defaults to "warning")
|
117
|
+
log_level_transcript: Level for logging to the log file (defaults to "info")
|
118
|
+
log_dir: Output path for logging results
|
119
|
+
(defaults to file log in ./logs directory).
|
120
|
+
log_format: Format for writing log files (defaults
|
121
|
+
to "eval", the native high-performance format).
|
122
|
+
limit: Limit evaluated samples
|
123
|
+
(defaults to all samples).
|
124
|
+
sample_id: Evaluate specific sample(s) from the dataset.
|
125
|
+
epochs: Epochs to repeat samples for and optional score
|
126
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
127
|
+
fail_on_error: `True` to fail on first sample error
|
128
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
129
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
130
|
+
eval if a count of samples fails.
|
131
|
+
debug_errors: Raise task errors (rather than logging them)
|
132
|
+
so they can be debugged (defaults to False).
|
133
|
+
message_limit: Limit on total messages used for each sample.
|
134
|
+
token_limit: Limit on total tokens used for each sample.
|
135
|
+
time_limit: Limit on time (in seconds) for execution of each sample.
|
136
|
+
max_samples: Maximum number of samples to run in parallel
|
137
|
+
(default is max_connections)
|
138
|
+
max_tasks: Maximum number of tasks to run in parallel
|
139
|
+
(default is 1)
|
140
|
+
max_subprocesses: Maximum number of subprocesses to
|
141
|
+
run in parallel (default is os.cpu_count())
|
142
|
+
max_sandboxes: Maximum number of sandboxes (per-provider)
|
143
|
+
to run in parallel.
|
144
|
+
log_samples: Log detailed samples and scores (defaults to True)
|
145
|
+
log_images: Log base64 encoded version of images,
|
146
|
+
even if specified as a filename or URL (defaults to False)
|
147
|
+
log_buffer: Number of samples to buffer before writing log file.
|
148
|
+
If not specified, an appropriate default for the format and filesystem is
|
149
|
+
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
150
|
+
score: Score output (defaults to True)
|
151
|
+
score_display: Show scoring metrics in realtime (defaults to True)
|
152
|
+
**kwargs: Model generation options.
|
153
153
|
|
154
154
|
Returns:
|
155
155
|
List of EvalLog (one for each task)
|
@@ -200,6 +200,10 @@ def eval(
|
|
200
200
|
)
|
201
201
|
|
202
202
|
|
203
|
+
# single call to eval_async at a time
|
204
|
+
_eval_async_running = False
|
205
|
+
|
206
|
+
|
203
207
|
async def eval_async(
|
204
208
|
tasks: Tasks,
|
205
209
|
model: str | Model | list[str] | list[Model] | None = None,
|
@@ -355,10 +359,14 @@ async def eval_async(
|
|
355
359
|
"Trace mode cannot be used when evaluating multiple models."
|
356
360
|
)
|
357
361
|
|
358
|
-
# resolve recorder
|
362
|
+
# resolve recorder (confirm writeable)
|
359
363
|
log_dir = log_dir if log_dir else os.environ.get("INSPECT_LOG_DIR", "./logs")
|
360
364
|
log_dir = absolute_file_path(log_dir)
|
361
365
|
recorder = create_recorder_for_format(log_format or DEFAULT_LOG_FORMAT, log_dir)
|
366
|
+
if not recorder.is_writeable():
|
367
|
+
raise PrerequisiteError(
|
368
|
+
f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
|
369
|
+
)
|
362
370
|
|
363
371
|
# resolve solver
|
364
372
|
solver = chain(solver) if isinstance(solver, list) else solver
|
@@ -461,10 +469,6 @@ async def eval_async(
|
|
461
469
|
return logs
|
462
470
|
|
463
471
|
|
464
|
-
# single call to eval_async at a time
|
465
|
-
_eval_async_running = False
|
466
|
-
|
467
|
-
|
468
472
|
def eval_retry(
|
469
473
|
tasks: str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog],
|
470
474
|
log_level: str | None = None,
|
@@ -492,47 +496,46 @@ def eval_retry(
|
|
492
496
|
"""Retry a previously failed evaluation task.
|
493
497
|
|
494
498
|
Args:
|
495
|
-
tasks:
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
Maximum number of concurrent connections to Model API (default is per Model API)
|
499
|
+
tasks: Log files for task(s) to retry.
|
500
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
501
|
+
"info", "warning", "error", or "critical" (defaults to "warning")
|
502
|
+
log_level_transcript: Level for logging to the log file (defaults to "info")
|
503
|
+
log_dir: Output path for logging results
|
504
|
+
(defaults to file log in ./logs directory).
|
505
|
+
log_format: Format for writing log files (defaults
|
506
|
+
to "eval", the native high-performance format).
|
507
|
+
max_samples: Maximum number of samples to run in parallel
|
508
|
+
(default is max_connections)
|
509
|
+
max_tasks: Maximum number of tasks to run in parallel
|
510
|
+
(default is 1)
|
511
|
+
max_subprocesses: Maximum number of subprocesses to
|
512
|
+
run in parallel (default is os.cpu_count())
|
513
|
+
max_sandboxes: Maximum number of sandboxes (per-provider)
|
514
|
+
to run in parallel.
|
515
|
+
sandbox_cleanup: Cleanup sandbox environments after task completes
|
516
|
+
(defaults to True)
|
517
|
+
trace: Trace message interactions with evaluated model to terminal.
|
518
|
+
display: Task display type (defaults to 'full').
|
519
|
+
fail_on_error: `True` to fail on first sample error
|
520
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
521
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
522
|
+
eval if a count of samples fails.
|
523
|
+
debug_errors: Raise task errors (rather than logging them)
|
524
|
+
so they can be debugged (defaults to False).
|
525
|
+
log_samples: Log detailed samples and scores (defaults to True)
|
526
|
+
log_images: Log base64 encoded version of images,
|
527
|
+
even if specified as a filename or URL (defaults to False)
|
528
|
+
log_buffer: Number of samples to buffer before writing log file.
|
529
|
+
If not specified, an appropriate default for the format and filesystem is
|
530
|
+
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
531
|
+
score: Score output (defaults to True)
|
532
|
+
score_display: Show scoring metrics in realtime (defaults to True)
|
533
|
+
max_retries:
|
534
|
+
Maximum number of times to retry request.
|
535
|
+
timeout:
|
536
|
+
Request timeout (in seconds)
|
537
|
+
max_connections:
|
538
|
+
Maximum number of concurrent connections to Model API (default is per Model API)
|
536
539
|
|
537
540
|
Returns:
|
538
541
|
List of EvalLog (one for each task)
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -43,6 +43,12 @@ from .task.task import PreviousTask, Task
|
|
43
43
|
logger = logging.getLogger(__name__)
|
44
44
|
|
45
45
|
|
46
|
+
class Log(NamedTuple):
|
47
|
+
info: EvalLogInfo
|
48
|
+
header: EvalLog
|
49
|
+
task_identifier: str
|
50
|
+
|
51
|
+
|
46
52
|
def eval_set(
|
47
53
|
tasks: Tasks,
|
48
54
|
log_dir: str,
|
@@ -87,79 +93,79 @@ def eval_set(
|
|
87
93
|
r"""Evaluate a set of tasks.
|
88
94
|
|
89
95
|
Args:
|
90
|
-
tasks:
|
96
|
+
tasks: Task(s) to evaluate. If None, attempt
|
91
97
|
to evaluate a task in the current working directory
|
92
|
-
log_dir
|
93
|
-
|
94
|
-
retry_attempts:
|
95
|
-
|
96
|
-
retry_wait
|
97
|
-
|
98
|
-
|
99
|
-
retry_connections
|
100
|
-
|
101
|
-
retry_cleanup
|
102
|
-
|
103
|
-
model
|
104
|
-
|
105
|
-
|
106
|
-
model_base_url:
|
107
|
-
|
108
|
-
model_args
|
109
|
-
|
110
|
-
task_args
|
111
|
-
|
112
|
-
sandbox
|
113
|
-
|
114
|
-
sandbox_cleanup
|
115
|
-
|
116
|
-
solver
|
117
|
-
|
118
|
-
tags
|
119
|
-
trace:
|
120
|
-
display
|
121
|
-
approval:
|
122
|
-
|
123
|
-
|
124
|
-
score
|
125
|
-
log_level
|
126
|
-
|
127
|
-
log_level_transcript
|
128
|
-
log_format
|
129
|
-
|
130
|
-
limit
|
131
|
-
|
132
|
-
sample_id
|
133
|
-
epochs
|
134
|
-
|
135
|
-
fail_on_error
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
debug_errors
|
140
|
-
|
141
|
-
message_limit
|
142
|
-
token_limit
|
143
|
-
time_limit
|
144
|
-
max_samples
|
145
|
-
|
146
|
-
max_tasks
|
147
|
-
|
148
|
-
max_subprocesses
|
149
|
-
|
150
|
-
max_sandboxes
|
151
|
-
|
152
|
-
log_samples:
|
153
|
-
log_images:
|
98
|
+
log_dir: Output path for logging results
|
99
|
+
(required to ensure that a unique storage scope is assigned for the set).
|
100
|
+
retry_attempts: Maximum number of retry attempts before giving up
|
101
|
+
(defaults to 10).
|
102
|
+
retry_wait: Time to wait between attempts, increased exponentially.
|
103
|
+
(defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
|
104
|
+
per-retry will in no case by longer than 1 hour.
|
105
|
+
retry_connections: Reduce max_connections at this rate with each retry
|
106
|
+
(defaults to 0.5)
|
107
|
+
retry_cleanup: Cleanup failed log files after retries
|
108
|
+
(defaults to True)
|
109
|
+
model: Model(s) for
|
110
|
+
evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
|
111
|
+
environment variable.
|
112
|
+
model_base_url: Base URL for communicating
|
113
|
+
with the model API.
|
114
|
+
model_args: Model creation args
|
115
|
+
(as a dictionary or as a path to a JSON or YAML config file)
|
116
|
+
task_args: Task creation arguments
|
117
|
+
(as a dictionary or as a path to a JSON or YAML config file)
|
118
|
+
sandbox: Sandbox environment type
|
119
|
+
(or optionally a str or tuple with a shorthand spec)
|
120
|
+
sandbox_cleanup: Cleanup sandbox environments after task completes
|
121
|
+
(defaults to True)
|
122
|
+
solver: Alternative solver(s) for
|
123
|
+
evaluating task(s). ptional (uses task solver by default).
|
124
|
+
tags: Tags to associate with this evaluation run.
|
125
|
+
trace: Trace message interactions with evaluated model to terminal.
|
126
|
+
display: Task display type (defaults to 'full').
|
127
|
+
approval: Tool use approval policies.
|
128
|
+
Either a path to an approval policy config file or a list of approval policies.
|
129
|
+
Defaults to no approval policy.
|
130
|
+
score: Score output (defaults to True)
|
131
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
132
|
+
"info", "warning", "error", or "critical" (defaults to "warning")
|
133
|
+
log_level_transcript: Level for logging to the log file (defaults to "info")
|
134
|
+
log_format: Format for writing
|
135
|
+
log files (defaults to "eval", the native high-performance format).
|
136
|
+
limit: Limit evaluated samples
|
137
|
+
(defaults to all samples).
|
138
|
+
sample_id: Evaluate specific sample(s) from the dataset.
|
139
|
+
epochs: Epochs to repeat samples for and optional score
|
140
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
141
|
+
fail_on_error: `True` to fail on first sample error
|
142
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
143
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
144
|
+
eval if a count of samples fails.
|
145
|
+
debug_errors: Raise task errors (rather than logging them)
|
146
|
+
so they can be debugged (defaults to False).
|
147
|
+
message_limit: Limit on total messages used for each sample.
|
148
|
+
token_limit: Limit on total tokens used for each sample.
|
149
|
+
time_limit: Limit on time (in seconds) for execution of each sample.
|
150
|
+
max_samples: Maximum number of samples to run in parallel
|
151
|
+
(default is max_connections)
|
152
|
+
max_tasks: Maximum number of tasks to run in parallel
|
153
|
+
(default is 1)
|
154
|
+
max_subprocesses: Maximum number of subprocesses to
|
155
|
+
run in parallel (default is os.cpu_count())
|
156
|
+
max_sandboxes: Maximum number of sandboxes (per-provider)
|
157
|
+
to run in parallel.
|
158
|
+
log_samples: Log detailed samples and scores (defaults to True)
|
159
|
+
log_images: Log base64 encoded version of images,
|
154
160
|
even if specified as a filename or URL (defaults to False)
|
155
|
-
log_buffer:
|
156
|
-
|
157
|
-
|
158
|
-
bundle_dir:
|
161
|
+
log_buffer: Number of samples to buffer before writing log file.
|
162
|
+
If not specified, an appropriate default for the format and filesystem is
|
163
|
+
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
164
|
+
bundle_dir: If specified, the log viewer and logs generated
|
159
165
|
by this eval set will be bundled into this directory.
|
160
|
-
bundle_overwrite
|
166
|
+
bundle_overwrite: Whether to overwrite files in the bundle_dir.
|
161
167
|
(defaults to False).
|
162
|
-
**kwargs
|
168
|
+
**kwargs: Model generation options.
|
163
169
|
|
164
170
|
Returns:
|
165
171
|
Tuple of bool (whether all tasks completed successfully) and list of EvalLog
|
@@ -452,12 +458,6 @@ def return_last_value(retry_state: RetryCallState) -> list[EvalLog]:
|
|
452
458
|
return []
|
453
459
|
|
454
460
|
|
455
|
-
class Log(NamedTuple):
|
456
|
-
info: EvalLogInfo
|
457
|
-
header: EvalLog
|
458
|
-
task_identifier: str
|
459
|
-
|
460
|
-
|
461
461
|
# list all eval logs
|
462
462
|
def list_all_eval_logs(log_dir: str) -> list[Log]:
|
463
463
|
log_files = list_eval_logs(log_dir)
|
inspect_ai/_eval/loader.py
CHANGED
@@ -8,7 +8,7 @@ from importlib.util import module_from_spec, spec_from_loader
|
|
8
8
|
from logging import getLogger
|
9
9
|
from pathlib import Path
|
10
10
|
from types import ModuleType
|
11
|
-
from typing import Any, Callable, cast
|
11
|
+
from typing import Any, Callable, Tuple, cast
|
12
12
|
|
13
13
|
from typing_extensions import overload
|
14
14
|
|
@@ -26,6 +26,7 @@ from inspect_ai._util.registry import (
|
|
26
26
|
registry_params,
|
27
27
|
)
|
28
28
|
from inspect_ai.model import Model, ModelName
|
29
|
+
from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
|
29
30
|
from inspect_ai.solver._bridge import bridge
|
30
31
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
31
32
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
@@ -421,16 +422,7 @@ def as_solver_spec(solver: Solver) -> SolverSpec:
|
|
421
422
|
|
422
423
|
def solver_from_spec(spec: SolverSpec) -> Solver:
|
423
424
|
# resolve @ reference
|
424
|
-
|
425
|
-
if spec_split[1] is not None:
|
426
|
-
solver_file: Path | None = Path(spec_split[0]).resolve()
|
427
|
-
solver_name: str | None = spec_split[1]
|
428
|
-
elif Path(spec_split[0]).exists():
|
429
|
-
solver_file = Path(spec_split[0]).resolve()
|
430
|
-
solver_name = None
|
431
|
-
else:
|
432
|
-
solver_file = None
|
433
|
-
solver_name = spec_split[0]
|
425
|
+
solver_file, solver_name = parse_spec_str(spec.solver)
|
434
426
|
|
435
427
|
# switch contexts if we are loading from a file
|
436
428
|
create_cm = (
|
@@ -501,7 +493,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
501
493
|
else:
|
502
494
|
agent_fn = getattr(solver_module, solver_name, None)
|
503
495
|
if inspect.isfunction(agent_fn):
|
504
|
-
return bridge(agent_fn(**spec.args))
|
496
|
+
return bridge.bridge(agent_fn(**spec.args))
|
505
497
|
elif agent_fn is not None:
|
506
498
|
raise PrerequisiteError(
|
507
499
|
f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
|
@@ -510,3 +502,121 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
510
502
|
raise PrerequisiteError(
|
511
503
|
f"The function {solver_name} was not found in file {pretty_solver_file}."
|
512
504
|
)
|
505
|
+
|
506
|
+
|
507
|
+
def scorer_from_spec(spec: ScorerSpec, task_path: Path | None, **kwargs: Any) -> Scorer:
|
508
|
+
"""
|
509
|
+
Load a scorer
|
510
|
+
|
511
|
+
Args:
|
512
|
+
spec: The scorer spec
|
513
|
+
task_path: An optional path to the task file
|
514
|
+
**kwargs: Additional keyword arguments passed to the scorer initialization
|
515
|
+
|
516
|
+
Returns:
|
517
|
+
Scorer: the loaded scorer
|
518
|
+
|
519
|
+
Raises:
|
520
|
+
PrerequisiteError: If the scorer cannot be found, loaded, or lacks required type annotations
|
521
|
+
"""
|
522
|
+
# resolve @ reference
|
523
|
+
scorer_file, scorer_name = parse_spec_str(spec.scorer)
|
524
|
+
|
525
|
+
# switch contexts if we are loading from a file
|
526
|
+
create_cm = (
|
527
|
+
chdir_python(scorer_file.parent.as_posix())
|
528
|
+
if scorer_file is not None
|
529
|
+
else contextlib.nullcontext()
|
530
|
+
)
|
531
|
+
|
532
|
+
# pretty solver name for error messages
|
533
|
+
pretty_scorer_file = (
|
534
|
+
cwd_relative_path(scorer_file.as_posix()) if scorer_file else None
|
535
|
+
)
|
536
|
+
|
537
|
+
with create_cm:
|
538
|
+
# is there a scorer file being provided? if not, load from registry
|
539
|
+
if scorer_file is None:
|
540
|
+
if scorer_name is None:
|
541
|
+
raise ValueError(f"Unable to resolve scorer name from {spec.scorer}")
|
542
|
+
|
543
|
+
try:
|
544
|
+
return scorer_create(scorer_name, **kwargs)
|
545
|
+
except ValueError:
|
546
|
+
# We need a valid path to a scorer file to try to load the scorer from there
|
547
|
+
if not task_path:
|
548
|
+
raise PrerequisiteError(
|
549
|
+
f"The scorer '{scorer_name}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter"
|
550
|
+
)
|
551
|
+
|
552
|
+
task_pretty_path = task_path.as_posix()
|
553
|
+
if not task_path.exists():
|
554
|
+
raise PrerequisiteError(
|
555
|
+
f"The scorer `{scorer_name}` couldn't be loaded. The file '{task_pretty_path}' was not found. Please provide a path to the file containing the scorer using the '--scorer' parameter"
|
556
|
+
)
|
557
|
+
|
558
|
+
# We have the path to a file, so load that and try again
|
559
|
+
try:
|
560
|
+
load_module(task_path)
|
561
|
+
scorer_fn = scorer_create(scorer_name, **kwargs)
|
562
|
+
|
563
|
+
# See if the scorer doesn't have type annotations. Currently the registry will not load
|
564
|
+
# the function without type annotations.
|
565
|
+
# TODO: We could consider calling this ourselves if we're certain it is what we're looking for
|
566
|
+
signature = inspect.signature(scorer_fn)
|
567
|
+
if signature.return_annotation is inspect.Signature.empty:
|
568
|
+
raise PrerequisiteError(
|
569
|
+
f"The scorer '{scorer_name}' in the file '{task_pretty_path}' requires return type annotations. Please add type annotations to load the scorer."
|
570
|
+
)
|
571
|
+
return scorer_fn
|
572
|
+
except ValueError:
|
573
|
+
# we still couldn't load this, request the user provide a path
|
574
|
+
raise PrerequisiteError(
|
575
|
+
f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
|
576
|
+
)
|
577
|
+
except ModuleNotFoundError:
|
578
|
+
# we still couldn't load this, request the user provide a path
|
579
|
+
raise PrerequisiteError(
|
580
|
+
f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
|
581
|
+
)
|
582
|
+
|
583
|
+
# solver is a path, so load it that way
|
584
|
+
else:
|
585
|
+
load_module(scorer_file)
|
586
|
+
decorators = parse_decorators(scorer_file, "scorer")
|
587
|
+
|
588
|
+
# if there is no solver_name see if we can discover it
|
589
|
+
if scorer_name is None:
|
590
|
+
if len(decorators) == 1:
|
591
|
+
# decorator based solver
|
592
|
+
scorer_name = decorators[0][0]
|
593
|
+
elif len(decorators) == 0:
|
594
|
+
raise PrerequisiteError(
|
595
|
+
f"The source file {pretty_scorer_file} does not contain any @scorer functions."
|
596
|
+
)
|
597
|
+
else:
|
598
|
+
raise PrerequisiteError(
|
599
|
+
f"The source file {pretty_scorer_file} has more than one @solver function (qualify which solver using e.g. '{scorer_file.name}y@solver_fn')"
|
600
|
+
)
|
601
|
+
|
602
|
+
# create decorator based solvers using the registry
|
603
|
+
if any(solver[0] == scorer_name for solver in decorators):
|
604
|
+
return scorer_create(scorer_name, **kwargs)
|
605
|
+
else:
|
606
|
+
raise PrerequisiteError(
|
607
|
+
f"The function {scorer_name} was not found in file {pretty_scorer_file}."
|
608
|
+
)
|
609
|
+
|
610
|
+
|
611
|
+
def parse_spec_str(spec_str: str) -> Tuple[Path | None, str | None]:
|
612
|
+
spec_split = split_spec(spec_str)
|
613
|
+
if spec_split[1] is not None:
|
614
|
+
file: Path | None = Path(spec_split[0]).resolve()
|
615
|
+
name: str | None = spec_split[1]
|
616
|
+
elif Path(spec_split[0]).exists():
|
617
|
+
file = Path(spec_split[0]).resolve()
|
618
|
+
name = None
|
619
|
+
else:
|
620
|
+
file = None
|
621
|
+
name = spec_split[0]
|
622
|
+
return file, name
|
inspect_ai/_eval/registry.py
CHANGED
@@ -148,7 +148,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
148
148
|
# module import, so set its task file and run dir
|
149
149
|
if get_installed_package_name(task_type) is None:
|
150
150
|
module = inspect.getmodule(task_type)
|
151
|
-
if module and hasattr(module, "__file__"):
|
151
|
+
if module and hasattr(module, "__file__") and module.__file__:
|
152
152
|
file = Path(getattr(module, "__file__"))
|
153
153
|
setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
|
154
154
|
setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
|
inspect_ai/_eval/run.py
CHANGED
@@ -20,8 +20,10 @@ from inspect_ai.log import EvalConfig, EvalLog
|
|
20
20
|
from inspect_ai.log._recorders import Recorder
|
21
21
|
from inspect_ai.model import GenerateConfigArgs
|
22
22
|
from inspect_ai.model._model import ModelName
|
23
|
+
from inspect_ai.scorer._metric import to_metric_specs
|
23
24
|
from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names
|
24
25
|
from inspect_ai.scorer._reducer.registry import validate_reducer
|
26
|
+
from inspect_ai.scorer._scorer import as_scorer_spec
|
25
27
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
26
28
|
from inspect_ai.util._sandbox.environment import (
|
27
29
|
SandboxEnvironmentConfigType,
|
@@ -100,6 +102,16 @@ async def eval_run(
|
|
100
102
|
eval_solver = None
|
101
103
|
eval_solver_spec = None
|
102
104
|
|
105
|
+
# resolve the task scorers
|
106
|
+
eval_scorer_specs = (
|
107
|
+
[as_scorer_spec(scorer) for scorer in task.scorer]
|
108
|
+
if task.scorer is not None
|
109
|
+
else None
|
110
|
+
)
|
111
|
+
|
112
|
+
# resolve task metrics
|
113
|
+
eval_metrics = to_metric_specs(task.metrics) if task.metrics is not None else None
|
114
|
+
|
103
115
|
try:
|
104
116
|
# create run tasks
|
105
117
|
task_run_options: list[TaskRunOptions] = []
|
@@ -168,6 +180,8 @@ async def eval_run(
|
|
168
180
|
tags=tags,
|
169
181
|
model=resolved_task.model,
|
170
182
|
dataset=task.dataset,
|
183
|
+
scorer=eval_scorer_specs,
|
184
|
+
metrics=eval_metrics,
|
171
185
|
sandbox=resolved_task.sandbox,
|
172
186
|
task_attribs=task.attribs,
|
173
187
|
task_args=resolved_task.task_args,
|