inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/cache.py
CHANGED
@@ -42,7 +42,10 @@ def _print_table(title: str, paths: list[tuple[str, int]]) -> None:
|
|
42
42
|
|
43
43
|
@click.group("cache")
|
44
44
|
def cache_command() -> None:
|
45
|
-
"""Manage the inspect cache.
|
45
|
+
"""Manage the inspect model output cache.
|
46
|
+
|
47
|
+
Learn more about model output caching at https://inspect.ai-safety-institute.org.uk/caching.html.
|
48
|
+
"""
|
46
49
|
return None
|
47
50
|
|
48
51
|
|
@@ -62,11 +65,9 @@ def cache_command() -> None:
|
|
62
65
|
type=str,
|
63
66
|
help="Clear the cache for a specific model (e.g. --model=openai/gpt-4). Can be passed multiple times.",
|
64
67
|
)
|
65
|
-
def clear(
|
66
|
-
all: bool, model: tuple[str, ...], log_level: str, log_level_transcript: str
|
67
|
-
) -> None:
|
68
|
+
def clear(all: bool, model: tuple[str, ...], log_level: str) -> None:
|
68
69
|
"""Clear all cache files. Requires either --all or --model flags."""
|
69
|
-
init_logger(log_level
|
70
|
+
init_logger(log_level)
|
70
71
|
|
71
72
|
if model:
|
72
73
|
_print_table(
|
@@ -119,14 +120,14 @@ def list_caches(pruneable: bool) -> None:
|
|
119
120
|
type=str,
|
120
121
|
help="Only prune a specific model (e.g. --model=openai/gpt-4). Can be passed multiple times.",
|
121
122
|
)
|
122
|
-
def prune(log_level: str,
|
123
|
+
def prune(log_level: str, model: tuple[str, ...]) -> None:
|
123
124
|
"""Prune all expired cache entries
|
124
125
|
|
125
126
|
Over time the cache directory can grow, but many cache entries will be
|
126
127
|
expired. This command will remove all expired cache entries for ease of
|
127
128
|
maintenance.
|
128
129
|
"""
|
129
|
-
init_logger(log_level
|
130
|
+
init_logger(log_level)
|
130
131
|
|
131
132
|
expired_cache_entries = cache_list_expired(list(model))
|
132
133
|
|
inspect_ai/_cli/common.py
CHANGED
@@ -9,14 +9,12 @@ from inspect_ai._util.constants import (
|
|
9
9
|
ALL_LOG_LEVELS,
|
10
10
|
DEFAULT_DISPLAY,
|
11
11
|
DEFAULT_LOG_LEVEL,
|
12
|
-
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
13
12
|
)
|
14
13
|
from inspect_ai.util._display import init_display_type
|
15
14
|
|
16
15
|
|
17
16
|
class CommonOptions(TypedDict):
|
18
17
|
log_level: str
|
19
|
-
log_level_transcript: str
|
20
18
|
log_dir: str
|
21
19
|
display: Literal["full", "conversation", "rich", "plain", "none"]
|
22
20
|
no_ansi: bool | None
|
@@ -36,16 +34,6 @@ def log_level_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
36
34
|
envvar="INSPECT_LOG_LEVEL",
|
37
35
|
help=f"Set the log level (defaults to '{DEFAULT_LOG_LEVEL}')",
|
38
36
|
)
|
39
|
-
@click.option(
|
40
|
-
"--log-level-transcript",
|
41
|
-
type=click.Choice(
|
42
|
-
[level.lower() for level in ALL_LOG_LEVELS],
|
43
|
-
case_sensitive=False,
|
44
|
-
),
|
45
|
-
default=DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
46
|
-
envvar="INSPECT_LOG_LEVEL_TRANSCRIPT",
|
47
|
-
help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')",
|
48
|
-
)
|
49
37
|
@functools.wraps(func)
|
50
38
|
def wrapper(*args: Any, **kwargs: Any) -> click.Context:
|
51
39
|
return cast(click.Context, func(*args, **kwargs))
|
inspect_ai/_cli/eval.py
CHANGED
@@ -7,7 +7,9 @@ from typing_extensions import Unpack
|
|
7
7
|
from inspect_ai import Epochs, eval, eval_retry
|
8
8
|
from inspect_ai._eval.evalset import eval_set
|
9
9
|
from inspect_ai._util.constants import (
|
10
|
+
ALL_LOG_LEVELS,
|
10
11
|
DEFAULT_EPOCHS,
|
12
|
+
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
11
13
|
DEFAULT_MAX_CONNECTIONS,
|
12
14
|
DEFAULT_MAX_RETRIES,
|
13
15
|
)
|
@@ -399,6 +401,16 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
399
401
|
envvar=["INSPECT_LOG_FORMAT", "INSPECT_EVAL_LOG_FORMAT"],
|
400
402
|
help="Format for writing log files.",
|
401
403
|
)
|
404
|
+
@click.option(
|
405
|
+
"--log-level-transcript",
|
406
|
+
type=click.Choice(
|
407
|
+
[level.lower() for level in ALL_LOG_LEVELS],
|
408
|
+
case_sensitive=False,
|
409
|
+
),
|
410
|
+
default=DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
411
|
+
envvar="INSPECT_LOG_LEVEL_TRANSCRIPT",
|
412
|
+
help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')",
|
413
|
+
)
|
402
414
|
@common_options
|
403
415
|
@functools.wraps(func)
|
404
416
|
def wrapper(*args: Any, **kwargs: Any) -> click.Context:
|
@@ -468,6 +480,7 @@ def eval_command(
|
|
468
480
|
no_score: bool | None,
|
469
481
|
no_score_display: bool | None,
|
470
482
|
log_format: Literal["eval", "json"] | None,
|
483
|
+
log_level_transcript: str,
|
471
484
|
**common: Unpack[CommonOptions],
|
472
485
|
) -> None:
|
473
486
|
"""Evaluate tasks."""
|
@@ -482,7 +495,7 @@ def eval_command(
|
|
482
495
|
tasks=tasks,
|
483
496
|
solver=solver,
|
484
497
|
log_level=common["log_level"],
|
485
|
-
log_level_transcript=
|
498
|
+
log_level_transcript=log_level_transcript,
|
486
499
|
log_dir=common["log_dir"],
|
487
500
|
log_format=log_format,
|
488
501
|
model=model,
|
@@ -630,9 +643,13 @@ def eval_set_command(
|
|
630
643
|
bundle_dir: str | None,
|
631
644
|
bundle_overwrite: bool | None,
|
632
645
|
log_format: Literal["eval", "json"] | None,
|
646
|
+
log_level_transcript: str,
|
633
647
|
**common: Unpack[CommonOptions],
|
634
648
|
) -> int:
|
635
|
-
"""Evaluate a set of tasks.
|
649
|
+
"""Evaluate a set of tasks with retries.
|
650
|
+
|
651
|
+
Learn more about eval sets at https://inspect.ai-safety-institute.org.uk/eval-sets.html.
|
652
|
+
"""
|
636
653
|
# read config
|
637
654
|
config = config_from_locals(dict(locals()))
|
638
655
|
|
@@ -644,7 +661,7 @@ def eval_set_command(
|
|
644
661
|
tasks=tasks,
|
645
662
|
solver=solver,
|
646
663
|
log_level=common["log_level"],
|
647
|
-
log_level_transcript=
|
664
|
+
log_level_transcript=log_level_transcript,
|
648
665
|
log_dir=common["log_dir"],
|
649
666
|
log_format=log_format,
|
650
667
|
model=model,
|
@@ -967,6 +984,16 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
967
984
|
"--max-retries", type=int, help=MAX_RETRIES_HELP, envvar="INSPECT_EVAL_MAX_RETRIES"
|
968
985
|
)
|
969
986
|
@click.option("--timeout", type=int, help=TIMEOUT_HELP, envvar="INSPECT_EVAL_TIMEOUT")
|
987
|
+
@click.option(
|
988
|
+
"--log-level-transcript",
|
989
|
+
type=click.Choice(
|
990
|
+
[level.lower() for level in ALL_LOG_LEVELS],
|
991
|
+
case_sensitive=False,
|
992
|
+
),
|
993
|
+
default=DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
994
|
+
envvar="INSPECT_LOG_LEVEL_TRANSCRIPT",
|
995
|
+
help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')",
|
996
|
+
)
|
970
997
|
@common_options
|
971
998
|
def eval_retry_command(
|
972
999
|
log_files: tuple[str],
|
@@ -986,6 +1013,7 @@ def eval_retry_command(
|
|
986
1013
|
max_connections: int | None,
|
987
1014
|
max_retries: int | None,
|
988
1015
|
timeout: int | None,
|
1016
|
+
log_level_transcript: str,
|
989
1017
|
**common: Unpack[CommonOptions],
|
990
1018
|
) -> None:
|
991
1019
|
"""Retry failed evaluation(s)"""
|
@@ -1014,7 +1042,7 @@ def eval_retry_command(
|
|
1014
1042
|
eval_retry(
|
1015
1043
|
retry_log_files,
|
1016
1044
|
log_level=common["log_level"],
|
1017
|
-
log_level_transcript=
|
1045
|
+
log_level_transcript=log_level_transcript,
|
1018
1046
|
log_dir=common["log_dir"],
|
1019
1047
|
max_samples=max_samples,
|
1020
1048
|
max_tasks=max_tasks,
|
inspect_ai/_cli/info.py
CHANGED
inspect_ai/_cli/list.py
CHANGED
inspect_ai/_cli/log.py
CHANGED
@@ -29,6 +29,8 @@ def log_command() -> None:
|
|
29
29
|
The default format is 'eval'. You can change this by setting the INSPECT_LOG_FORMAT environment variable or using the --log-format command line option.
|
30
30
|
|
31
31
|
The 'log' commands enable you to read Inspect logs uniformly as JSON no matter their physical storage format, and also enable you to read only the headers (everything but the samples) from log files, which is useful for very large logs.
|
32
|
+
|
33
|
+
Learn more about managing log files at https://inspect.ai-safety-institute.org.uk/eval-logs.html.
|
32
34
|
"""
|
33
35
|
return None
|
34
36
|
|
inspect_ai/_cli/main.py
CHANGED
inspect_ai/_cli/sandbox.py
CHANGED
@@ -7,7 +7,10 @@ from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
|
|
7
7
|
|
8
8
|
@click.group("sandbox")
|
9
9
|
def sandbox_command() -> None:
|
10
|
-
"""Manage Sandbox Environments.
|
10
|
+
"""Manage Sandbox Environments.
|
11
|
+
|
12
|
+
Learn more about sandboxing at https://inspect.ai-safety-institute.org.uk/sandboxing.html.
|
13
|
+
"""
|
11
14
|
return None
|
12
15
|
|
13
16
|
|
inspect_ai/_cli/score.py
CHANGED
@@ -2,33 +2,61 @@ import asyncio
|
|
2
2
|
import os
|
3
3
|
|
4
4
|
import click
|
5
|
+
import rich
|
6
|
+
from rich.panel import Panel
|
7
|
+
from rich.prompt import Prompt
|
8
|
+
from rich.table import Table
|
5
9
|
from typing_extensions import Unpack
|
6
10
|
|
11
|
+
from inspect_ai._cli.util import parse_cli_config
|
7
12
|
from inspect_ai._display import display
|
13
|
+
from inspect_ai._display.core.rich import rich_theme
|
8
14
|
from inspect_ai._eval.context import init_eval_context, init_task_context
|
9
|
-
from inspect_ai._eval.
|
10
|
-
from inspect_ai.
|
11
|
-
from inspect_ai.
|
15
|
+
from inspect_ai._eval.score import ScoreAction, task_score
|
16
|
+
from inspect_ai._util.file import basename, dirname, exists
|
17
|
+
from inspect_ai.log._log import EvalLog
|
12
18
|
from inspect_ai.log._recorders import create_recorder_for_location
|
13
19
|
from inspect_ai.model import get_model
|
14
20
|
|
15
21
|
from .common import CommonOptions, common_options, process_common_options
|
16
22
|
|
23
|
+
SCORES_PER_ROW = 4
|
24
|
+
|
17
25
|
|
18
26
|
@click.command("score")
|
19
|
-
@click.argument("task", type=str)
|
20
27
|
@click.argument("log-file", type=str, required=True)
|
21
28
|
@click.option(
|
22
|
-
"--
|
29
|
+
"--scorer",
|
30
|
+
type=str,
|
31
|
+
envvar="INSPECT_SCORE_SCORER",
|
32
|
+
help="Scorer to use for scoring",
|
33
|
+
)
|
34
|
+
@click.option(
|
35
|
+
"-S",
|
36
|
+
multiple=True,
|
37
|
+
type=str,
|
38
|
+
envvar="INSPECT_SCORE_SCORER_ARGS",
|
39
|
+
help="One or more scorer arguments (e.g. -S arg=value)",
|
40
|
+
)
|
41
|
+
@click.option(
|
42
|
+
"--action",
|
43
|
+
type=click.Choice(["append", "overwrite"]),
|
44
|
+
envvar="INSPECT_SCORE_SCORER_ACTION",
|
45
|
+
help="Whether to append or overwrite the existing scores.",
|
46
|
+
)
|
47
|
+
@click.option(
|
48
|
+
"--overwrite",
|
23
49
|
type=bool,
|
24
50
|
is_flag=True,
|
25
|
-
help="
|
51
|
+
help="Overwrite log file with the scored version",
|
26
52
|
)
|
27
53
|
@common_options
|
28
54
|
def score_command(
|
29
|
-
task: str,
|
30
55
|
log_file: str,
|
31
|
-
|
56
|
+
overwrite: bool | None,
|
57
|
+
scorer: str | None,
|
58
|
+
s: tuple[str] | None,
|
59
|
+
action: ScoreAction | None,
|
32
60
|
**common: Unpack[CommonOptions],
|
33
61
|
) -> None:
|
34
62
|
"""Score a previous evaluation run."""
|
@@ -38,31 +66,43 @@ def score_command(
|
|
38
66
|
# score
|
39
67
|
asyncio.run(
|
40
68
|
score(
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
69
|
+
log_dir=common["log_dir"],
|
70
|
+
log_file=log_file,
|
71
|
+
scorer=scorer,
|
72
|
+
s=s,
|
73
|
+
overwrite=False if overwrite is None else overwrite,
|
74
|
+
action=action,
|
75
|
+
log_level=common["log_level"],
|
47
76
|
)
|
48
77
|
)
|
49
78
|
|
50
79
|
|
51
80
|
async def score(
|
52
|
-
task: str,
|
53
81
|
log_dir: str,
|
54
82
|
log_file: str,
|
83
|
+
scorer: str | None,
|
84
|
+
s: tuple[str] | None,
|
55
85
|
overwrite: bool,
|
86
|
+
action: ScoreAction | None,
|
56
87
|
log_level: str | None,
|
57
|
-
|
88
|
+
output_file: str | None = None,
|
58
89
|
) -> None:
|
59
90
|
# init eval context
|
60
|
-
init_eval_context(log_level,
|
91
|
+
init_eval_context(log_level, None)
|
92
|
+
scorer_args = parse_cli_config(args=s, config=None)
|
61
93
|
|
62
94
|
# read the eval log
|
63
95
|
recorder = create_recorder_for_location(log_file, log_dir)
|
64
96
|
eval_log = await recorder.read_log(log_file)
|
65
97
|
|
98
|
+
# resolve the target output file (prompts user)
|
99
|
+
output_file = resolve_output_file(
|
100
|
+
log_file, output_file=output_file, overwrite=overwrite
|
101
|
+
)
|
102
|
+
|
103
|
+
# resolve action
|
104
|
+
action = resolve_action(eval_log, action)
|
105
|
+
|
66
106
|
# check that there are samples therein
|
67
107
|
if eval_log.samples is None or len(eval_log.samples) == 0:
|
68
108
|
raise ValueError(f"{log_file} does not include samples to score")
|
@@ -77,23 +117,132 @@ async def score(
|
|
77
117
|
# initialize active model
|
78
118
|
init_task_context(model)
|
79
119
|
|
80
|
-
# instantiate the task so we can get its scorer and metrics
|
81
|
-
score_task = load_tasks([task], model)[0]
|
82
|
-
|
83
120
|
# re-score the task
|
84
|
-
eval_log = await task_score(
|
121
|
+
eval_log = await task_score(
|
122
|
+
log=eval_log, scorer=scorer, scorer_args=scorer_args, action=action
|
123
|
+
)
|
85
124
|
|
86
|
-
# re-write the log
|
87
|
-
|
88
|
-
scored = f"{SCORED_SUFFIX}{ext}"
|
89
|
-
if not overwrite and not log_file.endswith(scored):
|
90
|
-
log_file = log_file.removesuffix(ext) + scored
|
91
|
-
await recorder.write_log(log_file, eval_log)
|
125
|
+
# re-write the log
|
126
|
+
await recorder.write_log(output_file, eval_log)
|
92
127
|
|
93
128
|
# print results
|
94
|
-
|
129
|
+
print_results(output_file, eval_log)
|
130
|
+
|
131
|
+
|
132
|
+
def print_results(output_file: str, eval_log: EvalLog) -> None:
|
133
|
+
# the theme
|
134
|
+
theme = rich_theme()
|
135
|
+
|
136
|
+
# Create results panel
|
137
|
+
grid = Table.grid(expand=True)
|
138
|
+
grid.add_column()
|
139
|
+
grid.add_row("")
|
140
|
+
|
95
141
|
if eval_log.results:
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
142
|
+
# Process scores in groups
|
143
|
+
for i in range(0, len(eval_log.results.scores), SCORES_PER_ROW):
|
144
|
+
# Create a grid for this row of scores
|
145
|
+
score_row = Table.grid(
|
146
|
+
expand=False,
|
147
|
+
padding=(0, 2, 0, 0),
|
148
|
+
)
|
149
|
+
|
150
|
+
# Add columns for each score in this row
|
151
|
+
for _ in range(SCORES_PER_ROW):
|
152
|
+
score_row.add_column()
|
153
|
+
|
154
|
+
# Create individual score tables and add them to the row
|
155
|
+
score_tables: list[Table | str] = []
|
156
|
+
for score in eval_log.results.scores[i : i + SCORES_PER_ROW]:
|
157
|
+
table = Table(
|
158
|
+
show_header=False, show_lines=False, box=None, show_edge=False
|
159
|
+
)
|
160
|
+
table.add_column()
|
161
|
+
table.add_column()
|
162
|
+
|
163
|
+
# Add score name and metrics
|
164
|
+
table.add_row(f"[bold]{score.name}[/bold]")
|
165
|
+
for name, metric in score.metrics.items():
|
166
|
+
table.add_row(f"{name}", f"{metric.value:.3f}")
|
167
|
+
|
168
|
+
score_tables.append(table)
|
169
|
+
|
170
|
+
# Fill remaining slots with empty tables if needed
|
171
|
+
while len(score_tables) < SCORES_PER_ROW:
|
172
|
+
score_tables.append("")
|
173
|
+
|
174
|
+
# Add the score tables to this row
|
175
|
+
score_row.add_row(*score_tables)
|
176
|
+
|
177
|
+
# Add this row of scores to the main grid
|
178
|
+
grid.add_row(score_row)
|
179
|
+
|
180
|
+
grid.add_row("")
|
181
|
+
grid.add_row(f" Log: [{theme.link}]{output_file}[/{theme.link}]")
|
182
|
+
|
183
|
+
p = Panel(
|
184
|
+
title=f"[bold][{theme.meta}]Results for {eval_log.eval.task}[/bold][/{theme.meta}]",
|
185
|
+
title_align="left",
|
186
|
+
renderable=grid,
|
187
|
+
)
|
188
|
+
|
189
|
+
# Print the results panel
|
190
|
+
display().print("")
|
191
|
+
console = rich.get_console()
|
192
|
+
console.print(p)
|
193
|
+
|
194
|
+
|
195
|
+
def resolve_output_file(log_file: str, output_file: str | None, overwrite: bool) -> str:
|
196
|
+
# resolve the output file (we may overwrite, use the passed file name, or suggest a new name)
|
197
|
+
if output_file is None:
|
198
|
+
if overwrite:
|
199
|
+
# explicitly asked to overwrite
|
200
|
+
return log_file
|
201
|
+
else:
|
202
|
+
if exists(log_file):
|
203
|
+
# Ask if we should overwrite
|
204
|
+
file_action = Prompt.ask(
|
205
|
+
"Overwrite existing log file or create new log file?",
|
206
|
+
choices=["overwrite", "create", "o", "c"],
|
207
|
+
default="create",
|
208
|
+
)
|
209
|
+
if file_action in ["overwrite", "o"]:
|
210
|
+
return log_file
|
211
|
+
else:
|
212
|
+
file_name = basename(log_file)
|
213
|
+
base_dir = dirname(log_file)
|
214
|
+
_, ext = os.path.splitext(file_name)
|
215
|
+
|
216
|
+
count = 0
|
217
|
+
|
218
|
+
def filename() -> str:
|
219
|
+
if count > 0:
|
220
|
+
return f"{file_name.removesuffix(ext)}-scored-{count}{ext}"
|
221
|
+
else:
|
222
|
+
return f"{file_name.removesuffix(ext)}-scored{ext}"
|
223
|
+
|
224
|
+
while exists(f"{os.path.join(base_dir, filename())}"):
|
225
|
+
count = count + 1
|
226
|
+
|
227
|
+
suggested_file = filename()
|
228
|
+
user_file = Prompt.ask("Output file name?", default=suggested_file)
|
229
|
+
return os.path.join(base_dir, user_file)
|
230
|
+
else:
|
231
|
+
return log_file
|
232
|
+
else:
|
233
|
+
return output_file
|
234
|
+
|
235
|
+
|
236
|
+
def resolve_action(eval_log: EvalLog, action: ScoreAction | None) -> ScoreAction:
|
237
|
+
if action is not None:
|
238
|
+
return action
|
239
|
+
|
240
|
+
if eval_log.results is not None and len(eval_log.results.scores) > 0:
|
241
|
+
user_action = Prompt.ask(
|
242
|
+
"Overwrite existing scores or append as additional scores?",
|
243
|
+
choices=["overwrite", "append", "o", "a"],
|
244
|
+
default="append",
|
245
|
+
)
|
246
|
+
return "overwrite" if user_action in ["ovewrite", "o"] else "append"
|
247
|
+
else:
|
248
|
+
return "overwrite"
|
inspect_ai/_cli/trace.py
CHANGED
@@ -26,6 +26,8 @@ def trace_command() -> None:
|
|
26
26
|
"""List and read execution traces.
|
27
27
|
|
28
28
|
Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
|
29
|
+
|
30
|
+
Learn more about execution traces at https://inspect.ai-safety-institute.org.uk/tracing.html.
|
29
31
|
"""
|
30
32
|
return None
|
31
33
|
|
@@ -109,11 +111,13 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
|
|
109
111
|
canceled_actions: dict[str, ActionTraceRecord] = {}
|
110
112
|
error_actions: dict[str, ActionTraceRecord] = {}
|
111
113
|
timeout_actions: dict[str, ActionTraceRecord] = {}
|
114
|
+
start_trace: ActionTraceRecord | None = None
|
112
115
|
|
113
116
|
def action_started(trace: ActionTraceRecord) -> None:
|
114
117
|
running_actions[trace.trace_id] = trace
|
115
118
|
|
116
119
|
def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
|
120
|
+
nonlocal start_trace
|
117
121
|
start_trace = running_actions.get(trace.trace_id)
|
118
122
|
if start_trace:
|
119
123
|
del running_actions[trace.trace_id]
|
@@ -122,14 +126,20 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
|
|
122
126
|
raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
|
123
127
|
|
124
128
|
def action_failed(trace: ActionTraceRecord) -> None:
|
129
|
+
nonlocal start_trace
|
125
130
|
if all:
|
131
|
+
assert start_trace
|
126
132
|
error_actions[start_trace.trace_id] = trace
|
127
133
|
|
128
134
|
def action_canceled(trace: ActionTraceRecord) -> None:
|
135
|
+
nonlocal start_trace
|
136
|
+
assert start_trace
|
129
137
|
canceled_actions[start_trace.trace_id] = trace
|
130
138
|
|
131
139
|
def action_timeout(trace: ActionTraceRecord) -> None:
|
140
|
+
nonlocal start_trace
|
132
141
|
if all:
|
142
|
+
assert start_trace
|
133
143
|
timeout_actions[start_trace.trace_id] = trace
|
134
144
|
|
135
145
|
for trace in traces:
|
inspect_ai/_cli/view.py
CHANGED
@@ -39,7 +39,10 @@ def start_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
39
39
|
@common_options
|
40
40
|
@click.pass_context
|
41
41
|
def view_command(ctx: click.Context, **kwargs: Unpack[CommonOptions]) -> None:
|
42
|
-
"""
|
42
|
+
"""Inspect log viewer.
|
43
|
+
|
44
|
+
Learn more about using the log viewer at https://inspect.ai-safety-institute.org.uk/log-viewer.html.
|
45
|
+
"""
|
43
46
|
if ctx.invoked_subcommand is None:
|
44
47
|
ctx.invoke(start, **kwargs)
|
45
48
|
else:
|
@@ -78,7 +81,6 @@ def start(
|
|
78
81
|
port=port,
|
79
82
|
authorization=authorization,
|
80
83
|
log_level=common["log_level"],
|
81
|
-
log_level_transcript=common["log_level_transcript"],
|
82
84
|
)
|
83
85
|
|
84
86
|
|
@@ -10,6 +10,8 @@ from ..rich.display import RichDisplay
|
|
10
10
|
from ..textual.display import TextualDisplay
|
11
11
|
from .display import Display, TaskScreen
|
12
12
|
|
13
|
+
_active_display: Display | None = None
|
14
|
+
|
13
15
|
|
14
16
|
def display() -> Display:
|
15
17
|
global _active_display
|
@@ -28,9 +30,6 @@ def display() -> Display:
|
|
28
30
|
return _active_display
|
29
31
|
|
30
32
|
|
31
|
-
_active_display: Display | None = None
|
32
|
-
|
33
|
-
|
34
33
|
def task_screen() -> TaskScreen:
|
35
34
|
screen = _active_task_screen.get(None)
|
36
35
|
if screen is None:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from inspect_ai._util.registry import is_registry_dict
|
2
|
+
from inspect_ai.log._log import eval_config_defaults
|
2
3
|
|
3
4
|
from .display import TaskProfile
|
4
5
|
|
@@ -13,7 +14,12 @@ def task_config(
|
|
13
14
|
value = task_args[key]
|
14
15
|
if is_registry_dict(value):
|
15
16
|
task_args[key] = value["name"]
|
16
|
-
|
17
|
+
# get eval_config overrides
|
18
|
+
eval_config = dict(profile.eval_config.model_dump(exclude_none=True))
|
19
|
+
for name, default_value in eval_config_defaults().items():
|
20
|
+
if eval_config.get(name, None) == default_value:
|
21
|
+
del eval_config[name]
|
22
|
+
config = eval_config | task_args
|
17
23
|
if generate_config:
|
18
24
|
config = dict(profile.generate_config.model_dump(exclude_none=True)) | config
|
19
25
|
if profile.tags:
|
@@ -347,7 +347,7 @@ class SampleLimits(Widget):
|
|
347
347
|
class SandboxesView(Vertical):
|
348
348
|
DEFAULT_CSS = """
|
349
349
|
SandboxesView {
|
350
|
-
padding: 1 0
|
350
|
+
padding: 1 0 0 0;
|
351
351
|
background: transparent;
|
352
352
|
height: auto;
|
353
353
|
}
|
@@ -358,6 +358,7 @@ class SandboxesView(Vertical):
|
|
358
358
|
background: transparent;
|
359
359
|
}
|
360
360
|
.clipboard-message {
|
361
|
+
height: auto;
|
361
362
|
margin-top: 1;
|
362
363
|
}
|
363
364
|
"""
|
@@ -372,7 +373,6 @@ class SandboxesView(Vertical):
|
|
372
373
|
async def sync_sample(self, sample: ActiveSample) -> None:
|
373
374
|
if len(sample.sandboxes) > 0:
|
374
375
|
multiple_sandboxes = len(sample.sandboxes) > 1
|
375
|
-
self.display = True
|
376
376
|
sandboxes_caption = cast(Static, self.query_one("#sandboxes-caption"))
|
377
377
|
sandboxes_caption.update(
|
378
378
|
f"[bold]sandbox container{'s' if multiple_sandboxes else ''}:[/bold]"
|
@@ -395,6 +395,7 @@ class SandboxesView(Vertical):
|
|
395
395
|
markup=True,
|
396
396
|
)
|
397
397
|
)
|
398
|
+
self.display = True
|
398
399
|
else:
|
399
400
|
self.display = False
|
400
401
|
|
@@ -473,7 +474,7 @@ class SampleToolbar(Horizontal):
|
|
473
474
|
else None
|
474
475
|
)
|
475
476
|
if isinstance(last_event, ToolEvent):
|
476
|
-
last_event.
|
477
|
+
last_event._cancel()
|
477
478
|
elif event.button.id == self.CANCEL_SCORE_OUTPUT:
|
478
479
|
self.sample.interrupt("score")
|
479
480
|
elif event.button.id == self.CANCEL_RAISE_ERROR:
|