inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/trace.py +8 -0
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/eval.py +4 -4
- inspect_ai/_eval/evalset.py +6 -6
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/run.py +13 -8
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +92 -29
- inspect_ai/_view/www/dist/assets/index.css +16636 -14674
- inspect_ai/_view/www/dist/assets/index.js +43585 -36122
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +36 -19
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +355 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +199 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +99 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +200 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -7
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +309 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +326 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +175 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +46 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +143 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +131 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +145 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +86 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +53 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +107 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +363 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +173 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +182 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +108 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +91 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +38 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +190 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +274 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/{StateEventView.mjs → StateEventView.tsx} +148 -110
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +7 -4
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +22 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +95 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +160 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +113 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +67 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +156 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +222 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +41 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +61 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +80 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/dataset/_dataset.py +36 -0
- inspect_ai/dataset/_sources/csv.py +8 -0
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +11 -1
- inspect_ai/dataset/_sources/json.py +8 -0
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_message.py +1 -1
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_chat_message.py +27 -0
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +74 -0
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -0
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +12 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +34 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/solver/_basic_agent.py +19 -9
- inspect_ai/solver/_multiple_choice.py +24 -9
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +2 -5
- inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +4 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +8 -2
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_sandbox/docker/docker.py +32 -85
- inspect_ai/util/_sandbox/self_check.py +124 -16
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/METADATA +2 -1
- inspect_ai-0.3.63.dist-info/RECORD +618 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -418
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.61.dist-info/RECORD +0 -476
- /inspect_ai/{tool/beta/_computer/_resources/tool/__init__.py → _view/www/src/components/MorePopOver.css} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _view/www/src/workspace/tabs/InfoTab.module.css} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_common.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
385
385
|
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
386
386
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
387
387
|
)
|
388
|
+
@click.option(
|
389
|
+
"--reasoning-history/--no-reasoning-history",
|
390
|
+
type=bool,
|
391
|
+
is_flag=True,
|
392
|
+
default=True,
|
393
|
+
help="Include reasoning in chat message history sent to generate.",
|
394
|
+
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
395
|
+
)
|
388
396
|
@click.option(
|
389
397
|
"--log-format",
|
390
398
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -444,6 +452,7 @@ def eval_command(
|
|
444
452
|
max_tool_output: int | None,
|
445
453
|
cache_prompt: str | None,
|
446
454
|
reasoning_effort: str | None,
|
455
|
+
reasoning_history: bool | None,
|
447
456
|
message_limit: int | None,
|
448
457
|
token_limit: int | None,
|
449
458
|
time_limit: int | None,
|
@@ -603,6 +612,7 @@ def eval_set_command(
|
|
603
612
|
max_tool_output: int | None,
|
604
613
|
cache_prompt: str | None,
|
605
614
|
reasoning_effort: str | None,
|
615
|
+
reasoning_history: bool | None,
|
606
616
|
message_limit: int | None,
|
607
617
|
token_limit: int | None,
|
608
618
|
time_limit: int | None,
|
@@ -841,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
|
|
841
851
|
if key == "internal_tools":
|
842
852
|
if value is not False:
|
843
853
|
value = None
|
854
|
+
if key == "reasoning_history":
|
855
|
+
if value is not False:
|
856
|
+
value = None
|
844
857
|
config[key] = value # type: ignore
|
845
858
|
return config
|
846
859
|
|
inspect_ai/_cli/main.py
CHANGED
inspect_ai/_cli/trace.py
CHANGED
@@ -109,11 +109,13 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
|
|
109
109
|
canceled_actions: dict[str, ActionTraceRecord] = {}
|
110
110
|
error_actions: dict[str, ActionTraceRecord] = {}
|
111
111
|
timeout_actions: dict[str, ActionTraceRecord] = {}
|
112
|
+
start_trace: ActionTraceRecord | None = None
|
112
113
|
|
113
114
|
def action_started(trace: ActionTraceRecord) -> None:
|
114
115
|
running_actions[trace.trace_id] = trace
|
115
116
|
|
116
117
|
def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
|
118
|
+
nonlocal start_trace
|
117
119
|
start_trace = running_actions.get(trace.trace_id)
|
118
120
|
if start_trace:
|
119
121
|
del running_actions[trace.trace_id]
|
@@ -122,14 +124,20 @@ def anomolies_command(trace_file: str | None, filter: str | None, all: bool) ->
|
|
122
124
|
raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
|
123
125
|
|
124
126
|
def action_failed(trace: ActionTraceRecord) -> None:
|
127
|
+
nonlocal start_trace
|
125
128
|
if all:
|
129
|
+
assert start_trace
|
126
130
|
error_actions[start_trace.trace_id] = trace
|
127
131
|
|
128
132
|
def action_canceled(trace: ActionTraceRecord) -> None:
|
133
|
+
nonlocal start_trace
|
134
|
+
assert start_trace
|
129
135
|
canceled_actions[start_trace.trace_id] = trace
|
130
136
|
|
131
137
|
def action_timeout(trace: ActionTraceRecord) -> None:
|
138
|
+
nonlocal start_trace
|
132
139
|
if all:
|
140
|
+
assert start_trace
|
133
141
|
timeout_actions[start_trace.trace_id] = trace
|
134
142
|
|
135
143
|
for trace in traces:
|
inspect_ai/_cli/view.py
CHANGED
@@ -63,6 +63,10 @@ def start(
|
|
63
63
|
INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
|
64
64
|
authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
|
65
65
|
if authorization:
|
66
|
+
# this indicates we are in vscode -- we want to set the log level to HTTP
|
67
|
+
# in vscode, updated versions of the extension do this but we set it
|
68
|
+
# manually here as a temporary bridge for running against older versions
|
69
|
+
common["log_level"] = "HTTP"
|
66
70
|
del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
|
67
71
|
os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)
|
68
72
|
|
@@ -10,6 +10,8 @@ from ..rich.display import RichDisplay
|
|
10
10
|
from ..textual.display import TextualDisplay
|
11
11
|
from .display import Display, TaskScreen
|
12
12
|
|
13
|
+
_active_display: Display | None = None
|
14
|
+
|
13
15
|
|
14
16
|
def display() -> Display:
|
15
17
|
global _active_display
|
@@ -28,9 +30,6 @@ def display() -> Display:
|
|
28
30
|
return _active_display
|
29
31
|
|
30
32
|
|
31
|
-
_active_display: Display | None = None
|
32
|
-
|
33
|
-
|
34
33
|
def task_screen() -> TaskScreen:
|
35
34
|
screen = _active_task_screen.get(None)
|
36
35
|
if screen is None:
|
@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
|
|
15
15
|
set_transcript_markdown_options,
|
16
16
|
transcript_function,
|
17
17
|
transcript_markdown,
|
18
|
+
transcript_reasoning,
|
18
19
|
transcript_separator,
|
19
20
|
)
|
20
21
|
from inspect_ai.log._samples import ActiveSample
|
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
|
|
33
34
|
SubtaskEvent,
|
34
35
|
ToolEvent,
|
35
36
|
)
|
36
|
-
from inspect_ai.model._chat_message import
|
37
|
+
from inspect_ai.model._chat_message import (
|
38
|
+
ChatMessage,
|
39
|
+
ChatMessageAssistant,
|
40
|
+
ChatMessageUser,
|
41
|
+
)
|
37
42
|
from inspect_ai.model._render import messages_preceding_assistant
|
38
43
|
from inspect_ai.tool._tool import ToolResult
|
39
44
|
from inspect_ai.tool._tool_transcript import transcript_tool_call
|
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
171
176
|
# content
|
172
177
|
content: list[RenderableType] = []
|
173
178
|
|
174
|
-
def append_message(message: ChatMessage
|
175
|
-
content.extend(render_message(message
|
179
|
+
def append_message(message: ChatMessage) -> None:
|
180
|
+
content.extend(render_message(message))
|
176
181
|
|
177
182
|
# render preceding messages
|
178
183
|
preceding = messages_preceding_assistant(event.input)
|
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
|
|
309
314
|
)
|
310
315
|
|
311
316
|
|
312
|
-
def render_message(
|
313
|
-
message: ChatMessage, text: str | None = None
|
314
|
-
) -> list[RenderableType]:
|
317
|
+
def render_message(message: ChatMessage) -> list[RenderableType]:
|
315
318
|
content: list[RenderableType] = [
|
316
319
|
Text(message.role.capitalize(), style="bold"),
|
317
320
|
Text(),
|
318
321
|
]
|
319
|
-
|
320
|
-
if
|
321
|
-
content.extend(
|
322
|
+
|
323
|
+
if isinstance(message, ChatMessageAssistant) and message.reasoning:
|
324
|
+
content.extend(transcript_reasoning(message.reasoning))
|
325
|
+
|
326
|
+
if message.text:
|
327
|
+
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
322
328
|
return content
|
323
329
|
|
324
330
|
|
inspect_ai/_eval/eval.py
CHANGED
@@ -200,6 +200,10 @@ def eval(
|
|
200
200
|
)
|
201
201
|
|
202
202
|
|
203
|
+
# single call to eval_async at a time
|
204
|
+
_eval_async_running = False
|
205
|
+
|
206
|
+
|
203
207
|
async def eval_async(
|
204
208
|
tasks: Tasks,
|
205
209
|
model: str | Model | list[str] | list[Model] | None = None,
|
@@ -461,10 +465,6 @@ async def eval_async(
|
|
461
465
|
return logs
|
462
466
|
|
463
467
|
|
464
|
-
# single call to eval_async at a time
|
465
|
-
_eval_async_running = False
|
466
|
-
|
467
|
-
|
468
468
|
def eval_retry(
|
469
469
|
tasks: str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog],
|
470
470
|
log_level: str | None = None,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -43,6 +43,12 @@ from .task.task import PreviousTask, Task
|
|
43
43
|
logger = logging.getLogger(__name__)
|
44
44
|
|
45
45
|
|
46
|
+
class Log(NamedTuple):
|
47
|
+
info: EvalLogInfo
|
48
|
+
header: EvalLog
|
49
|
+
task_identifier: str
|
50
|
+
|
51
|
+
|
46
52
|
def eval_set(
|
47
53
|
tasks: Tasks,
|
48
54
|
log_dir: str,
|
@@ -452,12 +458,6 @@ def return_last_value(retry_state: RetryCallState) -> list[EvalLog]:
|
|
452
458
|
return []
|
453
459
|
|
454
460
|
|
455
|
-
class Log(NamedTuple):
|
456
|
-
info: EvalLogInfo
|
457
|
-
header: EvalLog
|
458
|
-
task_identifier: str
|
459
|
-
|
460
|
-
|
461
461
|
# list all eval logs
|
462
462
|
def list_all_eval_logs(log_dir: str) -> list[Log]:
|
463
463
|
log_files = list_eval_logs(log_dir)
|
inspect_ai/_eval/task/error.py
CHANGED
@@ -8,28 +8,24 @@ class SampleErrorHandler:
|
|
8
8
|
self.fail_on_error = True if fail_on_error is None else fail_on_error
|
9
9
|
self.total_samples = float(total_samples)
|
10
10
|
|
11
|
-
def __call__(self, ex: BaseException) -> EvalError:
|
11
|
+
def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
12
12
|
# increment error count
|
13
13
|
self.error_count += 1
|
14
14
|
|
15
15
|
# create error (we may return it)
|
16
|
-
def sample_error(
|
17
|
-
|
16
|
+
def sample_error(
|
17
|
+
*, raise_error: bool
|
18
|
+
) -> tuple[EvalError, BaseException | None]:
|
19
|
+
return eval_error(
|
20
|
+
ex, type(ex), ex, ex.__traceback__
|
21
|
+
), ex if raise_error else None
|
18
22
|
|
19
23
|
# check against limits
|
20
24
|
if isinstance(self.fail_on_error, bool):
|
21
|
-
|
22
|
-
raise ex
|
23
|
-
else:
|
24
|
-
return sample_error()
|
25
|
+
return sample_error(raise_error=self.fail_on_error)
|
25
26
|
else:
|
26
27
|
if self.fail_on_error < 1:
|
27
28
|
max_errors = self.fail_on_error * self.total_samples
|
28
|
-
|
29
|
-
raise ex
|
30
|
-
else:
|
31
|
-
return sample_error()
|
32
|
-
elif self.error_count >= self.fail_on_error:
|
33
|
-
raise ex
|
29
|
+
return sample_error(raise_error=self.error_count >= max_errors)
|
34
30
|
else:
|
35
|
-
return sample_error()
|
31
|
+
return sample_error(raise_error=self.error_count >= self.fail_on_error)
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -496,7 +496,7 @@ async def task_run_sample(
|
|
496
496
|
logger: TaskLogger | None,
|
497
497
|
log_images: bool,
|
498
498
|
sample_source: EvalSampleSource | None,
|
499
|
-
sample_error:
|
499
|
+
sample_error: SampleErrorHandler,
|
500
500
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
501
501
|
fails_on_error: bool,
|
502
502
|
time_limit: int | None,
|
@@ -548,12 +548,12 @@ async def task_run_sample(
|
|
548
548
|
)
|
549
549
|
|
550
550
|
# helper to handle exceptions (will throw if we've exceeded the limit)
|
551
|
-
def handle_error(ex: BaseException) -> EvalError:
|
551
|
+
def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
552
552
|
err = sample_error(ex)
|
553
553
|
py_logger.warning(
|
554
554
|
f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
|
555
555
|
)
|
556
|
-
transcript()._event(ErrorEvent(error=err))
|
556
|
+
transcript()._event(ErrorEvent(error=err[0]))
|
557
557
|
return err
|
558
558
|
|
559
559
|
# solver loop
|
@@ -572,6 +572,7 @@ async def task_run_sample(
|
|
572
572
|
) as active,
|
573
573
|
):
|
574
574
|
error: EvalError | None = None
|
575
|
+
raise_error: BaseException | None = None
|
575
576
|
results: dict[str, SampleScore] = {}
|
576
577
|
try:
|
577
578
|
async with sandboxenv_cm:
|
@@ -640,7 +641,7 @@ async def task_run_sample(
|
|
640
641
|
state = sample_state() or state
|
641
642
|
case "error":
|
642
643
|
# default error handling
|
643
|
-
error = handle_error(ex)
|
644
|
+
error, raise_error = handle_error(ex)
|
644
645
|
|
645
646
|
else:
|
646
647
|
raise
|
@@ -660,7 +661,7 @@ async def task_run_sample(
|
|
660
661
|
state.completed = True
|
661
662
|
|
662
663
|
except BaseException as ex:
|
663
|
-
error = handle_error(ex)
|
664
|
+
error, raise_error = handle_error(ex)
|
664
665
|
|
665
666
|
# set timeout for scoring. if the original timeout was hit we still
|
666
667
|
# want to provide opportunity for scoring, but we don't necessarily
|
@@ -710,6 +711,9 @@ async def task_run_sample(
|
|
710
711
|
results[name] = SampleScore(
|
711
712
|
score=score, sample_id=state.sample_id
|
712
713
|
)
|
714
|
+
transcript()._event(
|
715
|
+
ScoreEvent(score=score, target=sample.target)
|
716
|
+
)
|
713
717
|
|
714
718
|
# propagate results into scores
|
715
719
|
state.scores = {k: v.score for k, v in results.items()}
|
@@ -737,11 +741,10 @@ async def task_run_sample(
|
|
737
741
|
)
|
738
742
|
|
739
743
|
# handle error (this will throw if we've exceeded the limit)
|
740
|
-
error = handle_error(ex)
|
744
|
+
error, raise_error = handle_error(ex)
|
741
745
|
|
742
|
-
# handle sandboxenv init errors
|
743
746
|
except Exception as ex:
|
744
|
-
error = handle_error(ex)
|
747
|
+
error, raise_error = handle_error(ex)
|
745
748
|
|
746
749
|
# complete the sample
|
747
750
|
progress(SAMPLE_TOTAL_PROGRESS_UNITS)
|
@@ -772,6 +775,8 @@ async def task_run_sample(
|
|
772
775
|
if results is not None:
|
773
776
|
sample_complete(results)
|
774
777
|
return results
|
778
|
+
elif raise_error:
|
779
|
+
raise raise_error
|
775
780
|
else:
|
776
781
|
return None
|
777
782
|
|
inspect_ai/_util/hash.py
CHANGED
@@ -3,7 +3,7 @@ import mmh3
|
|
3
3
|
|
4
4
|
def mm3_hash(message: str) -> str:
|
5
5
|
# Generate the 128-bit hash as two 64-bit integers
|
6
|
-
h1, h2 = mmh3.hash64(message.encode("utf-8"))
|
6
|
+
h1, h2 = mmh3.hash64(message.encode("utf-8")) # pylint: disable=E0633
|
7
7
|
|
8
8
|
# Convert to unsigned integers and then to hexadecimal
|
9
9
|
return f"{h1 & 0xFFFFFFFFFFFFFFFF:016x}{h2 & 0xFFFFFFFFFFFFFFFF:016x}"
|
inspect_ai/_util/transcript.py
CHANGED
@@ -111,6 +111,17 @@ def transcript_panel(
|
|
111
111
|
)
|
112
112
|
|
113
113
|
|
114
|
+
def transcript_reasoning(reasoning: str) -> list[RenderableType]:
|
115
|
+
content: list[RenderableType] = []
|
116
|
+
content.append(
|
117
|
+
transcript_markdown(
|
118
|
+
f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
|
119
|
+
)
|
120
|
+
)
|
121
|
+
content.append(Text())
|
122
|
+
return content
|
123
|
+
|
124
|
+
|
114
125
|
def transcript_separator(title: str, color: str) -> RenderableType:
|
115
126
|
return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
|
116
127
|
|
inspect_ai/_view/www/App.css
CHANGED
@@ -9,12 +9,27 @@
|
|
9
9
|
--inspect-input-border: var(--bs-light-border-subtle);
|
10
10
|
--inspect-diff-add-color: #dafbe1;
|
11
11
|
--inspect-diff-remove-color: #ffebe9;
|
12
|
-
--inspect-inactive-selection-background: var(
|
13
|
-
|
12
|
+
--inspect-inactive-selection-background: var(
|
13
|
+
--vscode-editor-inactiveSelectionBackground,
|
14
|
+
#d9d9d9
|
15
|
+
);
|
16
|
+
--inspect-active-selection-background: var(
|
17
|
+
--vscode-editor-selectionBackground,
|
18
|
+
#d7d4f0
|
19
|
+
);
|
14
20
|
--inspect-focus-border-color: #86b7fe;
|
15
21
|
--inspect-focus-border-shadow: 0 0 0 0.25rem rgba(var(--bs-primary-rgb), 0.25);
|
16
22
|
--inspect-focus-border-gray-color: #808080;
|
17
23
|
--inspect-focus-border-gray-shadow: 0 0 0 0.25rem rgba(48, 48, 48, 0.25);
|
24
|
+
|
25
|
+
/* Inspect Font Sizes */
|
26
|
+
--inspect-font-size-title: 1.5rem;
|
27
|
+
--inspect-font-size-title-secondary: 1.3rem;
|
28
|
+
--inspect-font-size-larger: 1.1rem;
|
29
|
+
--inspect-font-size-large: 1rem;
|
30
|
+
--inspect-font-size-base: 0.9rem;
|
31
|
+
--inspect-font-size-small: 0.8rem;
|
32
|
+
--inspect-font-size-smaller: 0.8rem;
|
18
33
|
}
|
19
34
|
|
20
35
|
body:not([class^="vscode-"]) button {
|
@@ -47,6 +62,61 @@ body[class^="vscode-"] .app-main-grid {
|
|
47
62
|
grid-template-rows: max-content max-content 1fr;
|
48
63
|
}
|
49
64
|
|
65
|
+
/* Inspect Text Styles */
|
66
|
+
.text-style-label {
|
67
|
+
text-transform: uppercase;
|
68
|
+
}
|
69
|
+
|
70
|
+
.text-style-secondary {
|
71
|
+
color: var(--bs-secondary);
|
72
|
+
}
|
73
|
+
|
74
|
+
.text-style-tertiary {
|
75
|
+
color: var(--bs-tertiary-color);
|
76
|
+
}
|
77
|
+
|
78
|
+
/* Inspect Font Size Styles */
|
79
|
+
.text-size-title {
|
80
|
+
font-size: var(--inspect-font-size-title);
|
81
|
+
}
|
82
|
+
|
83
|
+
.text-size-title-secondary {
|
84
|
+
font-size: var(--inspect-font-size-title-secondary);
|
85
|
+
}
|
86
|
+
|
87
|
+
.text-size-larger {
|
88
|
+
font-size: var(--inspect-font-size-larger);
|
89
|
+
}
|
90
|
+
|
91
|
+
.text-size-large {
|
92
|
+
font-size: var(--inspect-font-size-large);
|
93
|
+
}
|
94
|
+
|
95
|
+
.text-size-base {
|
96
|
+
font-size: var(--inspect-font-size-base);
|
97
|
+
}
|
98
|
+
|
99
|
+
.text-size-small {
|
100
|
+
font-size: var(--inspect-font-size-small);
|
101
|
+
}
|
102
|
+
|
103
|
+
.text-size-smaller {
|
104
|
+
font-size: var(--inspect-font-size-smaller);
|
105
|
+
}
|
106
|
+
|
107
|
+
.text-truncate {
|
108
|
+
white-space: nowrap;
|
109
|
+
text-overflow: ellipsis;
|
110
|
+
overflow: hidden;
|
111
|
+
}
|
112
|
+
|
113
|
+
.three-line-clamp {
|
114
|
+
display: -webkit-box;
|
115
|
+
-webkit-line-clamp: 3;
|
116
|
+
-webkit-box-orient: vertical;
|
117
|
+
overflow: hidden;
|
118
|
+
}
|
119
|
+
|
50
120
|
body[class^="vscode-"] {
|
51
121
|
--bs-border-radius: 0;
|
52
122
|
--bs-border-radius-lg: 0;
|
@@ -87,7 +157,7 @@ html.vscode {
|
|
87
157
|
|
88
158
|
html.vscode .sample-input {
|
89
159
|
line-height: 1.3em;
|
90
|
-
-webkit-line-clamp: 4 !important
|
160
|
+
-webkit-line-clamp: 4 !important;
|
91
161
|
}
|
92
162
|
|
93
163
|
body[class^="vscode-"] .modal-backdrop {
|
@@ -276,7 +346,7 @@ body {
|
|
276
346
|
}
|
277
347
|
|
278
348
|
@media (max-width: 575px) {
|
279
|
-
.tab-tools
|
349
|
+
.tab-tools select {
|
280
350
|
width: 50px;
|
281
351
|
}
|
282
352
|
}
|
@@ -312,12 +382,6 @@ body {
|
|
312
382
|
font-size: 1.5em;
|
313
383
|
}
|
314
384
|
|
315
|
-
.sidebar {
|
316
|
-
--bs-offcanvas-width: var(--sidebar-width);
|
317
|
-
width: var(--sidebar-width);
|
318
|
-
overflow-y: auto;
|
319
|
-
}
|
320
|
-
|
321
385
|
.nav-link.active {
|
322
386
|
border-bottom-width: 0 !important;
|
323
387
|
}
|
@@ -644,7 +708,7 @@ table.table.table-sm td {
|
|
644
708
|
|
645
709
|
.tab-tools .btn {
|
646
710
|
font-size: 0.7rem;
|
647
|
-
padding: 0.
|
711
|
+
padding: 0.2em 0.8em;
|
648
712
|
}
|
649
713
|
|
650
714
|
.tab-tools {
|
@@ -724,7 +788,7 @@ table.table.table-sm td {
|
|
724
788
|
}
|
725
789
|
|
726
790
|
@keyframes moveLeftToRight {
|
727
|
-
from {
|
791
|
+
from {
|
728
792
|
margin-left: 0;
|
729
793
|
}
|
730
794
|
to {
|
@@ -760,7 +824,6 @@ pre[class*="language-"].tool-output {
|
|
760
824
|
|
761
825
|
/* lightbox styles */
|
762
826
|
|
763
|
-
|
764
827
|
.lightbox-overlay .close-button,
|
765
828
|
.lightbox-overlay .nav-button {
|
766
829
|
/* Hide by default */
|
@@ -868,38 +931,38 @@ ul.jsondiffpatch-delta {
|
|
868
931
|
vertical-align: top;
|
869
932
|
}
|
870
933
|
.jsondiffpatch-property-name:after {
|
871
|
-
content:
|
934
|
+
content: ": ";
|
872
935
|
}
|
873
936
|
.jsondiffpatch-child-node-type-array > .jsondiffpatch-property-name:after {
|
874
|
-
content:
|
937
|
+
content: ": [";
|
875
938
|
}
|
876
939
|
.jsondiffpatch-child-node-type-array:after {
|
877
|
-
content:
|
940
|
+
content: "],";
|
878
941
|
}
|
879
942
|
div.jsondiffpatch-child-node-type-array:before {
|
880
|
-
content:
|
943
|
+
content: "[";
|
881
944
|
}
|
882
945
|
div.jsondiffpatch-child-node-type-array:after {
|
883
|
-
content:
|
946
|
+
content: "]";
|
884
947
|
}
|
885
948
|
.jsondiffpatch-child-node-type-object > .jsondiffpatch-property-name:after {
|
886
|
-
content:
|
949
|
+
content: ": {";
|
887
950
|
}
|
888
951
|
.jsondiffpatch-child-node-type-object:after {
|
889
|
-
content:
|
952
|
+
content: "},";
|
890
953
|
}
|
891
954
|
div.jsondiffpatch-child-node-type-object:before {
|
892
|
-
content:
|
955
|
+
content: "{";
|
893
956
|
}
|
894
957
|
div.jsondiffpatch-child-node-type-object:after {
|
895
|
-
content:
|
958
|
+
content: "}";
|
896
959
|
}
|
897
960
|
.jsondiffpatch-value pre:after {
|
898
|
-
content:
|
961
|
+
content: ",";
|
899
962
|
}
|
900
963
|
li:last-child > .jsondiffpatch-value pre:after,
|
901
964
|
.jsondiffpatch-modified > .jsondiffpatch-left-value pre:after {
|
902
|
-
content:
|
965
|
+
content: "";
|
903
966
|
}
|
904
967
|
.jsondiffpatch-modified .jsondiffpatch-value {
|
905
968
|
display: inline-block;
|
@@ -916,7 +979,7 @@ li:last-child > .jsondiffpatch-value pre:after,
|
|
916
979
|
color: #888;
|
917
980
|
}
|
918
981
|
.jsondiffpatch-moved .jsondiffpatch-moved-destination:before {
|
919
|
-
content:
|
982
|
+
content: " => ";
|
920
983
|
}
|
921
984
|
ul.jsondiffpatch-textdiff {
|
922
985
|
padding: 0;
|
@@ -930,7 +993,7 @@ ul.jsondiffpatch-textdiff {
|
|
930
993
|
display: inline-block;
|
931
994
|
}
|
932
995
|
.jsondiffpatch-textdiff-line-number:after {
|
933
|
-
content:
|
996
|
+
content: ",";
|
934
997
|
}
|
935
998
|
.jsondiffpatch-error {
|
936
999
|
background: red;
|
@@ -976,14 +1039,14 @@ ul.jsondiffpatch-textdiff {
|
|
976
1039
|
padding: 1em;
|
977
1040
|
margin: 0.5em 0;
|
978
1041
|
overflow: auto;
|
979
|
-
border: 0.3em solid #7a6651;
|
1042
|
+
/* border: 0.3em solid #7a6651; */
|
980
1043
|
border-radius: 0.5em;
|
981
1044
|
box-shadow: 1px 1px 0.5em #000 inset;
|
982
1045
|
}
|
983
1046
|
.vscode-dark :not(pre) > code[class*="language-"] {
|
984
1047
|
padding: 0.15em 0.2em 0.05em;
|
985
1048
|
border-radius: 0.3em;
|
986
|
-
border: 0.13em solid #7a6651;
|
1049
|
+
/* border: 0.13em solid #7a6651; */
|
987
1050
|
box-shadow: 1px 1px 0.3em -0.1em #000 inset;
|
988
1051
|
white-space: normal;
|
989
1052
|
}
|
@@ -1045,4 +1108,4 @@ ul.jsondiffpatch-textdiff {
|
|
1045
1108
|
.vscode-dark .token.deleted {
|
1046
1109
|
color: red;
|
1047
1110
|
}
|
1048
|
-
/* END PrismJS */
|
1111
|
+
/* END PrismJS */
|