inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/trace.py +8 -0
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/eval.py +4 -4
- inspect_ai/_eval/evalset.py +6 -6
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/run.py +13 -8
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +92 -29
- inspect_ai/_view/www/dist/assets/index.css +16636 -14674
- inspect_ai/_view/www/dist/assets/index.js +43585 -36122
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +36 -19
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +355 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +199 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +99 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +200 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -7
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +309 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +326 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +175 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +46 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +143 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +131 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +145 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +86 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +53 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +107 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +363 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +173 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +182 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +108 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +91 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +38 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +190 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +274 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/{StateEventView.mjs → StateEventView.tsx} +148 -110
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +7 -4
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +22 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +95 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +160 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +113 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +67 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +156 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +222 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +41 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +61 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +80 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/dataset/_dataset.py +36 -0
- inspect_ai/dataset/_sources/csv.py +8 -0
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +11 -1
- inspect_ai/dataset/_sources/json.py +8 -0
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_message.py +1 -1
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_chat_message.py +27 -0
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +74 -0
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -0
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +12 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +34 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/solver/_basic_agent.py +19 -9
- inspect_ai/solver/_multiple_choice.py +24 -9
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +2 -5
- inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +4 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +8 -2
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_sandbox/docker/docker.py +32 -85
- inspect_ai/util/_sandbox/self_check.py +124 -16
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/METADATA +2 -1
- inspect_ai-0.3.63.dist-info/RECORD +618 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -418
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.61.dist-info/RECORD +0 -476
- /inspect_ai/{tool/beta/_computer/_resources/tool/__init__.py → _view/www/src/components/MorePopOver.css} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _view/www/src/workspace/tabs/InfoTab.module.css} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_common.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/top_level.txt +0 -0
inspect_ai/dataset/_dataset.py
CHANGED
@@ -144,6 +144,14 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
144
144
|
@abc.abstractmethod
|
145
145
|
def shuffled(self) -> bool: ...
|
146
146
|
|
147
|
+
@abc.abstractmethod
|
148
|
+
def shuffle_choices(self, seed: int | None = None) -> None:
|
149
|
+
"""Shuffle the order of the choices with each sample.
|
150
|
+
|
151
|
+
Args:
|
152
|
+
seed: (int | None): Random seed for shuffling (optional).
|
153
|
+
"""
|
154
|
+
|
147
155
|
@overload
|
148
156
|
def __getitem__(self, index: int) -> Sample: ...
|
149
157
|
|
@@ -315,6 +323,34 @@ class MemoryDataset(Dataset):
|
|
315
323
|
random.shuffle(self.samples)
|
316
324
|
self._shuffled = True
|
317
325
|
|
326
|
+
@override
|
327
|
+
def shuffle_choices(self, seed: int | None = None) -> None:
|
328
|
+
rand = random.Random(seed)
|
329
|
+
for sample in self.samples:
|
330
|
+
if not sample.choices:
|
331
|
+
continue
|
332
|
+
# The original positions
|
333
|
+
positions = list(range(len(sample.choices)))
|
334
|
+
|
335
|
+
# Shuffle the choices
|
336
|
+
rand.shuffle(positions)
|
337
|
+
shuffled_choices = [sample.choices[i] for i in positions]
|
338
|
+
|
339
|
+
# Map of original position / target letter
|
340
|
+
position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
|
341
|
+
|
342
|
+
# Update to the shuffled choices and target
|
343
|
+
sample.choices = shuffled_choices
|
344
|
+
sample.target = self._remap_target(sample.target, position_map=position_map)
|
345
|
+
|
346
|
+
def _remap_target(
|
347
|
+
self, target: str | list[str], position_map: dict[int, str]
|
348
|
+
) -> str | list[str]:
|
349
|
+
if isinstance(target, list):
|
350
|
+
return [position_map[ord(t) - 65] for t in target]
|
351
|
+
else:
|
352
|
+
return position_map[ord(target) - 65]
|
353
|
+
|
318
354
|
@override
|
319
355
|
def sort(
|
320
356
|
self,
|
@@ -23,6 +23,7 @@ def csv_dataset(
|
|
23
23
|
auto_id: bool = False,
|
24
24
|
shuffle: bool = False,
|
25
25
|
seed: int | None = None,
|
26
|
+
shuffle_choices: bool | int | None = None,
|
26
27
|
limit: int | None = None,
|
27
28
|
dialect: str = "unix",
|
28
29
|
encoding: str = "utf-8",
|
@@ -45,6 +46,7 @@ def csv_dataset(
|
|
45
46
|
auto_id (bool): Assign an auto-incrementing ID for each sample.
|
46
47
|
shuffle (bool): Randomly shuffle the dataset order.
|
47
48
|
seed: (int | None): Seed used for random shuffle.
|
49
|
+
shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
48
50
|
limit (int | None): Limit the number of records to read.
|
49
51
|
dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
|
50
52
|
encoding (str): Text encoding for file (defaults to "utf-8").
|
@@ -86,6 +88,12 @@ def csv_dataset(
|
|
86
88
|
if shuffle:
|
87
89
|
dataset.shuffle(seed=seed)
|
88
90
|
|
91
|
+
# shuffle choices, if requested
|
92
|
+
if isinstance(shuffle_choices, int):
|
93
|
+
dataset.shuffle_choices(seed=shuffle_choices)
|
94
|
+
elif shuffle_choices is True:
|
95
|
+
dataset.shuffle_choices()
|
96
|
+
|
89
97
|
# limit if requested
|
90
98
|
if limit:
|
91
99
|
return dataset[0:limit]
|
@@ -16,6 +16,7 @@ def file_dataset(
|
|
16
16
|
auto_id: bool = False,
|
17
17
|
shuffle: bool = False,
|
18
18
|
seed: int | None = None,
|
19
|
+
shuffle_choices: bool | int | None = None,
|
19
20
|
limit: int | None = None,
|
20
21
|
dialect: str = "unix",
|
21
22
|
encoding: str = "utf-8",
|
@@ -40,6 +41,7 @@ def file_dataset(
|
|
40
41
|
auto_id (bool): Assign an auto-incrementing ID for each sample.
|
41
42
|
shuffle (bool): Randomly shuffle the dataset order.
|
42
43
|
seed: (int | None): Seed used for random shuffle.
|
44
|
+
shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
43
45
|
limit (int | None): Limit the number of records to read.
|
44
46
|
dialect (str): CSV dialect ("unix" or "excel", defaults to "unix"). Only
|
45
47
|
applies to reading CSV files.
|
@@ -66,6 +68,7 @@ def file_dataset(
|
|
66
68
|
auto_id=auto_id,
|
67
69
|
shuffle=shuffle,
|
68
70
|
seed=seed,
|
71
|
+
shuffle_choices=shuffle_choices,
|
69
72
|
limit=limit,
|
70
73
|
encoding=encoding,
|
71
74
|
name=name,
|
@@ -78,6 +81,7 @@ def file_dataset(
|
|
78
81
|
auto_id=auto_id,
|
79
82
|
shuffle=shuffle,
|
80
83
|
seed=seed,
|
84
|
+
shuffle_choices=shuffle_choices,
|
81
85
|
limit=limit,
|
82
86
|
dialect=dialect,
|
83
87
|
encoding=encoding,
|
@@ -29,6 +29,7 @@ def hf_dataset(
|
|
29
29
|
auto_id: bool = False,
|
30
30
|
shuffle: bool = False,
|
31
31
|
seed: int | None = None,
|
32
|
+
shuffle_choices: bool | int | None = None,
|
32
33
|
limit: int | None = None,
|
33
34
|
trust: bool = False,
|
34
35
|
cached: bool = True,
|
@@ -59,6 +60,7 @@ def hf_dataset(
|
|
59
60
|
auto_id (bool): Assign an auto-incrementing ID for each sample.
|
60
61
|
shuffle (bool): Randomly shuffle the dataset order.
|
61
62
|
seed: (int | None): Seed used for random shuffle.
|
63
|
+
shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
62
64
|
limit (int | None): Limit the number of records to read.
|
63
65
|
trust (bool): Whether or not to allow for datasets defined on the Hub
|
64
66
|
using a dataset script. This option should only be set to True for
|
@@ -117,8 +119,16 @@ def hf_dataset(
|
|
117
119
|
dataset = dataset.select(range(limit))
|
118
120
|
|
119
121
|
# return the dataset
|
120
|
-
|
122
|
+
memory_dataset = MemoryDataset(
|
121
123
|
samples=data_to_samples(dataset.to_list(), data_to_sample, auto_id),
|
122
124
|
name=Path(path).stem if Path(path).exists() else path,
|
123
125
|
location=path,
|
124
126
|
)
|
127
|
+
|
128
|
+
# maybe shuffle the choices
|
129
|
+
if isinstance(shuffle_choices, int):
|
130
|
+
memory_dataset.shuffle_choices(seed=shuffle_choices)
|
131
|
+
elif shuffle_choices is True:
|
132
|
+
memory_dataset.shuffle_choices()
|
133
|
+
|
134
|
+
return memory_dataset
|
@@ -25,6 +25,7 @@ def json_dataset(
|
|
25
25
|
auto_id: bool = False,
|
26
26
|
shuffle: bool = False,
|
27
27
|
seed: int | None = None,
|
28
|
+
shuffle_choices: bool | int | None = None,
|
28
29
|
limit: int | None = None,
|
29
30
|
encoding: str = "utf-8",
|
30
31
|
name: str | None = None,
|
@@ -49,6 +50,7 @@ def json_dataset(
|
|
49
50
|
auto_id (bool): Assign an auto-incrementing ID for each sample.
|
50
51
|
shuffle (bool): Randomly shuffle the dataset order.
|
51
52
|
seed: (int | None): Seed used for random shuffle.
|
53
|
+
shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
52
54
|
limit (int | None): Limit the number of records to read.
|
53
55
|
encoding (str): Text encoding for file (defaults to "utf-8").
|
54
56
|
name (str): Optional name for dataset (for logging). If not specified,
|
@@ -86,6 +88,12 @@ def json_dataset(
|
|
86
88
|
if shuffle:
|
87
89
|
dataset.shuffle(seed=seed)
|
88
90
|
|
91
|
+
# shuffle choices, if requested
|
92
|
+
if isinstance(shuffle_choices, int):
|
93
|
+
dataset.shuffle_choices(seed=shuffle_choices)
|
94
|
+
elif shuffle_choices is True:
|
95
|
+
dataset.shuffle_choices()
|
96
|
+
|
89
97
|
# limit if requested
|
90
98
|
if limit:
|
91
99
|
return dataset[0:limit]
|
inspect_ai/log/_log.py
CHANGED
@@ -17,12 +17,7 @@ from inspect_ai._util.error import EvalError, exception_message
|
|
17
17
|
from inspect_ai._util.logger import warn_once
|
18
18
|
from inspect_ai.approval._policy import ApprovalPolicyConfig
|
19
19
|
from inspect_ai.dataset._dataset import MT, metadata_as
|
20
|
-
from inspect_ai.model import
|
21
|
-
ChatMessage,
|
22
|
-
GenerateConfig,
|
23
|
-
ModelOutput,
|
24
|
-
ModelUsage,
|
25
|
-
)
|
20
|
+
from inspect_ai.model import ChatMessage, GenerateConfig, ModelOutput, ModelUsage
|
26
21
|
from inspect_ai.scorer import Score
|
27
22
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
28
23
|
from inspect_ai.util._store import Store
|
@@ -404,6 +399,8 @@ class EvalResults(BaseModel):
|
|
404
399
|
if "metrics" in values:
|
405
400
|
metrics = values["metrics"]
|
406
401
|
del values["metrics"]
|
402
|
+
else:
|
403
|
+
metrics = None
|
407
404
|
# Convert the scorer to the new schema
|
408
405
|
score = values["scorer"]
|
409
406
|
if metrics:
|
inspect_ai/log/_message.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Literal, Type, cast
|
|
5
5
|
from pydantic import BaseModel, Field, model_validator
|
6
6
|
|
7
7
|
LoggingLevel = Literal[
|
8
|
-
"debug", "http", "sandbox", "info", "warning", "error", "critical"
|
8
|
+
"debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
|
9
9
|
]
|
10
10
|
"""Logging level."""
|
11
11
|
|
@@ -203,7 +203,7 @@ class EvalRecorder(FileRecorder):
|
|
203
203
|
# of small fetches from the zip file streams)
|
204
204
|
temp_log: str | None = None
|
205
205
|
fs = filesystem(location)
|
206
|
-
if not fs.is_local():
|
206
|
+
if not fs.is_local() and header_only is False:
|
207
207
|
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
208
208
|
temp_log = temp.name
|
209
209
|
fs.get_file(location, temp_log)
|
@@ -9,12 +9,7 @@ from typing_extensions import override
|
|
9
9
|
|
10
10
|
from inspect_ai._util.constants import LOG_SCHEMA_VERSION
|
11
11
|
from inspect_ai._util.error import EvalError
|
12
|
-
from inspect_ai._util.file import
|
13
|
-
absolute_file_path,
|
14
|
-
async_fileystem,
|
15
|
-
file,
|
16
|
-
filesystem,
|
17
|
-
)
|
12
|
+
from inspect_ai._util.file import absolute_file_path, async_fileystem, file, filesystem
|
18
13
|
from inspect_ai._util.trace import trace_action
|
19
14
|
|
20
15
|
from .._log import (
|
@@ -236,12 +231,13 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
236
231
|
f.seek(0)
|
237
232
|
|
238
233
|
# Parse the log file, stopping before parsing samples
|
234
|
+
status: Literal["started", "success", "cancelled", "error"] | None = None
|
239
235
|
for k, v in ijson.kvitems(f, ""):
|
240
236
|
if k == "status":
|
241
237
|
assert v in get_args(
|
242
238
|
Literal["started", "success", "cancelled", "error"]
|
243
239
|
)
|
244
|
-
status
|
240
|
+
status = v
|
245
241
|
if k == "eval":
|
246
242
|
eval = EvalSpec(**v)
|
247
243
|
elif k == "plan":
|
@@ -257,6 +253,8 @@ def _read_header_streaming(log_file: str) -> EvalLog:
|
|
257
253
|
error = EvalError(**v)
|
258
254
|
break
|
259
255
|
|
256
|
+
assert status, "Must encounter a 'status'"
|
257
|
+
|
260
258
|
return EvalLog(
|
261
259
|
eval=eval,
|
262
260
|
plan=plan,
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -133,7 +133,8 @@ async def call_tools(
|
|
133
133
|
):
|
134
134
|
content: str | list[Content] = [result]
|
135
135
|
elif isinstance(result, list) and (
|
136
|
-
|
136
|
+
len(result) == 0
|
137
|
+
or isinstance(
|
137
138
|
result[0], ContentText | ContentImage | ContentAudio | ContentVideo
|
138
139
|
)
|
139
140
|
):
|
@@ -7,6 +7,8 @@ from inspect_ai._util.content import Content, ContentText
|
|
7
7
|
from inspect_ai.tool import ToolCall
|
8
8
|
from inspect_ai.tool._tool_call import ToolCallError
|
9
9
|
|
10
|
+
from ._reasoning import parse_content_with_reasoning
|
11
|
+
|
10
12
|
logger = getLogger(__name__)
|
11
13
|
|
12
14
|
|
@@ -83,6 +85,31 @@ class ChatMessageAssistant(ChatMessageBase):
|
|
83
85
|
tool_calls: list[ToolCall] | None = Field(default=None)
|
84
86
|
"""Tool calls made by the model."""
|
85
87
|
|
88
|
+
reasoning: str | None = Field(default=None)
|
89
|
+
"""Reasoning content."""
|
90
|
+
|
91
|
+
# Some OpenAI compatible REST endpoints include reasoning as a field alongside
|
92
|
+
# content, however since this field doesn't exist in the OpenAI interface,
|
93
|
+
# hosting providers (so far we've seen this with Together and Groq) may
|
94
|
+
# include the reasoning in a <think></think> tag before the main response.
|
95
|
+
# We expect this pattern to be repeated elsewhere, so include this hook to
|
96
|
+
# automatically extract the reasoning content when the response is prefaced
|
97
|
+
# with a <think> block. If this ends up being an overeach we can fall back
|
98
|
+
# to each provider manually parsing out <think> using a helper function.
|
99
|
+
# The implementation isn't important here, the critical thing to establish
|
100
|
+
# is that Inspect makes reasoning content available separately.
|
101
|
+
@model_validator(mode="before")
|
102
|
+
@classmethod
|
103
|
+
def extract_reasoning(cls, data: Any) -> Any:
|
104
|
+
if isinstance(data, dict):
|
105
|
+
content = data.get("content", None)
|
106
|
+
if isinstance(content, str):
|
107
|
+
parsed = parse_content_with_reasoning(content)
|
108
|
+
if parsed:
|
109
|
+
data["reasoning"] = parsed.reasoning
|
110
|
+
data["content"] = parsed.content
|
111
|
+
return data
|
112
|
+
|
86
113
|
|
87
114
|
class ChatMessageTool(ChatMessageBase):
|
88
115
|
role: Literal["tool"] = Field(default="tool")
|
@@ -2,7 +2,7 @@ from rich.console import RenderableType
|
|
2
2
|
from rich.text import Text
|
3
3
|
|
4
4
|
from inspect_ai._util.rich import lines_display
|
5
|
-
from inspect_ai._util.transcript import transcript_markdown
|
5
|
+
from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
|
6
6
|
from inspect_ai.util._conversation import conversation_panel
|
7
7
|
from inspect_ai.util._display import display_type
|
8
8
|
|
@@ -38,8 +38,15 @@ def conversation_assistant_message(
|
|
38
38
|
content=transcript_markdown(m.text, escape=True),
|
39
39
|
)
|
40
40
|
|
41
|
-
#
|
42
|
-
content: list[RenderableType] =
|
41
|
+
# build content
|
42
|
+
content: list[RenderableType] = []
|
43
|
+
|
44
|
+
# reasoning
|
45
|
+
if message.reasoning:
|
46
|
+
content.extend(transcript_reasoning(message.reasoning))
|
47
|
+
|
48
|
+
# message text
|
49
|
+
content.extend(
|
43
50
|
[transcript_markdown(message.text, escape=True)] if message.text else []
|
44
51
|
)
|
45
52
|
|
@@ -75,6 +75,9 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
75
75
|
reasoning_effort: Literal["low", "medium", "high"] | None
|
76
76
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
77
77
|
|
78
|
+
reasoning_history: bool | None
|
79
|
+
"""Include reasoning in chat message history sent to generate."""
|
80
|
+
|
78
81
|
|
79
82
|
class GenerateConfig(BaseModel):
|
80
83
|
"""Base class for model generation configs."""
|
@@ -145,6 +148,9 @@ class GenerateConfig(BaseModel):
|
|
145
148
|
reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
|
146
149
|
"""Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
|
147
150
|
|
151
|
+
reasoning_history: bool | None = Field(default=None)
|
152
|
+
"""Include reasoning in chat message history sent to generate."""
|
153
|
+
|
148
154
|
def merge(
|
149
155
|
self, other: Union["GenerateConfig", GenerateConfigArgs]
|
150
156
|
) -> "GenerateConfig":
|
inspect_ai/model/_model.py
CHANGED
@@ -168,6 +168,10 @@ class ModelAPI(abc.ABC):
|
|
168
168
|
"""Tool results can contain images"""
|
169
169
|
return False
|
170
170
|
|
171
|
+
def has_reasoning_history(self) -> bool:
|
172
|
+
"""Chat message assistant messages can include reasoning."""
|
173
|
+
return False
|
174
|
+
|
171
175
|
|
172
176
|
class Model:
|
173
177
|
"""Model interface."""
|
@@ -302,6 +306,11 @@ class Model:
|
|
302
306
|
tools = []
|
303
307
|
tool_choice = "none"
|
304
308
|
|
309
|
+
# handle reasoning history
|
310
|
+
input = resolve_reasoning_history(
|
311
|
+
input, config, self.api.has_reasoning_history()
|
312
|
+
)
|
313
|
+
|
305
314
|
# apply any tool model_input handlers
|
306
315
|
input = resolve_tool_model_input(tdefs, input)
|
307
316
|
|
@@ -726,6 +735,71 @@ def simple_input_messages(
|
|
726
735
|
return messages
|
727
736
|
|
728
737
|
|
738
|
+
def resolve_reasoning_history(
|
739
|
+
messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
|
740
|
+
) -> list[ChatMessage]:
|
741
|
+
# determine if we are including reasoning history
|
742
|
+
reasoning_history = config.reasoning_history is not False
|
743
|
+
|
744
|
+
# determine up front if we have any reasoning content
|
745
|
+
have_reasoning = any(
|
746
|
+
[
|
747
|
+
isinstance(m, ChatMessageAssistant) and m.reasoning is not None
|
748
|
+
for m in messages
|
749
|
+
]
|
750
|
+
)
|
751
|
+
if not have_reasoning:
|
752
|
+
return messages
|
753
|
+
|
754
|
+
# API asssistant message format directly supports reasoning history so we will:
|
755
|
+
# (a) Remove reasoning content entirely if config says not to include it; or
|
756
|
+
# (b) Leave the messages alone if config says to include it
|
757
|
+
if api_has_reasoning_history:
|
758
|
+
# remove reasoning history as per config
|
759
|
+
if not reasoning_history:
|
760
|
+
resolved_messages: list[ChatMessage] = []
|
761
|
+
for message in messages:
|
762
|
+
if isinstance(message, ChatMessageAssistant):
|
763
|
+
resolved_messages.append(
|
764
|
+
message.model_copy(update={"reasoning": None})
|
765
|
+
)
|
766
|
+
else:
|
767
|
+
resolved_messages.append(message)
|
768
|
+
|
769
|
+
return resolved_messages
|
770
|
+
|
771
|
+
# include reasoning history as per config
|
772
|
+
else:
|
773
|
+
return messages
|
774
|
+
|
775
|
+
# API can't represent reasoning natively so include <think> tags
|
776
|
+
elif reasoning_history:
|
777
|
+
resolved_messages = []
|
778
|
+
for message in messages:
|
779
|
+
if (
|
780
|
+
isinstance(message, ChatMessageAssistant)
|
781
|
+
and message.reasoning is not None
|
782
|
+
):
|
783
|
+
message = deepcopy(message)
|
784
|
+
if isinstance(message.content, str):
|
785
|
+
message.content = (
|
786
|
+
f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
|
787
|
+
)
|
788
|
+
else:
|
789
|
+
message.content.insert(
|
790
|
+
0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
|
791
|
+
)
|
792
|
+
message.reasoning = None
|
793
|
+
|
794
|
+
resolved_messages.append(message)
|
795
|
+
|
796
|
+
return resolved_messages
|
797
|
+
|
798
|
+
# api doesn't handle reasoning and config says no reasoning_history, nothing to do
|
799
|
+
else:
|
800
|
+
return messages
|
801
|
+
|
802
|
+
|
729
803
|
def resolve_tool_model_input(
|
730
804
|
tdefs: list[ToolDef], messages: list[ChatMessage]
|
731
805
|
) -> list[ChatMessage]:
|
inspect_ai/model/_openai.py
CHANGED
@@ -43,10 +43,18 @@ from ._chat_message import (
|
|
43
43
|
from ._model_output import ModelUsage, StopReason, as_stop_reason
|
44
44
|
|
45
45
|
|
46
|
+
def is_o_series(name: str) -> bool:
|
47
|
+
return is_o1(name) or is_o3(name)
|
48
|
+
|
49
|
+
|
46
50
|
def is_o1(name: str) -> bool:
|
47
51
|
return name.startswith("o1")
|
48
52
|
|
49
53
|
|
54
|
+
def is_o3(name: str) -> bool:
|
55
|
+
return name.startswith("o3")
|
56
|
+
|
57
|
+
|
50
58
|
def is_o1_full(name: str) -> bool:
|
51
59
|
return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
|
52
60
|
|
@@ -55,10 +63,18 @@ def is_o1_mini(name: str) -> bool:
|
|
55
63
|
return name.startswith("o1-mini")
|
56
64
|
|
57
65
|
|
66
|
+
def is_o3_mini(name: str) -> bool:
|
67
|
+
return name.startswith("o3-mini")
|
68
|
+
|
69
|
+
|
58
70
|
def is_o1_preview(name: str) -> bool:
|
59
71
|
return name.startswith("o1-preview")
|
60
72
|
|
61
73
|
|
74
|
+
def is_gpt(name: str) -> bool:
|
75
|
+
return name.startswith("gpt")
|
76
|
+
|
77
|
+
|
62
78
|
def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
|
63
79
|
return ChatCompletionMessageToolCall(
|
64
80
|
type="function",
|
@@ -296,6 +312,14 @@ def chat_messages_from_openai(
|
|
296
312
|
else:
|
297
313
|
content = [content_from_openai(c) for c in asst_content]
|
298
314
|
|
315
|
+
# resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
|
316
|
+
# interfaces e.g. DeepSeek do include this field so we pluck it out)
|
317
|
+
reasoning = message.get("reasoning_content", None) or message.get(
|
318
|
+
"reasoning", None
|
319
|
+
)
|
320
|
+
if reasoning is not None:
|
321
|
+
reasoning = str(reasoning)
|
322
|
+
|
299
323
|
# return message
|
300
324
|
if "tool_calls" in message:
|
301
325
|
tool_calls: list[ToolCall] = []
|
@@ -306,7 +330,11 @@ def chat_messages_from_openai(
|
|
306
330
|
else:
|
307
331
|
tool_calls = []
|
308
332
|
chat_messages.append(
|
309
|
-
ChatMessageAssistant(
|
333
|
+
ChatMessageAssistant(
|
334
|
+
content=content,
|
335
|
+
tool_calls=tool_calls or None,
|
336
|
+
reasoning=reasoning,
|
337
|
+
)
|
310
338
|
)
|
311
339
|
elif message["role"] == "tool":
|
312
340
|
tool_content = message.get("content", None) or ""
|
@@ -357,10 +385,14 @@ def chat_message_assistant_from_openai(
|
|
357
385
|
message: ChatCompletionMessage, tools: list[ToolInfo]
|
358
386
|
) -> ChatMessageAssistant:
|
359
387
|
refusal = getattr(message, "refusal", None)
|
388
|
+
reasoning = getattr(message, "reasoning_content", None) or getattr(
|
389
|
+
message, "reasoning", None
|
390
|
+
)
|
360
391
|
return ChatMessageAssistant(
|
361
392
|
content=refusal or message.content or "",
|
362
393
|
source="generate",
|
363
394
|
tool_calls=chat_tool_calls_from_openai(message, tools),
|
395
|
+
reasoning=reasoning,
|
364
396
|
)
|
365
397
|
|
366
398
|
|
@@ -12,6 +12,7 @@ else:
|
|
12
12
|
|
13
13
|
from anthropic import (
|
14
14
|
APIConnectionError,
|
15
|
+
APIStatusError,
|
15
16
|
AsyncAnthropic,
|
16
17
|
AsyncAnthropicBedrock,
|
17
18
|
AsyncAnthropicVertex,
|
@@ -218,6 +219,17 @@ class AnthropicAPI(ModelAPI):
|
|
218
219
|
except BadRequestError as ex:
|
219
220
|
return self.handle_bad_request(ex), model_call()
|
220
221
|
|
222
|
+
except APIStatusError as ex:
|
223
|
+
if ex.status_code == 413:
|
224
|
+
return ModelOutput.from_content(
|
225
|
+
model=self.model_name,
|
226
|
+
content=ex.message,
|
227
|
+
stop_reason="model_length",
|
228
|
+
error=ex.message,
|
229
|
+
), model_call()
|
230
|
+
else:
|
231
|
+
raise ex
|
232
|
+
|
221
233
|
def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
|
222
234
|
params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
|
223
235
|
if config.temperature is not None:
|
@@ -294,8 +294,12 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
|
|
294
294
|
|
295
295
|
|
296
296
|
def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
|
297
|
+
reasoning = getattr(message, "reasoning", None)
|
298
|
+
if reasoning is not None:
|
299
|
+
reasoning = str(reasoning)
|
297
300
|
return ChatMessageAssistant(
|
298
301
|
content=message.content or "",
|
299
302
|
source="generate",
|
300
303
|
tool_calls=chat_tool_calls(message, tools),
|
304
|
+
reasoning=reasoning,
|
301
305
|
)
|
@@ -35,10 +35,12 @@ from .._model_output import (
|
|
35
35
|
StopReason,
|
36
36
|
)
|
37
37
|
from .._openai import (
|
38
|
-
|
38
|
+
is_gpt,
|
39
39
|
is_o1_full,
|
40
40
|
is_o1_mini,
|
41
41
|
is_o1_preview,
|
42
|
+
is_o3,
|
43
|
+
is_o_series,
|
42
44
|
openai_chat_messages,
|
43
45
|
openai_chat_tool_choice,
|
44
46
|
openai_chat_tools,
|
@@ -140,8 +142,8 @@ class OpenAIAPI(ModelAPI):
|
|
140
142
|
def is_azure(self) -> bool:
|
141
143
|
return self.service == "azure"
|
142
144
|
|
143
|
-
def
|
144
|
-
return
|
145
|
+
def is_o_series(self) -> bool:
|
146
|
+
return is_o_series(self.model_name)
|
145
147
|
|
146
148
|
def is_o1_full(self) -> bool:
|
147
149
|
return is_o1_full(self.model_name)
|
@@ -149,9 +151,15 @@ class OpenAIAPI(ModelAPI):
|
|
149
151
|
def is_o1_mini(self) -> bool:
|
150
152
|
return is_o1_mini(self.model_name)
|
151
153
|
|
154
|
+
def is_o3(self) -> bool:
|
155
|
+
return is_o3(self.model_name)
|
156
|
+
|
152
157
|
def is_o1_preview(self) -> bool:
|
153
158
|
return is_o1_preview(self.model_name)
|
154
159
|
|
160
|
+
def is_gpt(self) -> bool:
|
161
|
+
return is_gpt(self.model_name)
|
162
|
+
|
155
163
|
async def generate(
|
156
164
|
self,
|
157
165
|
input: list[ChatMessage],
|
@@ -258,7 +266,7 @@ class OpenAIAPI(ModelAPI):
|
|
258
266
|
model=self.model_name,
|
259
267
|
)
|
260
268
|
if config.max_tokens is not None:
|
261
|
-
if self.
|
269
|
+
if self.is_o_series():
|
262
270
|
params["max_completion_tokens"] = config.max_tokens
|
263
271
|
else:
|
264
272
|
params["max_tokens"] = config.max_tokens
|
@@ -273,10 +281,10 @@ class OpenAIAPI(ModelAPI):
|
|
273
281
|
if config.seed is not None:
|
274
282
|
params["seed"] = config.seed
|
275
283
|
if config.temperature is not None:
|
276
|
-
if self.
|
284
|
+
if self.is_o_series():
|
277
285
|
warn_once(
|
278
286
|
logger,
|
279
|
-
"
|
287
|
+
"o series models do not support the 'temperature' parameter (temperature is always 1).",
|
280
288
|
)
|
281
289
|
else:
|
282
290
|
params["temperature"] = config.temperature
|
@@ -293,9 +301,9 @@ class OpenAIAPI(ModelAPI):
|
|
293
301
|
params["logprobs"] = config.logprobs
|
294
302
|
if config.top_logprobs is not None:
|
295
303
|
params["top_logprobs"] = config.top_logprobs
|
296
|
-
if tools and config.parallel_tool_calls is not None and not self.
|
304
|
+
if tools and config.parallel_tool_calls is not None and not self.is_o_series():
|
297
305
|
params["parallel_tool_calls"] = config.parallel_tool_calls
|
298
|
-
if config.reasoning_effort is not None and self.
|
306
|
+
if config.reasoning_effort is not None and not self.is_gpt():
|
299
307
|
params["reasoning_effort"] = config.reasoning_effort
|
300
308
|
|
301
309
|
return params
|
@@ -312,7 +320,11 @@ class OpenAIAPI(ModelAPI):
|
|
312
320
|
stop_reason: StopReason | None = None
|
313
321
|
if e.code == "context_length_exceeded":
|
314
322
|
stop_reason = "model_length"
|
315
|
-
elif
|
323
|
+
elif (
|
324
|
+
e.code == "invalid_prompt" # seems to happen for o1/o3
|
325
|
+
or e.code == "content_policy_violation" # seems to happen for vision
|
326
|
+
or e.code == "content_filter" # seems to happen on azure
|
327
|
+
):
|
316
328
|
stop_reason = "content_filter"
|
317
329
|
|
318
330
|
if stop_reason:
|