inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/trace.py +8 -0
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/eval.py +4 -4
- inspect_ai/_eval/evalset.py +6 -6
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/run.py +13 -8
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +92 -29
- inspect_ai/_view/www/dist/assets/index.css +16636 -14674
- inspect_ai/_view/www/dist/assets/index.js +43585 -36122
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +36 -19
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +355 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +199 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +99 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +200 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -7
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +309 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +326 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +175 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +46 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +143 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +131 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +145 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +86 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +53 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +107 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +363 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +173 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +182 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +108 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +91 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +38 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +190 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +274 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/{StateEventView.mjs → StateEventView.tsx} +148 -110
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +7 -4
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +22 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +95 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +160 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +113 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +67 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +156 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +222 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +41 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +61 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +80 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/dataset/_dataset.py +36 -0
- inspect_ai/dataset/_sources/csv.py +8 -0
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +11 -1
- inspect_ai/dataset/_sources/json.py +8 -0
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_message.py +1 -1
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_chat_message.py +27 -0
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +74 -0
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -0
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +12 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +34 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/solver/_basic_agent.py +19 -9
- inspect_ai/solver/_multiple_choice.py +24 -9
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +2 -5
- inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +4 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +8 -2
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_sandbox/docker/docker.py +32 -85
- inspect_ai/util/_sandbox/self_check.py +124 -16
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/METADATA +2 -1
- inspect_ai-0.3.63.dist-info/RECORD +618 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -418
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.61.dist-info/RECORD +0 -476
- /inspect_ai/{tool/beta/_computer/_resources/tool/__init__.py → _view/www/src/components/MorePopOver.css} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _view/www/src/workspace/tabs/InfoTab.module.css} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_common.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/top_level.txt +0 -0
@@ -27,11 +27,7 @@ from inspect_ai.tool import ToolCall, ToolInfo
|
|
27
27
|
from .._call_tools import parse_tool_call, tool_parse_error_message
|
28
28
|
from .._model_call import ModelCall
|
29
29
|
from .._model_output import ModelUsage, StopReason, as_stop_reason
|
30
|
-
from .._providers.util import
|
31
|
-
ChatAPIHandler,
|
32
|
-
ChatAPIMessage,
|
33
|
-
chat_api_input,
|
34
|
-
)
|
30
|
+
from .._providers.util import ChatAPIHandler, ChatAPIMessage, chat_api_input
|
35
31
|
|
36
32
|
logger = getLogger(__name__)
|
37
33
|
|
@@ -85,6 +81,8 @@ def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput | Excepti
|
|
85
81
|
stop_reason: StopReason | None = "model_length"
|
86
82
|
elif ex.code == "invalid_prompt":
|
87
83
|
stop_reason = "content_filter"
|
84
|
+
else:
|
85
|
+
stop_reason = None
|
88
86
|
|
89
87
|
if stop_reason:
|
90
88
|
return ModelOutput.from_content(
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from typing_extensions import override
|
5
|
+
|
6
|
+
from inspect_ai._util.error import PrerequisiteError
|
7
|
+
from inspect_ai.model._providers.util import model_base_url
|
8
|
+
from inspect_ai.model._providers.util.util import environment_prerequisite_error
|
9
|
+
|
10
|
+
from .._generate_config import GenerateConfig
|
11
|
+
from .openai import OpenAIAPI
|
12
|
+
|
13
|
+
OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
|
14
|
+
|
15
|
+
|
16
|
+
class OpenRouterAPI(OpenAIAPI):
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
model_name: str,
|
20
|
+
base_url: str | None = None,
|
21
|
+
api_key: str | None = None,
|
22
|
+
config: GenerateConfig = GenerateConfig(),
|
23
|
+
**model_args: Any,
|
24
|
+
) -> None:
|
25
|
+
# api_key
|
26
|
+
if not api_key:
|
27
|
+
api_key = os.environ.get(OPENROUTER_API_KEY, None)
|
28
|
+
if not api_key:
|
29
|
+
raise environment_prerequisite_error("OpenRouter", OPENROUTER_API_KEY)
|
30
|
+
|
31
|
+
# base_url
|
32
|
+
base_url = model_base_url(base_url, "OPENROUTER_BASE_URL")
|
33
|
+
base_url = base_url if base_url else "https://openrouter.ai/api/v1"
|
34
|
+
|
35
|
+
# collect known model args that we forward to generate
|
36
|
+
def collect_model_arg(name: str) -> Any | None:
|
37
|
+
nonlocal model_args
|
38
|
+
value = model_args.get(name, None)
|
39
|
+
if value is not None:
|
40
|
+
model_args.pop(name)
|
41
|
+
return value
|
42
|
+
|
43
|
+
# models arg
|
44
|
+
self.models = collect_model_arg("models")
|
45
|
+
if self.models is not None:
|
46
|
+
if not isinstance(self.models, list):
|
47
|
+
raise PrerequisiteError("models must be a list of strings")
|
48
|
+
|
49
|
+
# providers arg
|
50
|
+
self.provider = collect_model_arg("provider")
|
51
|
+
if self.provider is not None:
|
52
|
+
if not isinstance(self.provider, dict):
|
53
|
+
raise PrerequisiteError("provider must be a dict")
|
54
|
+
|
55
|
+
# transforms arg
|
56
|
+
self.transforms = collect_model_arg("transforms")
|
57
|
+
if self.transforms is not None:
|
58
|
+
if not isinstance(self.transforms, list):
|
59
|
+
raise PrerequisiteError("transforms must be a list of strings")
|
60
|
+
|
61
|
+
# call super
|
62
|
+
super().__init__(
|
63
|
+
model_name=model_name,
|
64
|
+
base_url=base_url,
|
65
|
+
api_key=api_key,
|
66
|
+
config=config,
|
67
|
+
**model_args,
|
68
|
+
)
|
69
|
+
|
70
|
+
@override
|
71
|
+
def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, Any]:
|
72
|
+
# default params
|
73
|
+
params = super().completion_params(config, tools)
|
74
|
+
|
75
|
+
# pass args if specifed
|
76
|
+
EXTRA_BODY = "extra_body"
|
77
|
+
if self.models or self.provider or self.transforms:
|
78
|
+
params[EXTRA_BODY] = params.get(EXTRA_BODY, {})
|
79
|
+
if self.models:
|
80
|
+
params[EXTRA_BODY]["models"] = self.models
|
81
|
+
if self.provider:
|
82
|
+
params[EXTRA_BODY]["provider"] = self.provider
|
83
|
+
if self.transforms:
|
84
|
+
params[EXTRA_BODY]["tranforms"] = self.transforms
|
85
|
+
|
86
|
+
return params
|
@@ -16,7 +16,7 @@ from .._registry import modelapi
|
|
16
16
|
def groq() -> type[ModelAPI]:
|
17
17
|
FEATURE = "Groq API"
|
18
18
|
PACKAGE = "groq"
|
19
|
-
MIN_VERSION = "0.
|
19
|
+
MIN_VERSION = "0.16.0"
|
20
20
|
|
21
21
|
# verify we have the package
|
22
22
|
try:
|
@@ -198,6 +198,17 @@ def ollama() -> type[ModelAPI]:
|
|
198
198
|
return OllamaAPI
|
199
199
|
|
200
200
|
|
201
|
+
@modelapi(name="openrouter")
|
202
|
+
def openrouter() -> type[ModelAPI]:
|
203
|
+
# validate
|
204
|
+
validate_openai_client("OpenRouter API")
|
205
|
+
|
206
|
+
# in the clear
|
207
|
+
from .openrouter import OpenRouterAPI
|
208
|
+
|
209
|
+
return OpenRouterAPI
|
210
|
+
|
211
|
+
|
201
212
|
@modelapi(name="llama-cpp-python")
|
202
213
|
def llama_cpp_python() -> type[ModelAPI]:
|
203
214
|
# validate
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import re
|
2
|
+
from typing import NamedTuple
|
3
|
+
|
4
|
+
|
5
|
+
class ContentWithReasoning(NamedTuple):
|
6
|
+
content: str
|
7
|
+
reasoning: str
|
8
|
+
|
9
|
+
|
10
|
+
def parse_content_with_reasoning(content: str) -> ContentWithReasoning | None:
|
11
|
+
match = re.match(r"\s*<think>(.*?)</think>(.*)", content, re.DOTALL)
|
12
|
+
if match:
|
13
|
+
return ContentWithReasoning(
|
14
|
+
content=match.group(2).strip(), reasoning=match.group(1).strip()
|
15
|
+
)
|
16
|
+
else:
|
17
|
+
return None
|
inspect_ai/scorer/_answer.py
CHANGED
@@ -8,7 +8,7 @@ from inspect_ai._util.pattern import (
|
|
8
8
|
)
|
9
9
|
|
10
10
|
from ._metrics import accuracy, stderr
|
11
|
-
from ._pattern import pattern
|
11
|
+
from ._pattern import pattern as make_pattern
|
12
12
|
from ._scorer import Scorer, scorer
|
13
13
|
|
14
14
|
|
@@ -33,7 +33,7 @@ class AnswerPattern(str, Enum):
|
|
33
33
|
|
34
34
|
|
35
35
|
@scorer(metrics=[accuracy(), stderr()])
|
36
|
-
def answer(
|
36
|
+
def answer(pattern: Literal["letter", "word", "line"]) -> Scorer:
|
37
37
|
"""Scorer for model output that preceded answers with ANSWER:.
|
38
38
|
|
39
39
|
Some solvers including multiple_choice solicit answers from
|
@@ -43,7 +43,7 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
|
|
43
43
|
Note that you must specify a `type` for the answer scorer.
|
44
44
|
|
45
45
|
Args:
|
46
|
-
|
46
|
+
pattern: (Literal["letter", "word", "line"]): Type of answer
|
47
47
|
to extract. "letter" is used with multiple choice and
|
48
48
|
extracts a single letter; "word" will extract the next
|
49
49
|
word (often used for yes/no answers); "line" will take
|
@@ -53,10 +53,10 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
|
|
53
53
|
with a separate line at the end.
|
54
54
|
|
55
55
|
"""
|
56
|
-
match
|
56
|
+
match pattern:
|
57
57
|
case "letter":
|
58
|
-
return
|
58
|
+
return make_pattern(AnswerPattern.LETTER)
|
59
59
|
case "word":
|
60
|
-
return
|
60
|
+
return make_pattern(AnswerPattern.WORD)
|
61
61
|
case "line":
|
62
|
-
return
|
62
|
+
return make_pattern(AnswerPattern.LINE)
|
@@ -12,7 +12,7 @@ from ._target import Target
|
|
12
12
|
|
13
13
|
@scorer(metrics=[mean(), stderr()])
|
14
14
|
def f1(
|
15
|
-
answer_fn: Callable[[str], str] | None = None,
|
15
|
+
answer_fn: Callable[[str], str] | None = None, stop_words: list[str] | None = None
|
16
16
|
) -> Scorer:
|
17
17
|
"""Scorer which produces an F1 score
|
18
18
|
|
@@ -26,7 +26,7 @@ def f1(
|
|
26
26
|
)
|
27
27
|
targets = target.target
|
28
28
|
|
29
|
-
f1_score = max_f1_score(answer, targets)
|
29
|
+
f1_score = max_f1_score(answer, targets, stop_words=stop_words)
|
30
30
|
return Score(
|
31
31
|
value=f1_score,
|
32
32
|
answer=answer,
|
@@ -53,12 +53,14 @@ def exact() -> Scorer:
|
|
53
53
|
return score
|
54
54
|
|
55
55
|
|
56
|
-
def max_f1_score(
|
56
|
+
def max_f1_score(
|
57
|
+
answer: str, targets: List[str], stop_words: list[str] | None = None
|
58
|
+
) -> float:
|
57
59
|
# Find the maximum F1 score for this answer
|
58
60
|
max_f1 = 0.0
|
59
61
|
for target in targets:
|
60
62
|
if target[0].strip():
|
61
|
-
f1_score = compute_f1(answer, target)
|
63
|
+
f1_score = compute_f1(answer, target, stop_words)
|
62
64
|
max_f1 = max(max_f1, f1_score)
|
63
65
|
return round(max_f1, 2)
|
64
66
|
|
@@ -75,18 +77,16 @@ def max_exact_score(answer: str, targets: List[str]) -> float:
|
|
75
77
|
return max_exact
|
76
78
|
|
77
79
|
|
78
|
-
def compute_f1(answer: str, target: str) -> float:
|
80
|
+
def compute_f1(answer: str, target: str, stop_words: list[str] | None = None) -> float:
|
79
81
|
"""Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the SQuAD F1 metric for the prediction."""
|
80
|
-
answer_words = _to_words(answer)
|
81
|
-
target_words = _to_words(target)
|
82
|
+
answer_words = _to_words(answer, stop_words)
|
83
|
+
target_words = _to_words(target, stop_words)
|
82
84
|
|
83
85
|
return _f1(answer_words=answer_words, target_words=target_words)
|
84
86
|
|
85
87
|
|
86
|
-
def _to_words(
|
87
|
-
answer
|
88
|
-
) -> set[str]:
|
89
|
-
normalized = _normalize(answer)
|
88
|
+
def _to_words(answer: str, stop_words: list[str] | None = None) -> set[str]:
|
89
|
+
normalized = _normalize(answer, stop_words)
|
90
90
|
token_bag = set(normalized.split())
|
91
91
|
return token_bag
|
92
92
|
|
@@ -147,16 +147,32 @@ def _tokenize(text: str) -> List[str]:
|
|
147
147
|
return re.split(" |-", text)
|
148
148
|
|
149
149
|
|
150
|
-
def _normalize(
|
150
|
+
def _normalize(text: str, stop_words: list[str] | None = None) -> str:
|
151
151
|
"""Normalize text to remove extraneous characters and words."""
|
152
152
|
tokens = []
|
153
|
-
tokenized_answer = _tokenize(
|
153
|
+
tokenized_answer = _tokenize(text)
|
154
|
+
|
155
|
+
# Process stop words, if present
|
156
|
+
if stop_words is not None:
|
157
|
+
folded_stop_words = [_normalize_token(word) for word in stop_words]
|
158
|
+
else:
|
159
|
+
folded_stop_words = []
|
160
|
+
|
161
|
+
# Now process the text
|
154
162
|
for token in tokenized_answer:
|
155
|
-
token =
|
156
|
-
token
|
157
|
-
|
158
|
-
|
159
|
-
|
163
|
+
token = _normalize_token(token)
|
164
|
+
if folded_stop_words is None or token not in folded_stop_words:
|
165
|
+
tokens.append(token)
|
166
|
+
|
167
|
+
# re-join the tokens into a normalized string
|
160
168
|
tokens = [token for token in tokens if token.strip()]
|
161
169
|
normalized = " ".join(tokens).strip()
|
162
170
|
return normalized
|
171
|
+
|
172
|
+
|
173
|
+
def _normalize_token(token: str) -> str:
|
174
|
+
token = _remove_punc(token.casefold())
|
175
|
+
token = _normalize_number(token)
|
176
|
+
token = _remove_articles(token)
|
177
|
+
token = _normalize_whitespace(token)
|
178
|
+
return token
|
inspect_ai/scorer/_common.py
CHANGED
@@ -25,19 +25,13 @@ def str_match_scorer(match: Callable[[str, str], tuple[str, bool]]) -> Scorer:
|
|
25
25
|
for value in target:
|
26
26
|
answer, matched = match(state.output.completion, value)
|
27
27
|
if matched:
|
28
|
-
explanation = (
|
29
|
-
state.output.completion
|
30
|
-
if state.output.completion != answer
|
31
|
-
else None
|
32
|
-
)
|
33
28
|
return Score(
|
34
29
|
value=CORRECT, answer=answer, explanation=state.output.completion
|
35
30
|
)
|
36
31
|
|
37
|
-
|
38
|
-
|
32
|
+
return Score(
|
33
|
+
value=INCORRECT, answer=answer, explanation=state.output.completion
|
39
34
|
)
|
40
|
-
return Score(value=INCORRECT, answer=answer, explanation=explanation)
|
41
35
|
|
42
36
|
return score
|
43
37
|
|
@@ -1,8 +1,9 @@
|
|
1
1
|
from logging import getLogger
|
2
|
-
from typing import Callable, cast
|
2
|
+
from typing import Awaitable, Callable, cast
|
3
3
|
|
4
4
|
from typing_extensions import TypedDict, Unpack
|
5
5
|
|
6
|
+
from inspect_ai._util._async import is_callable_coroutine
|
6
7
|
from inspect_ai.model._cache import CachePolicy
|
7
8
|
from inspect_ai.model._call_tools import call_tools
|
8
9
|
from inspect_ai.model._chat_message import ChatMessageTool, ChatMessageUser
|
@@ -58,7 +59,9 @@ def basic_agent(
|
|
58
59
|
max_tool_output: int | None = None,
|
59
60
|
score_value: ValueToFloat | None = None,
|
60
61
|
incorrect_message: str
|
61
|
-
| Callable[
|
62
|
+
| Callable[
|
63
|
+
[TaskState, list[Score]], str | Awaitable[str]
|
64
|
+
] = DEFAULT_INCORRECT_MESSAGE,
|
62
65
|
continue_message: str = DEFAULT_CONTINUE_MESSAGE,
|
63
66
|
submit_name: str = DEFAULT_SUBMIT_NAME,
|
64
67
|
submit_description: str = DEFAULT_SUBMIT_DESCRIPTION,
|
@@ -93,8 +96,9 @@ def basic_agent(
|
|
93
96
|
Defaults to max_tool_output from active GenerateConfig.
|
94
97
|
score_value (ValueToFloat): Function used to extract float from scores (defaults
|
95
98
|
to standard value_to_float())
|
96
|
-
incorrect_message (str | Callable[[TaskState, list[Score]], str]):
|
97
|
-
incorrect submission from the model. Alternatively,
|
99
|
+
incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
|
100
|
+
User message reply for an incorrect submission from the model. Alternatively,
|
101
|
+
a function which returns a message (function may optionally be async)
|
98
102
|
continue_message (str): User message to urge the model to continue when it
|
99
103
|
doesn't make a tool call.
|
100
104
|
submit_name (str): Name for tool used to make submissions
|
@@ -216,11 +220,17 @@ def basic_agent(
|
|
216
220
|
|
217
221
|
# otherwise notify the model that it was incorrect and continue
|
218
222
|
else:
|
219
|
-
|
220
|
-
incorrect_message(
|
221
|
-
|
222
|
-
|
223
|
-
)
|
223
|
+
if is_callable_coroutine(incorrect_message):
|
224
|
+
response_message: str = await incorrect_message(
|
225
|
+
state, answer_scores
|
226
|
+
) # type: ignore[misc,operator]
|
227
|
+
elif callable(incorrect_message):
|
228
|
+
response_message = cast(
|
229
|
+
str, incorrect_message(state, answer_scores)
|
230
|
+
)
|
231
|
+
else:
|
232
|
+
response_message = incorrect_message
|
233
|
+
|
224
234
|
state.messages.append(
|
225
235
|
ChatMessageUser(content=response_message)
|
226
236
|
)
|
@@ -1,13 +1,19 @@
|
|
1
|
+
import logging
|
1
2
|
import re
|
2
3
|
from enum import Enum
|
3
4
|
from random import Random
|
4
|
-
from typing import Match
|
5
|
+
from typing import Match, TypedDict
|
5
6
|
|
7
|
+
from typing_extensions import Unpack
|
8
|
+
|
9
|
+
from inspect_ai._util.logger import warn_once
|
6
10
|
from inspect_ai.util import resource
|
7
11
|
|
8
12
|
from ._solver import Generate, Solver, solver
|
9
13
|
from ._task_state import Choices, TaskState
|
10
14
|
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
11
17
|
SINGLE_ANSWER_TEMPLATE = r"""
|
12
18
|
Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
13
19
|
|
@@ -201,13 +207,17 @@ class MultipleChoiceTemplate(str, Enum):
|
|
201
207
|
MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
|
202
208
|
|
203
209
|
|
210
|
+
class DeprecatedArgs(TypedDict, total=False):
|
211
|
+
shuffle: bool | Random
|
212
|
+
|
213
|
+
|
204
214
|
@solver
|
205
215
|
def multiple_choice(
|
206
216
|
*,
|
207
217
|
template: str | None = None,
|
208
218
|
cot: bool = False,
|
209
219
|
multiple_correct: bool = False,
|
210
|
-
|
220
|
+
**kwargs: Unpack[DeprecatedArgs],
|
211
221
|
) -> Solver:
|
212
222
|
"""Multiple choice question solver.
|
213
223
|
|
@@ -223,10 +233,7 @@ def multiple_choice(
|
|
223
233
|
|
224
234
|
### Shuffling
|
225
235
|
|
226
|
-
|
227
|
-
after the model has been called, essentially rewriting history. It is
|
228
|
-
something to be aware of if writing custom scorers or solvers that interact
|
229
|
-
with this scorer.
|
236
|
+
You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
|
230
237
|
|
231
238
|
Args:
|
232
239
|
template (str | None): Template to use for the multiple choice question.
|
@@ -243,10 +250,18 @@ def multiple_choice(
|
|
243
250
|
squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
|
244
251
|
as `False` if there's exactly one correct answer from the choices
|
245
252
|
available. NOTE: this has no effect if you provide a custom template.
|
246
|
-
|
247
|
-
in the multiple. Passing a `Random` instance will use that for shuffling,
|
248
|
-
if `True` a new `Random` instance will be created.
|
253
|
+
**kwargs (Any): Deprecated arguments for backward compatibility.
|
249
254
|
"""
|
255
|
+
shuffle: bool | Random = False
|
256
|
+
if "shuffle" in kwargs:
|
257
|
+
shuffle = kwargs["shuffle"]
|
258
|
+
|
259
|
+
if shuffle:
|
260
|
+
warn_once(
|
261
|
+
logger,
|
262
|
+
"The multiple choice shuffle parameter is deprecated. Please shuffle choices at the time your dataset is read by using the shuffle_choices method/parameter of the datasets API.",
|
263
|
+
)
|
264
|
+
|
250
265
|
if template and not valid_template(template):
|
251
266
|
raise ValueError(
|
252
267
|
"The template must contain '{question}' and '{choices}' placeholders for string substitution."
|
inspect_ai/tool/__init__.py
CHANGED
@@ -21,12 +21,14 @@ from ._tool_def import ToolDef
|
|
21
21
|
from ._tool_info import ToolInfo
|
22
22
|
from ._tool_params import ToolParam, ToolParams
|
23
23
|
from ._tool_with import tool_with
|
24
|
+
from ._tools._computer import computer
|
24
25
|
from ._tools._execute import bash, python
|
25
26
|
from ._tools._web_browser import web_browser
|
26
27
|
from ._tools._web_search import web_search
|
27
28
|
|
28
29
|
__all__ = [
|
29
30
|
"bash",
|
31
|
+
"computer",
|
30
32
|
"python",
|
31
33
|
"web_browser",
|
32
34
|
"web_search",
|
@@ -2,10 +2,7 @@ from typing import Awaitable, Callable
|
|
2
2
|
|
3
3
|
from inspect_ai._util.content import Content, ContentImage, ContentText
|
4
4
|
from inspect_ai.tool import Tool, ToolResult, tool
|
5
|
-
from inspect_ai.tool._tool import
|
6
|
-
TOOL_INIT_MODEL_INPUT,
|
7
|
-
ToolParsingError,
|
8
|
-
)
|
5
|
+
from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
|
9
6
|
from inspect_ai.tool._tool_call import ToolCallModelInput
|
10
7
|
|
11
8
|
from . import _common as common
|
@@ -84,7 +81,7 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
|
|
84
81
|
if coordinate is not None:
|
85
82
|
raise ToolParsingError(f"coordinate is not accepted for {action}")
|
86
83
|
if not isinstance(text, str):
|
87
|
-
raise ToolParsingError(
|
84
|
+
raise ToolParsingError(f"{text} must be a string")
|
88
85
|
|
89
86
|
if action == "key":
|
90
87
|
return await common.press_key(text, timeout=timeout)
|
@@ -60,6 +60,10 @@ RUN apt-get install -y \
|
|
60
60
|
# configure noVNC
|
61
61
|
RUN ln -s /usr/share/novnc/vnc.html /usr/share/novnc/index.html
|
62
62
|
|
63
|
+
# configure python alias
|
64
|
+
RUN ln -s /usr/bin/python3 /usr/bin/python
|
65
|
+
|
66
|
+
|
63
67
|
# We copy requirements.txt by itself so that changes to the scripts will be in a later layer
|
64
68
|
# and we only pip install if requirements.txt changes
|
65
69
|
COPY tool/requirements.txt /opt/inspect/tool/requirements.txt
|
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
|
3
|
+
<channel name="xfce4-panel" version="1.0">
|
4
|
+
<property name="configver" type="int" value="2"/>
|
5
|
+
<property name="panels" type="array">
|
6
|
+
<value type="int" value="1"/>
|
7
|
+
<property name="dark-mode" type="bool" value="true"/>
|
8
|
+
<property name="panel-1" type="empty">
|
9
|
+
<property name="position" type="string" value="p=6;x=0;y=0"/>
|
10
|
+
<property name="length" type="uint" value="100"/>
|
11
|
+
<property name="position-locked" type="bool" value="true"/>
|
12
|
+
<property name="icon-size" type="uint" value="16"/>
|
13
|
+
<property name="size" type="uint" value="26"/>
|
14
|
+
<property name="plugin-ids" type="array">
|
15
|
+
<value type="int" value="1"/>
|
16
|
+
<value type="int" value="2"/>
|
17
|
+
<value type="int" value="3"/>
|
18
|
+
<value type="int" value="4"/>
|
19
|
+
<value type="int" value="5"/>
|
20
|
+
<value type="int" value="6"/>
|
21
|
+
<value type="int" value="8"/>
|
22
|
+
<value type="int" value="10"/>
|
23
|
+
<value type="int" value="11"/>
|
24
|
+
<value type="int" value="12"/>
|
25
|
+
<value type="int" value="13"/>
|
26
|
+
<value type="int" value="14"/>
|
27
|
+
</property>
|
28
|
+
</property>
|
29
|
+
</property>
|
30
|
+
<property name="plugins" type="empty">
|
31
|
+
<property name="plugin-1" type="string" value="applicationsmenu"/>
|
32
|
+
<property name="plugin-2" type="string" value="tasklist">
|
33
|
+
<property name="grouping" type="uint" value="1"/>
|
34
|
+
</property>
|
35
|
+
<property name="plugin-3" type="string" value="separator">
|
36
|
+
<property name="expand" type="bool" value="true"/>
|
37
|
+
<property name="style" type="uint" value="0"/>
|
38
|
+
</property>
|
39
|
+
<property name="plugin-4" type="string" value="pager"/>
|
40
|
+
<property name="plugin-5" type="string" value="separator">
|
41
|
+
<property name="style" type="uint" value="0"/>
|
42
|
+
</property>
|
43
|
+
<property name="plugin-6" type="string" value="systray">
|
44
|
+
<property name="square-icons" type="bool" value="true"/>
|
45
|
+
</property>
|
46
|
+
<property name="plugin-8" type="string" value="pulseaudio">
|
47
|
+
<property name="enable-keyboard-shortcuts" type="bool" value="true"/>
|
48
|
+
<property name="show-notifications" type="bool" value="true"/>
|
49
|
+
</property>
|
50
|
+
<property name="plugin-9" type="string" value="power-manager-plugin"/>
|
51
|
+
<property name="plugin-10" type="string" value="notification-plugin"/>
|
52
|
+
<property name="plugin-11" type="string" value="separator">
|
53
|
+
<property name="style" type="uint" value="0"/>
|
54
|
+
</property>
|
55
|
+
<property name="plugin-12" type="string" value="clock"/>
|
56
|
+
<property name="plugin-13" type="string" value="separator">
|
57
|
+
<property name="style" type="uint" value="0"/>
|
58
|
+
</property>
|
59
|
+
<property name="plugin-14" type="string" value="actions"/>
|
60
|
+
</property>
|
61
|
+
</channel>
|
File without changes
|
@@ -138,7 +138,7 @@ class X11Client:
|
|
138
138
|
if coordinate is not None:
|
139
139
|
raise ToolError(f"coordinate is not accepted for {action}")
|
140
140
|
if not isinstance(text, str):
|
141
|
-
raise ToolError(
|
141
|
+
raise ToolError(f"{text} must be a string")
|
142
142
|
|
143
143
|
if action == "key":
|
144
144
|
return await self.shell(
|
File without changes
|
@@ -74,8 +74,14 @@ def python(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
74
74
|
"""
|
75
75
|
Use the python function to execute Python code.
|
76
76
|
|
77
|
-
The
|
78
|
-
|
77
|
+
The Python tool executes single-run Python scripts. Important notes:
|
78
|
+
1. Each execution is independent - no state is preserved between runs
|
79
|
+
2. You must explicitly use print() statements to see any output
|
80
|
+
3. Simply writing expressions (like in notebooks) will not display results
|
81
|
+
4. The script cannot accept interactive input during execution
|
82
|
+
5. Return statements alone won't produce visible output
|
83
|
+
6. All variables and imports are cleared between executions
|
84
|
+
7. Standard output (via print()) is the only way to see results
|
79
85
|
|
80
86
|
Args:
|
81
87
|
code (str): The python code to execute.
|
inspect_ai/tool/beta.py
ADDED