inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/model/_model.py
CHANGED
@@ -149,7 +149,11 @@ class ModelAPI(abc.ABC):
|
|
149
149
|
return "default"
|
150
150
|
|
151
151
|
def is_rate_limit(self, ex: BaseException) -> bool:
|
152
|
-
"""Is this exception a rate limit error.
|
152
|
+
"""Is this exception a rate limit error.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
ex: Exception to check for rate limit.
|
156
|
+
"""
|
153
157
|
return False
|
154
158
|
|
155
159
|
def collapse_user_messages(self) -> bool:
|
@@ -176,12 +180,18 @@ class ModelAPI(abc.ABC):
|
|
176
180
|
class Model:
|
177
181
|
"""Model interface."""
|
178
182
|
|
183
|
+
api: ModelAPI
|
184
|
+
"""Model API."""
|
185
|
+
|
186
|
+
config: GenerateConfig
|
187
|
+
"""Generation config."""
|
188
|
+
|
179
189
|
def __init__(self, api: ModelAPI, config: GenerateConfig) -> None:
|
180
190
|
"""Create a model.
|
181
191
|
|
182
192
|
Args:
|
183
|
-
api
|
184
|
-
config
|
193
|
+
api: Model API provider.
|
194
|
+
config: Model configuration.
|
185
195
|
"""
|
186
196
|
self.api = api
|
187
197
|
self.config = config
|
@@ -212,16 +222,12 @@ class Model:
|
|
212
222
|
"""Generate output from the model.
|
213
223
|
|
214
224
|
Args:
|
215
|
-
input (str
|
216
|
-
input (if a `str` is passed it is converted
|
225
|
+
input: Chat message input (if a `str` is passed it is converted
|
217
226
|
to a `ChatMessageUser`).
|
218
|
-
tools
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
cache (bool | CachePolicy): Caching behavior for
|
223
|
-
generate responses (defaults to no caching).
|
224
|
-
config (GenerateConfig): Model configuration.
|
227
|
+
tools: Tools available for the model to call.
|
228
|
+
tool_choice: Directives to the model as to which tools to prefer.
|
229
|
+
config: Model configuration.
|
230
|
+
cache: Caching behavior for generate responses (defaults to no caching).
|
225
231
|
|
226
232
|
Returns:
|
227
233
|
ModelOutput
|
@@ -517,7 +523,8 @@ class Model:
|
|
517
523
|
) -> None:
|
518
524
|
# trace
|
519
525
|
if isinstance(result, ModelOutput):
|
520
|
-
|
526
|
+
if result.choices:
|
527
|
+
conversation_assistant_message(input, result.choices[0].message)
|
521
528
|
event.output = result
|
522
529
|
else:
|
523
530
|
conversation_assistant_error(result)
|
@@ -550,7 +557,7 @@ class ModelName:
|
|
550
557
|
"""Create a ModelName.
|
551
558
|
|
552
559
|
Args:
|
553
|
-
model:
|
560
|
+
model: Model to create name for.
|
554
561
|
"""
|
555
562
|
if isinstance(model, str):
|
556
563
|
(api, name) = self._parse_model(model)
|
@@ -596,16 +603,16 @@ def get_model(
|
|
596
603
|
"""Get an instance of a model.
|
597
604
|
|
598
605
|
Args:
|
599
|
-
model
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
config
|
605
|
-
base_url
|
606
|
-
api_key
|
607
|
-
**model_args
|
608
|
-
|
606
|
+
model: Model specification.
|
607
|
+
If `Model` is passed it is returned unmodified,
|
608
|
+
if `None` is passed then the model currently being
|
609
|
+
evaluated is returned (or if there is no evaluation
|
610
|
+
then the model referred to by `INSPECT_EVAL_MODEL`).
|
611
|
+
config: Configuration for model.
|
612
|
+
base_url: Optional. Alternate base URL for model.
|
613
|
+
api_key: Optional. API key for model.
|
614
|
+
**model_args: Additional args to
|
615
|
+
pass to model constructor.
|
609
616
|
|
610
617
|
Returns:
|
611
618
|
Model instance.
|
@@ -9,6 +9,8 @@ from ._chat_message import ChatMessageAssistant
|
|
9
9
|
|
10
10
|
|
11
11
|
class ModelUsage(BaseModel):
|
12
|
+
"""Token usage for completion."""
|
13
|
+
|
12
14
|
input_tokens: int = Field(default=0)
|
13
15
|
"""Total input tokens used."""
|
14
16
|
|
@@ -73,6 +75,8 @@ class Logprobs(BaseModel):
|
|
73
75
|
|
74
76
|
|
75
77
|
class ChatCompletionChoice(BaseModel):
|
78
|
+
"""Choice generated for completion."""
|
79
|
+
|
76
80
|
message: ChatMessageAssistant
|
77
81
|
"""Assistant message."""
|
78
82
|
|
@@ -96,6 +100,8 @@ class ChatCompletionChoice(BaseModel):
|
|
96
100
|
|
97
101
|
|
98
102
|
class ModelOutput(BaseModel):
|
103
|
+
"""Output from model generation."""
|
104
|
+
|
99
105
|
model: str = Field(default_factory=str)
|
100
106
|
"""Model used for generation."""
|
101
107
|
|
@@ -155,7 +161,14 @@ class ModelOutput(BaseModel):
|
|
155
161
|
stop_reason: StopReason = "stop",
|
156
162
|
error: str | None = None,
|
157
163
|
) -> "ModelOutput":
|
158
|
-
"""
|
164
|
+
"""Create ModelOutput from simple text content.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
model: Model name.
|
168
|
+
content: Text content from generation.
|
169
|
+
stop_reason: Stop reason for generation.
|
170
|
+
error: Error message.
|
171
|
+
"""
|
159
172
|
return ModelOutput(
|
160
173
|
model=model,
|
161
174
|
choices=[
|
inspect_ai/model/_openai.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import re
|
2
3
|
from typing import Literal
|
3
4
|
|
4
5
|
from openai.types.chat import (
|
@@ -44,29 +45,13 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
|
|
44
45
|
|
45
46
|
|
46
47
|
def is_o_series(name: str) -> bool:
|
47
|
-
return
|
48
|
-
|
49
|
-
|
50
|
-
def is_o1(name: str) -> bool:
|
51
|
-
return name.startswith("o1")
|
52
|
-
|
53
|
-
|
54
|
-
def is_o3(name: str) -> bool:
|
55
|
-
return name.startswith("o3")
|
56
|
-
|
57
|
-
|
58
|
-
def is_o1_full(name: str) -> bool:
|
59
|
-
return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
|
48
|
+
return bool(re.match(r"^o\d+", name))
|
60
49
|
|
61
50
|
|
62
51
|
def is_o1_mini(name: str) -> bool:
|
63
52
|
return name.startswith("o1-mini")
|
64
53
|
|
65
54
|
|
66
|
-
def is_o3_mini(name: str) -> bool:
|
67
|
-
return name.startswith("o3-mini")
|
68
|
-
|
69
|
-
|
70
55
|
def is_o1_preview(name: str) -> bool:
|
71
56
|
return name.startswith("o1-preview")
|
72
57
|
|
@@ -132,10 +117,17 @@ async def openai_chat_message(
|
|
132
117
|
message: ChatMessage, model: str
|
133
118
|
) -> ChatCompletionMessageParam:
|
134
119
|
if message.role == "system":
|
135
|
-
|
120
|
+
# o1-mini does not support developer or system messages
|
121
|
+
# (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
|
122
|
+
if is_o1_mini(model):
|
123
|
+
return ChatCompletionUserMessageParam(role="user", content=message.text)
|
124
|
+
# other o-series models use 'developer' rather than 'system' messages
|
125
|
+
# https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
|
126
|
+
elif is_o_series(model):
|
136
127
|
return ChatCompletionDeveloperMessageParam(
|
137
128
|
role="developer", content=message.text
|
138
129
|
)
|
130
|
+
# gpt models use standard 'system' messages
|
139
131
|
else:
|
140
132
|
return ChatCompletionSystemMessageParam(
|
141
133
|
role=message.role, content=message.text
|
@@ -216,6 +216,9 @@ class AnthropicAPI(ModelAPI):
|
|
216
216
|
# return output and call
|
217
217
|
return output, model_call()
|
218
218
|
|
219
|
+
except BadRequestError as ex:
|
220
|
+
return self.handle_bad_request(ex), model_call()
|
221
|
+
|
219
222
|
except APIStatusError as ex:
|
220
223
|
if ex.status_code == 413:
|
221
224
|
return ModelOutput.from_content(
|
@@ -227,9 +230,6 @@ class AnthropicAPI(ModelAPI):
|
|
227
230
|
else:
|
228
231
|
raise ex
|
229
232
|
|
230
|
-
except BadRequestError as ex:
|
231
|
-
return self.handle_bad_request(ex), model_call()
|
232
|
-
|
233
233
|
def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
|
234
234
|
params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
|
235
235
|
if config.temperature is not None:
|
@@ -5,7 +5,7 @@ import json
|
|
5
5
|
from copy import copy
|
6
6
|
from io import BytesIO
|
7
7
|
from logging import getLogger
|
8
|
-
from typing import Any, cast
|
8
|
+
from typing import Any, MutableSequence, cast
|
9
9
|
|
10
10
|
import proto # type: ignore
|
11
11
|
from google.ai.generativelanguage import (
|
@@ -553,11 +553,15 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
|
|
553
553
|
|
554
554
|
|
555
555
|
def completion_choices_from_candidates(
|
556
|
-
candidates:
|
556
|
+
candidates: MutableSequence[Candidate],
|
557
557
|
) -> list[ChatCompletionChoice]:
|
558
|
-
|
559
|
-
|
560
|
-
|
558
|
+
if candidates:
|
559
|
+
candidates_list = sorted(candidates, key=lambda c: c.index)
|
560
|
+
return [
|
561
|
+
completion_choice_from_candidate(candidate) for candidate in candidates_list
|
562
|
+
]
|
563
|
+
else:
|
564
|
+
return []
|
561
565
|
|
562
566
|
|
563
567
|
# google doesn't export FinishReason (it's in a sub-namespace with a beta
|
@@ -36,10 +36,8 @@ from .._model_output import (
|
|
36
36
|
)
|
37
37
|
from .._openai import (
|
38
38
|
is_gpt,
|
39
|
-
is_o1_full,
|
40
39
|
is_o1_mini,
|
41
40
|
is_o1_preview,
|
42
|
-
is_o3,
|
43
41
|
is_o_series,
|
44
42
|
openai_chat_messages,
|
45
43
|
openai_chat_tool_choice,
|
@@ -145,15 +143,9 @@ class OpenAIAPI(ModelAPI):
|
|
145
143
|
def is_o_series(self) -> bool:
|
146
144
|
return is_o_series(self.model_name)
|
147
145
|
|
148
|
-
def is_o1_full(self) -> bool:
|
149
|
-
return is_o1_full(self.model_name)
|
150
|
-
|
151
146
|
def is_o1_mini(self) -> bool:
|
152
147
|
return is_o1_mini(self.model_name)
|
153
148
|
|
154
|
-
def is_o3(self) -> bool:
|
155
|
-
return is_o3(self.model_name)
|
156
|
-
|
157
149
|
def is_o1_preview(self) -> bool:
|
158
150
|
return is_o1_preview(self.model_name)
|
159
151
|
|
@@ -303,7 +295,11 @@ class OpenAIAPI(ModelAPI):
|
|
303
295
|
params["top_logprobs"] = config.top_logprobs
|
304
296
|
if tools and config.parallel_tool_calls is not None and not self.is_o_series():
|
305
297
|
params["parallel_tool_calls"] = config.parallel_tool_calls
|
306
|
-
if
|
298
|
+
if (
|
299
|
+
config.reasoning_effort is not None
|
300
|
+
and not self.is_gpt()
|
301
|
+
and not self.is_o1_mini()
|
302
|
+
):
|
307
303
|
params["reasoning_effort"] = config.reasoning_effort
|
308
304
|
|
309
305
|
return params
|
@@ -27,11 +27,7 @@ from inspect_ai.tool import ToolCall, ToolInfo
|
|
27
27
|
from .._call_tools import parse_tool_call, tool_parse_error_message
|
28
28
|
from .._model_call import ModelCall
|
29
29
|
from .._model_output import ModelUsage, StopReason, as_stop_reason
|
30
|
-
from .._providers.util import
|
31
|
-
ChatAPIHandler,
|
32
|
-
ChatAPIMessage,
|
33
|
-
chat_api_input,
|
34
|
-
)
|
30
|
+
from .._providers.util import ChatAPIHandler, ChatAPIMessage, chat_api_input
|
35
31
|
|
36
32
|
logger = getLogger(__name__)
|
37
33
|
|
@@ -85,6 +81,8 @@ def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput | Excepti
|
|
85
81
|
stop_reason: StopReason | None = "model_length"
|
86
82
|
elif ex.code == "invalid_prompt":
|
87
83
|
stop_reason = "content_filter"
|
84
|
+
else:
|
85
|
+
stop_reason = None
|
88
86
|
|
89
87
|
if stop_reason:
|
90
88
|
return ModelOutput.from_content(
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from typing_extensions import override
|
5
|
+
|
6
|
+
from inspect_ai._util.error import PrerequisiteError
|
7
|
+
from inspect_ai.model._providers.util import model_base_url
|
8
|
+
from inspect_ai.model._providers.util.util import environment_prerequisite_error
|
9
|
+
|
10
|
+
from .._generate_config import GenerateConfig
|
11
|
+
from .openai import OpenAIAPI
|
12
|
+
|
13
|
+
OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
|
14
|
+
|
15
|
+
|
16
|
+
class OpenRouterAPI(OpenAIAPI):
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
model_name: str,
|
20
|
+
base_url: str | None = None,
|
21
|
+
api_key: str | None = None,
|
22
|
+
config: GenerateConfig = GenerateConfig(),
|
23
|
+
**model_args: Any,
|
24
|
+
) -> None:
|
25
|
+
# api_key
|
26
|
+
if not api_key:
|
27
|
+
api_key = os.environ.get(OPENROUTER_API_KEY, None)
|
28
|
+
if not api_key:
|
29
|
+
raise environment_prerequisite_error("OpenRouter", OPENROUTER_API_KEY)
|
30
|
+
|
31
|
+
# base_url
|
32
|
+
base_url = model_base_url(base_url, "OPENROUTER_BASE_URL")
|
33
|
+
base_url = base_url if base_url else "https://openrouter.ai/api/v1"
|
34
|
+
|
35
|
+
# collect known model args that we forward to generate
|
36
|
+
def collect_model_arg(name: str) -> Any | None:
|
37
|
+
nonlocal model_args
|
38
|
+
value = model_args.get(name, None)
|
39
|
+
if value is not None:
|
40
|
+
model_args.pop(name)
|
41
|
+
return value
|
42
|
+
|
43
|
+
# models arg
|
44
|
+
self.models = collect_model_arg("models")
|
45
|
+
if self.models is not None:
|
46
|
+
if not isinstance(self.models, list):
|
47
|
+
raise PrerequisiteError("models must be a list of strings")
|
48
|
+
|
49
|
+
# providers arg
|
50
|
+
self.provider = collect_model_arg("provider")
|
51
|
+
if self.provider is not None:
|
52
|
+
if not isinstance(self.provider, dict):
|
53
|
+
raise PrerequisiteError("provider must be a dict")
|
54
|
+
|
55
|
+
# transforms arg
|
56
|
+
self.transforms = collect_model_arg("transforms")
|
57
|
+
if self.transforms is not None:
|
58
|
+
if not isinstance(self.transforms, list):
|
59
|
+
raise PrerequisiteError("transforms must be a list of strings")
|
60
|
+
|
61
|
+
# call super
|
62
|
+
super().__init__(
|
63
|
+
model_name=model_name,
|
64
|
+
base_url=base_url,
|
65
|
+
api_key=api_key,
|
66
|
+
config=config,
|
67
|
+
**model_args,
|
68
|
+
)
|
69
|
+
|
70
|
+
@override
|
71
|
+
def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, Any]:
|
72
|
+
# default params
|
73
|
+
params = super().completion_params(config, tools)
|
74
|
+
|
75
|
+
# pass args if specifed
|
76
|
+
EXTRA_BODY = "extra_body"
|
77
|
+
if self.models or self.provider or self.transforms:
|
78
|
+
params[EXTRA_BODY] = params.get(EXTRA_BODY, {})
|
79
|
+
if self.models:
|
80
|
+
params[EXTRA_BODY]["models"] = self.models
|
81
|
+
if self.provider:
|
82
|
+
params[EXTRA_BODY]["provider"] = self.provider
|
83
|
+
if self.transforms:
|
84
|
+
params[EXTRA_BODY]["transforms"] = self.transforms
|
85
|
+
|
86
|
+
return params
|
@@ -198,6 +198,17 @@ def ollama() -> type[ModelAPI]:
|
|
198
198
|
return OllamaAPI
|
199
199
|
|
200
200
|
|
201
|
+
@modelapi(name="openrouter")
|
202
|
+
def openrouter() -> type[ModelAPI]:
|
203
|
+
# validate
|
204
|
+
validate_openai_client("OpenRouter API")
|
205
|
+
|
206
|
+
# in the clear
|
207
|
+
from .openrouter import OpenRouterAPI
|
208
|
+
|
209
|
+
return OpenRouterAPI
|
210
|
+
|
211
|
+
|
201
212
|
@modelapi(name="llama-cpp-python")
|
202
213
|
def llama_cpp_python() -> type[ModelAPI]:
|
203
214
|
# validate
|
inspect_ai/scorer/__init__.py
CHANGED
@@ -10,6 +10,8 @@ from ._metric import (
|
|
10
10
|
NOANSWER,
|
11
11
|
PARTIAL,
|
12
12
|
Metric,
|
13
|
+
MetricProtocol,
|
14
|
+
SampleScore,
|
13
15
|
Score,
|
14
16
|
Value,
|
15
17
|
ValueToFloat,
|
@@ -18,7 +20,7 @@ from ._metric import (
|
|
18
20
|
)
|
19
21
|
from ._metrics.accuracy import accuracy
|
20
22
|
from ._metrics.mean import mean
|
21
|
-
from ._metrics.std import bootstrap_stderr, std, stderr
|
23
|
+
from ._metrics.std import bootstrap_stderr, std, stderr, var
|
22
24
|
from ._model import model_graded_fact, model_graded_qa
|
23
25
|
from ._multi import multi_scorer
|
24
26
|
from ._pattern import pattern
|
@@ -56,9 +58,12 @@ __all__ = [
|
|
56
58
|
"std",
|
57
59
|
"stderr",
|
58
60
|
"mean",
|
61
|
+
"var",
|
59
62
|
"Metric",
|
63
|
+
"MetricProtocol",
|
60
64
|
"metric",
|
61
65
|
"Score",
|
66
|
+
"SampleScore",
|
62
67
|
"score",
|
63
68
|
"Value",
|
64
69
|
"ValueToFloat",
|
inspect_ai/scorer/_answer.py
CHANGED
@@ -8,7 +8,7 @@ from inspect_ai._util.pattern import (
|
|
8
8
|
)
|
9
9
|
|
10
10
|
from ._metrics import accuracy, stderr
|
11
|
-
from ._pattern import pattern
|
11
|
+
from ._pattern import pattern as make_pattern
|
12
12
|
from ._scorer import Scorer, scorer
|
13
13
|
|
14
14
|
|
@@ -33,7 +33,7 @@ class AnswerPattern(str, Enum):
|
|
33
33
|
|
34
34
|
|
35
35
|
@scorer(metrics=[accuracy(), stderr()])
|
36
|
-
def answer(
|
36
|
+
def answer(pattern: Literal["letter", "word", "line"]) -> Scorer:
|
37
37
|
"""Scorer for model output that preceded answers with ANSWER:.
|
38
38
|
|
39
39
|
Some solvers including multiple_choice solicit answers from
|
@@ -43,7 +43,7 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
|
|
43
43
|
Note that you must specify a `type` for the answer scorer.
|
44
44
|
|
45
45
|
Args:
|
46
|
-
|
46
|
+
pattern: Type of answer
|
47
47
|
to extract. "letter" is used with multiple choice and
|
48
48
|
extracts a single letter; "word" will extract the next
|
49
49
|
word (often used for yes/no answers); "line" will take
|
@@ -53,10 +53,10 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
|
|
53
53
|
with a separate line at the end.
|
54
54
|
|
55
55
|
"""
|
56
|
-
match
|
56
|
+
match pattern:
|
57
57
|
case "letter":
|
58
|
-
return
|
58
|
+
return make_pattern(AnswerPattern.LETTER)
|
59
59
|
case "word":
|
60
|
-
return
|
60
|
+
return make_pattern(AnswerPattern.WORD)
|
61
61
|
case "line":
|
62
|
-
return
|
62
|
+
return make_pattern(AnswerPattern.LINE)
|
@@ -12,11 +12,15 @@ from ._target import Target
|
|
12
12
|
|
13
13
|
@scorer(metrics=[mean(), stderr()])
|
14
14
|
def f1(
|
15
|
-
answer_fn: Callable[[str], str] | None = None,
|
15
|
+
answer_fn: Callable[[str], str] | None = None, stop_words: list[str] | None = None
|
16
16
|
) -> Scorer:
|
17
17
|
"""Scorer which produces an F1 score
|
18
18
|
|
19
19
|
Computes the `F1` score for the answer (which balances recall precision by taking the harmonic mean between recall and precision).
|
20
|
+
|
21
|
+
Args:
|
22
|
+
answer_fn: Custom function to extract the answer from the completion (defaults to using the completion).
|
23
|
+
stop_words: Stop words to include in answer tokenization.
|
20
24
|
"""
|
21
25
|
|
22
26
|
async def score(state: TaskState, target: Target) -> Score:
|
@@ -26,7 +30,7 @@ def f1(
|
|
26
30
|
)
|
27
31
|
targets = target.target
|
28
32
|
|
29
|
-
f1_score = max_f1_score(answer, targets)
|
33
|
+
f1_score = max_f1_score(answer, targets, stop_words=stop_words)
|
30
34
|
return Score(
|
31
35
|
value=f1_score,
|
32
36
|
answer=answer,
|
@@ -53,12 +57,14 @@ def exact() -> Scorer:
|
|
53
57
|
return score
|
54
58
|
|
55
59
|
|
56
|
-
def max_f1_score(
|
60
|
+
def max_f1_score(
|
61
|
+
answer: str, targets: List[str], stop_words: list[str] | None = None
|
62
|
+
) -> float:
|
57
63
|
# Find the maximum F1 score for this answer
|
58
64
|
max_f1 = 0.0
|
59
65
|
for target in targets:
|
60
66
|
if target[0].strip():
|
61
|
-
f1_score = compute_f1(answer, target)
|
67
|
+
f1_score = compute_f1(answer, target, stop_words)
|
62
68
|
max_f1 = max(max_f1, f1_score)
|
63
69
|
return round(max_f1, 2)
|
64
70
|
|
@@ -75,18 +81,16 @@ def max_exact_score(answer: str, targets: List[str]) -> float:
|
|
75
81
|
return max_exact
|
76
82
|
|
77
83
|
|
78
|
-
def compute_f1(answer: str, target: str) -> float:
|
84
|
+
def compute_f1(answer: str, target: str, stop_words: list[str] | None = None) -> float:
|
79
85
|
"""Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the SQuAD F1 metric for the prediction."""
|
80
|
-
answer_words = _to_words(answer)
|
81
|
-
target_words = _to_words(target)
|
86
|
+
answer_words = _to_words(answer, stop_words)
|
87
|
+
target_words = _to_words(target, stop_words)
|
82
88
|
|
83
89
|
return _f1(answer_words=answer_words, target_words=target_words)
|
84
90
|
|
85
91
|
|
86
|
-
def _to_words(
|
87
|
-
answer
|
88
|
-
) -> set[str]:
|
89
|
-
normalized = _normalize(answer)
|
92
|
+
def _to_words(answer: str, stop_words: list[str] | None = None) -> set[str]:
|
93
|
+
normalized = _normalize(answer, stop_words)
|
90
94
|
token_bag = set(normalized.split())
|
91
95
|
return token_bag
|
92
96
|
|
@@ -147,16 +151,32 @@ def _tokenize(text: str) -> List[str]:
|
|
147
151
|
return re.split(" |-", text)
|
148
152
|
|
149
153
|
|
150
|
-
def _normalize(
|
154
|
+
def _normalize(text: str, stop_words: list[str] | None = None) -> str:
|
151
155
|
"""Normalize text to remove extraneous characters and words."""
|
152
156
|
tokens = []
|
153
|
-
tokenized_answer = _tokenize(
|
157
|
+
tokenized_answer = _tokenize(text)
|
158
|
+
|
159
|
+
# Process stop words, if present
|
160
|
+
if stop_words is not None:
|
161
|
+
folded_stop_words = [_normalize_token(word) for word in stop_words]
|
162
|
+
else:
|
163
|
+
folded_stop_words = []
|
164
|
+
|
165
|
+
# Now process the text
|
154
166
|
for token in tokenized_answer:
|
155
|
-
token =
|
156
|
-
token
|
157
|
-
|
158
|
-
|
159
|
-
|
167
|
+
token = _normalize_token(token)
|
168
|
+
if folded_stop_words is None or token not in folded_stop_words:
|
169
|
+
tokens.append(token)
|
170
|
+
|
171
|
+
# re-join the tokens into a normalized string
|
160
172
|
tokens = [token for token in tokens if token.strip()]
|
161
173
|
normalized = " ".join(tokens).strip()
|
162
174
|
return normalized
|
175
|
+
|
176
|
+
|
177
|
+
def _normalize_token(token: str) -> str:
|
178
|
+
token = _remove_punc(token.casefold())
|
179
|
+
token = _normalize_number(token)
|
180
|
+
token = _remove_articles(token)
|
181
|
+
token = _normalize_whitespace(token)
|
182
|
+
return token
|
inspect_ai/scorer/_common.py
CHANGED
@@ -25,19 +25,13 @@ def str_match_scorer(match: Callable[[str, str], tuple[str, bool]]) -> Scorer:
|
|
25
25
|
for value in target:
|
26
26
|
answer, matched = match(state.output.completion, value)
|
27
27
|
if matched:
|
28
|
-
explanation = (
|
29
|
-
state.output.completion
|
30
|
-
if state.output.completion != answer
|
31
|
-
else None
|
32
|
-
)
|
33
28
|
return Score(
|
34
29
|
value=CORRECT, answer=answer, explanation=state.output.completion
|
35
30
|
)
|
36
31
|
|
37
|
-
|
38
|
-
|
32
|
+
return Score(
|
33
|
+
value=INCORRECT, answer=answer, explanation=state.output.completion
|
39
34
|
)
|
40
|
-
return Score(value=INCORRECT, answer=answer, explanation=explanation)
|
41
35
|
|
42
36
|
return score
|
43
37
|
|
inspect_ai/scorer/_match.py
CHANGED
@@ -15,12 +15,11 @@ def match(
|
|
15
15
|
"""Scorer which matches text or a number.
|
16
16
|
|
17
17
|
Args:
|
18
|
-
location
|
19
|
-
Location to match at. "any" matches anywhere in the
|
18
|
+
location: Location to match at. "any" matches anywhere in the
|
20
19
|
output; "exact" requires the output be exactly
|
21
20
|
equal to the target (module whitespace, etc.)
|
22
|
-
ignore_case
|
23
|
-
numeric
|
21
|
+
ignore_case: Do case insensitive comparison.
|
22
|
+
numeric: Is this a numeric match? (in this
|
24
23
|
case different punctuation removal rules are
|
25
24
|
used and numbers are normalized before comparison).
|
26
25
|
"""
|
@@ -42,7 +41,7 @@ def includes(ignore_case: bool = True) -> Scorer:
|
|
42
41
|
"""Check whether the specified text is included in the model output.
|
43
42
|
|
44
43
|
Args:
|
45
|
-
ignore_case
|
44
|
+
ignore_case: Use a case insensitive comparison.
|
46
45
|
|
47
46
|
"""
|
48
47
|
|