inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/solver/_solver.py
CHANGED
@@ -29,36 +29,32 @@ from ._task_state import TaskState, set_sample_state
|
|
29
29
|
|
30
30
|
@runtime_checkable
|
31
31
|
class Generate(Protocol):
|
32
|
-
"""Generate using the model and add the assistant message to the task state.
|
33
|
-
|
34
|
-
Args:
|
35
|
-
state (TaskState): Beginning task state.
|
36
|
-
|
37
|
-
tool_calls (Literal["loop", "single", "none"]): Resolve tool calls:
|
38
|
-
- `"loop"` resolves tools calls and then invokes `generate()`,
|
39
|
-
proceeding in a loop which terminates when there are no more
|
40
|
-
tool calls, or `message_limit` or `token_limit` is exceeded.
|
41
|
-
This is the default behavior.
|
42
|
-
- `"single"` resolves at most a single set of tool calls and then returns.
|
43
|
-
- `"none"` does not resolve tool calls at all (in this
|
44
|
-
case you will need to invoke `call_tools()` directly).
|
45
|
-
|
46
|
-
cache: (bool | CachePolicy):
|
47
|
-
Caching behaviour for generate responses (defaults to no caching).
|
48
|
-
|
49
|
-
**kwargs: Optional generation config arguments.
|
50
|
-
|
51
|
-
Returns:
|
52
|
-
Updated TaskState.
|
53
|
-
"""
|
54
|
-
|
55
32
|
async def __call__(
|
56
33
|
self,
|
57
34
|
state: TaskState,
|
58
35
|
tool_calls: Literal["loop", "single", "none"] = "loop",
|
59
36
|
cache: bool | CachePolicy = False,
|
60
37
|
**kwargs: Unpack[GenerateConfigArgs],
|
61
|
-
) -> TaskState:
|
38
|
+
) -> TaskState:
|
39
|
+
"""Generate using the model and add the assistant message to the task state.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
state: Beginning task state.
|
43
|
+
tool_calls:
|
44
|
+
- `"loop"` resolves tools calls and then invokes `generate()`,
|
45
|
+
proceeding in a loop which terminates when there are no more
|
46
|
+
tool calls, or `message_limit` or `token_limit` is exceeded.
|
47
|
+
This is the default behavior.
|
48
|
+
- `"single"` resolves at most a single set of tool calls and then returns.
|
49
|
+
- `"none"` does not resolve tool calls at all (in this
|
50
|
+
case you will need to invoke `call_tools()` directly).
|
51
|
+
cache: Caching behaviour for generate responses (defaults to no caching).
|
52
|
+
**kwargs: Optional generation config arguments.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Updated TaskState.
|
56
|
+
"""
|
57
|
+
...
|
62
58
|
|
63
59
|
|
64
60
|
@dataclass(frozen=True)
|
@@ -74,28 +70,37 @@ class SolverSpec:
|
|
74
70
|
|
75
71
|
@runtime_checkable
|
76
72
|
class Solver(Protocol):
|
77
|
-
|
73
|
+
async def __call__(
|
74
|
+
self,
|
75
|
+
state: TaskState,
|
76
|
+
generate: Generate,
|
77
|
+
) -> TaskState:
|
78
|
+
r"""Contribute to solving an evaluation task.
|
78
79
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
to generate output (and a new TaskState with that output).
|
80
|
+
Transform a `TaskState`, returning the new state. Solvers may
|
81
|
+
optionally call the `generate()` function to create a new
|
82
|
+
state resulting from model generation. Solvers may also do
|
83
|
+
prompt engineering or other types of elicitation.
|
84
84
|
|
85
|
+
Args:
|
86
|
+
state: State for tasks being evaluated.
|
87
|
+
generate: Function for generating outputs.
|
85
88
|
|
86
|
-
|
87
|
-
|
88
|
-
generate (Generate): Function for generating outputs.
|
89
|
+
Returns:
|
90
|
+
Updated TaskState.
|
89
91
|
|
90
|
-
|
91
|
-
|
92
|
-
|
92
|
+
Examples:
|
93
|
+
```python
|
94
|
+
@solver
|
95
|
+
def prompt_cot(template: str) -> Solver:
|
96
|
+
def solve(state: TaskState, generate: Generate) -> TaskState:
|
97
|
+
# insert chain of thought prompt
|
98
|
+
return state
|
93
99
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
) -> TaskState: ...
|
100
|
+
return solve
|
101
|
+
```
|
102
|
+
"""
|
103
|
+
...
|
99
104
|
|
100
105
|
|
101
106
|
P = ParamSpec("P")
|
@@ -144,7 +149,7 @@ def solver(
|
|
144
149
|
r"""Decorator for registering solvers.
|
145
150
|
|
146
151
|
Args:
|
147
|
-
name:
|
152
|
+
name:
|
148
153
|
Optional name for solver. If the decorator has no name
|
149
154
|
argument then the name of the underlying Callable[P, Solver]
|
150
155
|
object will be used to automatically assign a name.
|
@@ -153,19 +158,15 @@ def solver(
|
|
153
158
|
Solver with registry attributes.
|
154
159
|
|
155
160
|
Examples:
|
156
|
-
|
157
|
-
def prompt_cot(state: TaskState, generate: Generate) -> None:
|
158
|
-
...
|
159
|
-
|
160
|
-
@solver(name = "prompt_cot")
|
161
|
-
def cot(state: TaskState, generate: Generate) -> None:
|
162
|
-
...
|
163
|
-
|
161
|
+
```python
|
164
162
|
@solver
|
165
163
|
def prompt_cot(template: str) -> Solver:
|
166
|
-
def solve(state: TaskState, generate: Generate) ->
|
167
|
-
|
164
|
+
def solve(state: TaskState, generate: Generate) -> TaskState:
|
165
|
+
# insert chain of thought prompt
|
166
|
+
return state
|
167
|
+
|
168
168
|
return solve
|
169
|
+
```
|
169
170
|
"""
|
170
171
|
|
171
172
|
# create_solver_wrapper:
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -31,17 +31,20 @@ class Choice:
|
|
31
31
|
"""
|
32
32
|
A `Choice` represents a single choice in a multiple choice question.
|
33
33
|
|
34
|
-
It is only relevant for the `multiple_choice` solver and corresponding
|
34
|
+
It is only relevant for the `multiple_choice` solver and corresponding
|
35
|
+
`choice` scorer.
|
35
36
|
"""
|
36
37
|
|
37
38
|
value: str
|
38
39
|
"""The original value of the choice from the `Sample`."""
|
39
40
|
|
40
41
|
correct: bool | None
|
41
|
-
"""Did the model think this choice satisfies the question? `None`
|
42
|
+
"""Did the model think this choice satisfies the question? `None`
|
43
|
+
indicates this has not been set yet"""
|
42
44
|
|
43
45
|
original_position: int
|
44
|
-
"""Choices may be re-ordered during processing, this represents the
|
46
|
+
"""Choices may be re-ordered during processing, this represents the
|
47
|
+
original position in the sample's list of choices"""
|
45
48
|
|
46
49
|
|
47
50
|
class Choices(Sequence[Choice]):
|
@@ -127,10 +130,10 @@ class TaskState:
|
|
127
130
|
"""
|
128
131
|
The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
|
129
132
|
|
130
|
-
|
131
|
-
evaluation. It allows us to
|
132
|
-
|
133
|
-
|
133
|
+
The `TaskState` is passed to and returned from each solver during a sample's
|
134
|
+
evaluation. It allows us to manipulated the message history, the tools
|
135
|
+
available to the model, the final output of the model, and whether the task
|
136
|
+
is completed or has hit a limit.
|
134
137
|
"""
|
135
138
|
|
136
139
|
def __init__(
|
@@ -149,73 +152,39 @@ class TaskState:
|
|
149
152
|
metadata: dict[str, Any] = {},
|
150
153
|
) -> None:
|
151
154
|
self._model = model
|
152
|
-
|
153
|
-
|
154
|
-
self.sample_id = sample_id
|
155
|
-
"""Unique id for sample."""
|
156
|
-
|
157
|
-
self.epoch = epoch
|
158
|
-
"""Epoch number for sample."""
|
159
|
-
|
155
|
+
self._sample_id = sample_id
|
156
|
+
self._epoch = epoch
|
160
157
|
self._input = input
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
Should be treated as immutable and not changed during the run, so that
|
165
|
-
it can be referenced or checked wherever needed. Access through `input`
|
166
|
-
or `input_text` only
|
167
|
-
"""
|
168
|
-
|
169
|
-
self.target = target
|
170
|
-
"""The scoring target for this `Sample`."""
|
171
|
-
|
172
|
-
self.metadata = metadata
|
173
|
-
"""Metadata from the `Sample` for this `TaskState`"""
|
174
|
-
|
158
|
+
self._target = target
|
159
|
+
self._metadata = metadata
|
175
160
|
self._messages: list[ChatMessage] = ChatMessageList(messages, self)
|
176
|
-
"""
|
177
|
-
Chat conversation history for sample.
|
178
|
-
|
179
|
-
This will generally get appended to every time a `generate` call is made
|
180
|
-
to the model. Useful for both debug and for solvers/scorers to assess
|
181
|
-
model performance or choose the next step.
|
182
|
-
"""
|
183
|
-
|
184
161
|
self._tools: list[Tool] = []
|
185
|
-
|
186
|
-
|
187
|
-
self.tool_choice: ToolChoice | None = None
|
188
|
-
"""Tool choice directive."""
|
189
|
-
|
190
|
-
self.output = output if output else ModelOutput(model=str(model), choices=[])
|
191
|
-
"""
|
192
|
-
The 'final' model output once we've completed all solving.
|
193
|
-
|
194
|
-
For simple evals this may just be the last `message` from the
|
195
|
-
conversation history, but more complex solvers may generate this in
|
196
|
-
different ways depending on what solvers are used..
|
197
|
-
"""
|
198
|
-
|
162
|
+
self._output = output if output else ModelOutput(model=str(model))
|
199
163
|
self._message_limit = message_limit
|
200
164
|
self._token_limit = token_limit
|
201
165
|
self._completed = completed
|
202
|
-
|
203
|
-
"""Store for shared data"""
|
204
|
-
self.store = Store()
|
166
|
+
self._store = Store()
|
205
167
|
|
206
168
|
if choices:
|
207
169
|
self.choices = Choices(choices)
|
208
170
|
else:
|
209
171
|
self.choices = Choices([])
|
210
172
|
|
211
|
-
self.scores: dict[str, Score] | None = None
|
212
|
-
"""Scores yielded by running task."""
|
213
|
-
|
214
173
|
@property
|
215
174
|
def model(self) -> ModelName:
|
216
175
|
"""Name of model being evaluated."""
|
217
176
|
return self._model
|
218
177
|
|
178
|
+
@property
|
179
|
+
def sample_id(self) -> int | str:
|
180
|
+
"""Unique id for sample."""
|
181
|
+
return self._sample_id
|
182
|
+
|
183
|
+
@property
|
184
|
+
def epoch(self) -> int:
|
185
|
+
"""Epoch number for sample."""
|
186
|
+
return self._epoch
|
187
|
+
|
219
188
|
@property
|
220
189
|
def input(self) -> str | list[ChatMessage]:
|
221
190
|
"""Input from the `Sample`, should be considered immutable."""
|
@@ -253,9 +222,6 @@ class TaskState:
|
|
253
222
|
engineering solvers). This property enables easy read and
|
254
223
|
write access to the user chat prompt. Raises an
|
255
224
|
exception if there is no user prompt
|
256
|
-
|
257
|
-
Returns:
|
258
|
-
First user `ChatMessage` in the task state.
|
259
225
|
"""
|
260
226
|
prompt = next((m for m in self.messages if m.role == "user"), None)
|
261
227
|
if prompt:
|
@@ -263,16 +229,63 @@ class TaskState:
|
|
263
229
|
else:
|
264
230
|
raise ValueError("user_prompt requested from TaskState but none available")
|
265
231
|
|
232
|
+
@property
|
233
|
+
def metadata(self) -> dict[str, Any]:
|
234
|
+
"""Metadata from the `Sample` for this `TaskState`"""
|
235
|
+
return self._metadata
|
236
|
+
|
237
|
+
@metadata.setter
|
238
|
+
def metadata(self, metadata: dict[str, Any]) -> None:
|
239
|
+
self._metadata = metadata
|
240
|
+
|
266
241
|
@property
|
267
242
|
def messages(self) -> list[ChatMessage]:
|
268
|
-
"""
|
243
|
+
"""
|
244
|
+
Chat conversation history for sample.
|
245
|
+
|
246
|
+
This will generally get appended to every time a `generate` call is made
|
247
|
+
to the model. Useful for both debug and for solvers/scorers to assess
|
248
|
+
model performance or choose the next step.
|
249
|
+
"""
|
269
250
|
return self._messages
|
270
251
|
|
271
252
|
@messages.setter
|
272
253
|
def messages(self, messages: list[ChatMessage]) -> None:
|
273
|
-
"""Set messages in chat history."""
|
274
254
|
self._messages = ChatMessageList(messages, self)
|
275
255
|
|
256
|
+
@property
|
257
|
+
def output(self) -> ModelOutput:
|
258
|
+
"""
|
259
|
+
The 'final' model output once we've completed all solving.
|
260
|
+
|
261
|
+
For simple evals this may just be the last `message` from the
|
262
|
+
conversation history, but more complex solvers may set this directly.
|
263
|
+
"""
|
264
|
+
return self._output
|
265
|
+
|
266
|
+
@output.setter
|
267
|
+
def output(self, output: ModelOutput) -> None:
|
268
|
+
self._output = output
|
269
|
+
|
270
|
+
@property
|
271
|
+
def store(self) -> Store:
|
272
|
+
"""Store for shared data"""
|
273
|
+
return self._store
|
274
|
+
|
275
|
+
@property
|
276
|
+
def tools(self) -> list[Tool]:
|
277
|
+
"""Tools available to the model."""
|
278
|
+
return self._tools
|
279
|
+
|
280
|
+
@tools.setter
|
281
|
+
def tools(self, tools: list[Tool | ToolDef]) -> None:
|
282
|
+
self._tools.clear()
|
283
|
+
for tool in tools:
|
284
|
+
self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
|
285
|
+
|
286
|
+
tool_choice: ToolChoice | None = None
|
287
|
+
"""Tool choice directive."""
|
288
|
+
|
276
289
|
@property
|
277
290
|
def max_messages(self) -> int | None:
|
278
291
|
"""Deprecated (use message_limit)."""
|
@@ -351,14 +364,12 @@ class TaskState:
|
|
351
364
|
self._completed = completed
|
352
365
|
|
353
366
|
@property
|
354
|
-
def
|
355
|
-
|
367
|
+
def target(self) -> Target:
|
368
|
+
"""The scoring target for this `Sample`."""
|
369
|
+
return self._target
|
356
370
|
|
357
|
-
|
358
|
-
|
359
|
-
self._tools.clear()
|
360
|
-
for tool in tools:
|
361
|
-
self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
|
371
|
+
scores: dict[str, Score] | None = None
|
372
|
+
"""Scores yielded by running task."""
|
362
373
|
|
363
374
|
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
364
375
|
"""Pydantic model interface to metadata.
|
inspect_ai/solver/_use_tools.py
CHANGED
@@ -15,15 +15,15 @@ def use_tools(
|
|
15
15
|
Inject tools into the task state to be used in generate().
|
16
16
|
|
17
17
|
Args:
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
18
|
+
*tools: One or more tools or lists of tools
|
19
|
+
to make available to the model. If no tools are passed, then
|
20
|
+
no change to the currently available set of `tools` is made.
|
21
|
+
tool_choice: Directive indicating which
|
22
|
+
tools the model should use. If `None` is passed, then no
|
23
|
+
change to `tool_choice` is made.
|
24
|
+
append: If `True`, then the passed-in tools are appended
|
25
|
+
to the existing tools; otherwise any existing tools are
|
26
|
+
replaced (the default)
|
27
27
|
|
28
28
|
Returns:
|
29
29
|
A solver that injects the tools and tool_choice into the task state.
|
inspect_ai/tool/__init__.py
CHANGED
@@ -19,14 +19,16 @@ from ._tool_call import (
|
|
19
19
|
from ._tool_choice import ToolChoice, ToolFunction
|
20
20
|
from ._tool_def import ToolDef
|
21
21
|
from ._tool_info import ToolInfo
|
22
|
-
from ._tool_params import ToolParam, ToolParams
|
22
|
+
from ._tool_params import JSONType, ToolParam, ToolParams
|
23
23
|
from ._tool_with import tool_with
|
24
|
+
from ._tools._computer import computer
|
24
25
|
from ._tools._execute import bash, python
|
25
26
|
from ._tools._web_browser import web_browser
|
26
27
|
from ._tools._web_search import web_search
|
27
28
|
|
28
29
|
__all__ = [
|
29
30
|
"bash",
|
31
|
+
"computer",
|
30
32
|
"python",
|
31
33
|
"web_browser",
|
32
34
|
"web_search",
|
@@ -52,6 +54,7 @@ __all__ = [
|
|
52
54
|
"ToolInfo",
|
53
55
|
"ToolParam",
|
54
56
|
"ToolParams",
|
57
|
+
"JSONType",
|
55
58
|
]
|
56
59
|
|
57
60
|
_UTIL_MODULE_VERSION = "0.3.19"
|
inspect_ai/tool/_tool.py
CHANGED
@@ -40,10 +40,25 @@ ToolResult = (
|
|
40
40
|
| ContentVideo
|
41
41
|
| list[ContentText | ContentImage | ContentAudio | ContentVideo]
|
42
42
|
)
|
43
|
+
"""Valid types for results from tool calls."""
|
43
44
|
|
44
45
|
|
45
46
|
class ToolError(Exception):
|
47
|
+
"""Exception thrown from tool call.
|
48
|
+
|
49
|
+
If you throw a `ToolError` form within a tool call,
|
50
|
+
the error will be reported to the model for further
|
51
|
+
processing (rather than ending the sample). If you want
|
52
|
+
to raise a fatal error from a tool call use an appropriate
|
53
|
+
standard exception type (e.g. `RuntimeError`, `ValueError`, etc.)
|
54
|
+
"""
|
55
|
+
|
46
56
|
def __init__(self, message: str) -> None:
|
57
|
+
"""Create a ToolError.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
message: Error message to report to the model.
|
61
|
+
"""
|
47
62
|
super().__init__(message)
|
48
63
|
self.message = message
|
49
64
|
|
@@ -68,11 +83,21 @@ class Tool(Protocol):
|
|
68
83
|
r"""Additional tool that an agent can use to solve a task.
|
69
84
|
|
70
85
|
Args:
|
71
|
-
|
72
|
-
|
86
|
+
*args: Arguments for the tool.
|
87
|
+
**kwargs: Keyword arguments for the tool.
|
73
88
|
|
74
89
|
Returns:
|
75
90
|
Result of tool call.
|
91
|
+
|
92
|
+
Examples:
|
93
|
+
```python
|
94
|
+
@tool
|
95
|
+
def add() -> Tool:
|
96
|
+
async def execute(x: int, y: int) -> int:
|
97
|
+
return x + y
|
98
|
+
|
99
|
+
return execute
|
100
|
+
```
|
76
101
|
"""
|
77
102
|
...
|
78
103
|
|
@@ -130,25 +155,29 @@ def tool(
|
|
130
155
|
r"""Decorator for registering tools.
|
131
156
|
|
132
157
|
Args:
|
133
|
-
func
|
134
|
-
name
|
135
|
-
Optional name for tool. If the decorator has no name
|
158
|
+
func: Tool function
|
159
|
+
name: Optional name for tool. If the decorator has no name
|
136
160
|
argument then the name of the tool creation function
|
137
161
|
will be used as the name of the tool.
|
138
|
-
viewer
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
parallel (bool):
|
143
|
-
Does this tool support parallel execution?
|
144
|
-
(defaults to True).
|
145
|
-
prompt (str):
|
146
|
-
Deprecated (provide all descriptive information about
|
162
|
+
viewer: Provide a custom view of tool call and context.
|
163
|
+
model_input: Provide a custom function for playing back tool results as model input.
|
164
|
+
parallel: Does this tool support parallel execution? (defaults to `True`).
|
165
|
+
prompt: Deprecated (provide all descriptive information about
|
147
166
|
the tool within the tool function's doc comment)
|
148
167
|
|
149
168
|
|
150
169
|
Returns:
|
151
170
|
Tool with registry attributes.
|
171
|
+
|
172
|
+
Examples:
|
173
|
+
```python
|
174
|
+
@tool
|
175
|
+
def add() -> Tool:
|
176
|
+
async def execute(x: int, y: int) -> int:
|
177
|
+
return x + y
|
178
|
+
|
179
|
+
return execute
|
180
|
+
```
|
152
181
|
"""
|
153
182
|
if prompt:
|
154
183
|
from inspect_ai._util.logger import warn_once
|
inspect_ai/tool/_tool_call.py
CHANGED
@@ -13,10 +13,10 @@ class ToolCallContent(BaseModel):
|
|
13
13
|
"""Optional (plain text) title for tool call content."""
|
14
14
|
|
15
15
|
format: Literal["text", "markdown"]
|
16
|
-
"""Format."""
|
16
|
+
"""Format (text or markdown)."""
|
17
17
|
|
18
18
|
content: str
|
19
|
-
"""
|
19
|
+
"""Text or markdown content."""
|
20
20
|
|
21
21
|
|
22
22
|
class ToolCallView(BaseModel):
|
@@ -56,6 +56,8 @@ class ToolCall:
|
|
56
56
|
|
57
57
|
@dataclass
|
58
58
|
class ToolCallError:
|
59
|
+
"""Error raised by a tool call."""
|
60
|
+
|
59
61
|
type: Literal[
|
60
62
|
"parsing",
|
61
63
|
"timeout",
|
@@ -67,8 +69,10 @@ class ToolCallError:
|
|
67
69
|
"approval",
|
68
70
|
"unknown",
|
69
71
|
]
|
72
|
+
"""Error type."""
|
70
73
|
|
71
74
|
message: str
|
75
|
+
"""Error message."""
|
72
76
|
|
73
77
|
|
74
78
|
ToolCallViewer = Callable[[ToolCall], ToolCallView]
|
inspect_ai/tool/_tool_choice.py
CHANGED
@@ -4,8 +4,10 @@ from typing import Literal, Union
|
|
4
4
|
|
5
5
|
@dataclass
|
6
6
|
class ToolFunction:
|
7
|
+
"""Indicate that a specific tool function should be called."""
|
8
|
+
|
7
9
|
name: str
|
8
|
-
"""The name of the function to call."""
|
10
|
+
"""The name of the tool function to call."""
|
9
11
|
|
10
12
|
|
11
13
|
ToolChoice = Union[Literal["auto", "any", "none"], ToolFunction]
|
inspect_ai/tool/_tool_def.py
CHANGED
@@ -25,6 +25,8 @@ from ._tool_params import ToolParams
|
|
25
25
|
|
26
26
|
|
27
27
|
class ToolDef:
|
28
|
+
"""Tool definition."""
|
29
|
+
|
28
30
|
def __init__(
|
29
31
|
self,
|
30
32
|
tool: Callable[..., Any],
|
@@ -35,19 +37,19 @@ class ToolDef:
|
|
35
37
|
viewer: ToolCallViewer | None = None,
|
36
38
|
model_input: ToolCallModelInput | None = None,
|
37
39
|
) -> None:
|
38
|
-
"""
|
40
|
+
"""Create a tool definition.
|
39
41
|
|
40
42
|
Args:
|
41
|
-
tool
|
42
|
-
name
|
43
|
-
description
|
43
|
+
tool: Callable to execute tool.
|
44
|
+
name: Name of tool. Discovered automatically if not specified.
|
45
|
+
description: Description of tool. Discovered automatically
|
44
46
|
by parsing doc comments if not specified.
|
45
|
-
parameters
|
47
|
+
parameters: Tool parameter descriptions and types.
|
46
48
|
Discovered automatically by parsing doc comments if not specified.
|
47
|
-
parallel
|
49
|
+
parallel: Does the tool support parallel execution
|
48
50
|
(defaults to True if not specified)
|
49
|
-
viewer
|
50
|
-
model_input
|
51
|
+
viewer: Optional tool call viewer implementation.
|
52
|
+
model_input: Optional function that determines how
|
51
53
|
tool call results are played back as model input.
|
52
54
|
|
53
55
|
Returns:
|
inspect_ai/tool/_tool_params.py
CHANGED
@@ -14,20 +14,44 @@ class ToolParam(BaseModel):
|
|
14
14
|
"""Description of tool parameter in JSON Schema format."""
|
15
15
|
|
16
16
|
type: JSONType | None = Field(default=None)
|
17
|
+
"""JSON type of tool parameter."""
|
18
|
+
|
17
19
|
description: str | None = Field(default=None)
|
20
|
+
"""Parameter description."""
|
21
|
+
|
18
22
|
default: Any = Field(default=None)
|
23
|
+
"""Default value for parameter."""
|
24
|
+
|
19
25
|
enum: list[Any] | None = Field(default=None)
|
26
|
+
"""Valid values for enum parameters."""
|
27
|
+
|
20
28
|
items: Optional["ToolParam"] = Field(default=None)
|
29
|
+
"""Valid type for array parameters."""
|
30
|
+
|
21
31
|
properties: dict[str, "ToolParam"] | None = Field(default=None)
|
32
|
+
"""Valid fields for object parametrs."""
|
33
|
+
|
22
34
|
additionalProperties: Optional["ToolParam"] | bool | None = Field(default=None)
|
35
|
+
"""Are additional properties allowed?"""
|
36
|
+
|
23
37
|
anyOf: list["ToolParam"] | None = Field(default=None)
|
38
|
+
"""Valid types for union parameters."""
|
39
|
+
|
24
40
|
required: list[str] | None = Field(default=None)
|
41
|
+
"""Required fields for object parameters."""
|
25
42
|
|
26
43
|
|
27
44
|
class ToolParams(BaseModel):
|
28
45
|
"""Description of tool parameters object in JSON Schema format."""
|
29
46
|
|
30
47
|
type: Literal["object"] = Field(default="object")
|
48
|
+
"""Params type (always 'object')"""
|
49
|
+
|
31
50
|
properties: dict[str, ToolParam] = Field(default_factory=dict)
|
51
|
+
"""Tool function parameters."""
|
52
|
+
|
32
53
|
required: list[str] = Field(default_factory=list)
|
54
|
+
"""List of required fields."""
|
55
|
+
|
33
56
|
additionalProperties: bool = Field(default=False)
|
57
|
+
"""Are additional object properties allowed? (always `False`)"""
|
inspect_ai/tool/_tool_with.py
CHANGED
@@ -25,14 +25,14 @@ def tool_with(
|
|
25
25
|
"""Tool with modifications to name and descriptions.
|
26
26
|
|
27
27
|
Args:
|
28
|
-
tool
|
29
|
-
name
|
30
|
-
description
|
31
|
-
parameters
|
32
|
-
parallel
|
28
|
+
tool: Tool instance to copy and add descriptions to.
|
29
|
+
name: Tool name (optional).
|
30
|
+
description: Tool description (optional).
|
31
|
+
parameters: Parameter descriptions (optional)
|
32
|
+
parallel: Does the tool support parallel execution
|
33
33
|
(defaults to True if not specified)
|
34
|
-
viewer
|
35
|
-
model_input
|
34
|
+
viewer: Optional tool call viewer implementation.
|
35
|
+
model_input: Optional function that determines how
|
36
36
|
tool call results are played back as model input.
|
37
37
|
|
38
38
|
Returns:
|
File without changes
|