inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/main.py +1 -1
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +10 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/active.py +2 -3
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/textual/widgets/samples.py +4 -3
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/eval.py +104 -101
- inspect_ai/_eval/evalset.py +75 -75
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +9 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/hash.py +1 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/.vscode/extensions.json +3 -0
- inspect_ai/_view/www/.vscode/settings.json +8 -0
- inspect_ai/_view/www/App.css +97 -29
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +16663 -14674
- inspect_ai/_view/www/dist/assets/index.js +58808 -51348
- inspect_ai/_view/www/dist/index.html +1 -1
- inspect_ai/_view/www/index.html +2 -2
- inspect_ai/_view/www/log-schema.json +87 -73
- inspect_ai/_view/www/package.json +22 -4
- inspect_ai/_view/www/postcss.config.cjs +8 -9
- inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
- inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
- inspect_ai/_view/www/src/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/api/api-http.ts +3 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
- inspect_ai/_view/www/src/api/client-api.ts +4 -4
- inspect_ai/_view/www/src/api/index.ts +4 -4
- inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
- inspect_ai/_view/www/src/appearance/colors.ts +9 -0
- inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
- inspect_ai/_view/www/src/appearance/icons.ts +100 -0
- inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
- inspect_ai/_view/www/src/components/Card.css +60 -0
- inspect_ai/_view/www/src/components/Card.tsx +109 -0
- inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
- inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
- inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
- inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
- inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
- inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
- inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
- inspect_ai/_view/www/src/components/FindBand.css +49 -0
- inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
- inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
- inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
- inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
- inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
- inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
- inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
- inspect_ai/_view/www/src/components/MessageBand.css +43 -0
- inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
- inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
- inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
- inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
- inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
- inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
- inspect_ai/_view/www/src/components/ToolButton.css +3 -0
- inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
- inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
- inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
- inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
- inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
- inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
- inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
- inspect_ai/_view/www/src/metadata/types.ts +18 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
- inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
- inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
- inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
- inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
- inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
- inspect_ai/_view/www/src/samples/error/error.ts +15 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
- inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
- inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
- inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
- inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
- inspect_ai/_view/www/src/types/log.d.ts +108 -19
- inspect_ai/_view/www/src/types/prism.d.ts +11 -0
- inspect_ai/_view/www/src/types.ts +71 -0
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
- inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
- inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
- inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
- inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
- inspect_ai/_view/www/src/utils/attachments.ts +42 -0
- inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
- inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
- inspect_ai/_view/www/src/utils/debugging.ts +28 -0
- inspect_ai/_view/www/src/utils/dom.ts +30 -0
- inspect_ai/_view/www/src/utils/format.ts +194 -0
- inspect_ai/_view/www/src/utils/git.ts +7 -0
- inspect_ai/_view/www/src/utils/html.ts +6 -0
- inspect_ai/_view/www/src/utils/http.ts +14 -0
- inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
- inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
- inspect_ai/_view/www/src/utils/queue.ts +51 -0
- inspect_ai/_view/www/src/utils/sync.ts +114 -0
- inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
- inspect_ai/_view/www/src/utils/vscode.ts +13 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
- inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
- inspect_ai/_view/www/src/workspace/types.ts +10 -0
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/_view/www/tsconfig.json +23 -9
- inspect_ai/_view/www/vite.config.js +8 -17
- inspect_ai/_view/www/yarn.lock +627 -556
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +64 -37
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +20 -12
- inspect_ai/dataset/_sources/file.py +4 -0
- inspect_ai/dataset/_sources/hf.py +39 -29
- inspect_ai/dataset/_sources/json.py +17 -9
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +101 -13
- inspect_ai/log/_message.py +4 -2
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/json.py +5 -7
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +4 -3
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/anthropic.py +3 -3
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openai_o1.py +3 -5
- inspect_ai/model/_providers/openrouter.py +86 -0
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +7 -7
- inspect_ai/scorer/_classification.py +38 -18
- inspect_ai/scorer/_common.py +2 -8
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +38 -26
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +4 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
- inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
- inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_execute.py +23 -11
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/tool/beta.py +3 -0
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +42 -86
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_sandbox/self_check.py +124 -16
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
- inspect_ai-0.3.64.dist-info/RECORD +625 -0
- inspect_ai/_view/www/src/Register.mjs +0 -3
- inspect_ai/_view/www/src/Types.mjs +0 -38
- inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
- inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
- inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
- inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
- inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
- inspect_ai/_view/www/src/components/Card.mjs +0 -126
- inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
- inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
- inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
- inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
- inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
- inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
- inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
- inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
- inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
- inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
- inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
- inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
- inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
- inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
- inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
- inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
- inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
- inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
- inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
- inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
- inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
- inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
- inspect_ai/_view/www/src/components/Tools.mjs +0 -376
- inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
- inspect_ai/_view/www/src/components/ansi-output.js +0 -932
- inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
- inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
- inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
- inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
- inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
- inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
- inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
- inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
- inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
- inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
- inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
- inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
- inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
- inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
- inspect_ai/_view/www/src/utils/Format.mjs +0 -260
- inspect_ai/_view/www/src/utils/Git.mjs +0 -12
- inspect_ai/_view/www/src/utils/Html.mjs +0 -21
- inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
- inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
- inspect_ai/_view/www/src/utils/http.mjs +0 -18
- inspect_ai/_view/www/src/utils/queue.mjs +0 -67
- inspect_ai/_view/www/src/utils/sync.mjs +0 -101
- inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
- inspect_ai/tool/beta/__init__.py +0 -5
- inspect_ai-0.3.62.dist-info/RECORD +0 -481
- /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
- /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
- /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
- /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/task/run.py
CHANGED
@@ -190,7 +190,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
190
190
|
if task.setup:
|
191
191
|
plan.steps = unroll(task.setup) + plan.steps
|
192
192
|
|
193
|
-
#
|
193
|
+
# resolve the scorer
|
194
194
|
score = score and task.scorer is not None
|
195
195
|
scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None
|
196
196
|
scorer_profiles = (
|
@@ -519,6 +519,7 @@ async def task_run_sample(
|
|
519
519
|
key: SampleScore(
|
520
520
|
score=score,
|
521
521
|
sample_id=previous_sample.id,
|
522
|
+
sample_metadata=previous_sample.metadata,
|
522
523
|
)
|
523
524
|
for key, score in previous_sample.scores.items()
|
524
525
|
}
|
@@ -696,6 +697,7 @@ async def task_run_sample(
|
|
696
697
|
sample_score = SampleScore(
|
697
698
|
score=score_result,
|
698
699
|
sample_id=sample.id,
|
700
|
+
sample_metadata=sample.metadata,
|
699
701
|
scorer=registry_unqualified_name(scorer),
|
700
702
|
)
|
701
703
|
transcript()._event(
|
@@ -709,7 +711,12 @@ async def task_run_sample(
|
|
709
711
|
if state.scores is not None:
|
710
712
|
for name, score in state.scores.items():
|
711
713
|
results[name] = SampleScore(
|
712
|
-
score=score,
|
714
|
+
score=score,
|
715
|
+
sample_id=state.sample_id,
|
716
|
+
sample_metadata=state.metadata,
|
717
|
+
)
|
718
|
+
transcript()._event(
|
719
|
+
ScoreEvent(score=score, target=sample.target)
|
713
720
|
)
|
714
721
|
|
715
722
|
# propagate results into scores
|
inspect_ai/_eval/task/sandbox.py
CHANGED
@@ -5,11 +5,20 @@ from random import random
|
|
5
5
|
from typing import AsyncGenerator, Callable, NamedTuple, cast
|
6
6
|
|
7
7
|
import httpx
|
8
|
+
from tenacity import (
|
9
|
+
retry,
|
10
|
+
retry_if_exception,
|
11
|
+
stop_after_attempt,
|
12
|
+
stop_after_delay,
|
13
|
+
wait_exponential_jitter,
|
14
|
+
)
|
8
15
|
|
9
16
|
from inspect_ai._eval.task.task import Task
|
10
17
|
from inspect_ai._eval.task.util import task_run_dir
|
18
|
+
from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT
|
11
19
|
from inspect_ai._util.file import file, filesystem
|
12
20
|
from inspect_ai._util.registry import registry_unqualified_name
|
21
|
+
from inspect_ai._util.retry import httpx_should_retry, log_retry_attempt
|
13
22
|
from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
|
14
23
|
from inspect_ai.dataset import Sample
|
15
24
|
from inspect_ai.util._concurrency import concurrency
|
@@ -115,8 +124,7 @@ async def read_sandboxenv_file(contents: str) -> bytes:
|
|
115
124
|
contents_base64 = data_uri_to_base64(contents)
|
116
125
|
file_bytes = base64.b64decode(contents_base64)
|
117
126
|
elif is_http_url(contents):
|
118
|
-
|
119
|
-
file_bytes = (await client.get(contents, follow_redirects=True)).content
|
127
|
+
file_bytes = await _retrying_httpx_get(contents)
|
120
128
|
else:
|
121
129
|
# try to read as a file (if it doesn't exist or has a path not cool w/
|
122
130
|
# the filesystem then we fall back to contents)
|
@@ -172,3 +180,28 @@ def resolve_sandbox(
|
|
172
180
|
return sample.sandbox
|
173
181
|
else:
|
174
182
|
return None
|
183
|
+
|
184
|
+
|
185
|
+
async def _retrying_httpx_get(
|
186
|
+
url: str,
|
187
|
+
client: httpx.AsyncClient = httpx.AsyncClient(),
|
188
|
+
timeout: int = 30, # per-attempt timeout
|
189
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
190
|
+
total_timeout: int = DEFAULT_TIMEOUT, # timeout for the whole retry loop. not for an individual attempt
|
191
|
+
) -> bytes:
|
192
|
+
@retry(
|
193
|
+
wait=wait_exponential_jitter(),
|
194
|
+
stop=(stop_after_attempt(max_retries) | stop_after_delay(total_timeout)),
|
195
|
+
retry=retry_if_exception(httpx_should_retry),
|
196
|
+
before_sleep=log_retry_attempt(url),
|
197
|
+
)
|
198
|
+
async def do_get() -> bytes:
|
199
|
+
response = await client.get(
|
200
|
+
url=url,
|
201
|
+
follow_redirects=True,
|
202
|
+
timeout=(timeout, timeout, timeout, timeout),
|
203
|
+
)
|
204
|
+
response.raise_for_status()
|
205
|
+
return response.content
|
206
|
+
|
207
|
+
return await do_get()
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -39,38 +39,6 @@ class Task:
|
|
39
39
|
r"""Evaluation task.
|
40
40
|
|
41
41
|
Tasks are the basis for defining and running evaluations.
|
42
|
-
|
43
|
-
Args:
|
44
|
-
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
45
|
-
setup: (Solver | list[Solver] | None): Setup step (always run
|
46
|
-
even when the main `solver` is replaced).
|
47
|
-
solver: (Solver | list[Solver]): Solver or list of solvers.
|
48
|
-
Defaults to generate(), a normal call to the model.
|
49
|
-
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
50
|
-
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
51
|
-
Alternative metrics (overrides the metrics provided by the specified scorer).
|
52
|
-
config (GenerateConfig): Model generation config.
|
53
|
-
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
54
|
-
(or optionally a str or tuple with a shorthand spec)
|
55
|
-
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
56
|
-
Either a path to an approval policy config file or a list of approval policies.
|
57
|
-
Defaults to no approval policy.
|
58
|
-
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
59
|
-
reducer function(s) used to combine sample scores (defaults to "mean")
|
60
|
-
fail_on_error (bool | float | None): `True` to fail on first sample error
|
61
|
-
(default); `False` to never fail on sample errors; Value between 0 and 1
|
62
|
-
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
63
|
-
eval if a count of samples fails.
|
64
|
-
message_limit (int | None): Limit on total messages used for each sample.
|
65
|
-
token_limit (int | None): Limit on total tokens used for each sample.
|
66
|
-
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
67
|
-
name: (str | None): Task name. If not specified is automatically
|
68
|
-
determined based on the name of the task directory (or "task")
|
69
|
-
if its anonymous task (e.g. created in a notebook and passed to
|
70
|
-
eval() directly)
|
71
|
-
version: (int): Version of task (to distinguish evolutions
|
72
|
-
of the task spec or breaking changes to it)
|
73
|
-
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
74
42
|
"""
|
75
43
|
|
76
44
|
def __init__(
|
@@ -93,6 +61,41 @@ class Task:
|
|
93
61
|
metadata: dict[str, Any] | None = None,
|
94
62
|
**kwargs: Unpack[TaskDeprecatedArgs],
|
95
63
|
) -> None:
|
64
|
+
"""Create a task.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
68
|
+
setup: (Solver | list[Solver] | None): Setup step (always run
|
69
|
+
even when the main `solver` is replaced).
|
70
|
+
solver: (Solver | list[Solver]): Solver or list of solvers.
|
71
|
+
Defaults to generate(), a normal call to the model.
|
72
|
+
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
73
|
+
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
74
|
+
Alternative metrics (overrides the metrics provided by the specified scorer).
|
75
|
+
config (GenerateConfig): Model generation config.
|
76
|
+
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
77
|
+
(or optionally a str or tuple with a shorthand spec)
|
78
|
+
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
79
|
+
Either a path to an approval policy config file or a list of approval policies.
|
80
|
+
Defaults to no approval policy.
|
81
|
+
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
82
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
83
|
+
fail_on_error (bool | float | None): `True` to fail on first sample error
|
84
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
85
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
86
|
+
eval if a count of samples fails.
|
87
|
+
message_limit (int | None): Limit on total messages used for each sample.
|
88
|
+
token_limit (int | None): Limit on total tokens used for each sample.
|
89
|
+
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
90
|
+
name: (str | None): Task name. If not specified is automatically
|
91
|
+
determined based on the name of the task directory (or "task")
|
92
|
+
if its anonymous task (e.g. created in a notebook and passed to
|
93
|
+
eval() directly)
|
94
|
+
version: (int): Version of task (to distinguish evolutions
|
95
|
+
of the task spec or breaking changes to it)
|
96
|
+
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
97
|
+
**kwargs: Deprecated arguments.
|
98
|
+
"""
|
96
99
|
# handle deprecated args
|
97
100
|
for arg, value in kwargs.items():
|
98
101
|
newarg = ""
|
@@ -179,33 +182,33 @@ def task_with(
|
|
179
182
|
task (Task): Task to adapt (it is deep copied prior to mutating options)
|
180
183
|
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
181
184
|
setup: (Solver | list[Solver] | None): Setup step (always run
|
182
|
-
|
185
|
+
even when the main `solver` is replaced).
|
183
186
|
solver: (Solver | list[Solver]): Solver or list of solvers.
|
184
|
-
|
187
|
+
Defaults to generate(), a normal call to the model.
|
185
188
|
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
186
189
|
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
187
|
-
|
190
|
+
Alternative metrics (overrides the metrics provided by the specified scorer).
|
188
191
|
config (GenerateConfig): Model generation config.
|
189
192
|
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
190
|
-
|
193
|
+
(or optionally a str or tuple with a shorthand spec)
|
191
194
|
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
192
|
-
|
193
|
-
|
195
|
+
Either a path to an approval policy config file or a list of approval policies.
|
196
|
+
Defaults to no approval policy.
|
194
197
|
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
195
|
-
|
198
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
196
199
|
fail_on_error (bool | float | None): `True` to fail on first sample error
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
201
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
202
|
+
eval if a count of samples fails.
|
200
203
|
message_limit (int | None): Limit on total messages used for each sample.
|
201
204
|
token_limit (int | None): Limit on total tokens used for each sample.
|
202
205
|
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
203
206
|
name: (str | None): Task name. If not specified is automatically
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
+
determined based on the name of the task directory (or "task")
|
208
|
+
if its anonymous task (e.g. created in a notebook and passed to
|
209
|
+
eval() directly)
|
207
210
|
version: (int): Version of task (to distinguish evolutions
|
208
|
-
|
211
|
+
of the task spec or breaking changes to it)
|
209
212
|
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
210
213
|
|
211
214
|
Returns:
|
inspect_ai/_util/constants.py
CHANGED
inspect_ai/_util/content.py
CHANGED
@@ -4,6 +4,8 @@ from pydantic import BaseModel, Field
|
|
4
4
|
|
5
5
|
|
6
6
|
class ContentText(BaseModel):
|
7
|
+
"""Text content."""
|
8
|
+
|
7
9
|
type: Literal["text"] = Field(default="text")
|
8
10
|
"""Type."""
|
9
11
|
|
@@ -12,6 +14,8 @@ class ContentText(BaseModel):
|
|
12
14
|
|
13
15
|
|
14
16
|
class ContentImage(BaseModel):
|
17
|
+
"""Image content."""
|
18
|
+
|
15
19
|
type: Literal["image"] = Field(default="image")
|
16
20
|
"""Type."""
|
17
21
|
|
@@ -26,6 +30,8 @@ class ContentImage(BaseModel):
|
|
26
30
|
|
27
31
|
|
28
32
|
class ContentAudio(BaseModel):
|
33
|
+
"""Audio content."""
|
34
|
+
|
29
35
|
type: Literal["audio"] = Field(default="audio")
|
30
36
|
"""Type."""
|
31
37
|
|
@@ -37,6 +43,8 @@ class ContentAudio(BaseModel):
|
|
37
43
|
|
38
44
|
|
39
45
|
class ContentVideo(BaseModel):
|
46
|
+
"""Video content."""
|
47
|
+
|
40
48
|
type: Literal["video"] = Field(default="video")
|
41
49
|
"""Type."""
|
42
50
|
|
inspect_ai/_util/error.py
CHANGED
inspect_ai/_util/file.py
CHANGED
@@ -18,6 +18,7 @@ from fsspec.core import split_protocol # type: ignore # type: ignore
|
|
18
18
|
from fsspec.implementations.local import make_path_posix # type: ignore
|
19
19
|
from pydantic import BaseModel
|
20
20
|
from s3fs import S3FileSystem # type: ignore
|
21
|
+
from shortuuid import uuid
|
21
22
|
|
22
23
|
# https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem
|
23
24
|
# https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.generic.GenericFileSystem
|
@@ -169,6 +170,9 @@ class FileSystem:
|
|
169
170
|
def exists(self, path: str) -> bool:
|
170
171
|
return self.fs.exists(path) is True
|
171
172
|
|
173
|
+
def touch(self, path: str) -> None:
|
174
|
+
self.fs.touch(path)
|
175
|
+
|
172
176
|
def rm(
|
173
177
|
self, path: str, recursive: bool = False, maxdepth: int | None = None
|
174
178
|
) -> None:
|
@@ -218,6 +222,16 @@ class FileSystem:
|
|
218
222
|
def is_local(self) -> bool:
|
219
223
|
return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem)
|
220
224
|
|
225
|
+
def is_writeable(self, path: str) -> bool:
|
226
|
+
try:
|
227
|
+
path = path.rstrip("/\\")
|
228
|
+
touch_file = f"{path}{self.fs.sep}{uuid()}"
|
229
|
+
self.touch(touch_file)
|
230
|
+
self.rm(touch_file)
|
231
|
+
return True
|
232
|
+
except PermissionError:
|
233
|
+
return False
|
234
|
+
|
221
235
|
def is_async(self) -> bool:
|
222
236
|
return isinstance(self.fs, fsspec.asyn.AsyncFileSystem)
|
223
237
|
|
@@ -354,7 +368,7 @@ def safe_filename(s: str, max_length: int = 255) -> str:
|
|
354
368
|
Returns:
|
355
369
|
str: A safe filename string
|
356
370
|
|
357
|
-
|
371
|
+
Examples:
|
358
372
|
>>> safe_filename("Hello/World?.txt")
|
359
373
|
'Hello_World.txt'
|
360
374
|
"""
|
inspect_ai/_util/hash.py
CHANGED
@@ -3,7 +3,7 @@ import mmh3
|
|
3
3
|
|
4
4
|
def mm3_hash(message: str) -> str:
|
5
5
|
# Generate the 128-bit hash as two 64-bit integers
|
6
|
-
h1, h2 = mmh3.hash64(message.encode("utf-8"))
|
6
|
+
h1, h2 = mmh3.hash64(message.encode("utf-8")) # pylint: disable=E0633
|
7
7
|
|
8
8
|
# Convert to unsigned integers and then to hexadecimal
|
9
9
|
return f"{h1 & 0xFFFFFFFFFFFFFFFF:016x}{h2 & 0xFFFFFFFFFFFFFFFF:016x}"
|
inspect_ai/_util/logger.py
CHANGED
@@ -161,7 +161,7 @@ def init_logger(
|
|
161
161
|
getLogger().addHandler(_logHandler)
|
162
162
|
|
163
163
|
# establish default capture level
|
164
|
-
capture_level = min(TRACE, levelno)
|
164
|
+
capture_level = min(TRACE, levelno, transcript_levelno)
|
165
165
|
|
166
166
|
# see all the messages (we won't actually display/write all of them)
|
167
167
|
getLogger().setLevel(capture_level)
|
@@ -181,7 +181,9 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
|
|
181
181
|
from inspect_ai.log._transcript import LoggerEvent, transcript
|
182
182
|
|
183
183
|
if write:
|
184
|
-
transcript()._event(
|
184
|
+
transcript()._event(
|
185
|
+
LoggerEvent(message=LoggingMessage._from_log_record(record))
|
186
|
+
)
|
185
187
|
global _rate_limit_count
|
186
188
|
if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
|
187
189
|
record.levelno == DEBUG
|
inspect_ai/_util/registry.py
CHANGED
@@ -209,7 +209,13 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object:
|
|
209
209
|
if isclass(obj):
|
210
210
|
return with_registry_info(obj(**kwargs))
|
211
211
|
elif callable(obj):
|
212
|
-
return_type =
|
212
|
+
return_type = get_annotations(obj).get("return")
|
213
|
+
# Until we remove the MetricDeprecated symbol we need this extra
|
214
|
+
# bit to map the Metric union back to Metric
|
215
|
+
if "_metric.Metric" in str(return_type):
|
216
|
+
return_type = "Metric"
|
217
|
+
else:
|
218
|
+
return_type = getattr(return_type, "__name__", None)
|
213
219
|
if return_type and return_type.lower() == type:
|
214
220
|
return with_registry_info(obj(**kwargs))
|
215
221
|
else:
|
inspect_ai/_view/view.py
CHANGED
@@ -28,11 +28,10 @@ def view(
|
|
28
28
|
port: int = DEFAULT_VIEW_PORT,
|
29
29
|
authorization: str | None = None,
|
30
30
|
log_level: str | None = None,
|
31
|
-
log_level_transcript: str | None = None,
|
32
31
|
fs_options: dict[str, Any] = {},
|
33
32
|
) -> None:
|
34
33
|
init_dotenv()
|
35
|
-
init_logger(log_level
|
34
|
+
init_logger(log_level)
|
36
35
|
|
37
36
|
# initialize the log_dir
|
38
37
|
log_dir = log_dir if log_dir else os.getenv("INSPECT_LOG_DIR", "./logs")
|
inspect_ai/_view/www/App.css
CHANGED
@@ -9,12 +9,28 @@
|
|
9
9
|
--inspect-input-border: var(--bs-light-border-subtle);
|
10
10
|
--inspect-diff-add-color: #dafbe1;
|
11
11
|
--inspect-diff-remove-color: #ffebe9;
|
12
|
-
--inspect-inactive-selection-background: var(
|
13
|
-
|
12
|
+
--inspect-inactive-selection-background: var(
|
13
|
+
--vscode-editor-inactiveSelectionBackground,
|
14
|
+
#d9d9d9
|
15
|
+
);
|
16
|
+
--inspect-active-selection-background: var(
|
17
|
+
--vscode-editor-selectionBackground,
|
18
|
+
#d7d4f0
|
19
|
+
);
|
14
20
|
--inspect-focus-border-color: #86b7fe;
|
15
21
|
--inspect-focus-border-shadow: 0 0 0 0.25rem rgba(var(--bs-primary-rgb), 0.25);
|
16
22
|
--inspect-focus-border-gray-color: #808080;
|
17
23
|
--inspect-focus-border-gray-shadow: 0 0 0 0.25rem rgba(48, 48, 48, 0.25);
|
24
|
+
|
25
|
+
/* Inspect Font Sizes */
|
26
|
+
--inspect-font-size-title: 1.5rem;
|
27
|
+
--inspect-font-size-title-secondary: 1.3rem;
|
28
|
+
--inspect-font-size-largest: 1.2rem;
|
29
|
+
--inspect-font-size-larger: 1.1rem;
|
30
|
+
--inspect-font-size-large: 1rem;
|
31
|
+
--inspect-font-size-base: 0.9rem;
|
32
|
+
--inspect-font-size-small: 0.8rem;
|
33
|
+
--inspect-font-size-smaller: 0.8rem;
|
18
34
|
}
|
19
35
|
|
20
36
|
body:not([class^="vscode-"]) button {
|
@@ -47,6 +63,65 @@ body[class^="vscode-"] .app-main-grid {
|
|
47
63
|
grid-template-rows: max-content max-content 1fr;
|
48
64
|
}
|
49
65
|
|
66
|
+
/* Inspect Text Styles */
|
67
|
+
.text-style-label {
|
68
|
+
text-transform: uppercase;
|
69
|
+
}
|
70
|
+
|
71
|
+
.text-style-secondary {
|
72
|
+
color: var(--bs-secondary);
|
73
|
+
}
|
74
|
+
|
75
|
+
.text-style-tertiary {
|
76
|
+
color: var(--bs-tertiary-color);
|
77
|
+
}
|
78
|
+
|
79
|
+
/* Inspect Font Size Styles */
|
80
|
+
.text-size-title {
|
81
|
+
font-size: var(--inspect-font-size-title);
|
82
|
+
}
|
83
|
+
|
84
|
+
.text-size-title-secondary {
|
85
|
+
font-size: var(--inspect-font-size-title-secondary);
|
86
|
+
}
|
87
|
+
|
88
|
+
.text-size-largest {
|
89
|
+
font-size: var(--inspect-font-size-largest);
|
90
|
+
}
|
91
|
+
|
92
|
+
.text-size-larger {
|
93
|
+
font-size: var(--inspect-font-size-larger);
|
94
|
+
}
|
95
|
+
|
96
|
+
.text-size-large {
|
97
|
+
font-size: var(--inspect-font-size-large);
|
98
|
+
}
|
99
|
+
|
100
|
+
.text-size-base {
|
101
|
+
font-size: var(--inspect-font-size-base);
|
102
|
+
}
|
103
|
+
|
104
|
+
.text-size-small {
|
105
|
+
font-size: var(--inspect-font-size-small);
|
106
|
+
}
|
107
|
+
|
108
|
+
.text-size-smaller {
|
109
|
+
font-size: var(--inspect-font-size-smaller);
|
110
|
+
}
|
111
|
+
|
112
|
+
.text-truncate {
|
113
|
+
white-space: nowrap;
|
114
|
+
text-overflow: ellipsis;
|
115
|
+
overflow: hidden;
|
116
|
+
}
|
117
|
+
|
118
|
+
.three-line-clamp {
|
119
|
+
display: -webkit-box;
|
120
|
+
-webkit-line-clamp: 3;
|
121
|
+
-webkit-box-orient: vertical;
|
122
|
+
overflow: hidden;
|
123
|
+
}
|
124
|
+
|
50
125
|
body[class^="vscode-"] {
|
51
126
|
--bs-border-radius: 0;
|
52
127
|
--bs-border-radius-lg: 0;
|
@@ -87,7 +162,7 @@ html.vscode {
|
|
87
162
|
|
88
163
|
html.vscode .sample-input {
|
89
164
|
line-height: 1.3em;
|
90
|
-
-webkit-line-clamp: 4 !important
|
165
|
+
-webkit-line-clamp: 4 !important;
|
91
166
|
}
|
92
167
|
|
93
168
|
body[class^="vscode-"] .modal-backdrop {
|
@@ -276,7 +351,7 @@ body {
|
|
276
351
|
}
|
277
352
|
|
278
353
|
@media (max-width: 575px) {
|
279
|
-
.tab-tools
|
354
|
+
.tab-tools select {
|
280
355
|
width: 50px;
|
281
356
|
}
|
282
357
|
}
|
@@ -312,12 +387,6 @@ body {
|
|
312
387
|
font-size: 1.5em;
|
313
388
|
}
|
314
389
|
|
315
|
-
.sidebar {
|
316
|
-
--bs-offcanvas-width: var(--sidebar-width);
|
317
|
-
width: var(--sidebar-width);
|
318
|
-
overflow-y: auto;
|
319
|
-
}
|
320
|
-
|
321
390
|
.nav-link.active {
|
322
391
|
border-bottom-width: 0 !important;
|
323
392
|
}
|
@@ -644,7 +713,7 @@ table.table.table-sm td {
|
|
644
713
|
|
645
714
|
.tab-tools .btn {
|
646
715
|
font-size: 0.7rem;
|
647
|
-
padding: 0.
|
716
|
+
padding: 0.2em 0.8em;
|
648
717
|
}
|
649
718
|
|
650
719
|
.tab-tools {
|
@@ -724,7 +793,7 @@ table.table.table-sm td {
|
|
724
793
|
}
|
725
794
|
|
726
795
|
@keyframes moveLeftToRight {
|
727
|
-
from {
|
796
|
+
from {
|
728
797
|
margin-left: 0;
|
729
798
|
}
|
730
799
|
to {
|
@@ -760,7 +829,6 @@ pre[class*="language-"].tool-output {
|
|
760
829
|
|
761
830
|
/* lightbox styles */
|
762
831
|
|
763
|
-
|
764
832
|
.lightbox-overlay .close-button,
|
765
833
|
.lightbox-overlay .nav-button {
|
766
834
|
/* Hide by default */
|
@@ -868,38 +936,38 @@ ul.jsondiffpatch-delta {
|
|
868
936
|
vertical-align: top;
|
869
937
|
}
|
870
938
|
.jsondiffpatch-property-name:after {
|
871
|
-
content:
|
939
|
+
content: ": ";
|
872
940
|
}
|
873
941
|
.jsondiffpatch-child-node-type-array > .jsondiffpatch-property-name:after {
|
874
|
-
content:
|
942
|
+
content: ": [";
|
875
943
|
}
|
876
944
|
.jsondiffpatch-child-node-type-array:after {
|
877
|
-
content:
|
945
|
+
content: "],";
|
878
946
|
}
|
879
947
|
div.jsondiffpatch-child-node-type-array:before {
|
880
|
-
content:
|
948
|
+
content: "[";
|
881
949
|
}
|
882
950
|
div.jsondiffpatch-child-node-type-array:after {
|
883
|
-
content:
|
951
|
+
content: "]";
|
884
952
|
}
|
885
953
|
.jsondiffpatch-child-node-type-object > .jsondiffpatch-property-name:after {
|
886
|
-
content:
|
954
|
+
content: ": {";
|
887
955
|
}
|
888
956
|
.jsondiffpatch-child-node-type-object:after {
|
889
|
-
content:
|
957
|
+
content: "},";
|
890
958
|
}
|
891
959
|
div.jsondiffpatch-child-node-type-object:before {
|
892
|
-
content:
|
960
|
+
content: "{";
|
893
961
|
}
|
894
962
|
div.jsondiffpatch-child-node-type-object:after {
|
895
|
-
content:
|
963
|
+
content: "}";
|
896
964
|
}
|
897
965
|
.jsondiffpatch-value pre:after {
|
898
|
-
content:
|
966
|
+
content: ",";
|
899
967
|
}
|
900
968
|
li:last-child > .jsondiffpatch-value pre:after,
|
901
969
|
.jsondiffpatch-modified > .jsondiffpatch-left-value pre:after {
|
902
|
-
content:
|
970
|
+
content: "";
|
903
971
|
}
|
904
972
|
.jsondiffpatch-modified .jsondiffpatch-value {
|
905
973
|
display: inline-block;
|
@@ -916,7 +984,7 @@ li:last-child > .jsondiffpatch-value pre:after,
|
|
916
984
|
color: #888;
|
917
985
|
}
|
918
986
|
.jsondiffpatch-moved .jsondiffpatch-moved-destination:before {
|
919
|
-
content:
|
987
|
+
content: " => ";
|
920
988
|
}
|
921
989
|
ul.jsondiffpatch-textdiff {
|
922
990
|
padding: 0;
|
@@ -930,7 +998,7 @@ ul.jsondiffpatch-textdiff {
|
|
930
998
|
display: inline-block;
|
931
999
|
}
|
932
1000
|
.jsondiffpatch-textdiff-line-number:after {
|
933
|
-
content:
|
1001
|
+
content: ",";
|
934
1002
|
}
|
935
1003
|
.jsondiffpatch-error {
|
936
1004
|
background: red;
|
@@ -976,14 +1044,14 @@ ul.jsondiffpatch-textdiff {
|
|
976
1044
|
padding: 1em;
|
977
1045
|
margin: 0.5em 0;
|
978
1046
|
overflow: auto;
|
979
|
-
border: 0.3em solid #7a6651;
|
1047
|
+
/* border: 0.3em solid #7a6651; */
|
980
1048
|
border-radius: 0.5em;
|
981
1049
|
box-shadow: 1px 1px 0.5em #000 inset;
|
982
1050
|
}
|
983
1051
|
.vscode-dark :not(pre) > code[class*="language-"] {
|
984
1052
|
padding: 0.15em 0.2em 0.05em;
|
985
1053
|
border-radius: 0.3em;
|
986
|
-
border: 0.13em solid #7a6651;
|
1054
|
+
/* border: 0.13em solid #7a6651; */
|
987
1055
|
box-shadow: 1px 1px 0.3em -0.1em #000 inset;
|
988
1056
|
white-space: normal;
|
989
1057
|
}
|
@@ -1045,4 +1113,4 @@ ul.jsondiffpatch-textdiff {
|
|
1045
1113
|
.vscode-dark .token.deleted {
|
1046
1114
|
color: red;
|
1047
1115
|
}
|
1048
|
-
/* END PrismJS */
|
1116
|
+
/* END PrismJS */
|
inspect_ai/_view/www/README.md
CHANGED