inspect-ai 0.3.90__py3-none-any.whl → 0.3.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +13 -0
- inspect_ai/_cli/eval.py +44 -0
- inspect_ai/_display/textual/widgets/samples.py +49 -4
- inspect_ai/_display/textual/widgets/vscode.py +4 -2
- inspect_ai/_eval/eval.py +41 -28
- inspect_ai/_eval/evalset.py +4 -0
- inspect_ai/_eval/loader.py +4 -5
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +6 -3
- inspect_ai/_eval/task/log.py +6 -0
- inspect_ai/_eval/task/run.py +108 -41
- inspect_ai/_eval/task/sandbox.py +19 -5
- inspect_ai/_util/_async.py +1 -1
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/environ.py +32 -0
- inspect_ai/_util/file.py +8 -1
- inspect_ai/_util/httpx.py +105 -22
- inspect_ai/_util/registry.py +83 -9
- inspect_ai/_util/text.py +81 -17
- inspect_ai/_util/transcript.py +9 -6
- inspect_ai/_util/vscode.py +7 -2
- inspect_ai/_view/schema.py +1 -1
- inspect_ai/_view/www/babel.config.js +11 -0
- inspect_ai/_view/www/dist/assets/index.css +3640 -3563
- inspect_ai/_view/www/dist/assets/index.js +59204 -52519
- inspect_ai/_view/www/eslint.config.mjs +10 -1
- inspect_ai/_view/www/jest.config.mjs +21 -0
- inspect_ai/_view/www/log-schema.json +111 -2
- inspect_ai/_view/www/package.json +19 -5
- inspect_ai/_view/www/src/{types → @types}/log.d.ts +95 -32
- inspect_ai/_view/www/{App.css → src/app/App.css} +22 -14
- inspect_ai/_view/www/src/app/App.tsx +168 -0
- inspect_ai/_view/www/src/{AppErrorBoundary.tsx → app/AppErrorBoundary.tsx} +1 -1
- inspect_ai/_view/www/src/{appearance → app/appearance}/icons.ts +1 -0
- inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.tsx +5 -5
- inspect_ai/_view/www/src/{workspace/WorkSpaceView.tsx → app/log-view/LogView.tsx} +59 -40
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +159 -0
- inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +109 -0
- inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.tsx +3 -3
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.tsx +4 -4
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.tsx +8 -8
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.tsx +6 -6
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.tsx +8 -8
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.tsx +35 -6
- inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +136 -0
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/SamplesTab.tsx +82 -73
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/grouping.ts +3 -3
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/types.ts +1 -1
- inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.tsx +2 -2
- inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.tsx +1 -1
- inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.tsx +4 -4
- inspect_ai/_view/www/src/{plan → app/plan}/PlanCard.tsx +2 -2
- inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.tsx +5 -5
- inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/app/routing/AppRouter.tsx +58 -0
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +182 -0
- inspect_ai/_view/www/src/app/routing/url.ts +43 -0
- inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.tsx +11 -27
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDialog.tsx +36 -40
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.tsx +116 -49
- inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.module.css +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.tsx +29 -26
- inspect_ai/_view/www/src/{samples → app/samples}/SamplesTools.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.module.css +5 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.tsx +12 -4
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRenderer.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.tsx +6 -1
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatView.tsx +4 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.tsx +5 -3
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +12 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContent.tsx +11 -10
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContents.tsx +14 -8
- inspect_ai/_view/www/src/{samples → app/samples}/chat/messages.ts +2 -2
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolCallView.tsx +26 -27
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +19 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolInput.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.module.css +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/tool.ts +1 -1
- inspect_ai/_view/www/src/app/samples/chat/types.ts +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/samplesDescriptor.tsx +38 -15
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/CategoricalScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/NumericScoreDescriptor.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/OtherScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/types.ts +4 -3
- inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.module.css +2 -1
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.tsx +3 -0
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.tsx +47 -33
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.module.css +16 -0
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.tsx +47 -20
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/filters.ts +8 -6
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.tsx +4 -3
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/completions.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/language.ts +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/sampleDataAdapter.ts +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/sampleLimit.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.tsx +12 -11
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.tsx +6 -6
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ApprovalEventView.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ErrorEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/InputEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.module.css +13 -7
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.tsx +49 -21
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.tsx +11 -9
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleLimitEventView.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.tsx +8 -6
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/StepEventView.tsx +11 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.module.css +8 -7
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.tsx +32 -114
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.module.css +6 -5
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.tsx +14 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/utils.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenderers.tsx +23 -21
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenders.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.tsx +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +142 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +39 -0
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/types.ts +1 -1
- inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.tsx +1 -1
- inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.module.css +16 -0
- inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.tsx +70 -0
- inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.module.css +8 -0
- inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.tsx +35 -17
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.tsx +2 -2
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/{types.ts → app/types.ts} +18 -11
- inspect_ai/_view/www/src/{usage → app/usage}/ModelTokenTable.tsx +1 -1
- inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.tsx +2 -2
- inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.tsx +1 -1
- inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.tsx +6 -6
- inspect_ai/_view/www/src/{api → client/api}/api-browser.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/api-http.ts +3 -3
- inspect_ai/_view/www/src/{api → client/api}/api-vscode.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/client-api.ts +6 -5
- inspect_ai/_view/www/src/{api → client/api}/index.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/types.ts +4 -1
- inspect_ai/_view/www/src/{logfile → client/remote}/remoteLogFile.ts +3 -3
- inspect_ai/_view/www/src/{storage → client/storage}/index.ts +11 -5
- inspect_ai/_view/www/src/components/Card.tsx +1 -1
- inspect_ai/_view/www/src/components/CopyButton.tsx +1 -1
- inspect_ai/_view/www/src/components/DownloadButton.tsx +1 -1
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/{ExpandablePanel.css → ExpandablePanel.module.css} +14 -11
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +16 -10
- inspect_ai/_view/www/src/components/FindBand.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.css +2 -2
- inspect_ai/_view/www/src/components/LargeModal.tsx +12 -1
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -1
- inspect_ai/_view/www/src/components/MessageBand.tsx +1 -1
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +1 -1
- inspect_ai/_view/www/src/constants.ts +10 -9
- inspect_ai/_view/www/src/index.tsx +27 -11
- inspect_ai/_view/www/src/state/appSlice.ts +44 -5
- inspect_ai/_view/www/src/state/hooks.ts +30 -7
- inspect_ai/_view/www/src/state/logSlice.ts +7 -5
- inspect_ai/_view/www/src/state/logsPolling.ts +1 -1
- inspect_ai/_view/www/src/state/logsSlice.ts +18 -13
- inspect_ai/_view/www/src/state/samplePolling.ts +12 -12
- inspect_ai/_view/www/src/state/sampleSlice.ts +3 -5
- inspect_ai/_view/www/src/state/sampleUtils.ts +1 -1
- inspect_ai/_view/www/src/{scoring/utils.ts → state/scoring.ts} +2 -2
- inspect_ai/_view/www/src/state/store.ts +9 -7
- inspect_ai/_view/www/src/state/utils.ts +1 -1
- inspect_ai/_view/www/src/tests/README.md +49 -0
- inspect_ai/_view/www/src/tests/__mocks__/fileMock.js +1 -0
- inspect_ai/_view/www/src/tests/__mocks__/styleMock.js +1 -0
- inspect_ai/_view/www/src/tests/setupTests.mjs +1 -0
- inspect_ai/_view/www/src/tests/utils/base64.test.ts +23 -0
- inspect_ai/_view/www/src/tests/utils/format.test.ts +127 -0
- inspect_ai/_view/www/src/tests/utils/path.test.ts +54 -0
- inspect_ai/_view/www/src/utils/format.ts +8 -2
- inspect_ai/_view/www/src/utils/path.ts +14 -2
- inspect_ai/_view/www/src/utils/polling.ts +1 -2
- inspect_ai/_view/www/src/utils/uri.ts +32 -0
- inspect_ai/_view/www/yarn.lock +3310 -382
- inspect_ai/agent/_handoff.py +6 -3
- inspect_ai/agent/_human/agent.py +5 -3
- inspect_ai/agent/_human/install.py +16 -7
- inspect_ai/agent/_human/panel.py +14 -1
- inspect_ai/agent/_human/service.py +5 -1
- inspect_ai/agent/_react.py +161 -128
- inspect_ai/agent/_types.py +15 -4
- inspect_ai/approval/_policy.py +2 -2
- inspect_ai/log/_file.py +30 -11
- inspect_ai/log/_log.py +7 -1
- inspect_ai/log/_recorders/eval.py +3 -0
- inspect_ai/log/_recorders/types.py +1 -0
- inspect_ai/log/_samples.py +4 -0
- inspect_ai/model/_call_tools.py +33 -17
- inspect_ai/model/_generate_config.py +10 -2
- inspect_ai/model/_model.py +41 -21
- inspect_ai/model/_model_output.py +2 -1
- inspect_ai/model/_openai.py +10 -8
- inspect_ai/model/_openai_responses.py +95 -42
- inspect_ai/model/_providers/anthropic.py +14 -12
- inspect_ai/model/_providers/google.py +191 -95
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/mistral.py +2 -3
- inspect_ai/model/_providers/openai.py +54 -17
- inspect_ai/model/_providers/openai_o1.py +1 -1
- inspect_ai/model/_providers/openai_responses.py +28 -16
- inspect_ai/model/_providers/openrouter.py +14 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/chatapi.py +17 -7
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/scorer/_metric.py +17 -1
- inspect_ai/scorer/_model.py +51 -6
- inspect_ai/scorer/_scorer.py +1 -1
- inspect_ai/solver/_human_agent.py +3 -0
- inspect_ai/solver/_plan.py +1 -1
- inspect_ai/solver/_solver.py +1 -1
- inspect_ai/solver/_use_tools.py +14 -8
- inspect_ai/tool/__init__.py +16 -1
- inspect_ai/tool/_json_rpc_helpers.py +285 -0
- inspect_ai/tool/_mcp/__init__.py +13 -0
- inspect_ai/tool/_mcp/_context.py +14 -0
- inspect_ai/tool/_mcp/_mcp.py +293 -0
- inspect_ai/tool/_mcp/_sandbox.py +104 -0
- inspect_ai/tool/_mcp/_types.py +31 -0
- inspect_ai/tool/_mcp/connection.py +60 -0
- inspect_ai/tool/_mcp/sampling.py +118 -0
- inspect_ai/tool/_mcp/server.py +112 -0
- inspect_ai/tool/_mcp/tools.py +34 -0
- inspect_ai/tool/_tool.py +13 -0
- inspect_ai/tool/_tool_def.py +24 -7
- inspect_ai/tool/_tool_support_helpers.py +129 -153
- inspect_ai/tool/_tools/_bash_session.py +11 -11
- inspect_ai/tool/_tools/_text_editor.py +6 -6
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +8 -8
- inspect_ai/util/_anyio.py +31 -20
- inspect_ai/util/_json.py +20 -2
- inspect_ai/util/_sandbox/context.py +18 -7
- inspect_ai/util/_sandbox/docker/compose.py +1 -1
- inspect_ai/util/_sandbox/docker/docker.py +92 -21
- inspect_ai/util/_sandbox/environment.py +33 -2
- inspect_ai/util/_sandbox/events.py +2 -2
- inspect_ai/util/_sandbox/service.py +13 -3
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/METADATA +6 -2
- inspect_ai-0.3.92.dist-info/RECORD +732 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/App.tsx +0 -316
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +0 -4
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +0 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +0 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +0 -14
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +0 -292
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +0 -5
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +0 -57
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +0 -43
- inspect_ai-0.3.90.dist-info/RECORD +0 -705
- /inspect_ai/_view/www/src/{types → @types}/asciicinema-player.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/jsondiffpatch.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/markdown-it-katex.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/prism.d.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/colors.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/fonts.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/styles.ts +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataGrid.tsx +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.tsx +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetadataGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/types.ts +0 -0
- /inspect_ai/_view/www/src/{workspace/WorkSpaceView.module.css → app/log-view/LogView.module.css} +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.tsx +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/JsonTab.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.tsx +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/types.ts +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/utils.ts +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.tsx +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/error.ts +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/tokenize.ts +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateDiffView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.module.css +0 -0
- /inspect_ai/_view/www/src/{api → client/api}/api-shared.ts +0 -0
- /inspect_ai/_view/www/src/{api → client/api}/jsonrpc.ts +0 -0
- /inspect_ai/_view/www/src/{logfile → client/remote}/remoteZipFile.ts +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/common.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import functools
|
2
|
+
import os
|
2
3
|
from typing import Any, Callable, Literal, cast
|
3
4
|
|
4
5
|
import click
|
@@ -21,6 +22,7 @@ class CommonOptions(TypedDict):
|
|
21
22
|
log_dir: str
|
22
23
|
display: Literal["full", "conversation", "rich", "plain", "none"]
|
23
24
|
no_ansi: bool | None
|
25
|
+
traceback_locals: bool
|
24
26
|
env: tuple[str] | None
|
25
27
|
debug: bool
|
26
28
|
debug_port: int
|
@@ -72,6 +74,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
72
74
|
help="Do not print ANSI control characters.",
|
73
75
|
envvar="INSPECT_NO_ANSI",
|
74
76
|
)
|
77
|
+
@click.option(
|
78
|
+
"--traceback-locals",
|
79
|
+
type=bool,
|
80
|
+
is_flag=True,
|
81
|
+
envvar="INSPECT_TRACEBACK_LOCALS",
|
82
|
+
help="Include values of local variables in tracebacks (note that this can leak private data e.g. API keys so should typically only be enabled for targeted debugging).",
|
83
|
+
)
|
75
84
|
@click.option(
|
76
85
|
"--env",
|
77
86
|
multiple=True,
|
@@ -107,6 +116,10 @@ def process_common_options(options: CommonOptions) -> None:
|
|
107
116
|
env_args = parse_cli_args(options["env"])
|
108
117
|
init_cli_env(env_args)
|
109
118
|
|
119
|
+
# set traceback locals env var
|
120
|
+
if options.get("traceback_locals", False):
|
121
|
+
os.environ["INSPECT_TRACEBACK_LOCALS"] = "1"
|
122
|
+
|
110
123
|
# propagate display
|
111
124
|
if options["no_ansi"]:
|
112
125
|
display = "rich"
|
inspect_ai/_cli/eval.py
CHANGED
@@ -12,6 +12,7 @@ from inspect_ai._util.constants import (
|
|
12
12
|
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
13
13
|
DEFAULT_LOG_SHARED,
|
14
14
|
DEFAULT_MAX_CONNECTIONS,
|
15
|
+
DEFAULT_RETRY_ON_ERROR,
|
15
16
|
)
|
16
17
|
from inspect_ai._util.file import filesystem
|
17
18
|
from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
|
@@ -43,6 +44,7 @@ NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task comple
|
|
43
44
|
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
|
44
45
|
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
|
45
46
|
NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
|
47
|
+
RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
|
46
48
|
LOG_IMAGES_HELP = (
|
47
49
|
"Include base64 encoded versions of filename or URL based images in the log file."
|
48
50
|
)
|
@@ -263,6 +265,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
263
265
|
help=NO_FAIL_ON_ERROR_HELP,
|
264
266
|
envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
|
265
267
|
)
|
268
|
+
@click.option(
|
269
|
+
"--retry-on-error",
|
270
|
+
is_flag=False,
|
271
|
+
flag_value="true",
|
272
|
+
default=None,
|
273
|
+
callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
|
274
|
+
help=RETRY_ON_ERROR_HELP,
|
275
|
+
envvar="INSPECT_EVAL_RETRY_ON_ERROR",
|
276
|
+
)
|
266
277
|
@click.option(
|
267
278
|
"--no-log-samples",
|
268
279
|
type=bool,
|
@@ -428,6 +439,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
428
439
|
help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
|
429
440
|
envvar="INSPECT_EVAL_REASONING_TOKENS",
|
430
441
|
)
|
442
|
+
@click.option(
|
443
|
+
"--reasoning-summary",
|
444
|
+
type=click.Choice(["concise", "detailed", "auto"]),
|
445
|
+
help="Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only.",
|
446
|
+
envvar="INSPECT_EVAL_REASONING_SUMMARY",
|
447
|
+
)
|
431
448
|
@click.option(
|
432
449
|
"--reasoning-history",
|
433
450
|
type=click.Choice(["none", "all", "last", "auto"]),
|
@@ -512,6 +529,7 @@ def eval_command(
|
|
512
529
|
cache_prompt: str | None,
|
513
530
|
reasoning_effort: str | None,
|
514
531
|
reasoning_tokens: int | None,
|
532
|
+
reasoning_summary: Literal["concise", "detailed", "auto"] | None,
|
515
533
|
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
516
534
|
response_schema: ResponseSchema | None,
|
517
535
|
message_limit: int | None,
|
@@ -524,6 +542,7 @@ def eval_command(
|
|
524
542
|
max_sandboxes: int | None,
|
525
543
|
fail_on_error: bool | float | None,
|
526
544
|
no_fail_on_error: bool | None,
|
545
|
+
retry_on_error: int | None,
|
527
546
|
no_log_samples: bool | None,
|
528
547
|
log_images: bool | None,
|
529
548
|
log_buffer: int | None,
|
@@ -578,6 +597,7 @@ def eval_command(
|
|
578
597
|
max_sandboxes=max_sandboxes,
|
579
598
|
fail_on_error=fail_on_error,
|
580
599
|
no_fail_on_error=no_fail_on_error,
|
600
|
+
retry_on_error=retry_on_error,
|
581
601
|
debug_errors=common["debug_errors"],
|
582
602
|
no_log_samples=no_log_samples,
|
583
603
|
log_images=log_images,
|
@@ -683,6 +703,7 @@ def eval_set_command(
|
|
683
703
|
cache_prompt: str | None,
|
684
704
|
reasoning_effort: str | None,
|
685
705
|
reasoning_tokens: int | None,
|
706
|
+
reasoning_summary: Literal["concise", "detailed", "auto"] | None,
|
686
707
|
reasoning_history: Literal["none", "all", "last", "auto"] | None,
|
687
708
|
response_schema: ResponseSchema | None,
|
688
709
|
message_limit: int | None,
|
@@ -695,6 +716,7 @@ def eval_set_command(
|
|
695
716
|
max_sandboxes: int | None,
|
696
717
|
fail_on_error: bool | float | None,
|
697
718
|
no_fail_on_error: bool | None,
|
719
|
+
retry_on_error: int | None,
|
698
720
|
no_log_samples: bool | None,
|
699
721
|
log_images: bool | None,
|
700
722
|
log_buffer: int | None,
|
@@ -754,6 +776,7 @@ def eval_set_command(
|
|
754
776
|
max_sandboxes=max_sandboxes,
|
755
777
|
fail_on_error=fail_on_error,
|
756
778
|
no_fail_on_error=no_fail_on_error,
|
779
|
+
retry_on_error=retry_on_error,
|
757
780
|
debug_errors=common["debug_errors"],
|
758
781
|
no_log_samples=no_log_samples,
|
759
782
|
log_images=log_images,
|
@@ -811,6 +834,7 @@ def eval_exec(
|
|
811
834
|
max_sandboxes: int | None,
|
812
835
|
fail_on_error: bool | float | None,
|
813
836
|
no_fail_on_error: bool | None,
|
837
|
+
retry_on_error: int | None,
|
814
838
|
debug_errors: bool | None,
|
815
839
|
no_log_samples: bool | None,
|
816
840
|
log_images: bool | None,
|
@@ -858,6 +882,10 @@ def eval_exec(
|
|
858
882
|
elif fail_on_error == 0.0:
|
859
883
|
fail_on_error = True
|
860
884
|
|
885
|
+
# resolve retry_on_error
|
886
|
+
if retry_on_error == 0:
|
887
|
+
retry_on_error = None
|
888
|
+
|
861
889
|
# resolve negating options
|
862
890
|
sandbox_cleanup = False if no_sandbox_cleanup else None
|
863
891
|
log_samples = False if no_log_samples else None
|
@@ -890,6 +918,7 @@ def eval_exec(
|
|
890
918
|
sample_id=eval_sample_id,
|
891
919
|
epochs=eval_epochs,
|
892
920
|
fail_on_error=fail_on_error,
|
921
|
+
retry_on_error=retry_on_error,
|
893
922
|
debug_errors=debug_errors,
|
894
923
|
message_limit=message_limit,
|
895
924
|
token_limit=token_limit,
|
@@ -1024,6 +1053,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
1024
1053
|
help=NO_FAIL_ON_ERROR_HELP,
|
1025
1054
|
envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
|
1026
1055
|
)
|
1056
|
+
@click.option(
|
1057
|
+
"--retry-on-error",
|
1058
|
+
is_flag=False,
|
1059
|
+
flag_value="true",
|
1060
|
+
default=None,
|
1061
|
+
callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
|
1062
|
+
help=RETRY_ON_ERROR_HELP,
|
1063
|
+
envvar="INSPECT_EVAL_RETRY_ON_ERROR",
|
1064
|
+
)
|
1027
1065
|
@click.option(
|
1028
1066
|
"--no-log-samples",
|
1029
1067
|
type=bool,
|
@@ -1096,6 +1134,7 @@ def eval_retry_command(
|
|
1096
1134
|
trace: bool | None,
|
1097
1135
|
fail_on_error: bool | float | None,
|
1098
1136
|
no_fail_on_error: bool | None,
|
1137
|
+
retry_on_error: int | None,
|
1099
1138
|
no_log_samples: bool | None,
|
1100
1139
|
log_images: bool | None,
|
1101
1140
|
log_buffer: int | None,
|
@@ -1125,6 +1164,10 @@ def eval_retry_command(
|
|
1125
1164
|
elif fail_on_error == 0.0:
|
1126
1165
|
fail_on_error = True
|
1127
1166
|
|
1167
|
+
# resolve retry on error
|
1168
|
+
if retry_on_error == 0:
|
1169
|
+
retry_on_error = None
|
1170
|
+
|
1128
1171
|
# resolve log file
|
1129
1172
|
retry_log_files = [
|
1130
1173
|
log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
|
@@ -1143,6 +1186,7 @@ def eval_retry_command(
|
|
1143
1186
|
sandbox_cleanup=sandbox_cleanup,
|
1144
1187
|
trace=trace,
|
1145
1188
|
fail_on_error=fail_on_error,
|
1189
|
+
retry_on_error=retry_on_error,
|
1146
1190
|
debug_errors=common["debug_errors"],
|
1147
1191
|
log_samples=log_samples,
|
1148
1192
|
log_images=log_images,
|
@@ -1,11 +1,18 @@
|
|
1
1
|
import time
|
2
2
|
from typing import cast
|
3
|
+
from urllib.parse import urlencode, urlparse, urlunparse
|
3
4
|
|
4
5
|
from rich.console import RenderableType
|
5
6
|
from rich.table import Table
|
6
7
|
from rich.text import Text
|
7
8
|
from textual.app import ComposeResult
|
8
|
-
from textual.containers import
|
9
|
+
from textual.containers import (
|
10
|
+
Horizontal,
|
11
|
+
HorizontalGroup,
|
12
|
+
Right,
|
13
|
+
Vertical,
|
14
|
+
VerticalGroup,
|
15
|
+
)
|
9
16
|
from textual.css.query import NoMatches
|
10
17
|
from textual.reactive import reactive
|
11
18
|
from textual.widget import Widget
|
@@ -20,9 +27,12 @@ from textual.widgets import (
|
|
20
27
|
from textual.widgets.option_list import Option, OptionDoesNotExist
|
21
28
|
|
22
29
|
from inspect_ai._display.textual.widgets.port_mappings import get_url
|
30
|
+
from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
|
31
|
+
from inspect_ai._util.file import to_uri
|
23
32
|
from inspect_ai._util.format import format_progress_time
|
24
33
|
from inspect_ai._util.port_names import get_service_by_port
|
25
34
|
from inspect_ai._util.registry import registry_unqualified_name
|
35
|
+
from inspect_ai._util.vscode import EXTENSION_COMMAND_OPEN_SAMPLE, VSCodeCommand
|
26
36
|
from inspect_ai.log._samples import ActiveSample
|
27
37
|
from inspect_ai.log._transcript import ToolEvent
|
28
38
|
|
@@ -272,6 +282,16 @@ class SampleInfo(Vertical):
|
|
272
282
|
background: $surface;
|
273
283
|
color: $secondary;
|
274
284
|
}
|
285
|
+
SampleInfo #sample-link {
|
286
|
+
height: auto;
|
287
|
+
width: 11;
|
288
|
+
margin-left: 1;
|
289
|
+
background: $background;
|
290
|
+
}
|
291
|
+
SampleInfo #sample-link Link {
|
292
|
+
color: $accent;
|
293
|
+
background: $background;
|
294
|
+
}
|
275
295
|
"""
|
276
296
|
|
277
297
|
def __init__(self) -> None:
|
@@ -280,9 +300,12 @@ class SampleInfo(Vertical):
|
|
280
300
|
self._sandbox_count: int | None = None
|
281
301
|
|
282
302
|
def compose(self) -> ComposeResult:
|
283
|
-
with
|
284
|
-
|
285
|
-
|
303
|
+
with Horizontal():
|
304
|
+
with Collapsible(title=""):
|
305
|
+
yield SampleLimits()
|
306
|
+
yield SandboxesView()
|
307
|
+
yield Right(id="sample-link")
|
308
|
+
|
286
309
|
yield SampleVNC()
|
287
310
|
|
288
311
|
async def sync_sample(self, sample: ActiveSample | None) -> None:
|
@@ -311,6 +334,28 @@ class SampleInfo(Vertical):
|
|
311
334
|
await sandboxes.sync_sample(sample)
|
312
335
|
await self.query_one(SampleVNC).sync_sample(sample)
|
313
336
|
|
337
|
+
# View Log Link
|
338
|
+
base_uri = sample.log_location
|
339
|
+
query_params = {
|
340
|
+
"sample_id": sample.sample.id,
|
341
|
+
"epoch": sample.epoch,
|
342
|
+
}
|
343
|
+
|
344
|
+
parsed = urlparse(to_uri(base_uri))
|
345
|
+
view_link = urlunparse(parsed._replace(query=urlencode(query_params)))
|
346
|
+
|
347
|
+
link_container = self.query_one("#sample-link")
|
348
|
+
link_container.remove_children()
|
349
|
+
link = conditional_vscode_link(
|
350
|
+
"[View Log]",
|
351
|
+
VSCodeCommand(
|
352
|
+
command="inspect.openLogViewer",
|
353
|
+
args=[view_link] if sample.log_location else [],
|
354
|
+
),
|
355
|
+
EXTENSION_COMMAND_OPEN_SAMPLE,
|
356
|
+
)
|
357
|
+
link_container.mount(link)
|
358
|
+
|
314
359
|
|
315
360
|
class SampleLimits(Widget):
|
316
361
|
DEFAULT_CSS = """
|
@@ -8,8 +8,10 @@ from inspect_ai._util.vscode import (
|
|
8
8
|
)
|
9
9
|
|
10
10
|
|
11
|
-
def conditional_vscode_link(
|
12
|
-
|
11
|
+
def conditional_vscode_link(
|
12
|
+
text: str, command: VSCodeCommand, context: str | None = None
|
13
|
+
) -> Widget:
|
14
|
+
if can_execute_vscode_command(command.command, context=context):
|
13
15
|
vscode_link = VSCodeLink(text)
|
14
16
|
vscode_link.commands = [command]
|
15
17
|
return vscode_link
|
inspect_ai/_eval/eval.py
CHANGED
@@ -90,6 +90,7 @@ def eval(
|
|
90
90
|
sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
|
91
91
|
epochs: int | Epochs | None = None,
|
92
92
|
fail_on_error: bool | float | None = None,
|
93
|
+
retry_on_error: int | None = None,
|
93
94
|
debug_errors: bool | None = None,
|
94
95
|
message_limit: int | None = None,
|
95
96
|
token_limit: int | None = None,
|
@@ -151,6 +152,8 @@ def eval(
|
|
151
152
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
152
153
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
153
154
|
eval if a count of samples fails.
|
155
|
+
retry_on_error: Number of times to retry samples if they encounter errors
|
156
|
+
(by default, no retries occur).
|
154
157
|
debug_errors: Raise task errors (rather than logging them)
|
155
158
|
so they can be debugged (defaults to False).
|
156
159
|
message_limit: Limit on total messages used for each sample.
|
@@ -214,6 +217,7 @@ def eval(
|
|
214
217
|
sample_id=sample_id,
|
215
218
|
epochs=epochs,
|
216
219
|
fail_on_error=fail_on_error,
|
220
|
+
retry_on_error=retry_on_error,
|
217
221
|
debug_errors=debug_errors,
|
218
222
|
message_limit=message_limit,
|
219
223
|
token_limit=token_limit,
|
@@ -266,6 +270,7 @@ async def eval_async(
|
|
266
270
|
sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
|
267
271
|
epochs: int | Epochs | None = None,
|
268
272
|
fail_on_error: bool | float | None = None,
|
273
|
+
retry_on_error: int | None = None,
|
269
274
|
debug_errors: bool | None = None,
|
270
275
|
message_limit: int | None = None,
|
271
276
|
token_limit: int | None = None,
|
@@ -315,6 +320,8 @@ async def eval_async(
|
|
315
320
|
fail_on_error: `True` to fail on first sample error
|
316
321
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
317
322
|
to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
|
323
|
+
retry_on_error: Number of times to retry samples if they encounter errors
|
324
|
+
(by default, no retries occur).
|
318
325
|
debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
|
319
326
|
message_limit: Limit on total messages used for each sample.
|
320
327
|
token_limit: Limit on total tokens used for each sample.
|
@@ -455,6 +462,7 @@ async def eval_async(
|
|
455
462
|
else None,
|
456
463
|
approval=config_from_approval_policies(approval) if approval else None,
|
457
464
|
fail_on_error=fail_on_error,
|
465
|
+
retry_on_error=retry_on_error,
|
458
466
|
message_limit=message_limit,
|
459
467
|
token_limit=token_limit,
|
460
468
|
time_limit=time_limit,
|
@@ -551,6 +559,7 @@ def eval_retry(
|
|
551
559
|
trace: bool | None = None,
|
552
560
|
display: DisplayType | None = None,
|
553
561
|
fail_on_error: bool | float | None = None,
|
562
|
+
retry_on_error: int | None = None,
|
554
563
|
debug_errors: bool | None = None,
|
555
564
|
log_samples: bool | None = None,
|
556
565
|
log_images: bool | None = None,
|
@@ -589,6 +598,8 @@ def eval_retry(
|
|
589
598
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
590
599
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
591
600
|
eval if a count of samples fails.
|
601
|
+
retry_on_error: Number of times to retry samples if they encounter errors
|
602
|
+
(by default, no retries occur).
|
592
603
|
debug_errors: Raise task errors (rather than logging them)
|
593
604
|
so they can be debugged (defaults to False).
|
594
605
|
log_samples: Log detailed samples and scores (defaults to True)
|
@@ -631,6 +642,7 @@ def eval_retry(
|
|
631
642
|
max_sandboxes=max_sandboxes,
|
632
643
|
sandbox_cleanup=sandbox_cleanup,
|
633
644
|
fail_on_error=fail_on_error,
|
645
|
+
retry_on_error=retry_on_error,
|
634
646
|
debug_errors=debug_errors,
|
635
647
|
log_samples=log_samples,
|
636
648
|
log_images=log_images,
|
@@ -658,6 +670,7 @@ async def eval_retry_async(
|
|
658
670
|
max_sandboxes: int | None = None,
|
659
671
|
sandbox_cleanup: bool | None = None,
|
660
672
|
fail_on_error: bool | float | None = None,
|
673
|
+
retry_on_error: int | None = None,
|
661
674
|
debug_errors: bool | None = None,
|
662
675
|
log_samples: bool | None = None,
|
663
676
|
log_images: bool | None = None,
|
@@ -672,46 +685,40 @@ async def eval_retry_async(
|
|
672
685
|
"""Retry a previously failed evaluation task.
|
673
686
|
|
674
687
|
Args:
|
675
|
-
tasks:
|
676
|
-
|
677
|
-
log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
|
688
|
+
tasks: Log files for task(s) to retry.
|
689
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
678
690
|
"info", "warning", "error", or "critical" (defaults to "warning")
|
679
|
-
log_level_transcript
|
680
|
-
log_dir
|
681
|
-
|
682
|
-
|
683
|
-
to "eval", the native high-performance format).
|
684
|
-
max_samples (int | None): Maximum number of samples to run in parallel
|
691
|
+
log_level_transcript: Level for logging to the log file (defaults to "info")
|
692
|
+
log_dir: Output path for logging results (defaults to file log in ./logs directory).
|
693
|
+
log_format: Format for writing log files (defaults to "eval", the native high-performance format).
|
694
|
+
max_samples: Maximum number of samples to run in parallel
|
685
695
|
(default is max_connections)
|
686
|
-
max_tasks
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
|
691
|
-
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
|
696
|
+
max_tasks: Maximum number of tasks to run in parallel (default is 1)
|
697
|
+
max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
|
698
|
+
max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
|
699
|
+
sandbox_cleanup: Cleanup sandbox environments after task completes
|
692
700
|
(defaults to True)
|
693
|
-
fail_on_error
|
701
|
+
fail_on_error: `True` to fail on first sample error
|
694
702
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
695
703
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
696
704
|
eval if a count of samples fails.
|
697
|
-
|
705
|
+
retry_on_error: Number of times to retry samples if they encounter errors
|
706
|
+
(by default, no retries occur).
|
707
|
+
debug_errors: Raise task errors (rather than logging them)
|
698
708
|
so they can be debugged (defaults to False).
|
699
|
-
log_samples:
|
700
|
-
log_images:
|
709
|
+
log_samples: Log detailed samples and scores (defaults to True)
|
710
|
+
log_images: Log base64 encoded version of images,
|
701
711
|
even if specified as a filename or URL (defaults to False)
|
702
|
-
log_buffer:
|
712
|
+
log_buffer: Number of samples to buffer before writing log file.
|
703
713
|
If not specified, an appropriate default for the format and filesystem is
|
704
714
|
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
|
705
715
|
log_shared: Indicate that the log directory is shared, which results in
|
706
716
|
additional syncing of realtime log data for Inspect View.
|
707
|
-
score
|
708
|
-
score_display
|
709
|
-
max_retries
|
710
|
-
|
711
|
-
|
712
|
-
Request timeout (in seconds)
|
713
|
-
max_connections (int | None):
|
714
|
-
Maximum number of concurrent connections to Model API (default is per Model API)
|
717
|
+
score: Score output (defaults to True)
|
718
|
+
score_display: Show scoring metrics in realtime (defaults to True)
|
719
|
+
max_retries: Maximum number of times to retry request.
|
720
|
+
timeout: Request timeout (in seconds)
|
721
|
+
max_connections: Maximum number of concurrent connections to Model API (default is per Model API)
|
715
722
|
|
716
723
|
Returns:
|
717
724
|
List of EvalLog (one for each task)
|
@@ -802,6 +809,11 @@ async def eval_retry_async(
|
|
802
809
|
if fail_on_error is not None
|
803
810
|
else eval_log.eval.config.fail_on_error
|
804
811
|
)
|
812
|
+
retry_on_error = (
|
813
|
+
retry_on_error
|
814
|
+
if retry_on_error is not None
|
815
|
+
else eval_log.eval.config.retry_on_error
|
816
|
+
)
|
805
817
|
log_samples = (
|
806
818
|
log_samples if log_samples is not None else eval_log.eval.config.log_samples
|
807
819
|
)
|
@@ -852,6 +864,7 @@ async def eval_retry_async(
|
|
852
864
|
sample_id=sample_id,
|
853
865
|
epochs=epochs,
|
854
866
|
fail_on_error=fail_on_error,
|
867
|
+
retry_on_error=retry_on_error,
|
855
868
|
debug_errors=debug_errors,
|
856
869
|
message_limit=message_limit,
|
857
870
|
token_limit=token_limit,
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -82,6 +82,7 @@ def eval_set(
|
|
82
82
|
sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
|
83
83
|
epochs: int | Epochs | None = None,
|
84
84
|
fail_on_error: bool | float | None = None,
|
85
|
+
retry_on_error: int | None = None,
|
85
86
|
debug_errors: bool | None = None,
|
86
87
|
message_limit: int | None = None,
|
87
88
|
token_limit: int | None = None,
|
@@ -153,6 +154,8 @@ def eval_set(
|
|
153
154
|
(default); `False` to never fail on sample errors; Value between 0 and 1
|
154
155
|
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
155
156
|
eval if a count of samples fails.
|
157
|
+
retry_on_error: Number of times to retry samples if they encounter errors
|
158
|
+
(by default, no retries occur).
|
156
159
|
debug_errors: Raise task errors (rather than logging them)
|
157
160
|
so they can be debugged (defaults to False).
|
158
161
|
message_limit: Limit on total messages used for each sample.
|
@@ -215,6 +218,7 @@ def eval_set(
|
|
215
218
|
sample_id=sample_id,
|
216
219
|
epochs=epochs,
|
217
220
|
fail_on_error=fail_on_error,
|
221
|
+
retry_on_error=retry_on_error,
|
218
222
|
debug_errors=debug_errors,
|
219
223
|
message_limit=message_limit,
|
220
224
|
token_limit=token_limit,
|
inspect_ai/_eval/loader.py
CHANGED
@@ -25,7 +25,6 @@ from inspect_ai._util.registry import (
|
|
25
25
|
registry_lookup,
|
26
26
|
registry_params,
|
27
27
|
)
|
28
|
-
from inspect_ai.agent._agent import Agent
|
29
28
|
from inspect_ai.agent._as_solver import as_solver
|
30
29
|
from inspect_ai.model import Model
|
31
30
|
from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
|
@@ -423,9 +422,9 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
423
422
|
if solver_name is None:
|
424
423
|
raise ValueError(f"Unable to resolve solver name from {spec.solver}")
|
425
424
|
elif registry_lookup("solver", solver_name) is not None:
|
426
|
-
return
|
425
|
+
return registry_create("solver", solver_name, **spec.args)
|
427
426
|
elif registry_lookup("agent", solver_name) is not None:
|
428
|
-
agent =
|
427
|
+
agent = registry_create("agent", solver_name, **spec.args)
|
429
428
|
return as_solver(agent)
|
430
429
|
else:
|
431
430
|
raise ValueError(
|
@@ -484,11 +483,11 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
484
483
|
|
485
484
|
# create decorator based solvers using the registry
|
486
485
|
if any(solver[0] == solver_name for solver in solver_decorators):
|
487
|
-
return
|
486
|
+
return registry_create("solver", solver_name, **spec.args)
|
488
487
|
|
489
488
|
# create decorator based agents using the registry
|
490
489
|
elif any(agent[0] == solver_name for agent in agent_decorators):
|
491
|
-
agent =
|
490
|
+
agent = registry_create("agent", solver_name, **spec.args)
|
492
491
|
return as_solver(agent)
|
493
492
|
|
494
493
|
# create bridge based solvers by calling the function and wrapping it in bridge()
|
inspect_ai/_eval/registry.py
CHANGED
@@ -80,7 +80,7 @@ def task_create(name: str, **kwargs: Any) -> Task:
|
|
80
80
|
else:
|
81
81
|
logger.warning(f"param '{param}' not used by task '{name}'")
|
82
82
|
|
83
|
-
return
|
83
|
+
return registry_create("task", name, **task_args)
|
84
84
|
|
85
85
|
|
86
86
|
@overload
|
inspect_ai/_eval/run.py
CHANGED
@@ -4,6 +4,7 @@ import sys
|
|
4
4
|
from typing import Any, Awaitable, Callable, Set, cast
|
5
5
|
|
6
6
|
from inspect_ai._eval.task.task import Task
|
7
|
+
from inspect_ai._util.environ import environ_vars
|
7
8
|
from inspect_ai._util.trace import trace_action
|
8
9
|
|
9
10
|
if sys.version_info < (3, 11):
|
@@ -49,7 +50,7 @@ from .loader import (
|
|
49
50
|
from .task.log import TaskLogger
|
50
51
|
from .task.resolved import ResolvedTask
|
51
52
|
from .task.run import TaskRunOptions, task_run
|
52
|
-
from .task.sandbox import TaskSandboxEnvironment,
|
53
|
+
from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task_and_sample
|
53
54
|
from .task.util import slice_dataset, task_run_dir
|
54
55
|
|
55
56
|
log = logging.getLogger(__name__)
|
@@ -435,7 +436,9 @@ async def startup_sandbox_environments(
|
|
435
436
|
# resolve each sample and add to sandboxenvs
|
436
437
|
dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
|
437
438
|
for sample in dataset:
|
438
|
-
sandbox =
|
439
|
+
sandbox = await resolve_sandbox_for_task_and_sample(
|
440
|
+
eval_sandbox, task.task, sample
|
441
|
+
)
|
439
442
|
if sandbox is not None and sandbox not in sandboxenvs:
|
440
443
|
sandboxenvs.add(sandbox)
|
441
444
|
|
@@ -448,7 +451,7 @@ async def startup_sandbox_environments(
|
|
448
451
|
|
449
452
|
# run startup
|
450
453
|
task_init = cast(TaskInit, getattr(sandboxenv_type, "task_init"))
|
451
|
-
with chdir(sandboxenv.run_dir):
|
454
|
+
with chdir(sandboxenv.run_dir), environ_vars(dict(sandboxenv.env)):
|
452
455
|
await task_init("startup", sandboxenv.sandbox.config)
|
453
456
|
|
454
457
|
# append cleanup method
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -187,6 +187,9 @@ class TaskLogger:
|
|
187
187
|
# log the sample event
|
188
188
|
self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
|
189
189
|
|
190
|
+
def remove_sample(self, id: str | int, epoch: int) -> None:
|
191
|
+
self._buffer_db.remove_samples([(id, epoch)])
|
192
|
+
|
190
193
|
async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
|
191
194
|
# log the sample
|
192
195
|
await self.recorder.log_sample(self.eval, sample)
|
@@ -202,6 +205,9 @@ class TaskLogger:
|
|
202
205
|
scores=sample.scores,
|
203
206
|
error=sample.error.message if sample.error is not None else None,
|
204
207
|
limit=f"{sample.limit.type}" if sample.limit is not None else None,
|
208
|
+
retries=len(sample.error_retries)
|
209
|
+
if sample.error_retries is not None
|
210
|
+
else None,
|
205
211
|
)
|
206
212
|
)
|
207
213
|
|