inspect-ai 0.3.90__py3-none-any.whl → 0.3.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/common.py +13 -0
- inspect_ai/_cli/eval.py +40 -0
- inspect_ai/_display/textual/widgets/samples.py +49 -4
- inspect_ai/_display/textual/widgets/vscode.py +4 -2
- inspect_ai/_eval/eval.py +41 -28
- inspect_ai/_eval/evalset.py +4 -0
- inspect_ai/_eval/loader.py +4 -5
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +6 -3
- inspect_ai/_eval/task/log.py +6 -0
- inspect_ai/_eval/task/run.py +108 -41
- inspect_ai/_eval/task/sandbox.py +19 -5
- inspect_ai/_util/_async.py +1 -1
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/environ.py +32 -0
- inspect_ai/_util/file.py +8 -1
- inspect_ai/_util/httpx.py +105 -22
- inspect_ai/_util/registry.py +83 -9
- inspect_ai/_util/text.py +81 -17
- inspect_ai/_util/transcript.py +9 -6
- inspect_ai/_util/vscode.py +7 -2
- inspect_ai/_view/schema.py +1 -1
- inspect_ai/_view/www/babel.config.js +11 -0
- inspect_ai/_view/www/dist/assets/index.css +3640 -3563
- inspect_ai/_view/www/dist/assets/index.js +59204 -52519
- inspect_ai/_view/www/eslint.config.mjs +10 -1
- inspect_ai/_view/www/jest.config.mjs +21 -0
- inspect_ai/_view/www/log-schema.json +111 -2
- inspect_ai/_view/www/package.json +19 -5
- inspect_ai/_view/www/src/{types → @types}/log.d.ts +95 -32
- inspect_ai/_view/www/{App.css → src/app/App.css} +22 -14
- inspect_ai/_view/www/src/app/App.tsx +168 -0
- inspect_ai/_view/www/src/{AppErrorBoundary.tsx → app/AppErrorBoundary.tsx} +1 -1
- inspect_ai/_view/www/src/{appearance → app/appearance}/icons.ts +1 -0
- inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.tsx +5 -5
- inspect_ai/_view/www/src/{workspace/WorkSpaceView.tsx → app/log-view/LogView.tsx} +59 -40
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +159 -0
- inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +109 -0
- inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.tsx +3 -3
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.tsx +4 -4
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.tsx +8 -8
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.tsx +6 -6
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.tsx +8 -8
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.tsx +35 -6
- inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +136 -0
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/SamplesTab.tsx +82 -73
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/grouping.ts +3 -3
- inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/types.ts +1 -1
- inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.tsx +2 -2
- inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.tsx +1 -1
- inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.tsx +4 -4
- inspect_ai/_view/www/src/{plan → app/plan}/PlanCard.tsx +2 -2
- inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.tsx +5 -5
- inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/app/routing/AppRouter.tsx +58 -0
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +182 -0
- inspect_ai/_view/www/src/app/routing/url.ts +43 -0
- inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.tsx +11 -27
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDialog.tsx +36 -40
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.tsx +116 -49
- inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.module.css +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.tsx +29 -26
- inspect_ai/_view/www/src/{samples → app/samples}/SamplesTools.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.module.css +5 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.tsx +12 -4
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRenderer.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.tsx +6 -1
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatView.tsx +4 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.tsx +5 -3
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +12 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContent.tsx +11 -10
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContents.tsx +14 -8
- inspect_ai/_view/www/src/{samples → app/samples}/chat/messages.ts +2 -2
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolCallView.tsx +26 -27
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +19 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolInput.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.module.css +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.module.css +4 -0
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/tool.ts +1 -1
- inspect_ai/_view/www/src/app/samples/chat/types.ts +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/samplesDescriptor.tsx +38 -15
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/CategoricalScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/NumericScoreDescriptor.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/OtherScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/descriptor/types.ts +4 -3
- inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.module.css +2 -1
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.tsx +3 -0
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.tsx +47 -33
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.module.css +16 -0
- inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.tsx +47 -20
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/filters.ts +8 -6
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.tsx +4 -3
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/completions.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/language.ts +1 -0
- inspect_ai/_view/www/src/{samples → app/samples}/sampleDataAdapter.ts +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/sampleLimit.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.tsx +12 -11
- inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.tsx +6 -6
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ApprovalEventView.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ErrorEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/InputEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.tsx +3 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.module.css +13 -7
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.tsx +49 -21
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.tsx +11 -9
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleLimitEventView.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.tsx +8 -6
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.tsx +4 -4
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/StepEventView.tsx +11 -3
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.module.css +8 -7
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.tsx +32 -114
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.module.css +6 -5
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.tsx +14 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.tsx +2 -2
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.tsx +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/utils.ts +1 -1
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenderers.tsx +23 -21
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenders.module.css +7 -0
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.tsx +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +142 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +39 -0
- inspect_ai/_view/www/src/{samples → app/samples}/transcript/types.ts +1 -1
- inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.tsx +1 -1
- inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.module.css +16 -0
- inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.tsx +70 -0
- inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.module.css +8 -0
- inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.tsx +35 -17
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.tsx +1 -1
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.tsx +2 -2
- inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/{types.ts → app/types.ts} +18 -11
- inspect_ai/_view/www/src/{usage → app/usage}/ModelTokenTable.tsx +1 -1
- inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.tsx +2 -2
- inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.tsx +1 -1
- inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.tsx +6 -6
- inspect_ai/_view/www/src/{api → client/api}/api-browser.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/api-http.ts +3 -3
- inspect_ai/_view/www/src/{api → client/api}/api-vscode.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/client-api.ts +6 -5
- inspect_ai/_view/www/src/{api → client/api}/index.ts +2 -2
- inspect_ai/_view/www/src/{api → client/api}/types.ts +4 -1
- inspect_ai/_view/www/src/{logfile → client/remote}/remoteLogFile.ts +3 -3
- inspect_ai/_view/www/src/{storage → client/storage}/index.ts +11 -5
- inspect_ai/_view/www/src/components/Card.tsx +1 -1
- inspect_ai/_view/www/src/components/CopyButton.tsx +1 -1
- inspect_ai/_view/www/src/components/DownloadButton.tsx +1 -1
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/{ExpandablePanel.css → ExpandablePanel.module.css} +14 -11
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +16 -10
- inspect_ai/_view/www/src/components/FindBand.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.css +2 -2
- inspect_ai/_view/www/src/components/LargeModal.tsx +12 -1
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -1
- inspect_ai/_view/www/src/components/MessageBand.tsx +1 -1
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +1 -1
- inspect_ai/_view/www/src/constants.ts +10 -9
- inspect_ai/_view/www/src/index.tsx +27 -11
- inspect_ai/_view/www/src/state/appSlice.ts +44 -5
- inspect_ai/_view/www/src/state/hooks.ts +30 -7
- inspect_ai/_view/www/src/state/logSlice.ts +7 -5
- inspect_ai/_view/www/src/state/logsPolling.ts +1 -1
- inspect_ai/_view/www/src/state/logsSlice.ts +18 -13
- inspect_ai/_view/www/src/state/samplePolling.ts +12 -12
- inspect_ai/_view/www/src/state/sampleSlice.ts +3 -5
- inspect_ai/_view/www/src/state/sampleUtils.ts +1 -1
- inspect_ai/_view/www/src/{scoring/utils.ts → state/scoring.ts} +2 -2
- inspect_ai/_view/www/src/state/store.ts +9 -7
- inspect_ai/_view/www/src/state/utils.ts +1 -1
- inspect_ai/_view/www/src/tests/README.md +49 -0
- inspect_ai/_view/www/src/tests/__mocks__/fileMock.js +1 -0
- inspect_ai/_view/www/src/tests/__mocks__/styleMock.js +1 -0
- inspect_ai/_view/www/src/tests/setupTests.mjs +1 -0
- inspect_ai/_view/www/src/tests/utils/base64.test.ts +23 -0
- inspect_ai/_view/www/src/tests/utils/format.test.ts +127 -0
- inspect_ai/_view/www/src/tests/utils/path.test.ts +54 -0
- inspect_ai/_view/www/src/utils/format.ts +8 -2
- inspect_ai/_view/www/src/utils/path.ts +14 -2
- inspect_ai/_view/www/src/utils/polling.ts +1 -2
- inspect_ai/_view/www/src/utils/uri.ts +32 -0
- inspect_ai/_view/www/yarn.lock +3310 -382
- inspect_ai/agent/_handoff.py +6 -3
- inspect_ai/agent/_human/agent.py +5 -3
- inspect_ai/agent/_human/install.py +16 -7
- inspect_ai/agent/_human/panel.py +14 -1
- inspect_ai/agent/_human/service.py +5 -1
- inspect_ai/agent/_react.py +161 -128
- inspect_ai/agent/_types.py +15 -4
- inspect_ai/approval/_policy.py +2 -2
- inspect_ai/log/_file.py +30 -11
- inspect_ai/log/_log.py +7 -1
- inspect_ai/log/_recorders/eval.py +3 -0
- inspect_ai/log/_recorders/types.py +1 -0
- inspect_ai/log/_samples.py +4 -0
- inspect_ai/model/_call_tools.py +33 -17
- inspect_ai/model/_generate_config.py +10 -2
- inspect_ai/model/_model.py +41 -21
- inspect_ai/model/_model_output.py +2 -1
- inspect_ai/model/_openai.py +10 -8
- inspect_ai/model/_openai_responses.py +83 -42
- inspect_ai/model/_providers/anthropic.py +14 -12
- inspect_ai/model/_providers/google.py +191 -95
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/mistral.py +2 -3
- inspect_ai/model/_providers/openai.py +54 -17
- inspect_ai/model/_providers/openai_o1.py +1 -1
- inspect_ai/model/_providers/openai_responses.py +28 -16
- inspect_ai/model/_providers/openrouter.py +14 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/chatapi.py +17 -7
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/scorer/_metric.py +17 -1
- inspect_ai/scorer/_model.py +51 -6
- inspect_ai/scorer/_scorer.py +1 -1
- inspect_ai/solver/_human_agent.py +3 -0
- inspect_ai/solver/_plan.py +1 -1
- inspect_ai/solver/_solver.py +1 -1
- inspect_ai/solver/_use_tools.py +14 -8
- inspect_ai/tool/__init__.py +16 -1
- inspect_ai/tool/_json_rpc_helpers.py +285 -0
- inspect_ai/tool/_mcp/__init__.py +13 -0
- inspect_ai/tool/_mcp/_context.py +14 -0
- inspect_ai/tool/_mcp/_mcp.py +293 -0
- inspect_ai/tool/_mcp/_sandbox.py +104 -0
- inspect_ai/tool/_mcp/_types.py +31 -0
- inspect_ai/tool/_mcp/connection.py +60 -0
- inspect_ai/tool/_mcp/sampling.py +118 -0
- inspect_ai/tool/_mcp/server.py +112 -0
- inspect_ai/tool/_mcp/tools.py +34 -0
- inspect_ai/tool/_tool.py +13 -0
- inspect_ai/tool/_tool_def.py +24 -7
- inspect_ai/tool/_tool_support_helpers.py +129 -153
- inspect_ai/tool/_tools/_bash_session.py +11 -11
- inspect_ai/tool/_tools/_text_editor.py +6 -6
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +8 -8
- inspect_ai/util/_anyio.py +31 -20
- inspect_ai/util/_json.py +20 -2
- inspect_ai/util/_sandbox/context.py +18 -7
- inspect_ai/util/_sandbox/docker/compose.py +1 -1
- inspect_ai/util/_sandbox/docker/docker.py +92 -21
- inspect_ai/util/_sandbox/environment.py +33 -2
- inspect_ai/util/_sandbox/events.py +2 -2
- inspect_ai/util/_sandbox/service.py +13 -3
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.91.dist-info}/METADATA +6 -2
- inspect_ai-0.3.91.dist-info/RECORD +732 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.91.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/App.tsx +0 -316
- inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +0 -4
- inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +0 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +0 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +0 -14
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +0 -292
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +0 -5
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +0 -57
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +0 -43
- inspect_ai-0.3.90.dist-info/RECORD +0 -705
- /inspect_ai/_view/www/src/{types → @types}/asciicinema-player.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/jsondiffpatch.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/markdown-it-katex.d.ts +0 -0
- /inspect_ai/_view/www/src/{types → @types}/prism.d.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/colors.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/fonts.ts +0 -0
- /inspect_ai/_view/www/src/{appearance → app/appearance}/styles.ts +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataGrid.tsx +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.tsx +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/MetadataGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.module.css +0 -0
- /inspect_ai/_view/www/src/{metadata → app/content}/types.ts +0 -0
- /inspect_ai/_view/www/src/{workspace/WorkSpaceView.module.css → app/log-view/LogView.module.css} +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.tsx +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/JsonTab.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.tsx +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/types.ts +0 -0
- /inspect_ai/_view/www/src/{workspace → app/log-view}/utils.ts +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.tsx +0 -0
- /inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/error/error.ts +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/tokenize.ts +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.module.css +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateDiffView.tsx +0 -0
- /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.module.css +0 -0
- /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.module.css +0 -0
- /inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.module.css +0 -0
- /inspect_ai/_view/www/src/{api → client/api}/api-shared.ts +0 -0
- /inspect_ai/_view/www/src/{api → client/api}/jsonrpc.ts +0 -0
- /inspect_ai/_view/www/src/{logfile → client/remote}/remoteZipFile.ts +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.91.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.91.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.91.dist-info}/top_level.txt +0 -0
@@ -29,9 +29,9 @@ from .._openai import (
|
|
29
29
|
OpenAIAsyncHttpxClient,
|
30
30
|
is_computer_use_preview,
|
31
31
|
is_gpt,
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
is_o1,
|
33
|
+
is_o1_early,
|
34
|
+
is_o3_mini,
|
35
35
|
is_o_series,
|
36
36
|
model_output_from_openai,
|
37
37
|
openai_chat_messages,
|
@@ -62,6 +62,9 @@ class OpenAIAPI(ModelAPI):
|
|
62
62
|
api_key: str | None = None,
|
63
63
|
config: GenerateConfig = GenerateConfig(),
|
64
64
|
responses_api: bool | None = None,
|
65
|
+
responses_store: Literal["auto"] | bool = "auto",
|
66
|
+
service_tier: str | None = None,
|
67
|
+
client_timeout: float | None = None,
|
65
68
|
**model_args: Any,
|
66
69
|
) -> None:
|
67
70
|
# extract azure service prefix from model name (other providers
|
@@ -82,9 +85,25 @@ class OpenAIAPI(ModelAPI):
|
|
82
85
|
config=config,
|
83
86
|
)
|
84
87
|
|
85
|
-
#
|
86
|
-
|
87
|
-
|
88
|
+
# is this a model we use responses api by default for?
|
89
|
+
responses_model = (
|
90
|
+
self.is_o_series() and not self.is_o1_early()
|
91
|
+
) or self.is_computer_use_preview()
|
92
|
+
|
93
|
+
# resolve whether we are forcing the responses api
|
94
|
+
self.responses_api = responses_api or responses_model
|
95
|
+
|
96
|
+
# resolve whether we are using the responses store
|
97
|
+
self.responses_store = (
|
98
|
+
responses_store if isinstance(responses_store, bool) else responses_model
|
99
|
+
)
|
100
|
+
|
101
|
+
# set service tier if specified
|
102
|
+
self.service_tier = service_tier
|
103
|
+
|
104
|
+
# bump up default client timeout to 15 minutes for service_tier=="flex"
|
105
|
+
self.client_timeout = client_timeout or (
|
106
|
+
900.0 if self.service_tier == "flex" else None
|
88
107
|
)
|
89
108
|
|
90
109
|
# resolve api_key
|
@@ -140,6 +159,7 @@ class OpenAIAPI(ModelAPI):
|
|
140
159
|
api_version=api_version,
|
141
160
|
azure_endpoint=base_url,
|
142
161
|
http_client=http_client,
|
162
|
+
timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
|
143
163
|
**model_args,
|
144
164
|
)
|
145
165
|
else:
|
@@ -147,6 +167,7 @@ class OpenAIAPI(ModelAPI):
|
|
147
167
|
api_key=self.api_key,
|
148
168
|
base_url=model_base_url(base_url, "OPENAI_BASE_URL"),
|
149
169
|
http_client=http_client,
|
170
|
+
timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
|
150
171
|
**model_args,
|
151
172
|
)
|
152
173
|
|
@@ -159,14 +180,14 @@ class OpenAIAPI(ModelAPI):
|
|
159
180
|
def is_o_series(self) -> bool:
|
160
181
|
return is_o_series(self.service_model_name())
|
161
182
|
|
162
|
-
def
|
163
|
-
return
|
183
|
+
def is_o1(self) -> bool:
|
184
|
+
return is_o1(self.service_model_name())
|
164
185
|
|
165
|
-
def
|
166
|
-
return
|
186
|
+
def is_o1_early(self) -> bool:
|
187
|
+
return is_o1_early(self.service_model_name())
|
167
188
|
|
168
|
-
def
|
169
|
-
return
|
189
|
+
def is_o3_mini(self) -> bool:
|
190
|
+
return is_o3_mini(self.service_model_name())
|
170
191
|
|
171
192
|
def is_computer_use_preview(self) -> bool:
|
172
193
|
return is_computer_use_preview(self.service_model_name())
|
@@ -184,8 +205,18 @@ class OpenAIAPI(ModelAPI):
|
|
184
205
|
|
185
206
|
@override
|
186
207
|
def tool_result_images(self) -> bool:
|
187
|
-
# o1-pro, o1, and computer_use_preview support image inputs
|
188
|
-
|
208
|
+
# o1-pro, o1, and computer_use_preview support image inputs
|
209
|
+
if self.is_computer_use_preview():
|
210
|
+
return True
|
211
|
+
elif self.is_o_series():
|
212
|
+
if self.is_o1_early():
|
213
|
+
return False
|
214
|
+
elif self.is_o3_mini():
|
215
|
+
return False
|
216
|
+
else:
|
217
|
+
return True
|
218
|
+
else:
|
219
|
+
return False
|
189
220
|
|
190
221
|
@override
|
191
222
|
def disable_computer_screenshot_truncation(self) -> bool:
|
@@ -203,7 +234,7 @@ class OpenAIAPI(ModelAPI):
|
|
203
234
|
config: GenerateConfig,
|
204
235
|
) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
|
205
236
|
# short-circuit to call o1- models that are text only
|
206
|
-
if self.
|
237
|
+
if self.is_o1_early():
|
207
238
|
return await generate_o1(
|
208
239
|
client=self.client,
|
209
240
|
input=input,
|
@@ -219,6 +250,8 @@ class OpenAIAPI(ModelAPI):
|
|
219
250
|
tools=tools,
|
220
251
|
tool_choice=tool_choice,
|
221
252
|
config=config,
|
253
|
+
service_tier=self.service_tier,
|
254
|
+
store=self.responses_store,
|
222
255
|
)
|
223
256
|
|
224
257
|
# allocate request_id (so we can see it from ModelCall)
|
@@ -248,7 +281,7 @@ class OpenAIAPI(ModelAPI):
|
|
248
281
|
# determine system role
|
249
282
|
# o1-mini does not support developer or system messages
|
250
283
|
# (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
|
251
|
-
if self.
|
284
|
+
if self.is_o1_early():
|
252
285
|
system_role: Literal["user", "system", "developer"] = "user"
|
253
286
|
# other o-series models use 'developer' rather than 'system' messages
|
254
287
|
# https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
|
@@ -309,6 +342,10 @@ class OpenAIAPI(ModelAPI):
|
|
309
342
|
# first call the default processing
|
310
343
|
params = openai_completion_params(self.service_model_name(), config, tools)
|
311
344
|
|
345
|
+
# add service_tier if specified
|
346
|
+
if self.service_tier is not None:
|
347
|
+
params["service_tier"] = self.service_tier
|
348
|
+
|
312
349
|
# now tailor to current model
|
313
350
|
if config.max_tokens is not None:
|
314
351
|
if self.is_o_series():
|
@@ -329,7 +366,7 @@ class OpenAIAPI(ModelAPI):
|
|
329
366
|
|
330
367
|
# remove reasoning_effort if not supported
|
331
368
|
if "reasoning_effort" in params.keys() and (
|
332
|
-
self.is_gpt() or self.
|
369
|
+
self.is_gpt() or self.is_o1_early()
|
333
370
|
):
|
334
371
|
del params["reasoning_effort"]
|
335
372
|
|
@@ -212,7 +212,7 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
|
|
212
212
|
prompt that asks the model to use the <tool_call>...</tool_call> syntax)
|
213
213
|
"""
|
214
214
|
# extract tool calls
|
215
|
-
tool_call_regex = rf"<{TOOL_CALL}
|
215
|
+
tool_call_regex = rf"<{TOOL_CALL}>\s*(\{{[\s\S]*?\}})\s*</{TOOL_CALL}>"
|
216
216
|
tool_calls_content: list[str] = re.findall(tool_call_regex, response)
|
217
217
|
|
218
218
|
# if there are tool calls proceed with parsing
|
@@ -15,9 +15,7 @@ from .._model_output import ModelOutput, ModelUsage
|
|
15
15
|
from .._openai import (
|
16
16
|
OpenAIResponseError,
|
17
17
|
is_computer_use_preview,
|
18
|
-
|
19
|
-
is_o1_mini,
|
20
|
-
is_o1_preview,
|
18
|
+
is_o1_early,
|
21
19
|
is_o_series,
|
22
20
|
openai_handle_bad_request,
|
23
21
|
openai_media_filter,
|
@@ -41,6 +39,8 @@ async def generate_responses(
|
|
41
39
|
tools: list[ToolInfo],
|
42
40
|
tool_choice: ToolChoice,
|
43
41
|
config: GenerateConfig,
|
42
|
+
service_tier: str | None,
|
43
|
+
store: bool,
|
44
44
|
) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
|
45
45
|
# allocate request_id (so we can see it from ModelCall)
|
46
46
|
request_id = http_hooks.start_request()
|
@@ -61,14 +61,20 @@ async def generate_responses(
|
|
61
61
|
# prepare request (we do this so we can log the ModelCall)
|
62
62
|
tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
|
63
63
|
request = dict(
|
64
|
-
input=await openai_responses_inputs(input, model_name),
|
64
|
+
input=await openai_responses_inputs(input, model_name, store),
|
65
65
|
tools=tool_params,
|
66
66
|
tool_choice=openai_responses_tool_choice(tool_choice, tool_params)
|
67
67
|
if isinstance(tool_params, list) and tool_choice != "auto"
|
68
68
|
else NOT_GIVEN,
|
69
69
|
truncation="auto" if is_computer_use_preview(model_name) else NOT_GIVEN,
|
70
70
|
extra_headers={HttpxHooks.REQUEST_ID_HEADER: request_id},
|
71
|
-
**completion_params_responses(
|
71
|
+
**completion_params_responses(
|
72
|
+
model_name,
|
73
|
+
config=config,
|
74
|
+
service_tier=service_tier,
|
75
|
+
tools=len(tools) > 0,
|
76
|
+
store=store,
|
77
|
+
),
|
72
78
|
)
|
73
79
|
|
74
80
|
try:
|
@@ -110,7 +116,12 @@ async def generate_responses(
|
|
110
116
|
|
111
117
|
|
112
118
|
def completion_params_responses(
|
113
|
-
model_name: str,
|
119
|
+
model_name: str,
|
120
|
+
*,
|
121
|
+
config: GenerateConfig,
|
122
|
+
service_tier: str | None,
|
123
|
+
tools: bool,
|
124
|
+
store: bool,
|
114
125
|
) -> dict[str, Any]:
|
115
126
|
# TODO: we'll need a computer_use_preview bool for the 'include'
|
116
127
|
# and 'reasoning' parameters
|
@@ -120,9 +131,9 @@ def completion_params_responses(
|
|
120
131
|
f"OpenAI Responses API does not support the '{param}' parameter.",
|
121
132
|
)
|
122
133
|
|
123
|
-
params: dict[str, Any] = dict(
|
124
|
-
|
125
|
-
|
134
|
+
params: dict[str, Any] = dict(model=model_name, store=store)
|
135
|
+
if service_tier is not None:
|
136
|
+
params["service_tier"] = service_tier
|
126
137
|
if config.max_tokens is not None:
|
127
138
|
params["max_output_tokens"] = config.max_tokens
|
128
139
|
if config.frequency_penalty is not None:
|
@@ -153,13 +164,14 @@ def completion_params_responses(
|
|
153
164
|
unsupported_warning("top_logprobs")
|
154
165
|
if tools and config.parallel_tool_calls is not None and not is_o_series(model_name):
|
155
166
|
params["parallel_tool_calls"] = config.parallel_tool_calls
|
156
|
-
if (
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
167
|
+
if is_o_series(model_name) and not is_o1_early(model_name):
|
168
|
+
reasoning: dict[str, str] = {}
|
169
|
+
if config.reasoning_effort is not None:
|
170
|
+
reasoning["effort"] = config.reasoning_effort
|
171
|
+
if config.reasoning_summary is not None:
|
172
|
+
reasoning["summary"] = config.reasoning_summary
|
173
|
+
if len(reasoning) > 0:
|
174
|
+
params["reasoning"] = reasoning
|
163
175
|
if config.response_schema is not None:
|
164
176
|
params["text"] = dict(
|
165
177
|
format=ResponseFormatTextJSONSchemaConfigParam(
|
@@ -111,6 +111,20 @@ class OpenRouterAPI(OpenAICompatibleAPI):
|
|
111
111
|
# default params
|
112
112
|
params = super().completion_params(config, tools)
|
113
113
|
|
114
|
+
# remove reasoning_effort it is exists
|
115
|
+
if "reasoning_effort" in params:
|
116
|
+
del params["reasoning_effort"]
|
117
|
+
|
118
|
+
# provide openrouter standard reasoning options
|
119
|
+
# https://openrouter.ai/docs/use-cases/reasoning-tokens
|
120
|
+
if config.reasoning_effort is not None or config.reasoning_tokens is not None:
|
121
|
+
reasoning: dict[str, str | int] = dict()
|
122
|
+
if config.reasoning_effort is not None:
|
123
|
+
reasoning["effort"] = config.reasoning_effort
|
124
|
+
if config.reasoning_tokens is not None:
|
125
|
+
reasoning["max_tokens"] = config.reasoning_tokens
|
126
|
+
params["reasoning"] = reasoning
|
127
|
+
|
114
128
|
# pass args if specifed
|
115
129
|
EXTRA_BODY = "extra_body"
|
116
130
|
if self.models or self.provider or self.transforms:
|
@@ -105,7 +105,7 @@ def vertex() -> type[ModelAPI]:
|
|
105
105
|
def google() -> type[ModelAPI]:
|
106
106
|
FEATURE = "Google API"
|
107
107
|
PACKAGE = "google-genai"
|
108
|
-
MIN_VERSION = "1.
|
108
|
+
MIN_VERSION = "1.12.1"
|
109
109
|
|
110
110
|
# verify we have the package
|
111
111
|
try:
|
@@ -267,7 +267,7 @@ def none() -> type[ModelAPI]:
|
|
267
267
|
def validate_openai_client(feature: str) -> None:
|
268
268
|
FEATURE = feature
|
269
269
|
PACKAGE = "openai"
|
270
|
-
MIN_VERSION = "1.
|
270
|
+
MIN_VERSION = "1.75.0"
|
271
271
|
|
272
272
|
# verify we have the package
|
273
273
|
try:
|
@@ -100,10 +100,20 @@ async def chat_api_request(
|
|
100
100
|
# look at its `__cause__`. we've observed Cloudflare giving transient 500
|
101
101
|
# status as well as a ReadTimeout, so we count these as rate limit errors
|
102
102
|
def should_retry_chat_api_error(ex: BaseException) -> bool:
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
103
|
+
# not a tenacity RetryError
|
104
|
+
if not isinstance(ex, RetryError):
|
105
|
+
return False
|
106
|
+
|
107
|
+
cause = ex.__cause__
|
108
|
+
|
109
|
+
if cause is None:
|
110
|
+
raise RuntimeError(f"Tenacity RetryError with no __cause__: {ex}")
|
111
|
+
|
112
|
+
if isinstance(cause, httpx.HTTPStatusError):
|
113
|
+
if is_retryable_http_status(cause.response.status_code):
|
114
|
+
return True
|
115
|
+
|
116
|
+
if httpx_should_retry(cause):
|
117
|
+
return True
|
118
|
+
|
119
|
+
return False
|
@@ -104,7 +104,7 @@ class VLLMAPI(ModelAPI):
|
|
104
104
|
|
105
105
|
# set which GPUs are available to use
|
106
106
|
if device is not None:
|
107
|
-
os.environ["CUDA_VISIBLE_DEVICES"] =
|
107
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(device)
|
108
108
|
|
109
109
|
# tell vllm how many GPUs to use
|
110
110
|
if "tensor_parallel_size" not in model_args:
|
inspect_ai/scorer/_metric.py
CHANGED
@@ -5,6 +5,7 @@ from typing import (
|
|
5
5
|
Callable,
|
6
6
|
ParamSpec,
|
7
7
|
Protocol,
|
8
|
+
Type,
|
8
9
|
Union,
|
9
10
|
cast,
|
10
11
|
overload,
|
@@ -24,6 +25,7 @@ from inspect_ai._util.registry import (
|
|
24
25
|
registry_params,
|
25
26
|
registry_tag,
|
26
27
|
)
|
28
|
+
from inspect_ai.dataset._dataset import MT, metadata_as
|
27
29
|
|
28
30
|
logger = getLogger(__name__)
|
29
31
|
|
@@ -121,6 +123,20 @@ class SampleScore(BaseModel):
|
|
121
123
|
sample_metadata: dict[str, Any] | None = Field(default=None)
|
122
124
|
"""Metadata from the sample"""
|
123
125
|
|
126
|
+
def sample_metadata_as(self, metadata_cls: Type[MT]) -> MT | None:
|
127
|
+
"""Pydantic model interface to sample metadata.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
metadata_cls: Pydantic model type
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
BaseModel: Instance of metadata_cls bound to sample metadata.
|
134
|
+
"""
|
135
|
+
if self.sample_metadata is not None:
|
136
|
+
return metadata_as(self.sample_metadata, metadata_cls)
|
137
|
+
else:
|
138
|
+
return None
|
139
|
+
|
124
140
|
scorer: str | None = Field(default=None)
|
125
141
|
"""Registry name of scorer that created this score."""
|
126
142
|
|
@@ -265,7 +281,7 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
|
|
265
281
|
Returns:
|
266
282
|
Metric with registry info attribute
|
267
283
|
"""
|
268
|
-
return
|
284
|
+
return registry_create("metric", name, **kwargs)
|
269
285
|
|
270
286
|
|
271
287
|
def to_metric_specs(
|
inspect_ai/scorer/_model.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from functools import partial
|
3
|
-
from typing import Callable
|
3
|
+
from typing import Any, Callable
|
4
4
|
|
5
|
+
from inspect_ai._util.content import Content, ContentText
|
5
6
|
from inspect_ai._util.dict import omit
|
6
7
|
from inspect_ai._util.format import format_function_call
|
7
8
|
from inspect_ai._util.list import remove_last_match_and_after
|
@@ -13,6 +14,7 @@ from inspect_ai.model._chat_message import (
|
|
13
14
|
ChatMessageUser,
|
14
15
|
)
|
15
16
|
from inspect_ai.model._model import Model, get_model
|
17
|
+
from inspect_ai.model._model_output import ModelOutput
|
16
18
|
from inspect_ai.solver._task_state import TaskState
|
17
19
|
from inspect_ai.util import resource
|
18
20
|
|
@@ -166,16 +168,17 @@ def _model_graded_qa_single(
|
|
166
168
|
question = state.input_text
|
167
169
|
|
168
170
|
# format the scoring template
|
169
|
-
|
171
|
+
scoring_prompt = model_scoring_prompt(
|
172
|
+
template=grading_template,
|
170
173
|
question=question,
|
171
|
-
|
174
|
+
output=state.output,
|
172
175
|
criterion=target.text,
|
173
176
|
instructions=instructions,
|
174
|
-
|
177
|
+
metadata=metadata,
|
175
178
|
)
|
176
179
|
|
177
180
|
# query the model for the score
|
178
|
-
result = await model.generate(
|
181
|
+
result = await model.generate([scoring_prompt])
|
179
182
|
|
180
183
|
# extract the grade
|
181
184
|
match = re.search(grade_pattern or DEFAULT_GRADE_PATTERN, result.completion)
|
@@ -186,7 +189,7 @@ def _model_graded_qa_single(
|
|
186
189
|
explanation=result.completion,
|
187
190
|
metadata=dict(
|
188
191
|
grading=[
|
189
|
-
|
192
|
+
scoring_prompt,
|
190
193
|
result.message,
|
191
194
|
]
|
192
195
|
),
|
@@ -300,3 +303,45 @@ def chat_history(state: TaskState) -> str:
|
|
300
303
|
)
|
301
304
|
|
302
305
|
return "\n\n".join(history)
|
306
|
+
|
307
|
+
|
308
|
+
def model_scoring_prompt(
|
309
|
+
*,
|
310
|
+
template: str,
|
311
|
+
question: str,
|
312
|
+
output: ModelOutput,
|
313
|
+
criterion: str,
|
314
|
+
instructions: str,
|
315
|
+
metadata: dict[str, Any],
|
316
|
+
) -> ChatMessageUser:
|
317
|
+
# we need to remove media objects from output and reference them as attachements in the answer
|
318
|
+
answer = output.completion
|
319
|
+
media: list[Content] = (
|
320
|
+
[
|
321
|
+
content
|
322
|
+
for content in output.message.content
|
323
|
+
if content.type in ["image", "audio", "video"]
|
324
|
+
]
|
325
|
+
if len(output.choices) > 0 and isinstance(output.message.content, list)
|
326
|
+
else []
|
327
|
+
)
|
328
|
+
if len(media) > 0:
|
329
|
+
if len(answer) > 0:
|
330
|
+
answer = f"{answer} (see also attached media)"
|
331
|
+
else:
|
332
|
+
answer = "See attached media"
|
333
|
+
|
334
|
+
# format the prompt
|
335
|
+
prompt = template.format(
|
336
|
+
question=question,
|
337
|
+
answer=answer,
|
338
|
+
criterion=criterion,
|
339
|
+
instructions=instructions,
|
340
|
+
**metadata,
|
341
|
+
)
|
342
|
+
|
343
|
+
# return with media if necessary
|
344
|
+
if len(media) > 0:
|
345
|
+
return ChatMessageUser(content=[ContentText(text=prompt)] + media)
|
346
|
+
else:
|
347
|
+
return ChatMessageUser(content=prompt)
|
inspect_ai/scorer/_scorer.py
CHANGED
@@ -117,7 +117,7 @@ def scorer_create(name: str, **kwargs: Any) -> Scorer:
|
|
117
117
|
Returns:
|
118
118
|
Scorer with registry info attribute
|
119
119
|
"""
|
120
|
-
return
|
120
|
+
return registry_create("scorer", name, **kwargs)
|
121
121
|
|
122
122
|
|
123
123
|
def scorer(
|
@@ -13,6 +13,7 @@ def human_agent(
|
|
13
13
|
answer: bool | str = True,
|
14
14
|
intermediate_scoring: bool = False,
|
15
15
|
record_session: bool = True,
|
16
|
+
user: str | None = None,
|
16
17
|
) -> Solver:
|
17
18
|
"""Human solver for agentic tasks that run in a Linux environment.
|
18
19
|
|
@@ -32,6 +33,7 @@ def human_agent(
|
|
32
33
|
that the answer matches the expected format.
|
33
34
|
intermediate_scoring: Allow the human agent to check their score while working.
|
34
35
|
record_session: Record all user commands and outputs in the sandbox bash session.
|
36
|
+
user: User to login as. Defaults to the sandbox environment's default user.
|
35
37
|
|
36
38
|
Returns:
|
37
39
|
Solver: Human agent solver.
|
@@ -48,5 +50,6 @@ def human_agent(
|
|
48
50
|
answer=answer,
|
49
51
|
intermediate_scoring=intermediate_scoring,
|
50
52
|
record_session=record_session,
|
53
|
+
user=user,
|
51
54
|
)
|
52
55
|
)
|
inspect_ai/solver/_plan.py
CHANGED
inspect_ai/solver/_solver.py
CHANGED
@@ -136,7 +136,7 @@ def solver_create(name: str, **kwargs: Any) -> Solver:
|
|
136
136
|
Returns:
|
137
137
|
Solver with registry info attribute
|
138
138
|
"""
|
139
|
-
return
|
139
|
+
return registry_create("solver", name, **kwargs)
|
140
140
|
|
141
141
|
|
142
142
|
SolverType: TypeAlias = Solver | Agent
|
inspect_ai/solver/_use_tools.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
+
from typing import Sequence
|
2
|
+
|
1
3
|
from inspect_ai.tool import Tool, ToolChoice
|
4
|
+
from inspect_ai.tool._tool import ToolSource
|
2
5
|
from inspect_ai.tool._tool_def import ToolDef
|
3
6
|
|
4
7
|
from ._solver import Generate, Solver, solver
|
@@ -7,7 +10,7 @@ from ._task_state import TaskState
|
|
7
10
|
|
8
11
|
@solver
|
9
12
|
def use_tools(
|
10
|
-
*tools: Tool |
|
13
|
+
*tools: Tool | ToolDef | ToolSource | Sequence[Tool | ToolDef | ToolSource],
|
11
14
|
tool_choice: ToolChoice | None = "auto",
|
12
15
|
append: bool = False,
|
13
16
|
) -> Solver:
|
@@ -34,17 +37,20 @@ def use_tools(
|
|
34
37
|
tools_update: list[Tool] = []
|
35
38
|
|
36
39
|
# add tool function to take care of tool/tool_def
|
37
|
-
def
|
38
|
-
if isinstance(tool,
|
39
|
-
|
40
|
-
|
40
|
+
async def add_tools(tool: Tool | ToolDef | ToolSource) -> None:
|
41
|
+
if isinstance(tool, ToolSource):
|
42
|
+
tools_update.extend(await tool.tools())
|
43
|
+
else:
|
44
|
+
if isinstance(tool, ToolDef):
|
45
|
+
tool = tool.as_tool()
|
46
|
+
tools_update.append(tool)
|
41
47
|
|
42
48
|
for tool in tools:
|
43
|
-
if isinstance(tool,
|
49
|
+
if isinstance(tool, Sequence):
|
44
50
|
for t in tool:
|
45
|
-
|
51
|
+
await add_tools(t)
|
46
52
|
else:
|
47
|
-
|
53
|
+
await add_tools(tool)
|
48
54
|
if len(tools_update) > 0:
|
49
55
|
if append:
|
50
56
|
existing_tools = state.tools
|
inspect_ai/tool/__init__.py
CHANGED
@@ -8,7 +8,15 @@ from inspect_ai._util.content import (
|
|
8
8
|
)
|
9
9
|
from inspect_ai._util.deprecation import relocated_module_attribute
|
10
10
|
|
11
|
-
from .
|
11
|
+
from ._mcp import (
|
12
|
+
MCPServer,
|
13
|
+
mcp_connection,
|
14
|
+
mcp_server_sandbox,
|
15
|
+
mcp_server_sse,
|
16
|
+
mcp_server_stdio,
|
17
|
+
mcp_tools,
|
18
|
+
)
|
19
|
+
from ._tool import Tool, ToolError, ToolResult, ToolSource, tool
|
12
20
|
from ._tool_call import (
|
13
21
|
ToolCall,
|
14
22
|
ToolCallContent,
|
@@ -45,6 +53,13 @@ __all__ = [
|
|
45
53
|
"ToolCallError",
|
46
54
|
"ToolError",
|
47
55
|
"ToolResult",
|
56
|
+
"ToolSource",
|
57
|
+
"mcp_tools",
|
58
|
+
"mcp_connection",
|
59
|
+
"mcp_server_stdio",
|
60
|
+
"mcp_server_sse",
|
61
|
+
"mcp_server_sandbox",
|
62
|
+
"MCPServer",
|
48
63
|
"Content",
|
49
64
|
"ContentAudio",
|
50
65
|
"ContentImage",
|