testmcpy 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {testmcpy-0.7.0/testmcpy.egg-info → testmcpy-0.7.2}/PKG-INFO +1 -1
- {testmcpy-0.7.0 → testmcpy-0.7.2}/pyproject.toml +1 -1
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/__init__.py +1 -1
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/llm_integration.py +173 -2
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/test_runner.py +58 -10
- {testmcpy-0.7.0 → testmcpy-0.7.2/testmcpy.egg-info}/PKG-INFO +1 -1
- {testmcpy-0.7.0 → testmcpy-0.7.2}/LICENSE +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/MANIFEST.in +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/NOTICE +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/README.md +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/setup.cfg +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/hooks.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/models.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/orchestrator.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/prompts.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/tools.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/auth_debugger.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/auth_flow_recorder.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/app.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/agent.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/baseline.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/export_db.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/mcp.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/metamorphic.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/multi_env.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/mutate.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/push.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/run.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/server.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/tools.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/tui.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/wizard.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/config.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/chat_session.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/docs_optimizer.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/mcp_manager.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/tool_comparison.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/tool_discovery.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/db.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/error_handlers.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/auth_evaluators.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/base_evaluators.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/evaluator_packs.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/base.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/curl.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/graphql.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/javascript_client.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/json_yaml.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/protobuf.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/python.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/python_client.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/thrift.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/typescript.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/typescript_client.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/llm_profiles.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/mcp_profiles.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/migrate_json.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/models.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_detailed_exploration.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_poc.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_working_poc.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/test_ollama_tools.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/api.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/api.py.bak +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/auth_middleware.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/helpers/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/helpers/mcp_config.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/models.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/agent.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/auth.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/compare.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/compatibility.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/generation_logs.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/health.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/llm.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/mcp_profiles.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/metrics.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/results.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/search.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/security.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/smoke_reports.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/test_profiles.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/tests.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/tools.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/state.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/websocket.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/smoke_test.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/__init__.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/baseline.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/ci_gate.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/comparison_runner.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/coverage_analyzer.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/html_report.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/mcp_client.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/metamorphic.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/model_registry.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/models.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/multi_env.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/oauth_flows.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/prompt_mutation.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/report_generator.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/runner_tools.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/schema_diff.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/token_manager.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/storage.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/test_profiles.py +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/README.md +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/assets/index-30Ed2JCz.css +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/assets/index-6JiH0p1L.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/index.html +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/index.html +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/package-lock.json +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/package.json +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/postcss.config.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/App.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/CommandPalette.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/CompareToolsTab.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/EditorStatusBar.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/EditorTabStrip.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ErrorAlert.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ErrorBoundary.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/LLMProfileSelector.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/LoadingSpinner.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/MCPProfileSelector.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/NotificationProvider.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/OptimizeDocsModal.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/OutputDiff.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ParameterCard.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/SchemaCodeViewer.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/SkeletonLoader.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/StreamingLogViewer.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestGenerationModal.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestProfileSelector.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestResultPanel.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestStatusIndicator.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolCallTimeline.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolComparison.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolDebugModal.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TraceView.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TypeBadge.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/Wizard.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/contexts/TestRunContext.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/contexts/ThemeContext.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useEditorTheme.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useKeyboardShortcuts.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useSafeFetch.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/index.css +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/main.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/AuthDebugger.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/ChatInterface.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/CompatibilityMatrix.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/Configuration.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/GenerationHistory.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/LLMProfiles.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPExplorer.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPHealth.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPProfiles.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MetricsDashboard.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/ProfilesManager.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/Reports.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/RunComparison.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/SecurityDashboard.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/TestManager.jsx +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/utils/__tests__/formatConverters.test.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/utils/formatConverters.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/tailwind.config.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/vite.config.js +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/SOURCES.txt +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/dependency_links.txt +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/entry_points.txt +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/requires.txt +0 -0
- {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/top_level.txt +0 -0
|
@@ -93,7 +93,7 @@ testmcpy = [
|
|
|
93
93
|
|
|
94
94
|
[project]
|
|
95
95
|
name = "testmcpy"
|
|
96
|
-
version = "0.7.
|
|
96
|
+
version = "0.7.2"
|
|
97
97
|
description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
|
|
98
98
|
authors = [{name = "Amin Ghadersohi"}]
|
|
99
99
|
license = "Apache-2.0"
|
|
@@ -1356,6 +1356,30 @@ class BedrockProvider(LLMProvider):
|
|
|
1356
1356
|
_claude_sdk_logger = logging.getLogger(__name__ + ".ClaudeSDKProvider")
|
|
1357
1357
|
|
|
1358
1358
|
|
|
1359
|
+
# Substrings that strongly indicate a tool-result is an error, even when
|
|
1360
|
+
# the SDK didn't flag is_error=True. Used by the retry-budget guard to
|
|
1361
|
+
# detect cases like MCP-side validation errors that come back as "successful"
|
|
1362
|
+
# tool results with error text in the body.
|
|
1363
|
+
_ERROR_PAYLOAD_MARKERS = (
|
|
1364
|
+
"validation error",
|
|
1365
|
+
"Unexpected keyword argument",
|
|
1366
|
+
"Missing required argument",
|
|
1367
|
+
"missing_argument",
|
|
1368
|
+
"unexpected_keyword_argument",
|
|
1369
|
+
'"error":',
|
|
1370
|
+
"'error':",
|
|
1371
|
+
"Error:",
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
def _looks_like_error_payload(content: Any) -> bool:
|
|
1376
|
+
"""Heuristic: does this tool-result content look like an error?"""
|
|
1377
|
+
text = str(content) if content is not None else ""
|
|
1378
|
+
if not text:
|
|
1379
|
+
return False
|
|
1380
|
+
return any(marker in text for marker in _ERROR_PAYLOAD_MARKERS)
|
|
1381
|
+
|
|
1382
|
+
|
|
1359
1383
|
class ClaudeSDKProvider(LLMProvider):
|
|
1360
1384
|
"""Claude Agent SDK provider with native MCP integration.
|
|
1361
1385
|
|
|
@@ -1712,10 +1736,21 @@ class ClaudeSDKProvider(LLMProvider):
|
|
|
1712
1736
|
cost = 0.0
|
|
1713
1737
|
raw_events = []
|
|
1714
1738
|
|
|
1739
|
+
# Retry budget: if the model keeps calling the SAME tool with the
|
|
1740
|
+
# SAME arguments and getting the SAME error, abort the query
|
|
1741
|
+
# rather than letting it spin until the wall-clock timeout fires.
|
|
1742
|
+
# Counts how many times each (tool_name, args, error) signature
|
|
1743
|
+
# has been seen; we abort when any signature crosses the
|
|
1744
|
+
# threshold below.
|
|
1745
|
+
error_signature_counts: dict[tuple[str, str, str], int] = {}
|
|
1746
|
+
max_repeats_per_signature = 3
|
|
1747
|
+
retry_budget_aborted = False
|
|
1748
|
+
|
|
1715
1749
|
log(f"[ClaudeSDK] Starting query (model={self.model}, timeout={timeout}s)...")
|
|
1716
1750
|
|
|
1717
1751
|
async def execute_query():
|
|
1718
1752
|
nonlocal response_text, thinking_text, token_usage, cost
|
|
1753
|
+
nonlocal retry_budget_aborted
|
|
1719
1754
|
message_count = 0
|
|
1720
1755
|
# Track all text blocks per AssistantMessage so we can
|
|
1721
1756
|
# identify the FINAL text response (after all tool calls)
|
|
@@ -1805,6 +1840,50 @@ class ClaudeSDKProvider(LLMProvider):
|
|
|
1805
1840
|
f"[ClaudeSDK] Tool Result ({status}): {content_preview}"
|
|
1806
1841
|
)
|
|
1807
1842
|
|
|
1843
|
+
# Retry-budget enforcement: if the model
|
|
1844
|
+
# keeps making the same call with the
|
|
1845
|
+
# same args and getting the same error,
|
|
1846
|
+
# abort to break out of the loop.
|
|
1847
|
+
if is_error or _looks_like_error_payload(content):
|
|
1848
|
+
matching_call = next(
|
|
1849
|
+
(
|
|
1850
|
+
tc
|
|
1851
|
+
for tc in tool_calls
|
|
1852
|
+
if tc.get("id") == tool_use_id
|
|
1853
|
+
),
|
|
1854
|
+
None,
|
|
1855
|
+
)
|
|
1856
|
+
if matching_call:
|
|
1857
|
+
sig_args = json.dumps(
|
|
1858
|
+
matching_call.get("arguments", {}),
|
|
1859
|
+
sort_keys=True,
|
|
1860
|
+
default=str,
|
|
1861
|
+
)[:200]
|
|
1862
|
+
# Use a normalized prefix of the
|
|
1863
|
+
# error text — exact byte match
|
|
1864
|
+
# would be too brittle.
|
|
1865
|
+
sig_err = str(content)[:120]
|
|
1866
|
+
sig = (
|
|
1867
|
+
matching_call.get("name", ""),
|
|
1868
|
+
sig_args,
|
|
1869
|
+
sig_err,
|
|
1870
|
+
)
|
|
1871
|
+
error_signature_counts[sig] = (
|
|
1872
|
+
error_signature_counts.get(sig, 0) + 1
|
|
1873
|
+
)
|
|
1874
|
+
if (
|
|
1875
|
+
error_signature_counts[sig]
|
|
1876
|
+
>= max_repeats_per_signature
|
|
1877
|
+
):
|
|
1878
|
+
log(
|
|
1879
|
+
f"[ClaudeSDK] Retry budget exhausted: "
|
|
1880
|
+
f"same call+error repeated "
|
|
1881
|
+
f"{max_repeats_per_signature}× — aborting "
|
|
1882
|
+
f"(tool={sig[0]}, error={sig_err[:80]!r})"
|
|
1883
|
+
)
|
|
1884
|
+
retry_budget_aborted = True
|
|
1885
|
+
return
|
|
1886
|
+
|
|
1808
1887
|
elif isinstance(message, ResultMessage):
|
|
1809
1888
|
if message.usage:
|
|
1810
1889
|
usage = message.usage
|
|
@@ -1882,9 +1961,25 @@ class ClaudeSDKProvider(LLMProvider):
|
|
|
1882
1961
|
mcp_tool_results.append(mcp_result)
|
|
1883
1962
|
|
|
1884
1963
|
duration = time.time() - start_time
|
|
1964
|
+
|
|
1965
|
+
# If we aborted via the retry budget, surface that in the
|
|
1966
|
+
# response so evaluators see a clear, actionable error
|
|
1967
|
+
# rather than an empty / partial response.
|
|
1968
|
+
if retry_budget_aborted and not response_text:
|
|
1969
|
+
response_text = (
|
|
1970
|
+
f"Error: aborted after the model repeated the same tool call "
|
|
1971
|
+
f"and got the same error {max_repeats_per_signature}× in a row. "
|
|
1972
|
+
f"This usually means the prompt is priming the model toward a "
|
|
1973
|
+
f"wrong parameter name, the tool's schema mismatches the model's "
|
|
1974
|
+
f"expectation, or the resource being queried doesn't exist. See "
|
|
1975
|
+
f"the log lines marked '[ClaudeSDK] Tool Result (Error)' for the "
|
|
1976
|
+
f"specific error pattern."
|
|
1977
|
+
)
|
|
1978
|
+
|
|
1885
1979
|
log(
|
|
1886
1980
|
f"[ClaudeSDK] Done: {len(response_text)} chars, "
|
|
1887
1981
|
f"{len(tool_calls)} tool calls, {len(mcp_tool_results)} results"
|
|
1982
|
+
+ (" [retry budget aborted]" if retry_budget_aborted else "")
|
|
1888
1983
|
)
|
|
1889
1984
|
|
|
1890
1985
|
# Estimate cost from tokens if SDK didn't provide it (subscription billing)
|
|
@@ -1955,6 +2050,21 @@ class ClaudeSDKProvider(LLMProvider):
|
|
|
1955
2050
|
_assistant_logger = logging.getLogger(__name__ + ".AssistantProvider")
|
|
1956
2051
|
|
|
1957
2052
|
|
|
2053
|
+
def _format_seconds(seconds: float) -> str:
|
|
2054
|
+
"""Format a duration so sub-second values aren't rounded to ``0s``.
|
|
2055
|
+
|
|
2056
|
+
The SSE idle threshold is configurable down to fractions of a second
|
|
2057
|
+
(the unit tests override it to 0.3s). Using ``f"{x:.0f}s"`` would
|
|
2058
|
+
produce a misleading ``0s`` in those messages — switch to ms below
|
|
2059
|
+
1s, decimals below 10s, and integer seconds otherwise.
|
|
2060
|
+
"""
|
|
2061
|
+
if seconds < 1.0:
|
|
2062
|
+
return f"{seconds * 1000:.0f}ms"
|
|
2063
|
+
if seconds < 10.0:
|
|
2064
|
+
return f"{seconds:.1f}s"
|
|
2065
|
+
return f"{seconds:.0f}s"
|
|
2066
|
+
|
|
2067
|
+
|
|
1958
2068
|
@dataclass
|
|
1959
2069
|
class _SSEStreamState:
|
|
1960
2070
|
"""Mutable state accumulated as we parse a chatbot SSE response."""
|
|
@@ -2016,6 +2126,13 @@ class AssistantProvider(LLMProvider):
|
|
|
2016
2126
|
_DEFAULT_CONVERSATIONS_PATH = "/api/v1/copilot/conversations"
|
|
2017
2127
|
_DEFAULT_COMPLETIONS_PATH = "/api/v1/copilot/completions"
|
|
2018
2128
|
|
|
2129
|
+
# If the SSE stream emits no recognized event for this many seconds,
|
|
2130
|
+
# abort the stream. Defends against a chatbot backend that keeps the
|
|
2131
|
+
# connection open (preventing httpx's per-event read timeout from
|
|
2132
|
+
# firing) but stops emitting real progress. Observed in c29
|
|
2133
|
+
# (SC-105915). Class-level so subclasses / tests can override.
|
|
2134
|
+
SSE_IDLE_ABORT_SECONDS: float = 90.0
|
|
2135
|
+
|
|
2019
2136
|
def __init__(
|
|
2020
2137
|
self,
|
|
2021
2138
|
model: str = "default",
|
|
@@ -2140,6 +2257,16 @@ class AssistantProvider(LLMProvider):
|
|
|
2140
2257
|
|
|
2141
2258
|
log(f"[Assistant] POST {completions_url} (conversation={self._conversation_id})")
|
|
2142
2259
|
|
|
2260
|
+
# Idle abort: if the SSE stream emits NO recognized event within
|
|
2261
|
+
# this many seconds, give up. This catches a chatbot backend that
|
|
2262
|
+
# keeps the connection open (so httpx's per-event read timeout
|
|
2263
|
+
# never fires) but stops sending real progress — observed in
|
|
2264
|
+
# eval cycle c29 (SC-105915) where C00_9, C01_9, C02_7 hung
|
|
2265
|
+
# despite the per-test wall-clock added in v0.7.1.
|
|
2266
|
+
sse_idle_abort_seconds = self.SSE_IDLE_ABORT_SECONDS
|
|
2267
|
+
last_event_at = time.time()
|
|
2268
|
+
idle_aborted = False
|
|
2269
|
+
|
|
2143
2270
|
state = _SSEStreamState()
|
|
2144
2271
|
try:
|
|
2145
2272
|
async with self._client.stream(
|
|
@@ -2153,7 +2280,40 @@ class AssistantProvider(LLMProvider):
|
|
|
2153
2280
|
)
|
|
2154
2281
|
|
|
2155
2282
|
current_event: str | None = None
|
|
2156
|
-
|
|
2283
|
+
# Drive the line iterator manually so we can wrap each
|
|
2284
|
+
# await in asyncio.wait_for(...). httpx's aiter_lines()
|
|
2285
|
+
# blocks inside __anext__ when no bytes arrive — a plain
|
|
2286
|
+
# `async for` would be suspended forever. The wait_for
|
|
2287
|
+
# catches the case where the SSE connection stays open
|
|
2288
|
+
# but never sends another byte (real-world c29 hang).
|
|
2289
|
+
line_iter = resp.aiter_lines().__aiter__()
|
|
2290
|
+
budget_str = _format_seconds(sse_idle_abort_seconds)
|
|
2291
|
+
while True:
|
|
2292
|
+
elapsed = time.time() - last_event_at
|
|
2293
|
+
remaining = sse_idle_abort_seconds - elapsed
|
|
2294
|
+
if remaining <= 0:
|
|
2295
|
+
log(
|
|
2296
|
+
f"[Assistant] SSE idle abort: no recognized event for "
|
|
2297
|
+
f"{budget_str} — closing stream"
|
|
2298
|
+
)
|
|
2299
|
+
idle_aborted = True
|
|
2300
|
+
break
|
|
2301
|
+
try:
|
|
2302
|
+
raw_line = await asyncio.wait_for(line_iter.__anext__(), timeout=remaining)
|
|
2303
|
+
except StopAsyncIteration:
|
|
2304
|
+
break
|
|
2305
|
+
except asyncio.TimeoutError:
|
|
2306
|
+
# Budget is measured since the last *recognized* event.
|
|
2307
|
+
# Unrecognized noise (keepalives, malformed events) does
|
|
2308
|
+
# NOT reset last_event_at, so this fires correctly even
|
|
2309
|
+
# if bytes are arriving without real progress.
|
|
2310
|
+
log(
|
|
2311
|
+
f"[Assistant] SSE idle abort: no recognized event for "
|
|
2312
|
+
f"{budget_str} — closing stream"
|
|
2313
|
+
)
|
|
2314
|
+
idle_aborted = True
|
|
2315
|
+
break
|
|
2316
|
+
|
|
2157
2317
|
line = raw_line.strip()
|
|
2158
2318
|
if not line:
|
|
2159
2319
|
current_event = None
|
|
@@ -2178,6 +2338,8 @@ class AssistantProvider(LLMProvider):
|
|
|
2178
2338
|
continue
|
|
2179
2339
|
|
|
2180
2340
|
self._handle_sse_event(current_event, data, state, log)
|
|
2341
|
+
# A real event arrived — reset the idle timer.
|
|
2342
|
+
last_event_at = time.time()
|
|
2181
2343
|
|
|
2182
2344
|
except httpx.TimeoutException:
|
|
2183
2345
|
duration = time.time() - start_time
|
|
@@ -2201,13 +2363,22 @@ class AssistantProvider(LLMProvider):
|
|
|
2201
2363
|
duration = time.time() - start_time
|
|
2202
2364
|
if state.got_error and not state.response_text:
|
|
2203
2365
|
state.response_text = f"Error: {state.error_message}"
|
|
2366
|
+
elif idle_aborted and not state.response_text:
|
|
2367
|
+
# Surface the idle abort cleanly so evaluators don't see an
|
|
2368
|
+
# empty response with no explanation.
|
|
2369
|
+
state.response_text = (
|
|
2370
|
+
f"Error: SSE stream went idle for "
|
|
2371
|
+
f"{_format_seconds(sse_idle_abort_seconds)} without sending a "
|
|
2372
|
+
"final / error event. The chatbot backend kept the connection "
|
|
2373
|
+
"open but stopped emitting progress. Aborted to free the runner."
|
|
2374
|
+
)
|
|
2204
2375
|
|
|
2205
2376
|
log(
|
|
2206
2377
|
f"[Assistant] Done: {len(state.response_text)} chars, "
|
|
2207
2378
|
f"{len(state.tool_calls)} tool calls, {state.token_event_count} tokens, "
|
|
2208
2379
|
f"final={'yes' if state.got_final else 'no'}, "
|
|
2209
2380
|
f"error={'yes' if state.got_error else 'no'}, "
|
|
2210
|
-
f"{duration:.2f}s"
|
|
2381
|
+
f"{duration:.2f}s" + (" [SSE idle aborted]" if idle_aborted else "")
|
|
2211
2382
|
)
|
|
2212
2383
|
|
|
2213
2384
|
return LLMResult(
|
|
@@ -1360,20 +1360,68 @@ class TestRunner:
|
|
|
1360
1360
|
|
|
1361
1361
|
return results
|
|
1362
1362
|
|
|
1363
|
+
# Safety margin added to each test's per-call timeout to compute the
|
|
1364
|
+
# wall-clock budget. Covers auth + conversation create + tool execution
|
|
1365
|
+
# + evaluator runtime. Class-level so tests can shorten it.
|
|
1366
|
+
WALL_CLOCK_SLACK_SECONDS: float = 60.0
|
|
1367
|
+
|
|
1363
1368
|
async def _run_test_with_retry(
|
|
1364
1369
|
self, test_case: TestCase, max_test_retries: int = 2
|
|
1365
1370
|
) -> TestResult:
|
|
1366
|
-
"""Run a test with retry logic for rate limit failures.
|
|
1367
|
-
|
|
1368
|
-
|
|
1371
|
+
"""Run a test with retry logic for rate limit failures.
|
|
1372
|
+
|
|
1373
|
+
Each test is wrapped in a wall-clock timeout so it can never hang
|
|
1374
|
+
the runner indefinitely. The wall-clock timeout is the test's
|
|
1375
|
+
declared ``timeout`` plus :attr:`WALL_CLOCK_SLACK_SECONDS` to
|
|
1376
|
+
absorb provider-side overhead (auth, conversation creation,
|
|
1377
|
+
evaluators); for CLI providers we also bump it up like
|
|
1378
|
+
:meth:`run_test` does for the inner LLM call.
|
|
1379
|
+
|
|
1380
|
+
Without this, providers that stream events (e.g. the assistant
|
|
1381
|
+
chatbot endpoint) can keep the test alive forever — the per-event
|
|
1382
|
+
httpx timeout resets on every received chunk, so a chatbot stuck
|
|
1383
|
+
in an infinite tool-call retry loop keeps streaming until the
|
|
1384
|
+
runner is killed externally.
|
|
1385
|
+
"""
|
|
1386
|
+
# Compute the wall-clock budget once per test (used across retries).
|
|
1387
|
+
cli_providers = ("claude-sdk", "claude-cli", "claude-code", "codex-cli", "codex")
|
|
1388
|
+
per_call_timeout = test_case.timeout
|
|
1389
|
+
if self.provider in cli_providers:
|
|
1390
|
+
per_call_timeout = max(per_call_timeout, 120.0)
|
|
1391
|
+
wall_clock_timeout = per_call_timeout + self.WALL_CLOCK_SLACK_SECONDS
|
|
1392
|
+
|
|
1393
|
+
async def _dispatch() -> TestResult:
|
|
1369
1394
|
if test_case.is_auth_only:
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1395
|
+
return await self.run_auth_only_test(test_case)
|
|
1396
|
+
if test_case.is_load_test:
|
|
1397
|
+
return await self.run_load_test(test_case)
|
|
1398
|
+
if test_case.is_multi_turn:
|
|
1399
|
+
return await self.run_multi_turn_test(test_case)
|
|
1400
|
+
return await self.run_test(test_case)
|
|
1401
|
+
|
|
1402
|
+
for attempt in range(max_test_retries + 1):
|
|
1403
|
+
try:
|
|
1404
|
+
result = await asyncio.wait_for(_dispatch(), timeout=wall_clock_timeout)
|
|
1405
|
+
except asyncio.TimeoutError:
|
|
1406
|
+
if self.verbose:
|
|
1407
|
+
self._log(
|
|
1408
|
+
f" Test wall-clock timeout after {wall_clock_timeout:.1f}s — "
|
|
1409
|
+
"the provider was streaming or retrying without making progress"
|
|
1410
|
+
)
|
|
1411
|
+
result = TestResult(
|
|
1412
|
+
test_name=test_case.name,
|
|
1413
|
+
passed=False,
|
|
1414
|
+
score=0.0,
|
|
1415
|
+
duration=wall_clock_timeout,
|
|
1416
|
+
response=(
|
|
1417
|
+
f"Error: test wall-clock timeout after {wall_clock_timeout:.1f}s. "
|
|
1418
|
+
"The provider did not return a final result in time. This is a "
|
|
1419
|
+
"hard cap independent of the per-call timeout, used to break out "
|
|
1420
|
+
"of provider-side retry loops or stuck SSE streams."
|
|
1421
|
+
),
|
|
1422
|
+
reason="wall-clock timeout",
|
|
1423
|
+
error="wall-clock timeout",
|
|
1424
|
+
)
|
|
1377
1425
|
|
|
1378
1426
|
# Check if this was a rate limit failure
|
|
1379
1427
|
is_rate_limit_failure = (
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|