testmcpy 0.7.0__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. {testmcpy-0.7.0/testmcpy.egg-info → testmcpy-0.7.2}/PKG-INFO +1 -1
  2. {testmcpy-0.7.0 → testmcpy-0.7.2}/pyproject.toml +1 -1
  3. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/__init__.py +1 -1
  4. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/llm_integration.py +173 -2
  5. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/test_runner.py +58 -10
  6. {testmcpy-0.7.0 → testmcpy-0.7.2/testmcpy.egg-info}/PKG-INFO +1 -1
  7. {testmcpy-0.7.0 → testmcpy-0.7.2}/LICENSE +0 -0
  8. {testmcpy-0.7.0 → testmcpy-0.7.2}/MANIFEST.in +0 -0
  9. {testmcpy-0.7.0 → testmcpy-0.7.2}/NOTICE +0 -0
  10. {testmcpy-0.7.0 → testmcpy-0.7.2}/README.md +0 -0
  11. {testmcpy-0.7.0 → testmcpy-0.7.2}/setup.cfg +0 -0
  12. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/__init__.py +0 -0
  13. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/hooks.py +0 -0
  14. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/models.py +0 -0
  15. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/orchestrator.py +0 -0
  16. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/prompts.py +0 -0
  17. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/agent/tools.py +0 -0
  18. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/auth_debugger.py +0 -0
  19. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/auth_flow_recorder.py +0 -0
  20. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/__init__.py +0 -0
  21. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/app.py +0 -0
  22. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/__init__.py +0 -0
  23. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/agent.py +0 -0
  24. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/baseline.py +0 -0
  25. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/export_db.py +0 -0
  26. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/mcp.py +0 -0
  27. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/metamorphic.py +0 -0
  28. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/multi_env.py +0 -0
  29. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/mutate.py +0 -0
  30. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/push.py +0 -0
  31. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/run.py +0 -0
  32. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/server.py +0 -0
  33. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/tools.py +0 -0
  34. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/tui.py +0 -0
  35. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/cli/commands/wizard.py +0 -0
  36. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/config.py +0 -0
  37. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/__init__.py +0 -0
  38. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/chat_session.py +0 -0
  39. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/docs_optimizer.py +0 -0
  40. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/mcp_manager.py +0 -0
  41. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/tool_comparison.py +0 -0
  42. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/core/tool_discovery.py +0 -0
  43. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/db.py +0 -0
  44. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/error_handlers.py +0 -0
  45. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/__init__.py +0 -0
  46. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/auth_evaluators.py +0 -0
  47. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/base_evaluators.py +0 -0
  48. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/evals/evaluator_packs.py +0 -0
  49. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/__init__.py +0 -0
  50. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/base.py +0 -0
  51. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/curl.py +0 -0
  52. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/graphql.py +0 -0
  53. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/javascript_client.py +0 -0
  54. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/json_yaml.py +0 -0
  55. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/protobuf.py +0 -0
  56. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/python.py +0 -0
  57. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/python_client.py +0 -0
  58. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/thrift.py +0 -0
  59. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/typescript.py +0 -0
  60. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/formatters/typescript_client.py +0 -0
  61. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/llm_profiles.py +0 -0
  62. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/mcp_profiles.py +0 -0
  63. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/migrate_json.py +0 -0
  64. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/models.py +0 -0
  65. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_detailed_exploration.py +0 -0
  66. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_poc.py +0 -0
  67. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/claude_sdk_working_poc.py +0 -0
  68. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/research/test_ollama_tools.py +0 -0
  69. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/__init__.py +0 -0
  70. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/api.py +0 -0
  71. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/api.py.bak +0 -0
  72. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/auth_middleware.py +0 -0
  73. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/helpers/__init__.py +0 -0
  74. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/helpers/mcp_config.py +0 -0
  75. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/models.py +0 -0
  76. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/__init__.py +0 -0
  77. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/agent.py +0 -0
  78. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/auth.py +0 -0
  79. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/compare.py +0 -0
  80. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/compatibility.py +0 -0
  81. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/generation_logs.py +0 -0
  82. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/health.py +0 -0
  83. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/llm.py +0 -0
  84. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/mcp_profiles.py +0 -0
  85. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/metrics.py +0 -0
  86. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/results.py +0 -0
  87. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/search.py +0 -0
  88. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/security.py +0 -0
  89. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/smoke_reports.py +0 -0
  90. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/test_profiles.py +0 -0
  91. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/tests.py +0 -0
  92. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/routers/tools.py +0 -0
  93. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/state.py +0 -0
  94. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/server/websocket.py +0 -0
  95. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/smoke_test.py +0 -0
  96. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/__init__.py +0 -0
  97. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/baseline.py +0 -0
  98. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/ci_gate.py +0 -0
  99. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/comparison_runner.py +0 -0
  100. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/coverage_analyzer.py +0 -0
  101. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/html_report.py +0 -0
  102. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/mcp_client.py +0 -0
  103. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/metamorphic.py +0 -0
  104. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/model_registry.py +0 -0
  105. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/models.py +0 -0
  106. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/multi_env.py +0 -0
  107. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/oauth_flows.py +0 -0
  108. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/prompt_mutation.py +0 -0
  109. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/report_generator.py +0 -0
  110. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/runner_tools.py +0 -0
  111. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/schema_diff.py +0 -0
  112. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/src/token_manager.py +0 -0
  113. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/storage.py +0 -0
  114. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/test_profiles.py +0 -0
  115. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/README.md +0 -0
  116. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/assets/index-30Ed2JCz.css +0 -0
  117. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/assets/index-6JiH0p1L.js +0 -0
  118. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/dist/index.html +0 -0
  119. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/index.html +0 -0
  120. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/package-lock.json +0 -0
  121. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/package.json +0 -0
  122. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/postcss.config.js +0 -0
  123. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/App.jsx +0 -0
  124. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/CommandPalette.jsx +0 -0
  125. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/CompareToolsTab.jsx +0 -0
  126. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/EditorStatusBar.jsx +0 -0
  127. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/EditorTabStrip.jsx +0 -0
  128. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ErrorAlert.jsx +0 -0
  129. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ErrorBoundary.jsx +0 -0
  130. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/LLMProfileSelector.jsx +0 -0
  131. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/LoadingSpinner.jsx +0 -0
  132. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/MCPProfileSelector.jsx +0 -0
  133. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/NotificationProvider.jsx +0 -0
  134. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/OptimizeDocsModal.jsx +0 -0
  135. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/OutputDiff.jsx +0 -0
  136. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ParameterCard.jsx +0 -0
  137. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/SchemaCodeViewer.jsx +0 -0
  138. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/SkeletonLoader.jsx +0 -0
  139. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/StreamingLogViewer.jsx +0 -0
  140. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestGenerationModal.jsx +0 -0
  141. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestProfileSelector.jsx +0 -0
  142. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestResultPanel.jsx +0 -0
  143. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TestStatusIndicator.jsx +0 -0
  144. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolCallTimeline.jsx +0 -0
  145. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolComparison.jsx +0 -0
  146. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/ToolDebugModal.jsx +0 -0
  147. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TraceView.jsx +0 -0
  148. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/TypeBadge.jsx +0 -0
  149. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/components/Wizard.jsx +0 -0
  150. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/contexts/TestRunContext.jsx +0 -0
  151. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/contexts/ThemeContext.jsx +0 -0
  152. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useEditorTheme.js +0 -0
  153. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useKeyboardShortcuts.js +0 -0
  154. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/hooks/useSafeFetch.js +0 -0
  155. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/index.css +0 -0
  156. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/main.jsx +0 -0
  157. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/AuthDebugger.jsx +0 -0
  158. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/ChatInterface.jsx +0 -0
  159. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/CompatibilityMatrix.jsx +0 -0
  160. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/Configuration.jsx +0 -0
  161. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/GenerationHistory.jsx +0 -0
  162. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/LLMProfiles.jsx +0 -0
  163. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPExplorer.jsx +0 -0
  164. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPHealth.jsx +0 -0
  165. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MCPProfiles.jsx +0 -0
  166. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/MetricsDashboard.jsx +0 -0
  167. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/ProfilesManager.jsx +0 -0
  168. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/Reports.jsx +0 -0
  169. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/RunComparison.jsx +0 -0
  170. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/SecurityDashboard.jsx +0 -0
  171. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/pages/TestManager.jsx +0 -0
  172. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/utils/__tests__/formatConverters.test.js +0 -0
  173. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/src/utils/formatConverters.js +0 -0
  174. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/tailwind.config.js +0 -0
  175. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy/ui/vite.config.js +0 -0
  176. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/SOURCES.txt +0 -0
  177. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/dependency_links.txt +0 -0
  178. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/entry_points.txt +0 -0
  179. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/requires.txt +0 -0
  180. {testmcpy-0.7.0 → testmcpy-0.7.2}/testmcpy.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: testmcpy
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
5
5
  Author: Amin Ghadersohi
6
6
  License-Expression: Apache-2.0
@@ -93,7 +93,7 @@ testmcpy = [
93
93
 
94
94
  [project]
95
95
  name = "testmcpy"
96
- version = "0.7.0"
96
+ version = "0.7.2"
97
97
  description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
98
98
  authors = [{name = "Amin Ghadersohi"}]
99
99
  license = "Apache-2.0"
@@ -11,6 +11,6 @@ try:
11
11
  __version__ = version("testmcpy")
12
12
  except Exception:
13
13
  # Fallback for development or when package not installed
14
- __version__ = "0.7.0"
14
+ __version__ = "0.7.2"
15
15
 
16
16
  __author__ = "testmcpy Contributors"
@@ -1356,6 +1356,30 @@ class BedrockProvider(LLMProvider):
1356
1356
  _claude_sdk_logger = logging.getLogger(__name__ + ".ClaudeSDKProvider")
1357
1357
 
1358
1358
 
1359
+ # Substrings that strongly indicate a tool-result is an error, even when
1360
+ # the SDK didn't flag is_error=True. Used by the retry-budget guard to
1361
+ # detect cases like MCP-side validation errors that come back as "successful"
1362
+ # tool results with error text in the body.
1363
+ _ERROR_PAYLOAD_MARKERS = (
1364
+ "validation error",
1365
+ "Unexpected keyword argument",
1366
+ "Missing required argument",
1367
+ "missing_argument",
1368
+ "unexpected_keyword_argument",
1369
+ '"error":',
1370
+ "'error':",
1371
+ "Error:",
1372
+ )
1373
+
1374
+
1375
+ def _looks_like_error_payload(content: Any) -> bool:
1376
+ """Heuristic: does this tool-result content look like an error?"""
1377
+ text = str(content) if content is not None else ""
1378
+ if not text:
1379
+ return False
1380
+ return any(marker in text for marker in _ERROR_PAYLOAD_MARKERS)
1381
+
1382
+
1359
1383
  class ClaudeSDKProvider(LLMProvider):
1360
1384
  """Claude Agent SDK provider with native MCP integration.
1361
1385
 
@@ -1712,10 +1736,21 @@ class ClaudeSDKProvider(LLMProvider):
1712
1736
  cost = 0.0
1713
1737
  raw_events = []
1714
1738
 
1739
+ # Retry budget: if the model keeps calling the SAME tool with the
1740
+ # SAME arguments and getting the SAME error, abort the query
1741
+ # rather than letting it spin until the wall-clock timeout fires.
1742
+ # Counts how many times each (tool_name, args, error) signature
1743
+ # has been seen; we abort when any signature crosses the
1744
+ # threshold below.
1745
+ error_signature_counts: dict[tuple[str, str, str], int] = {}
1746
+ max_repeats_per_signature = 3
1747
+ retry_budget_aborted = False
1748
+
1715
1749
  log(f"[ClaudeSDK] Starting query (model={self.model}, timeout={timeout}s)...")
1716
1750
 
1717
1751
  async def execute_query():
1718
1752
  nonlocal response_text, thinking_text, token_usage, cost
1753
+ nonlocal retry_budget_aborted
1719
1754
  message_count = 0
1720
1755
  # Track all text blocks per AssistantMessage so we can
1721
1756
  # identify the FINAL text response (after all tool calls)
@@ -1805,6 +1840,50 @@ class ClaudeSDKProvider(LLMProvider):
1805
1840
  f"[ClaudeSDK] Tool Result ({status}): {content_preview}"
1806
1841
  )
1807
1842
 
1843
+ # Retry-budget enforcement: if the model
1844
+ # keeps making the same call with the
1845
+ # same args and getting the same error,
1846
+ # abort to break out of the loop.
1847
+ if is_error or _looks_like_error_payload(content):
1848
+ matching_call = next(
1849
+ (
1850
+ tc
1851
+ for tc in tool_calls
1852
+ if tc.get("id") == tool_use_id
1853
+ ),
1854
+ None,
1855
+ )
1856
+ if matching_call:
1857
+ sig_args = json.dumps(
1858
+ matching_call.get("arguments", {}),
1859
+ sort_keys=True,
1860
+ default=str,
1861
+ )[:200]
1862
+ # Use a normalized prefix of the
1863
+ # error text — exact byte match
1864
+ # would be too brittle.
1865
+ sig_err = str(content)[:120]
1866
+ sig = (
1867
+ matching_call.get("name", ""),
1868
+ sig_args,
1869
+ sig_err,
1870
+ )
1871
+ error_signature_counts[sig] = (
1872
+ error_signature_counts.get(sig, 0) + 1
1873
+ )
1874
+ if (
1875
+ error_signature_counts[sig]
1876
+ >= max_repeats_per_signature
1877
+ ):
1878
+ log(
1879
+ f"[ClaudeSDK] Retry budget exhausted: "
1880
+ f"same call+error repeated "
1881
+ f"{max_repeats_per_signature}× — aborting "
1882
+ f"(tool={sig[0]}, error={sig_err[:80]!r})"
1883
+ )
1884
+ retry_budget_aborted = True
1885
+ return
1886
+
1808
1887
  elif isinstance(message, ResultMessage):
1809
1888
  if message.usage:
1810
1889
  usage = message.usage
@@ -1882,9 +1961,25 @@ class ClaudeSDKProvider(LLMProvider):
1882
1961
  mcp_tool_results.append(mcp_result)
1883
1962
 
1884
1963
  duration = time.time() - start_time
1964
+
1965
+ # If we aborted via the retry budget, surface that in the
1966
+ # response so evaluators see a clear, actionable error
1967
+ # rather than an empty / partial response.
1968
+ if retry_budget_aborted and not response_text:
1969
+ response_text = (
1970
+ f"Error: aborted after the model repeated the same tool call "
1971
+ f"and got the same error {max_repeats_per_signature}× in a row. "
1972
+ f"This usually means the prompt is priming the model toward a "
1973
+ f"wrong parameter name, the tool's schema mismatches the model's "
1974
+ f"expectation, or the resource being queried doesn't exist. See "
1975
+ f"the log lines marked '[ClaudeSDK] Tool Result (Error)' for the "
1976
+ f"specific error pattern."
1977
+ )
1978
+
1885
1979
  log(
1886
1980
  f"[ClaudeSDK] Done: {len(response_text)} chars, "
1887
1981
  f"{len(tool_calls)} tool calls, {len(mcp_tool_results)} results"
1982
+ + (" [retry budget aborted]" if retry_budget_aborted else "")
1888
1983
  )
1889
1984
 
1890
1985
  # Estimate cost from tokens if SDK didn't provide it (subscription billing)
@@ -1955,6 +2050,21 @@ class ClaudeSDKProvider(LLMProvider):
1955
2050
  _assistant_logger = logging.getLogger(__name__ + ".AssistantProvider")
1956
2051
 
1957
2052
 
2053
+ def _format_seconds(seconds: float) -> str:
2054
+ """Format a duration so sub-second values aren't rounded to ``0s``.
2055
+
2056
+ The SSE idle threshold is configurable down to fractions of a second
2057
+ (the unit tests override it to 0.3s). Using ``f"{x:.0f}s"`` would
2058
+ produce a misleading ``0s`` in those messages — switch to ms below
2059
+ 1s, decimals below 10s, and integer seconds otherwise.
2060
+ """
2061
+ if seconds < 1.0:
2062
+ return f"{seconds * 1000:.0f}ms"
2063
+ if seconds < 10.0:
2064
+ return f"{seconds:.1f}s"
2065
+ return f"{seconds:.0f}s"
2066
+
2067
+
1958
2068
  @dataclass
1959
2069
  class _SSEStreamState:
1960
2070
  """Mutable state accumulated as we parse a chatbot SSE response."""
@@ -2016,6 +2126,13 @@ class AssistantProvider(LLMProvider):
2016
2126
  _DEFAULT_CONVERSATIONS_PATH = "/api/v1/copilot/conversations"
2017
2127
  _DEFAULT_COMPLETIONS_PATH = "/api/v1/copilot/completions"
2018
2128
 
2129
+ # If the SSE stream emits no recognized event for this many seconds,
2130
+ # abort the stream. Defends against a chatbot backend that keeps the
2131
+ # connection open (preventing httpx's per-event read timeout from
2132
+ # firing) but stops emitting real progress. Observed in c29
2133
+ # (SC-105915). Class-level so subclasses / tests can override.
2134
+ SSE_IDLE_ABORT_SECONDS: float = 90.0
2135
+
2019
2136
  def __init__(
2020
2137
  self,
2021
2138
  model: str = "default",
@@ -2140,6 +2257,16 @@ class AssistantProvider(LLMProvider):
2140
2257
 
2141
2258
  log(f"[Assistant] POST {completions_url} (conversation={self._conversation_id})")
2142
2259
 
2260
+ # Idle abort: if the SSE stream emits NO recognized event within
2261
+ # this many seconds, give up. This catches a chatbot backend that
2262
+ # keeps the connection open (so httpx's per-event read timeout
2263
+ # never fires) but stops sending real progress — observed in
2264
+ # eval cycle c29 (SC-105915) where C00_9, C01_9, C02_7 hung
2265
+ # despite the per-test wall-clock added in v0.7.1.
2266
+ sse_idle_abort_seconds = self.SSE_IDLE_ABORT_SECONDS
2267
+ last_event_at = time.time()
2268
+ idle_aborted = False
2269
+
2143
2270
  state = _SSEStreamState()
2144
2271
  try:
2145
2272
  async with self._client.stream(
@@ -2153,7 +2280,40 @@ class AssistantProvider(LLMProvider):
2153
2280
  )
2154
2281
 
2155
2282
  current_event: str | None = None
2156
- async for raw_line in resp.aiter_lines():
2283
+ # Drive the line iterator manually so we can wrap each
2284
+ # await in asyncio.wait_for(...). httpx's aiter_lines()
2285
+ # blocks inside __anext__ when no bytes arrive — a plain
2286
+ # `async for` would be suspended forever. The wait_for
2287
+ # catches the case where the SSE connection stays open
2288
+ # but never sends another byte (real-world c29 hang).
2289
+ line_iter = resp.aiter_lines().__aiter__()
2290
+ budget_str = _format_seconds(sse_idle_abort_seconds)
2291
+ while True:
2292
+ elapsed = time.time() - last_event_at
2293
+ remaining = sse_idle_abort_seconds - elapsed
2294
+ if remaining <= 0:
2295
+ log(
2296
+ f"[Assistant] SSE idle abort: no recognized event for "
2297
+ f"{budget_str} — closing stream"
2298
+ )
2299
+ idle_aborted = True
2300
+ break
2301
+ try:
2302
+ raw_line = await asyncio.wait_for(line_iter.__anext__(), timeout=remaining)
2303
+ except StopAsyncIteration:
2304
+ break
2305
+ except asyncio.TimeoutError:
2306
+ # Budget is measured since the last *recognized* event.
2307
+ # Unrecognized noise (keepalives, malformed events) does
2308
+ # NOT reset last_event_at, so this fires correctly even
2309
+ # if bytes are arriving without real progress.
2310
+ log(
2311
+ f"[Assistant] SSE idle abort: no recognized event for "
2312
+ f"{budget_str} — closing stream"
2313
+ )
2314
+ idle_aborted = True
2315
+ break
2316
+
2157
2317
  line = raw_line.strip()
2158
2318
  if not line:
2159
2319
  current_event = None
@@ -2178,6 +2338,8 @@ class AssistantProvider(LLMProvider):
2178
2338
  continue
2179
2339
 
2180
2340
  self._handle_sse_event(current_event, data, state, log)
2341
+ # A real event arrived — reset the idle timer.
2342
+ last_event_at = time.time()
2181
2343
 
2182
2344
  except httpx.TimeoutException:
2183
2345
  duration = time.time() - start_time
@@ -2201,13 +2363,22 @@ class AssistantProvider(LLMProvider):
2201
2363
  duration = time.time() - start_time
2202
2364
  if state.got_error and not state.response_text:
2203
2365
  state.response_text = f"Error: {state.error_message}"
2366
+ elif idle_aborted and not state.response_text:
2367
+ # Surface the idle abort cleanly so evaluators don't see an
2368
+ # empty response with no explanation.
2369
+ state.response_text = (
2370
+ f"Error: SSE stream went idle for "
2371
+ f"{_format_seconds(sse_idle_abort_seconds)} without sending a "
2372
+ "final / error event. The chatbot backend kept the connection "
2373
+ "open but stopped emitting progress. Aborted to free the runner."
2374
+ )
2204
2375
 
2205
2376
  log(
2206
2377
  f"[Assistant] Done: {len(state.response_text)} chars, "
2207
2378
  f"{len(state.tool_calls)} tool calls, {state.token_event_count} tokens, "
2208
2379
  f"final={'yes' if state.got_final else 'no'}, "
2209
2380
  f"error={'yes' if state.got_error else 'no'}, "
2210
- f"{duration:.2f}s"
2381
+ f"{duration:.2f}s" + (" [SSE idle aborted]" if idle_aborted else "")
2211
2382
  )
2212
2383
 
2213
2384
  return LLMResult(
@@ -1360,20 +1360,68 @@ class TestRunner:
1360
1360
 
1361
1361
  return results
1362
1362
 
1363
+ # Safety margin added to each test's per-call timeout to compute the
1364
+ # wall-clock budget. Covers auth + conversation create + tool execution
1365
+ # + evaluator runtime. Class-level so tests can shorten it.
1366
+ WALL_CLOCK_SLACK_SECONDS: float = 60.0
1367
+
1363
1368
  async def _run_test_with_retry(
1364
1369
  self, test_case: TestCase, max_test_retries: int = 2
1365
1370
  ) -> TestResult:
1366
- """Run a test with retry logic for rate limit failures."""
1367
- for attempt in range(max_test_retries + 1):
1368
- # Dispatch to appropriate runner
1371
+ """Run a test with retry logic for rate limit failures.
1372
+
1373
+ Each test is wrapped in a wall-clock timeout so it can never hang
1374
+ the runner indefinitely. The wall-clock timeout is the test's
1375
+ declared ``timeout`` plus :attr:`WALL_CLOCK_SLACK_SECONDS` to
1376
+ absorb provider-side overhead (auth, conversation creation,
1377
+ evaluators); for CLI providers we also bump it up like
1378
+ :meth:`run_test` does for the inner LLM call.
1379
+
1380
+ Without this, providers that stream events (e.g. the assistant
1381
+ chatbot endpoint) can keep the test alive forever — the per-event
1382
+ httpx timeout resets on every received chunk, so a chatbot stuck
1383
+ in an infinite tool-call retry loop keeps streaming until the
1384
+ runner is killed externally.
1385
+ """
1386
+ # Compute the wall-clock budget once per test (used across retries).
1387
+ cli_providers = ("claude-sdk", "claude-cli", "claude-code", "codex-cli", "codex")
1388
+ per_call_timeout = test_case.timeout
1389
+ if self.provider in cli_providers:
1390
+ per_call_timeout = max(per_call_timeout, 120.0)
1391
+ wall_clock_timeout = per_call_timeout + self.WALL_CLOCK_SLACK_SECONDS
1392
+
1393
+ async def _dispatch() -> TestResult:
1369
1394
  if test_case.is_auth_only:
1370
- result = await self.run_auth_only_test(test_case)
1371
- elif test_case.is_load_test:
1372
- result = await self.run_load_test(test_case)
1373
- elif test_case.is_multi_turn:
1374
- result = await self.run_multi_turn_test(test_case)
1375
- else:
1376
- result = await self.run_test(test_case)
1395
+ return await self.run_auth_only_test(test_case)
1396
+ if test_case.is_load_test:
1397
+ return await self.run_load_test(test_case)
1398
+ if test_case.is_multi_turn:
1399
+ return await self.run_multi_turn_test(test_case)
1400
+ return await self.run_test(test_case)
1401
+
1402
+ for attempt in range(max_test_retries + 1):
1403
+ try:
1404
+ result = await asyncio.wait_for(_dispatch(), timeout=wall_clock_timeout)
1405
+ except asyncio.TimeoutError:
1406
+ if self.verbose:
1407
+ self._log(
1408
+ f" Test wall-clock timeout after {wall_clock_timeout:.1f}s — "
1409
+ "the provider was streaming or retrying without making progress"
1410
+ )
1411
+ result = TestResult(
1412
+ test_name=test_case.name,
1413
+ passed=False,
1414
+ score=0.0,
1415
+ duration=wall_clock_timeout,
1416
+ response=(
1417
+ f"Error: test wall-clock timeout after {wall_clock_timeout:.1f}s. "
1418
+ "The provider did not return a final result in time. This is a "
1419
+ "hard cap independent of the per-call timeout, used to break out "
1420
+ "of provider-side retry loops or stuck SSE streams."
1421
+ ),
1422
+ reason="wall-clock timeout",
1423
+ error="wall-clock timeout",
1424
+ )
1377
1425
 
1378
1426
  # Check if this was a rate limit failure
1379
1427
  is_rate_limit_failure = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: testmcpy
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
5
5
  Author: Amin Ghadersohi
6
6
  License-Expression: Apache-2.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes