testmcpy 0.2.17__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {testmcpy-0.2.17/testmcpy.egg-info → testmcpy-0.3.0}/PKG-INFO +9 -4
  2. {testmcpy-0.2.17 → testmcpy-0.3.0}/README.md +1 -3
  3. {testmcpy-0.2.17 → testmcpy-0.3.0}/pyproject.toml +14 -2
  4. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/__init__.py +1 -1
  5. testmcpy-0.3.0/testmcpy/agent/__init__.py +27 -0
  6. testmcpy-0.3.0/testmcpy/agent/hooks.py +184 -0
  7. testmcpy-0.3.0/testmcpy/agent/models.py +176 -0
  8. testmcpy-0.3.0/testmcpy/agent/orchestrator.py +195 -0
  9. testmcpy-0.3.0/testmcpy/agent/prompts.py +80 -0
  10. testmcpy-0.3.0/testmcpy/agent/tools.py +598 -0
  11. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/__init__.py +8 -0
  12. testmcpy-0.3.0/testmcpy/cli/commands/agent.py +223 -0
  13. testmcpy-0.3.0/testmcpy/cli/commands/baseline.py +456 -0
  14. testmcpy-0.3.0/testmcpy/cli/commands/export_db.py +349 -0
  15. testmcpy-0.3.0/testmcpy/cli/commands/metamorphic.py +201 -0
  16. testmcpy-0.3.0/testmcpy/cli/commands/multi_env.py +169 -0
  17. testmcpy-0.3.0/testmcpy/cli/commands/mutate.py +290 -0
  18. testmcpy-0.3.0/testmcpy/cli/commands/push.py +279 -0
  19. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/run.py +359 -14
  20. testmcpy-0.3.0/testmcpy/cli/commands/wizard.py +523 -0
  21. testmcpy-0.3.0/testmcpy/db.py +82 -0
  22. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/evals/__init__.py +15 -0
  23. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/evals/auth_evaluators.py +199 -5
  24. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/evals/base_evaluators.py +918 -2
  25. testmcpy-0.3.0/testmcpy/evals/evaluator_packs.py +238 -0
  26. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/mcp_profiles.py +103 -2
  27. testmcpy-0.3.0/testmcpy/migrate_json.py +191 -0
  28. testmcpy-0.3.0/testmcpy/models.py +344 -0
  29. testmcpy-0.3.0/testmcpy/server/api.py +1537 -0
  30. testmcpy-0.3.0/testmcpy/server/auth_middleware.py +51 -0
  31. testmcpy-0.3.0/testmcpy/server/routers/agent.py +188 -0
  32. testmcpy-0.3.0/testmcpy/server/routers/compare.py +138 -0
  33. testmcpy-0.3.0/testmcpy/server/routers/compatibility.py +175 -0
  34. testmcpy-0.3.0/testmcpy/server/routers/generation_logs.py +118 -0
  35. testmcpy-0.3.0/testmcpy/server/routers/health.py +123 -0
  36. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/mcp_profiles.py +5 -5
  37. testmcpy-0.3.0/testmcpy/server/routers/metrics.py +214 -0
  38. testmcpy-0.3.0/testmcpy/server/routers/results.py +417 -0
  39. testmcpy-0.3.0/testmcpy/server/routers/search.py +162 -0
  40. testmcpy-0.3.0/testmcpy/server/routers/security.py +192 -0
  41. testmcpy-0.3.0/testmcpy/server/routers/smoke_reports.py +71 -0
  42. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/tests.py +128 -10
  43. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/tools.py +450 -1
  44. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/state.py +48 -19
  45. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/websocket.py +26 -3
  46. testmcpy-0.3.0/testmcpy/src/baseline.py +364 -0
  47. testmcpy-0.3.0/testmcpy/src/ci_gate.py +69 -0
  48. testmcpy-0.3.0/testmcpy/src/comparison_runner.py +239 -0
  49. testmcpy-0.3.0/testmcpy/src/coverage_analyzer.py +343 -0
  50. testmcpy-0.3.0/testmcpy/src/html_report.py +443 -0
  51. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/llm_integration.py +1022 -836
  52. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/mcp_client.py +628 -26
  53. testmcpy-0.3.0/testmcpy/src/metamorphic.py +375 -0
  54. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/model_registry.py +46 -44
  55. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/models.py +4 -0
  56. testmcpy-0.3.0/testmcpy/src/multi_env.py +281 -0
  57. testmcpy-0.3.0/testmcpy/src/oauth_flows.py +331 -0
  58. testmcpy-0.3.0/testmcpy/src/prompt_mutation.py +276 -0
  59. testmcpy-0.3.0/testmcpy/src/report_generator.py +430 -0
  60. testmcpy-0.3.0/testmcpy/src/schema_diff.py +253 -0
  61. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/test_runner.py +228 -11
  62. testmcpy-0.3.0/testmcpy/src/token_manager.py +228 -0
  63. testmcpy-0.3.0/testmcpy/storage.py +1162 -0
  64. testmcpy-0.3.0/testmcpy/ui/dist/assets/index-C8j69QMM.js +287 -0
  65. testmcpy-0.3.0/testmcpy/ui/dist/assets/index-DFiQIkV-.css +1 -0
  66. testmcpy-0.3.0/testmcpy/ui/dist/index.html +22 -0
  67. testmcpy-0.3.0/testmcpy/ui/index.html +21 -0
  68. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/package-lock.json +518 -941
  69. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/package.json +2 -2
  70. testmcpy-0.3.0/testmcpy/ui/src/App.jsx +647 -0
  71. testmcpy-0.3.0/testmcpy/ui/src/components/CommandPalette.jsx +237 -0
  72. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/CompareToolsTab.jsx +3 -3
  73. testmcpy-0.3.0/testmcpy/ui/src/components/NotificationProvider.jsx +111 -0
  74. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/OptimizeDocsModal.jsx +270 -13
  75. testmcpy-0.3.0/testmcpy/ui/src/components/OutputDiff.jsx +131 -0
  76. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/SchemaCodeViewer.jsx +4 -2
  77. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/TestGenerationModal.jsx +7 -7
  78. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/TestResultPanel.jsx +3 -3
  79. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/ToolComparison.jsx +6 -6
  80. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/ToolDebugModal.jsx +9 -7
  81. testmcpy-0.3.0/testmcpy/ui/src/components/TraceView.jsx +180 -0
  82. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/TypeBadge.jsx +1 -1
  83. testmcpy-0.3.0/testmcpy/ui/src/components/Wizard.jsx +227 -0
  84. testmcpy-0.3.0/testmcpy/ui/src/contexts/ThemeContext.jsx +70 -0
  85. testmcpy-0.3.0/testmcpy/ui/src/hooks/useEditorTheme.js +13 -0
  86. testmcpy-0.3.0/testmcpy/ui/src/index.css +410 -0
  87. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/AuthDebugger.jsx +298 -10
  88. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/ChatInterface.jsx +421 -96
  89. testmcpy-0.3.0/testmcpy/ui/src/pages/CompatibilityMatrix.jsx +287 -0
  90. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/Configuration.jsx +8 -8
  91. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/GenerationHistory.jsx +21 -13
  92. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/LLMProfiles.jsx +448 -10
  93. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/MCPExplorer.jsx +307 -38
  94. testmcpy-0.3.0/testmcpy/ui/src/pages/MCPHealth.jsx +227 -0
  95. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/MCPProfiles.jsx +638 -37
  96. testmcpy-0.3.0/testmcpy/ui/src/pages/MetricsDashboard.jsx +346 -0
  97. testmcpy-0.3.0/testmcpy/ui/src/pages/Reports.jsx +1242 -0
  98. testmcpy-0.3.0/testmcpy/ui/src/pages/RunComparison.jsx +287 -0
  99. testmcpy-0.3.0/testmcpy/ui/src/pages/SecurityDashboard.jsx +279 -0
  100. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/TestManager.jsx +361 -192
  101. testmcpy-0.3.0/testmcpy/ui/tailwind.config.js +111 -0
  102. {testmcpy-0.2.17 → testmcpy-0.3.0/testmcpy.egg-info}/PKG-INFO +9 -4
  103. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy.egg-info/SOURCES.txt +52 -2
  104. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy.egg-info/requires.txt +9 -0
  105. testmcpy-0.2.17/testmcpy/server/api.py +0 -859
  106. testmcpy-0.2.17/testmcpy/server/routers/generation_logs.py +0 -185
  107. testmcpy-0.2.17/testmcpy/server/routers/results.py +0 -277
  108. testmcpy-0.2.17/testmcpy/server/routers/smoke_reports.py +0 -130
  109. testmcpy-0.2.17/testmcpy/storage.py +0 -1050
  110. testmcpy-0.2.17/testmcpy/ui/dist/assets/index-CaEBvXci.css +0 -1
  111. testmcpy-0.2.17/testmcpy/ui/dist/assets/index-mv4agCEg.js +0 -649
  112. testmcpy-0.2.17/testmcpy/ui/dist/index.html +0 -14
  113. testmcpy-0.2.17/testmcpy/ui/index.html +0 -13
  114. testmcpy-0.2.17/testmcpy/ui/src/App.jsx +0 -477
  115. testmcpy-0.2.17/testmcpy/ui/src/index.css +0 -265
  116. testmcpy-0.2.17/testmcpy/ui/src/pages/Reports.jsx +0 -572
  117. testmcpy-0.2.17/testmcpy/ui/tailwind.config.js +0 -92
  118. {testmcpy-0.2.17 → testmcpy-0.3.0}/LICENSE +0 -0
  119. {testmcpy-0.2.17 → testmcpy-0.3.0}/MANIFEST.in +0 -0
  120. {testmcpy-0.2.17 → testmcpy-0.3.0}/NOTICE +0 -0
  121. {testmcpy-0.2.17 → testmcpy-0.3.0}/setup.cfg +0 -0
  122. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/auth_debugger.py +0 -0
  123. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/auth_flow_recorder.py +0 -0
  124. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/app.py +0 -0
  125. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/__init__.py +0 -0
  126. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/mcp.py +0 -0
  127. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/server.py +0 -0
  128. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/tools.py +0 -0
  129. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/cli/commands/tui.py +0 -0
  130. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/config.py +0 -0
  131. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/__init__.py +0 -0
  132. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/chat_session.py +0 -0
  133. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/docs_optimizer.py +0 -0
  134. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/mcp_manager.py +0 -0
  135. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/tool_comparison.py +0 -0
  136. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/core/tool_discovery.py +0 -0
  137. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/error_handlers.py +0 -0
  138. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/__init__.py +0 -0
  139. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/base.py +0 -0
  140. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/curl.py +0 -0
  141. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/graphql.py +0 -0
  142. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/javascript_client.py +0 -0
  143. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/json_yaml.py +0 -0
  144. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/protobuf.py +0 -0
  145. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/python.py +0 -0
  146. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/python_client.py +0 -0
  147. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/thrift.py +0 -0
  148. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/typescript.py +0 -0
  149. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/formatters/typescript_client.py +0 -0
  150. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/llm_profiles.py +0 -0
  151. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/research/claude_sdk_detailed_exploration.py +0 -0
  152. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/research/claude_sdk_poc.py +0 -0
  153. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/research/claude_sdk_working_poc.py +0 -0
  154. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/research/test_ollama_tools.py +0 -0
  155. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/__init__.py +0 -0
  156. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/api.py.bak +0 -0
  157. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/helpers/__init__.py +0 -0
  158. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/helpers/mcp_config.py +0 -0
  159. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/models.py +0 -0
  160. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/__init__.py +0 -0
  161. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/auth.py +0 -0
  162. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/llm.py +0 -0
  163. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/server/routers/test_profiles.py +0 -0
  164. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/smoke_test.py +0 -0
  165. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/__init__.py +0 -0
  166. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/src/runner_tools.py +0 -0
  167. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/test_profiles.py +0 -0
  168. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/README.md +0 -0
  169. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/postcss.config.js +0 -0
  170. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/ErrorAlert.jsx +0 -0
  171. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/ErrorBoundary.jsx +0 -0
  172. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/LLMProfileSelector.jsx +0 -0
  173. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/LoadingSpinner.jsx +0 -0
  174. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/MCPProfileSelector.jsx +0 -0
  175. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/ParameterCard.jsx +0 -0
  176. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/SkeletonLoader.jsx +0 -0
  177. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/TestProfileSelector.jsx +0 -0
  178. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/components/TestStatusIndicator.jsx +0 -0
  179. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/contexts/TestRunContext.jsx +0 -0
  180. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/hooks/useKeyboardShortcuts.js +0 -0
  181. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/hooks/useSafeFetch.js +0 -0
  182. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/main.jsx +0 -0
  183. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/pages/ProfilesManager.jsx +0 -0
  184. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/utils/__tests__/formatConverters.test.js +0 -0
  185. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/src/utils/formatConverters.js +0 -0
  186. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy/ui/vite.config.js +0 -0
  187. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy.egg-info/dependency_links.txt +0 -0
  188. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy.egg-info/entry_points.txt +0 -0
  189. {testmcpy-0.2.17 → testmcpy-0.3.0}/testmcpy.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: testmcpy
3
- Version: 0.2.17
3
+ Version: 0.3.0
4
4
  Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
5
5
  Author: Amin Ghadersohi
6
6
  License-Expression: Apache-2.0
@@ -30,6 +30,8 @@ Requires-Dist: python-dotenv<2.0.0,>=1.0.0
30
30
  Requires-Dist: click<9.0.0,>=8.0.0
31
31
  Requires-Dist: shellingham<2.0.0,>=1.3.0
32
32
  Requires-Dist: textual<1.0.0,>=0.47.0
33
+ Requires-Dist: sqlalchemy<3.0.0,>=2.0.0
34
+ Requires-Dist: alembic<2.0.0,>=1.13.0
33
35
  Provides-Extra: dev
34
36
  Requires-Dist: ruff>=0.8.0; extra == "dev"
35
37
  Requires-Dist: mypy>=1.13.0; extra == "dev"
@@ -50,6 +52,11 @@ Provides-Extra: sdk
50
52
  Requires-Dist: claude-agent-sdk>=0.1.0; extra == "sdk"
51
53
  Provides-Extra: tui
52
54
  Requires-Dist: textual>=0.85.0; extra == "tui"
55
+ Provides-Extra: e2e
56
+ Requires-Dist: playwright>=1.40.0; extra == "e2e"
57
+ Requires-Dist: pytest-playwright>=0.4.0; extra == "e2e"
58
+ Provides-Extra: export
59
+ Requires-Dist: pandas<3.0.0,>=2.0.0; extra == "export"
53
60
  Provides-Extra: all
54
61
  Requires-Dist: fastapi<1.0.0,>=0.104.0; extra == "all"
55
62
  Requires-Dist: uvicorn[standard]<1.0.0,>=0.24.0; extra == "all"
@@ -626,6 +633,4 @@ By contributing, you agree that your contributions will be licensed under Apache
626
633
 
627
634
  ## Acknowledgments
628
635
 
629
- Built to enable better LLM testing and integration with Model Context Protocol services.
630
-
631
- Special thanks to the MCP community and all our contributors!
636
+ **Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).
@@ -566,6 +566,4 @@ By contributing, you agree that your contributions will be licensed under Apache
566
566
 
567
567
  ## Acknowledgments
568
568
 
569
- Built to enable better LLM testing and integration with Model Context Protocol services.
570
-
571
- Special thanks to the MCP community and all our contributors!
569
+ **Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).
@@ -43,13 +43,16 @@ check_untyped_defs = true
43
43
  ignore_missing_imports = true
44
44
 
45
45
  [tool.pytest.ini_options]
46
- testpaths = ["tests"]
46
+ testpaths = ["unit_tests", "integration_tests"]
47
47
  python_files = ["test_*.py"]
48
48
  addopts = [
49
49
  "-v",
50
50
  "--strict-markers",
51
51
  "--tb=short",
52
52
  ]
53
+ markers = [
54
+ "e2e: End-to-end UI tests (requires playwright)",
55
+ ]
53
56
 
54
57
  [tool.coverage.run]
55
58
  source = ["testmcpy"]
@@ -90,7 +93,7 @@ testmcpy = [
90
93
 
91
94
  [project]
92
95
  name = "testmcpy"
93
- version = "0.2.17"
96
+ version = "0.3.0"
94
97
  description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
95
98
  authors = [{name = "Amin Ghadersohi"}]
96
99
  license = "Apache-2.0"
@@ -118,6 +121,8 @@ dependencies = [
118
121
  "click>=8.0.0,<9.0.0",
119
122
  "shellingham>=1.3.0,<2.0.0",
120
123
  "textual>=0.47.0,<1.0.0",
124
+ "sqlalchemy>=2.0.0,<3.0.0",
125
+ "alembic>=1.13.0,<2.0.0",
121
126
  ]
122
127
 
123
128
  [project.optional-dependencies]
@@ -145,6 +150,13 @@ sdk = [
145
150
  tui = [
146
151
  "textual>=0.85.0",
147
152
  ]
153
+ e2e = [
154
+ "playwright>=1.40.0",
155
+ "pytest-playwright>=0.4.0",
156
+ ]
157
+ export = [
158
+ "pandas>=2.0.0,<3.0.0",
159
+ ]
148
160
  all = [
149
161
  "fastapi>=0.104.0,<1.0.0",
150
162
  "uvicorn[standard]>=0.24.0,<1.0.0",
@@ -11,6 +11,6 @@ try:
11
11
  __version__ = version("testmcpy")
12
12
  except Exception:
13
13
  # Fallback for development or when package not installed
14
- __version__ = "0.2.12"
14
+ __version__ = "0.3.0"
15
15
 
16
16
  __author__ = "testmcpy Contributors"
@@ -0,0 +1,27 @@
1
+ """
2
+ Test Execution Agent using Claude Agent SDK.
3
+
4
+ Provides an intelligent orchestrator that wraps testmcpy infrastructure
5
+ with reasoning, adaptability, and natural language interaction.
6
+
7
+ Note: Requires `claude-agent-sdk` package. Imports are lazy to avoid
8
+ crashing when the SDK is not installed.
9
+ """
10
+
11
+ from testmcpy.agent.models import AgentRunReport, AgentSession, ToolInvocation
12
+
13
+
14
+ def __getattr__(name):
15
+ if name == "TestExecutionAgent":
16
+ from testmcpy.agent.orchestrator import TestExecutionAgent
17
+
18
+ return TestExecutionAgent
19
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
20
+
21
+
22
+ __all__ = [
23
+ "TestExecutionAgent",
24
+ "AgentRunReport",
25
+ "AgentSession",
26
+ "ToolInvocation",
27
+ ]
@@ -0,0 +1,184 @@
1
+ """
2
+ Agent hooks for monitoring and controlling the Test Execution Agent.
3
+
4
+ Uses the Claude Agent SDK hook system (PreToolUse, PostToolUse, Stop)
5
+ to track tool calls, detect loops, measure costs, and generate reports.
6
+ """
7
+
8
+ import time
9
+ from typing import Any
10
+
11
+ from testmcpy.agent.models import AgentSession, ToolInvocation
12
+
13
+ try:
14
+ from claude_agent_sdk import HookContext
15
+ except ImportError:
16
+ HookContext = Any # type: ignore[assignment,misc]
17
+
18
+ # Maximum identical consecutive tool calls before blocking
19
+ MAX_IDENTICAL_CALLS = 3
20
+
21
+
22
+ def create_hooks(session: AgentSession) -> dict[str, list[dict[str, Any]]]:
23
+ """Create all hooks wired to a shared AgentSession.
24
+
25
+ Returns a hooks dict in the format expected by ClaudeAgentOptions:
26
+ {
27
+ "PreToolUse": [{"matcher": None, "hooks": [callback]}],
28
+ "PostToolUse": [{"matcher": None, "hooks": [callback]}],
29
+ "Stop": [{"matcher": None, "hooks": [callback]}],
30
+ }
31
+ """
32
+ # Mutable state shared between hooks (not in session to keep session clean)
33
+ _hook_state: dict[str, Any] = {
34
+ "last_tool_name": None,
35
+ "last_tool_args": None,
36
+ "consecutive_identical": 0,
37
+ "pending_start_times": {}, # tool_use_id -> start_time
38
+ }
39
+
40
+ async def pre_tool_use(
41
+ input_data: dict[str, Any],
42
+ tool_use_id: str | None,
43
+ context: HookContext,
44
+ ) -> dict[str, Any]:
45
+ """Hook called before each tool use.
46
+
47
+ - Detects loops (3+ identical consecutive calls)
48
+ - Records start time for duration tracking
49
+ """
50
+ tool_name = input_data.get("name", "")
51
+ tool_args = input_data.get("input", {})
52
+
53
+ # Loop detection: track consecutive identical calls
54
+ if (
55
+ tool_name == _hook_state["last_tool_name"]
56
+ and tool_args == _hook_state["last_tool_args"]
57
+ ):
58
+ _hook_state["consecutive_identical"] += 1
59
+ else:
60
+ _hook_state["consecutive_identical"] = 1
61
+
62
+ _hook_state["last_tool_name"] = tool_name
63
+ _hook_state["last_tool_args"] = tool_args
64
+
65
+ # Block if too many identical calls in a row
66
+ if _hook_state["consecutive_identical"] >= MAX_IDENTICAL_CALLS:
67
+ session.record_error(
68
+ f"Loop detected: {tool_name} called {MAX_IDENTICAL_CALLS}+ times "
69
+ f"with identical arguments"
70
+ )
71
+ return {
72
+ "decision": "block",
73
+ "systemMessage": (
74
+ f"BLOCKED: You have called {tool_name} {MAX_IDENTICAL_CALLS} times "
75
+ f"in a row with identical arguments. This looks like a loop. "
76
+ f"Try a different approach or different arguments."
77
+ ),
78
+ }
79
+
80
+ # Record start time for this tool use
81
+ if tool_use_id:
82
+ _hook_state["pending_start_times"][tool_use_id] = time.time()
83
+
84
+ return {}
85
+
86
+ async def post_tool_use(
87
+ input_data: dict[str, Any],
88
+ tool_use_id: str | None,
89
+ context: HookContext,
90
+ ) -> dict[str, Any]:
91
+ """Hook called after each tool use.
92
+
93
+ - Records tool invocation with timing
94
+ - Tracks test results for execute_test_case
95
+ - Accumulates costs and tokens
96
+ """
97
+ tool_name = input_data.get("name", "")
98
+ tool_input = input_data.get("input", {})
99
+ tool_result = input_data.get("result", "")
100
+
101
+ # Calculate duration
102
+ duration_ms = 0.0
103
+ if tool_use_id and tool_use_id in _hook_state["pending_start_times"]:
104
+ start = _hook_state["pending_start_times"].pop(tool_use_id)
105
+ duration_ms = (time.time() - start) * 1000
106
+
107
+ # Determine if there was an error
108
+ is_error = False
109
+ result_summary = ""
110
+ if isinstance(tool_result, str):
111
+ result_summary = tool_result[:200]
112
+ is_error = tool_result.startswith("Error:")
113
+ elif isinstance(tool_result, dict):
114
+ is_error = tool_result.get("is_error", False)
115
+ content = tool_result.get("content", [])
116
+ if content and isinstance(content, list) and len(content) > 0:
117
+ first = content[0]
118
+ if isinstance(first, dict):
119
+ result_summary = first.get("text", "")[:200]
120
+
121
+ # Record the invocation
122
+ invocation = ToolInvocation(
123
+ tool_name=tool_name,
124
+ arguments=tool_input,
125
+ result_summary=result_summary,
126
+ is_error=is_error,
127
+ duration_ms=duration_ms,
128
+ )
129
+ session.record_tool_call(invocation)
130
+
131
+ # Track test results if this was execute_test_case
132
+ if tool_name == "execute_test_case" and not is_error:
133
+ try:
134
+ import json
135
+
136
+ # Parse the result to extract pass/fail
137
+ if isinstance(tool_result, str) and not tool_result.startswith("Error:"):
138
+ parsed = json.loads(tool_result)
139
+ if "passed" in parsed:
140
+ session.record_test_result(parsed["passed"])
141
+ # Track test execution cost separately
142
+ if "cost" in parsed:
143
+ session.test_execution_cost_usd += parsed.get("cost", 0.0)
144
+ if "token_usage" in parsed and parsed["token_usage"]:
145
+ tokens = parsed["token_usage"]
146
+ session.test_execution_tokens += tokens.get("total", 0)
147
+ except (json.JSONDecodeError, KeyError, TypeError):
148
+ pass
149
+
150
+ if is_error:
151
+ session.record_error(f"Tool {tool_name} returned error: {result_summary}")
152
+
153
+ return {}
154
+
155
+ async def stop_hook(
156
+ input_data: dict[str, Any],
157
+ tool_use_id: str | None,
158
+ context: HookContext,
159
+ ) -> dict[str, Any]:
160
+ """Hook called when the agent stops.
161
+
162
+ Finalizes the session and generates the run report.
163
+ """
164
+ session.complete()
165
+
166
+ # Extract orchestrator cost from the result message if available
167
+ result = input_data.get("result", {})
168
+ if isinstance(result, dict):
169
+ total_cost = result.get("total_cost_usd", 0.0)
170
+ if total_cost:
171
+ session.orchestrator_cost_usd = total_cost - session.test_execution_cost_usd
172
+
173
+ usage = result.get("usage", {})
174
+ if usage:
175
+ session.orchestrator_tokens_input = usage.get("input_tokens", 0)
176
+ session.orchestrator_tokens_output = usage.get("output_tokens", 0)
177
+
178
+ return {}
179
+
180
+ return {
181
+ "PreToolUse": [{"matcher": None, "hooks": [pre_tool_use]}],
182
+ "PostToolUse": [{"matcher": None, "hooks": [post_tool_use]}],
183
+ "Stop": [{"matcher": None, "hooks": [stop_hook]}],
184
+ }
@@ -0,0 +1,176 @@
1
+ """
2
+ Data models for the Test Execution Agent.
3
+
4
+ Defines session state, run reports, and tool invocation records
5
+ used by the agent hooks and orchestrator.
6
+ """
7
+
8
+ from dataclasses import asdict, dataclass, field
9
+ from datetime import datetime, timezone
10
+ from typing import Any
11
+
12
+
13
+ @dataclass
14
+ class ToolInvocation:
15
+ """Record of a single tool call made by the agent."""
16
+
17
+ tool_name: str
18
+ arguments: dict[str, Any]
19
+ result_summary: str
20
+ is_error: bool = False
21
+ duration_ms: float = 0.0
22
+ timestamp: str = ""
23
+
24
+ def __post_init__(self):
25
+ if not self.timestamp:
26
+ self.timestamp = datetime.now(timezone.utc).isoformat()
27
+
28
+ def to_dict(self) -> dict[str, Any]:
29
+ return asdict(self)
30
+
31
+
32
+ @dataclass
33
+ class AgentSession:
34
+ """Mutable state accumulated during an agent run.
35
+
36
+ Used by hooks to track progress and build the final report.
37
+ """
38
+
39
+ # Test execution tracking
40
+ tests_run: int = 0
41
+ tests_passed: int = 0
42
+ tests_failed: int = 0
43
+
44
+ # Cost tracking (orchestrator vs test execution)
45
+ orchestrator_cost_usd: float = 0.0
46
+ test_execution_cost_usd: float = 0.0
47
+
48
+ # Token tracking
49
+ orchestrator_tokens_input: int = 0
50
+ orchestrator_tokens_output: int = 0
51
+ test_execution_tokens: int = 0
52
+
53
+ # Tool call history
54
+ tool_call_history: list[ToolInvocation] = field(default_factory=list)
55
+ tool_call_counts: dict[str, int] = field(default_factory=dict)
56
+
57
+ # Errors
58
+ errors: list[str] = field(default_factory=list)
59
+
60
+ # Timing
61
+ started_at: str = ""
62
+ completed_at: str = ""
63
+
64
+ def __post_init__(self):
65
+ if not self.started_at:
66
+ self.started_at = datetime.now(timezone.utc).isoformat()
67
+
68
+ def record_tool_call(self, invocation: ToolInvocation) -> None:
69
+ """Record a tool invocation."""
70
+ self.tool_call_history.append(invocation)
71
+ self.tool_call_counts[invocation.tool_name] = (
72
+ self.tool_call_counts.get(invocation.tool_name, 0) + 1
73
+ )
74
+
75
+ def record_test_result(self, passed: bool) -> None:
76
+ """Record a test result."""
77
+ self.tests_run += 1
78
+ if passed:
79
+ self.tests_passed += 1
80
+ else:
81
+ self.tests_failed += 1
82
+
83
+ def record_error(self, error: str) -> None:
84
+ """Record an error."""
85
+ self.errors.append(error)
86
+
87
+ def complete(self) -> None:
88
+ """Mark the session as completed."""
89
+ self.completed_at = datetime.now(timezone.utc).isoformat()
90
+
91
+ def to_dict(self) -> dict[str, Any]:
92
+ d = asdict(self)
93
+ d["tool_call_history"] = [t.to_dict() for t in self.tool_call_history]
94
+ return d
95
+
96
+
97
+ @dataclass
98
+ class AgentRunReport:
99
+ """Final report from an agent run.
100
+
101
+ Separates orchestrator costs from test execution costs.
102
+ """
103
+
104
+ # Run metadata
105
+ run_id: str = ""
106
+ started_at: str = ""
107
+ completed_at: str = ""
108
+ duration_ms: float = 0.0
109
+
110
+ # Test results summary
111
+ tests_run: int = 0
112
+ tests_passed: int = 0
113
+ tests_failed: int = 0
114
+ pass_rate: float = 0.0
115
+
116
+ # Cost breakdown
117
+ orchestrator_cost_usd: float = 0.0
118
+ test_execution_cost_usd: float = 0.0
119
+ total_cost_usd: float = 0.0
120
+
121
+ # Token breakdown
122
+ orchestrator_tokens_input: int = 0
123
+ orchestrator_tokens_output: int = 0
124
+ test_execution_tokens: int = 0
125
+
126
+ # Agent activity
127
+ total_tool_calls: int = 0
128
+ tool_call_counts: dict[str, int] = field(default_factory=dict)
129
+ tool_call_history: list[ToolInvocation] = field(default_factory=list)
130
+
131
+ # Errors
132
+ errors: list[str] = field(default_factory=list)
133
+
134
+ # Agent's final analysis (text from the agent)
135
+ analysis: str = ""
136
+
137
+ # Number of agent turns
138
+ num_turns: int = 0
139
+
140
+ @classmethod
141
+ def from_session(cls, session: AgentSession, run_id: str = "") -> "AgentRunReport":
142
+ """Build a report from a completed agent session."""
143
+ session.complete()
144
+
145
+ started = datetime.fromisoformat(session.started_at)
146
+ completed = datetime.fromisoformat(session.completed_at)
147
+ duration_ms = (completed - started).total_seconds() * 1000
148
+
149
+ total_cost = session.orchestrator_cost_usd + session.test_execution_cost_usd
150
+ pass_rate = session.tests_passed / session.tests_run if session.tests_run > 0 else 0.0
151
+
152
+ return cls(
153
+ run_id=run_id,
154
+ started_at=session.started_at,
155
+ completed_at=session.completed_at,
156
+ duration_ms=duration_ms,
157
+ tests_run=session.tests_run,
158
+ tests_passed=session.tests_passed,
159
+ tests_failed=session.tests_failed,
160
+ pass_rate=pass_rate,
161
+ orchestrator_cost_usd=session.orchestrator_cost_usd,
162
+ test_execution_cost_usd=session.test_execution_cost_usd,
163
+ total_cost_usd=total_cost,
164
+ orchestrator_tokens_input=session.orchestrator_tokens_input,
165
+ orchestrator_tokens_output=session.orchestrator_tokens_output,
166
+ test_execution_tokens=session.test_execution_tokens,
167
+ total_tool_calls=len(session.tool_call_history),
168
+ tool_call_counts=dict(session.tool_call_counts),
169
+ tool_call_history=list(session.tool_call_history),
170
+ errors=list(session.errors),
171
+ )
172
+
173
+ def to_dict(self) -> dict[str, Any]:
174
+ d = asdict(self)
175
+ d["tool_call_history"] = [t.to_dict() for t in self.tool_call_history]
176
+ return d
@@ -0,0 +1,195 @@
1
+ """
2
+ Test Execution Agent orchestrator.
3
+
4
+ Main entry point for creating and running the agent. Wires together
5
+ tools, hooks, prompts, and the Claude Agent SDK.
6
+ """
7
+
8
+ import uuid
9
+ from collections.abc import AsyncIterator
10
+ from datetime import datetime, timezone
11
+ from typing import Any
12
+
13
+ from testmcpy.agent.hooks import create_hooks
14
+ from testmcpy.agent.models import AgentRunReport, AgentSession
15
+ from testmcpy.agent.prompts import build_context_prompt
16
+ from testmcpy.agent.tools import ALL_TOOLS, set_tool_context
17
+
18
+ try:
19
+ from claude_agent_sdk import (
20
+ AssistantMessage,
21
+ ClaudeAgentOptions,
22
+ ClaudeSDKClient,
23
+ ResultMessage,
24
+ TextBlock,
25
+ create_sdk_mcp_server,
26
+ query,
27
+ )
28
+
29
+ _HAS_SDK = True
30
+ except ImportError:
31
+ _HAS_SDK = False
32
+
33
+
34
+ class TestExecutionAgent:
35
+ """Intelligent test execution agent powered by Claude Agent SDK.
36
+
37
+ Orchestrates testmcpy infrastructure through custom @tool functions,
38
+ providing reasoning, adaptability, and natural language interaction.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ mcp_profile: str | None = None,
44
+ mcp_url: str | None = None,
45
+ auth_config: dict[str, Any] | None = None,
46
+ models: list[str] | None = None,
47
+ storage_path: str | None = None,
48
+ max_turns: int = 50,
49
+ agent_model: str | None = None,
50
+ ):
51
+ """Initialize the agent.
52
+
53
+ Args:
54
+ mcp_profile: MCP service profile name
55
+ mcp_url: Direct MCP service URL (overrides profile)
56
+ auth_config: Authentication config dict
57
+ models: List of model names available for testing
58
+ storage_path: Path to SQLite storage database
59
+ max_turns: Maximum agent turns (default 50)
60
+ agent_model: Model for the agent itself (default: SDK default)
61
+ """
62
+ if not _HAS_SDK:
63
+ raise ImportError(
64
+ "claude_agent_sdk is required for the Test Execution Agent. "
65
+ "Install with: pip install testmcpy[sdk]"
66
+ )
67
+
68
+ self.mcp_profile = mcp_profile
69
+ self.mcp_url = mcp_url
70
+ self.auth_config = auth_config
71
+ self.models = models or []
72
+ self.storage_path = storage_path
73
+ self.max_turns = max_turns
74
+ self.agent_model = agent_model
75
+
76
+ # Configure shared tool context
77
+ set_tool_context(
78
+ mcp_profile=mcp_profile,
79
+ mcp_url=mcp_url,
80
+ auth_config=auth_config,
81
+ storage_path=storage_path,
82
+ )
83
+
84
+ def _build_options(self, session: AgentSession) -> ClaudeAgentOptions:
85
+ """Build ClaudeAgentOptions with tools, hooks, and configuration."""
86
+ # Create in-process MCP server with our custom tools
87
+ mcp_server = create_sdk_mcp_server(
88
+ name="testmcpy-agent-tools",
89
+ version="1.0.0",
90
+ tools=ALL_TOOLS,
91
+ )
92
+
93
+ # Build system prompt with context
94
+ system_prompt = build_context_prompt(
95
+ mcp_profile=self.mcp_profile,
96
+ models=self.models,
97
+ )
98
+
99
+ # Create hooks wired to the session
100
+ hooks = create_hooks(session)
101
+
102
+ options = ClaudeAgentOptions(
103
+ system_prompt=system_prompt,
104
+ permission_mode="bypassPermissions",
105
+ max_turns=self.max_turns,
106
+ mcp_servers={"testmcpy-agent-tools": mcp_server},
107
+ hooks=hooks,
108
+ )
109
+
110
+ if self.agent_model:
111
+ options.model = self.agent_model
112
+
113
+ return options
114
+
115
+ async def run(self, prompt: str) -> AgentRunReport:
116
+ """Execute a one-shot agent run.
117
+
118
+ The agent processes the prompt, uses tools as needed, and returns
119
+ a structured report of what it did.
120
+
121
+ Args:
122
+ prompt: Natural language instruction (e.g., "Run all tests in tests/example.yaml")
123
+
124
+ Returns:
125
+ AgentRunReport with test results, costs, and analysis
126
+ """
127
+ run_id = (
128
+ f"agent_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
129
+ )
130
+ session = AgentSession()
131
+ options = self._build_options(session)
132
+
133
+ # Collect the agent's text output for analysis
134
+ analysis_parts = []
135
+ num_turns = 0
136
+
137
+ async for message in query(prompt=prompt, options=options):
138
+ if isinstance(message, AssistantMessage):
139
+ for block in message.content:
140
+ if isinstance(block, TextBlock):
141
+ analysis_parts.append(block.text)
142
+
143
+ if isinstance(message, ResultMessage):
144
+ num_turns = message.num_turns
145
+ # Extract cost info from result
146
+ if message.total_cost_usd is not None:
147
+ session.orchestrator_cost_usd = max(
148
+ 0.0,
149
+ message.total_cost_usd - session.test_execution_cost_usd,
150
+ )
151
+ if message.usage:
152
+ session.orchestrator_tokens_input = message.usage.get("input_tokens", 0)
153
+ session.orchestrator_tokens_output = message.usage.get("output_tokens", 0)
154
+
155
+ # Build report
156
+ report = AgentRunReport.from_session(session, run_id=run_id)
157
+ report.analysis = "\n".join(analysis_parts)
158
+ report.num_turns = num_turns
159
+
160
+ return report
161
+
162
+ async def chat(self, prompt: str) -> AsyncIterator[dict[str, Any]]:
163
+ """Start an interactive chat session.
164
+
165
+ Yields message dicts as they arrive from the agent.
166
+ Suitable for streaming to a web UI or CLI.
167
+
168
+ Args:
169
+ prompt: Initial prompt to start the conversation
170
+
171
+ Yields:
172
+ Dicts with keys: type (text|tool_use|tool_result|result), content
173
+ """
174
+ session = AgentSession()
175
+ options = self._build_options(session)
176
+
177
+ async with ClaudeSDKClient(options=options) as client:
178
+ await client.query(prompt)
179
+
180
+ async for message in client.receive_response():
181
+ if isinstance(message, AssistantMessage):
182
+ for block in message.content:
183
+ if isinstance(block, TextBlock):
184
+ yield {"type": "text", "content": block.text}
185
+
186
+ elif isinstance(message, ResultMessage):
187
+ report = AgentRunReport.from_session(session)
188
+ report.num_turns = message.num_turns
189
+ if message.total_cost_usd is not None:
190
+ report.orchestrator_cost_usd = max(
191
+ 0.0,
192
+ message.total_cost_usd - session.test_execution_cost_usd,
193
+ )
194
+ report.total_cost_usd = message.total_cost_usd
195
+ yield {"type": "result", "content": report.to_dict()}