testmcpy 0.8.0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. {testmcpy-0.8.0/testmcpy.egg-info → testmcpy-0.9.2}/PKG-INFO +8 -1
  2. {testmcpy-0.8.0 → testmcpy-0.9.2}/README.md +7 -0
  3. {testmcpy-0.8.0 → testmcpy-0.9.2}/pyproject.toml +1 -1
  4. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/models.py +6 -0
  5. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/api.py +151 -7
  6. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/results.py +2 -15
  7. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/runs.py +36 -2
  8. testmcpy-0.9.2/testmcpy/server/run_persistence.py +208 -0
  9. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/run_registry.py +65 -6
  10. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/websocket.py +133 -50
  11. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/llm_integration.py +5 -5
  12. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/storage.py +106 -23
  13. testmcpy-0.9.2/testmcpy/ui/dist/assets/index-BXP9_Odn.js +324 -0
  14. testmcpy-0.9.2/testmcpy/ui/dist/assets/index-D35cfDhp.css +1 -0
  15. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/dist/index.html +2 -2
  16. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/BackgroundRunsIndicator.jsx +13 -2
  17. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/contexts/TestRunContext.jsx +195 -96
  18. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/TestManager.jsx +22 -0
  19. {testmcpy-0.8.0 → testmcpy-0.9.2/testmcpy.egg-info}/PKG-INFO +8 -1
  20. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/SOURCES.txt +3 -2
  21. testmcpy-0.8.0/testmcpy/ui/dist/assets/index-9d2zHuWX.js +0 -324
  22. testmcpy-0.8.0/testmcpy/ui/dist/assets/index-CgmKHZdS.css +0 -1
  23. {testmcpy-0.8.0 → testmcpy-0.9.2}/LICENSE +0 -0
  24. {testmcpy-0.8.0 → testmcpy-0.9.2}/MANIFEST.in +0 -0
  25. {testmcpy-0.8.0 → testmcpy-0.9.2}/NOTICE +0 -0
  26. {testmcpy-0.8.0 → testmcpy-0.9.2}/setup.cfg +0 -0
  27. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/__init__.py +0 -0
  28. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/__main__.py +0 -0
  29. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/__init__.py +0 -0
  30. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/hooks.py +0 -0
  31. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/models.py +0 -0
  32. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/orchestrator.py +0 -0
  33. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/prompts.py +0 -0
  34. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/tools.py +0 -0
  35. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/analytics.py +0 -0
  36. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/auth_debugger.py +0 -0
  37. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/auth_flow_recorder.py +0 -0
  38. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/__init__.py +0 -0
  39. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/app.py +0 -0
  40. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/__init__.py +0 -0
  41. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/agent.py +0 -0
  42. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/analytics.py +0 -0
  43. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/badge.py +0 -0
  44. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/baseline.py +0 -0
  45. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/bench.py +0 -0
  46. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/conformance.py +0 -0
  47. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/export_db.py +0 -0
  48. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/mcp.py +0 -0
  49. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/metamorphic.py +0 -0
  50. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/multi_env.py +0 -0
  51. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/mutate.py +0 -0
  52. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/push.py +0 -0
  53. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/run.py +0 -0
  54. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/scan.py +0 -0
  55. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/score.py +0 -0
  56. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/server.py +0 -0
  57. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/tools.py +0 -0
  58. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/tui.py +0 -0
  59. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/wizard.py +0 -0
  60. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/config.py +0 -0
  61. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/__init__.py +0 -0
  62. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/chat_session.py +0 -0
  63. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/docs_optimizer.py +0 -0
  64. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/mcp_manager.py +0 -0
  65. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/tool_comparison.py +0 -0
  66. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/tool_discovery.py +0 -0
  67. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/db.py +0 -0
  68. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/error_handlers.py +0 -0
  69. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/__init__.py +0 -0
  70. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/auth_evaluators.py +0 -0
  71. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/base_evaluators.py +0 -0
  72. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/evaluator_packs.py +0 -0
  73. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/security_evaluators.py +0 -0
  74. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/__init__.py +0 -0
  75. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/base.py +0 -0
  76. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/curl.py +0 -0
  77. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/graphql.py +0 -0
  78. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/javascript_client.py +0 -0
  79. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/json_yaml.py +0 -0
  80. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/protobuf.py +0 -0
  81. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/python.py +0 -0
  82. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/python_client.py +0 -0
  83. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/thrift.py +0 -0
  84. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/typescript.py +0 -0
  85. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/typescript_client.py +0 -0
  86. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/llm_profiles.py +0 -0
  87. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/mcp_profiles.py +0 -0
  88. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/migrate_json.py +0 -0
  89. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_detailed_exploration.py +0 -0
  90. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_poc.py +0 -0
  91. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_working_poc.py +0 -0
  92. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/test_ollama_tools.py +0 -0
  93. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/__init__.py +0 -0
  94. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/rules.py +0 -0
  95. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/scanner.py +0 -0
  96. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/__init__.py +0 -0
  97. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/auth_middleware.py +0 -0
  98. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/helpers/__init__.py +0 -0
  99. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/helpers/mcp_config.py +0 -0
  100. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/models.py +0 -0
  101. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/__init__.py +0 -0
  102. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/agent.py +0 -0
  103. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/analytics.py +0 -0
  104. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/auth.py +0 -0
  105. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/compare.py +0 -0
  106. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/compatibility.py +0 -0
  107. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/generation_logs.py +0 -0
  108. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/health.py +0 -0
  109. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/llm.py +0 -0
  110. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/mcp_profiles.py +0 -0
  111. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/metrics.py +0 -0
  112. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/search.py +0 -0
  113. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/security.py +0 -0
  114. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/smoke_reports.py +0 -0
  115. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/test_profiles.py +0 -0
  116. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/tests.py +0 -0
  117. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/tools.py +0 -0
  118. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/state.py +0 -0
  119. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/smoke_test.py +0 -0
  120. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/__init__.py +0 -0
  121. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/baseline.py +0 -0
  122. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/ci_gate.py +0 -0
  123. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/comparison_runner.py +0 -0
  124. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/coverage_analyzer.py +0 -0
  125. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/emitters.py +0 -0
  126. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/html_report.py +0 -0
  127. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/mcp_client.py +0 -0
  128. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/metamorphic.py +0 -0
  129. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/model_registry.py +0 -0
  130. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/models.py +0 -0
  131. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/multi_env.py +0 -0
  132. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/oauth_flows.py +0 -0
  133. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/prompt_mutation.py +0 -0
  134. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/report_generator.py +0 -0
  135. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/runner_tools.py +0 -0
  136. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/schema_diff.py +0 -0
  137. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/test_runner.py +0 -0
  138. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/token_manager.py +0 -0
  139. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/usability_score.py +0 -0
  140. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/test_profiles.py +0 -0
  141. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/README.md +0 -0
  142. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/index.html +0 -0
  143. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/package-lock.json +0 -0
  144. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/package.json +0 -0
  145. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/postcss.config.js +0 -0
  146. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/App.jsx +0 -0
  147. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/Badge.jsx +0 -0
  148. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/CommandPalette.jsx +0 -0
  149. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/CompareToolsTab.jsx +0 -0
  150. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ConfirmDialog.jsx +0 -0
  151. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/EditorStatusBar.jsx +0 -0
  152. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/EditorTabStrip.jsx +0 -0
  153. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ErrorAlert.jsx +0 -0
  154. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ErrorBoundary.jsx +0 -0
  155. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/LLMProfileSelector.jsx +0 -0
  156. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/LoadingSpinner.jsx +0 -0
  157. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/MCPProfileSelector.jsx +0 -0
  158. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/NotificationProvider.jsx +0 -0
  159. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/OptimizeDocsModal.jsx +0 -0
  160. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/OutputDiff.jsx +0 -0
  161. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ParameterCard.jsx +0 -0
  162. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/SchemaCodeViewer.jsx +0 -0
  163. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/SkeletonLoader.jsx +0 -0
  164. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/StreamingLogViewer.jsx +0 -0
  165. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestGenerationModal.jsx +0 -0
  166. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestProfileSelector.jsx +0 -0
  167. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestResultPanel.jsx +0 -0
  168. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestStatusIndicator.jsx +0 -0
  169. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolCallTimeline.jsx +0 -0
  170. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolComparison.jsx +0 -0
  171. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolDebugModal.jsx +0 -0
  172. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TraceView.jsx +0 -0
  173. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TypeBadge.jsx +0 -0
  174. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/Wizard.jsx +0 -0
  175. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/__tests__/OutputDiff.test.jsx +0 -0
  176. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/contexts/ThemeContext.jsx +0 -0
  177. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useEditorTheme.js +0 -0
  178. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useKeyboardShortcuts.js +0 -0
  179. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useSafeFetch.js +0 -0
  180. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/index.css +0 -0
  181. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/main.jsx +0 -0
  182. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/AuthDebugger.jsx +0 -0
  183. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/ChatInterface.jsx +0 -0
  184. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Configuration.jsx +0 -0
  185. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/GenerationHistory.jsx +0 -0
  186. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/LLMProfiles.jsx +0 -0
  187. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/MCPExplorer.jsx +0 -0
  188. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/MCPProfiles.jsx +0 -0
  189. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Performance.jsx +0 -0
  190. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/ProfilesManager.jsx +0 -0
  191. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Reports.jsx +0 -0
  192. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/SecurityDashboard.jsx +0 -0
  193. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Servers.jsx +0 -0
  194. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/__tests__/ChatInterface.test.jsx +0 -0
  195. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/__tests__/Performance.test.jsx +0 -0
  196. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/test-setup.js +0 -0
  197. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/__tests__/formatConverters.test.js +0 -0
  198. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/formatConverters.js +0 -0
  199. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/formatters.js +0 -0
  200. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/tailwind.config.js +0 -0
  201. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/vite.config.js +0 -0
  202. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/vitest.config.js +0 -0
  203. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/dependency_links.txt +0 -0
  204. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/entry_points.txt +0 -0
  205. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/requires.txt +0 -0
  206. {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: testmcpy
3
- Version: 0.8.0
3
+ Version: 0.9.2
4
4
  Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
5
5
  Author: Amin Ghadersohi
6
6
  License-Expression: Apache-2.0
@@ -351,6 +351,13 @@ profiles:
351
351
 
352
352
  The setup command is **idempotent** — safe to run multiple times. Use `--force` to overwrite existing files.
353
353
 
354
+ **`TESTMCPY_CHAT_OAUTH_LOGIN`** (default `true`): when a chat message hits an
355
+ OAuth (`oauth_auto_discover`) MCP profile with no cached token, the server opens
356
+ the interactive browser OAuth flow and retries. This assumes a browser is
357
+ available on the machine running the server — in headless deployments set
358
+ `TESTMCPY_CHAT_OAUTH_LOGIN=false` so the request fails fast with a clear error
359
+ instead of blocking on a login that can never complete.
360
+
354
361
  ### 2. Explore Your MCP Service
355
362
 
356
363
  ```bash
@@ -271,6 +271,13 @@ profiles:
271
271
 
272
272
  The setup command is **idempotent** — safe to run multiple times. Use `--force` to overwrite existing files.
273
273
 
274
+ **`TESTMCPY_CHAT_OAUTH_LOGIN`** (default `true`): when a chat message hits an
275
+ OAuth (`oauth_auto_discover`) MCP profile with no cached token, the server opens
276
+ the interactive browser OAuth flow and retries. This assumes a browser is
277
+ available on the machine running the server — in headless deployments set
278
+ `TESTMCPY_CHAT_OAUTH_LOGIN=false` so the request fails fast with a clear error
279
+ instead of blocking on a login that can never complete.
280
+
274
281
  ### 2. Explore Your MCP Service
275
282
 
276
283
  ```bash
@@ -93,7 +93,7 @@ testmcpy = [
93
93
 
94
94
  [project]
95
95
  name = "testmcpy"
96
- version = "0.8.0"
96
+ version = "0.9.2"
97
97
  description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
98
98
  authors = [{name = "Amin Ghadersohi"}]
99
99
  license = "Apache-2.0"
@@ -139,6 +139,12 @@ class TestRunModel(Base):
139
139
  total_tokens: Mapped[int] = mapped_column(Integer, default=0)
140
140
  started_at: Mapped[str] = mapped_column(String, nullable=False)
141
141
  completed_at: Mapped[str | None] = mapped_column(String, nullable=True)
142
+ # Touched every ~30s while the run executes (UTC ISO, same format as
143
+ # the reconciliation cutoff it's compared against). Lets crash
144
+ # reconciliation distinguish a live run — possibly owned by another
145
+ # server sharing this DB — from a dead one, instead of guessing from
146
+ # started_at age.
147
+ heartbeat_at: Mapped[str | None] = mapped_column(String, nullable=True)
142
148
  metadata_: Mapped[dict | None] = mapped_column("metadata", JSON, nullable=True)
143
149
  created_at: Mapped[datetime] = mapped_column(
144
150
  DateTime, nullable=False, default=lambda: datetime.now(timezone.utc)
@@ -12,6 +12,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning, module="websocket
12
12
  warnings.filterwarnings("ignore", category=DeprecationWarning, module="websockets.legacy")
13
13
  warnings.filterwarnings("ignore", category=DeprecationWarning, module="uvicorn")
14
14
 
15
+ import contextlib # noqa: E402
15
16
  from contextlib import asynccontextmanager # noqa: E402
16
17
  from datetime import datetime # noqa: E402
17
18
  from enum import Enum # noqa: E402
@@ -150,6 +151,22 @@ def _get_init_lock(cache_key: str) -> asyncio.Lock:
150
151
  return _client_init_locks[cache_key]
151
152
 
152
153
 
154
+ def _primary_mcp_provider_kwargs(
155
+ clients_to_use: list[tuple[str, str, MCPClient]],
156
+ ) -> dict[str, Any]:
157
+ """mcp_url/auth kwargs from the FIRST selected MCP client.
158
+
159
+ SDK providers support a single MCP server; the Chat UI sends exactly one
160
+ "profileId:mcpName". Without these kwargs the providers fall back to the
161
+ DEFAULT profile's URL/auth, breaking chat for any other selected profile.
162
+ create_llm_provider filters these out for providers that don't accept them.
163
+ """
164
+ if not clients_to_use:
165
+ return {}
166
+ _profile_id, _mcp_name, client = clients_to_use[0]
167
+ return {"mcp_url": client.base_url, "auth": client.auth_config}
168
+
169
+
153
170
  async def get_mcp_clients_for_profile(profile_id: str) -> list[tuple[str, MCPClient]]:
154
171
  """
155
172
  Get or create MCP clients for all MCP servers in a profile.
@@ -298,12 +315,15 @@ async def get_mcp_client_for_server(profile_id: str, mcp_name: str) -> MCPClient
298
315
  return client
299
316
 
300
317
 
301
- async def clear_cached_client(cache_key: str) -> bool:
318
+ async def clear_cached_client(cache_key: str, record_failure: bool = True) -> bool:
302
319
  """
303
320
  Clear a cached MCP client by its cache key.
304
321
 
305
322
  Args:
306
323
  cache_key: Cache key in format "{profile_id}:{mcp_name}"
324
+ record_failure: When True (default), throttle the next reconnect via
325
+ back-off. Pass False for deliberate re-initialization (e.g. an
326
+ interactive OAuth re-login) where an immediate reconnect is wanted.
307
327
 
308
328
  Returns:
309
329
  True if a client was cleared, False if no client was cached
@@ -312,8 +332,9 @@ async def clear_cached_client(cache_key: str) -> bool:
312
332
 
313
333
  client = mcp_clients.pop(cache_key, None)
314
334
  if client:
315
- # Record a failure so the next reconnect is throttled via back-off.
316
- _record_failure(cache_key)
335
+ if record_failure:
336
+ # Record a failure so the next reconnect is throttled via back-off.
337
+ _record_failure(cache_key)
317
338
  try:
318
339
  await client.close()
319
340
  print(f"Cleared cached client '{cache_key}'")
@@ -323,6 +344,69 @@ async def clear_cached_client(cache_key: str) -> bool:
323
344
  return False
324
345
 
325
346
 
347
+ # Marker substring of the ValueError raised by BaseSDKProvider when an
348
+ # oauth_auto_discover profile has no cached token (see
349
+ # llm_integration.BaseSDKProvider._resolve_mcp_bearer_token).
350
+ _OAUTH_TOKEN_ERROR = "No usable cached OAuth token"
351
+
352
+
353
+ def _chat_oauth_login_enabled() -> bool:
354
+ """Feature flag for interactive OAuth login during chat (default ON).
355
+
356
+ Disable with TESTMCPY_CHAT_OAUTH_LOGIN=false (or 0/no). Read at call time
357
+ so tests can monkeypatch the environment.
358
+ """
359
+ return os.environ.get("TESTMCPY_CHAT_OAUTH_LOGIN", "true").strip().lower() not in (
360
+ "0",
361
+ "false",
362
+ "no",
363
+ )
364
+
365
+
366
+ async def _relogin_oauth_servers(server_keys: list[str]) -> dict[str, MCPClient]:
367
+ """Deliberate interactive re-auth for the given "profileId:mcpName" keys.
368
+
369
+ Drops cached clients WITHOUT recording back-off, clears any pre-existing
370
+ back-off state, and re-initializes. MCPClient.initialize() with
371
+ oauth_auto_discover opens the browser OAuth flow and caches the token via
372
+ fastmcp FileTokenStorage; duplicate popups are prevented by the per-key
373
+ init locks.
374
+
375
+ Returns the fresh clients keyed by cache key so callers can replace any
376
+ references to the old, now-closed client objects.
377
+ """
378
+ new_clients: dict[str, MCPClient] = {}
379
+ for cache_key in server_keys:
380
+ await clear_cached_client(cache_key, record_failure=False)
381
+ _clear_failure(cache_key) # earlier failures must not block deliberate re-auth
382
+ profile_id, mcp_name = cache_key.split(":", 1)
383
+ client = await get_mcp_client_for_server(profile_id, mcp_name)
384
+ if client:
385
+ new_clients[cache_key] = client
386
+ return new_clients
387
+
388
+
389
+ def _refresh_client_refs(
390
+ new_clients: dict[str, MCPClient],
391
+ clients_to_use: list[tuple[str, str, MCPClient]],
392
+ tool_to_client: dict[str, tuple[MCPClient, str, str]],
393
+ ) -> tuple[list[tuple[str, str, MCPClient]], dict[str, tuple[MCPClient, str, str]]]:
394
+ """Swap re-logged-in clients into the chat endpoints' lookup structures.
395
+
396
+ After _relogin_oauth_servers the old client objects are closed; tool
397
+ execution through tool_to_client must use the replacements.
398
+ """
399
+ refreshed_clients = [
400
+ (pid, name, new_clients.get(f"{pid}:{name}", client))
401
+ for pid, name, client in clients_to_use
402
+ ]
403
+ refreshed_tools = {
404
+ tool: (new_clients.get(f"{pid}:{name}", client), pid, name)
405
+ for tool, (client, pid, name) in tool_to_client.items()
406
+ }
407
+ return refreshed_clients, refreshed_tools
408
+
409
+
326
410
  def is_auth_error(error_msg: str) -> bool:
327
411
  """Check if an error message indicates an authentication failure."""
328
412
  error_lower = error_msg.lower()
@@ -395,6 +479,30 @@ async def lifespan(app: FastAPI):
395
479
  except SQLAlchemyError as e:
396
480
  print(f"Warning: could not reconcile stale runs: {e}")
397
481
 
482
+ # …and keep reconciling while we run, so a crashed sibling server (or
483
+ # a row orphaned by an event-loop death that didn't restart the
484
+ # process) flips to 'interrupted' within minutes rather than at the
485
+ # next restart. Heartbeat-only (no started_at fallback): legacy rows
486
+ # without heartbeats carry local-naive timestamps that can't be
487
+ # compared reliably against a UTC cutoff.
488
+ async def _stale_run_sweeper() -> None:
489
+ from testmcpy.storage import get_storage
490
+
491
+ while True:
492
+ await _asyncio.sleep(60)
493
+ try:
494
+ get_storage().mark_stale_runs_interrupted(no_heartbeat_older_than_hours=None)
495
+ except _asyncio.CancelledError:
496
+ raise
497
+ except Exception as sweep_err: # noqa: BLE001 — long-lived loop:
498
+ # any escaping error (not just SQLAlchemyError — e.g. an
499
+ # OSError on first-time DB-path init) would otherwise kill
500
+ # the sweeper permanently and silently, reverting crash
501
+ # reconciliation to startup-only. (PR #90 review)
502
+ print(f"Warning: stale-run sweep failed: {sweep_err}")
503
+
504
+ sweeper_task = _asyncio.create_task(_stale_run_sweeper())
505
+
398
506
  # Startup
399
507
  try:
400
508
  mcp_url = config.get_mcp_url()
@@ -410,6 +518,10 @@ async def lifespan(app: FastAPI):
410
518
  yield
411
519
 
412
520
  # Shutdown
521
+ sweeper_task.cancel()
522
+ with contextlib.suppress(_asyncio.CancelledError):
523
+ await sweeper_task
524
+
413
525
  if mcp_client:
414
526
  await mcp_client.close()
415
527
 
@@ -907,9 +1019,25 @@ async def chat(request: ChatRequest) -> ChatResponse:
907
1019
  provider_kwargs = {}
908
1020
  if api_key:
909
1021
  provider_kwargs["api_key"] = api_key
910
- llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1022
+ provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
911
1023
  print("[Chat] Initializing LLM provider...")
912
- await llm_provider.initialize()
1024
+ try:
1025
+ llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1026
+ await llm_provider.initialize()
1027
+ except ValueError as e:
1028
+ if not (_chat_oauth_login_enabled() and _OAUTH_TOKEN_ERROR in str(e)):
1029
+ raise
1030
+ print("[Chat] No cached OAuth token; triggering interactive OAuth login...")
1031
+ new_clients = await _relogin_oauth_servers(accessed_servers)
1032
+ # The old client objects are closed now — swap in the replacements
1033
+ # so tool execution doesn't hit a closed client.
1034
+ clients_to_use, tool_to_client = _refresh_client_refs(
1035
+ new_clients, clients_to_use, tool_to_client
1036
+ )
1037
+ provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
1038
+ llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1039
+ # Single retry; a second failure falls to the existing handlers.
1040
+ await llm_provider.initialize()
913
1041
  print(
914
1042
  f"[Chat] LLM provider initialized. Generating response with {len(all_tools)} tools..."
915
1043
  )
@@ -1180,8 +1308,24 @@ async def chat_stream(request: ChatRequest):
1180
1308
  provider_kwargs: dict = {}
1181
1309
  if api_key:
1182
1310
  provider_kwargs["api_key"] = api_key
1183
- llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1184
- await llm_provider.initialize()
1311
+ provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
1312
+ try:
1313
+ llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1314
+ await llm_provider.initialize()
1315
+ except ValueError as e:
1316
+ if not (_chat_oauth_login_enabled() and _OAUTH_TOKEN_ERROR in str(e)):
1317
+ raise
1318
+ yield send_event("status", "Waiting for OAuth login in browser...")
1319
+ new_clients = await _relogin_oauth_servers(accessed_servers)
1320
+ # The old client objects are closed now — swap in the replacements
1321
+ # so tool execution doesn't hit a closed client.
1322
+ clients_to_use, tool_to_client = _refresh_client_refs(
1323
+ new_clients, clients_to_use, tool_to_client
1324
+ )
1325
+ provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
1326
+ llm_provider = create_llm_provider(provider, model, **provider_kwargs)
1327
+ # Single retry; a second failure falls to the existing handlers.
1328
+ await llm_provider.initialize()
1185
1329
 
1186
1330
  # --- Detect if provider is SDK-based (handles its own agentic loop) ---
1187
1331
  from testmcpy.src.llm_integration import ClaudeSDKProvider
@@ -11,6 +11,7 @@ from typing import Any
11
11
  from fastapi import APIRouter, HTTPException
12
12
  from pydantic import BaseModel
13
13
 
14
+ from testmcpy.server.run_persistence import question_result_kwargs
14
15
  from testmcpy.storage import get_storage
15
16
 
16
17
  router = APIRouter(prefix="/api/results", tags=["results"])
@@ -98,21 +99,7 @@ def save_test_run_to_file(data: dict[str, Any]) -> dict[str, Any]:
98
99
 
99
100
  # Save individual question results
100
101
  for r in results:
101
- storage.save_question_result(
102
- run_id=run_id,
103
- question_id=r.get("test_name", r.get("question_id", "unknown")),
104
- passed=r.get("passed", False),
105
- score=r.get("score", 0.0),
106
- answer=r.get("response", r.get("answer")),
107
- tool_uses=r.get("tool_calls", r.get("tool_uses")),
108
- tool_results=r.get("tool_results"),
109
- tokens_input=(r.get("token_usage") or {}).get("input", 0),
110
- tokens_output=(r.get("token_usage") or {}).get("output", 0),
111
- duration_ms=int(r.get("duration", 0) * 1000),
112
- evaluations=r.get("evaluations"),
113
- error=r.get("error"),
114
- cost_usd=r.get("cost", r.get("cost_usd", 0.0)),
115
- )
102
+ storage.save_question_result(run_id=run_id, **question_result_kwargs(r))
116
103
 
117
104
  # Complete the run
118
105
  storage.complete_run(run_id, datetime.now().isoformat())
@@ -11,8 +11,11 @@ from __future__ import annotations
11
11
  from typing import Any
12
12
 
13
13
  from fastapi import APIRouter, HTTPException
14
+ from sqlalchemy.exc import SQLAlchemyError
14
15
 
15
16
  from testmcpy.server import run_registry
17
+ from testmcpy.server.run_persistence import wire_status_for_db_status
18
+ from testmcpy.storage import get_storage
16
19
 
17
20
  router = APIRouter(prefix="/api", tags=["runs"])
18
21
 
@@ -65,9 +68,40 @@ async def list_runs(active_only: bool = True) -> dict[str, Any]:
65
68
  @router.get("/runs/{run_id}")
66
69
  async def get_run(run_id: str) -> dict[str, Any]:
67
70
  handle = await run_registry.get_run(run_id)
68
- if handle is None:
71
+ if handle is not None:
72
+ return _serialise(handle)
73
+ # Registry miss (GC'd after CLEANUP_TTL, or a server restart) — fall
74
+ # back to the results DB so a stale tab asking about its run gets the
75
+ # final state instead of a 404. ``source: history`` tells the client
76
+ # this is a finished record, not a live handle. A DB hiccup is treated
77
+ # as a miss (matching the WS twin, _attach_history_run) rather than
78
+ # surfacing a 500 to the indicator's poll loop.
79
+ try:
80
+ record = get_storage().get_run(run_id)
81
+ except SQLAlchemyError:
82
+ record = None
83
+ if record is None:
69
84
  raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
70
- return _serialise(handle)
85
+ status = wire_status_for_db_status(record.get("status"))
86
+ return {
87
+ "run_id": run_id,
88
+ # Only single-run ids ever land a DB row today — directory-batch
89
+ # ids persist per-file under fresh ids (see _attach_history_run),
90
+ # so anything resolvable here is a single run by construction.
91
+ "kind": "single",
92
+ "status": status,
93
+ "started_at": record.get("started_at"),
94
+ "finished_at": record.get("completed_at"),
95
+ "meta": {
96
+ "test_path": record.get("test_id"),
97
+ "model": record.get("model"),
98
+ "provider": record.get("provider"),
99
+ },
100
+ "summary": record.get("summary"),
101
+ "result_count": len(record.get("question_results") or []),
102
+ "is_attached": False,
103
+ "source": "history",
104
+ }
71
105
 
72
106
 
73
107
  @router.post("/runs/{run_id}/stop")
@@ -0,0 +1,208 @@
1
+ """Incremental DB persistence for in-flight test runs.
2
+
3
+ Historically the WebSocket runner saved a run to the database only once,
4
+ at the very end (``save_test_run_to_file``) — a server crash at test 29/30
5
+ lost everything. ``RunRecord`` makes the DB the source of truth for
6
+ partial progress instead:
7
+
8
+ - ``begin()`` — creates the suite + a ``test_runs`` row (status=running)
9
+ as soon as the run starts executing.
10
+ - ``append()`` — writes one ``question_results`` row per completed test.
11
+ - ``finish()`` — stamps the terminal status (completed/error/stopped) and
12
+ the denormalized totals. Idempotent.
13
+
14
+ DB errors are swallowed (logged through the run's own log stream): a
15
+ persistence hiccup must degrade history, never kill a live run.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import uuid
21
+ from collections.abc import Callable
22
+ from datetime import datetime, timezone
23
+ from typing import Any
24
+
25
+ from sqlalchemy.exc import SQLAlchemyError
26
+
27
+ from testmcpy.storage import get_storage
28
+
29
+
30
+ def mint_run_id() -> str:
31
+ """Legacy ``<8-hex>_<timestamp>`` run-id shape shared with the run
32
+ registry and ``save_test_run_to_file`` so every code path mints
33
+ correlatable identifiers."""
34
+ return f"{uuid.uuid4().hex[:8]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
35
+
36
+
37
+ def question_result_kwargs(r: dict[str, Any]) -> dict[str, Any]:
38
+ """Map a TestResult.to_dict() shape onto ``save_question_result``
39
+ kwargs. Single source of truth for the mapping — used by both the
40
+ end-of-run ``save_test_run_to_file`` and the incremental ``RunRecord``.
41
+ """
42
+ # LLM providers report token_usage as {prompt, completion, total}
43
+ # (see llm_integration.py); the old mapping read input/output and
44
+ # silently stored 0 for every UI-triggered run. Keep input/output as
45
+ # a fallback for callers of POST /api/results/save that adopted the
46
+ # old keys.
47
+ usage = r.get("token_usage") or {}
48
+ return {
49
+ "question_id": r.get("test_name", r.get("question_id", "unknown")),
50
+ "passed": r.get("passed", False),
51
+ "score": r.get("score", 0.0),
52
+ "answer": r.get("response", r.get("answer")),
53
+ "tool_uses": r.get("tool_calls", r.get("tool_uses")),
54
+ "tool_results": r.get("tool_results"),
55
+ "tokens_input": usage.get("prompt", usage.get("input", 0)),
56
+ "tokens_output": usage.get("completion", usage.get("output", 0)),
57
+ "duration_ms": int(r.get("duration", 0) * 1000),
58
+ "evaluations": r.get("evaluations"),
59
+ "error": r.get("error"),
60
+ "cost_usd": r.get("cost", r.get("cost_usd", 0.0)),
61
+ }
62
+
63
+
64
+ def ui_result_from_question_result(q: dict[str, Any]) -> dict[str, Any]:
65
+ """Inverse of ``question_result_kwargs``: map a stored question_results
66
+ row (as returned by ``storage.get_run``) back onto the TestResult
67
+ wire shape the UI's test_complete / all_complete handlers expect —
68
+ including the live {prompt, completion, total} token_usage keys the
69
+ client sums (TestRunContext reads token_usage.total)."""
70
+ tokens_in = q.get("tokens_input", 0) or 0
71
+ tokens_out = q.get("tokens_output", 0) or 0
72
+ return {
73
+ "test_name": q.get("question_id"),
74
+ "passed": bool(q.get("passed")),
75
+ "score": q.get("score", 0.0),
76
+ "response": q.get("answer"),
77
+ "tool_calls": q.get("tool_uses") or [],
78
+ "tool_results": q.get("tool_results") or [],
79
+ "token_usage": {
80
+ "prompt": tokens_in,
81
+ "completion": tokens_out,
82
+ "total": tokens_in + tokens_out,
83
+ },
84
+ "duration": (q.get("duration_ms") or 0) / 1000,
85
+ "evaluations": q.get("evaluations") or [],
86
+ "error": q.get("error"),
87
+ "cost": q.get("cost_usd", 0.0) or 0.0,
88
+ }
89
+
90
+
91
+ # DB statuses that map straight onto the wire's terminal statuses. A DB
92
+ # row still 'running' (or already 'interrupted') with no registry handle
93
+ # means the server died mid-run — report it as interrupted.
94
+ _TERMINAL_WIRE_STATUS = {"completed": "completed", "stopped": "stopped", "error": "error"}
95
+
96
+
97
+ def wire_status_for_db_status(db_status: str | None) -> str:
98
+ """Map a test_runs.status onto the WebSocket/REST wire status for a
99
+ run that is NOT in the in-memory registry: terminal statuses pass
100
+ through, anything else (running / interrupted / NULL / unknown) means
101
+ the owning process died mid-run — interrupted."""
102
+ return _TERMINAL_WIRE_STATUS.get(db_status or "", "interrupted")
103
+
104
+
105
+ def history_replay_messages(record: dict[str, Any]) -> list[dict[str, Any]]:
106
+ """Synthesize the WebSocket message sequence for attaching to a run
107
+ that's no longer in the in-memory registry (GC'd after CLEANUP_TTL,
108
+ or lost to a server restart) but lives in the results DB: a
109
+ ``run_started`` marker, one ``test_complete`` per stored result (so
110
+ the UI rebuilds its per-test panels), and a terminal ``all_complete``
111
+ carrying the run's real status — including ``interrupted`` with
112
+ partial results for runs that died mid-flight."""
113
+ status = wire_status_for_db_status(record.get("status"))
114
+ results = [ui_result_from_question_result(q) for q in record.get("question_results", [])]
115
+ passed = sum(1 for r in results if r["passed"])
116
+ summary = {
117
+ "total": len(results),
118
+ "passed": passed,
119
+ "failed": len(results) - passed,
120
+ "total_cost": sum(r["cost"] for r in results),
121
+ "status": status,
122
+ }
123
+ return [
124
+ {
125
+ "type": "run_started",
126
+ "run_id": record.get("run_id"),
127
+ "kind": "single",
128
+ "reattached": True,
129
+ "status": status,
130
+ "source": "history",
131
+ },
132
+ *({"type": "test_complete", "test_name": r["test_name"], "result": r} for r in results),
133
+ {"type": "all_complete", "status": status, "summary": summary, "results": results},
134
+ ]
135
+
136
+
137
+ class RunRecord:
138
+ """Write-through record of one run (one YAML file) in the results DB.
139
+
140
+ All writes are best-effort: a failure marks the record broken and is
141
+ reported once through ``log``, after which subsequent calls no-op so
142
+ a flaky DB doesn't spam the run log or slow the run down.
143
+ """
144
+
145
+ def __init__(self, run_id: str | None = None, log: Callable[[str], None] | None = None):
146
+ self.run_id = run_id or mint_run_id()
147
+ self._log = log or (lambda msg: None)
148
+ self._began = False
149
+ self._finished = False
150
+ self._broken = False
151
+
152
+ def _report_db_error(self, op: str, exc: SQLAlchemyError) -> None:
153
+ self._broken = True
154
+ self._log(f"⚠️ Results DB unavailable ({op}): {exc} — run continues without history")
155
+
156
+ def begin(
157
+ self,
158
+ *,
159
+ test_file: str,
160
+ model: str,
161
+ provider: str,
162
+ mcp_profile: str | None = None,
163
+ llm_profile: str | None = None,
164
+ metadata: dict[str, Any] | None = None,
165
+ ) -> None:
166
+ """Create the suite + the run row (status=running) up front."""
167
+ if self._began or self._broken:
168
+ return
169
+ try:
170
+ storage = get_storage()
171
+ storage.save_suite(suite_id=test_file, name=test_file, questions=[])
172
+ storage.save_run(
173
+ run_id=self.run_id,
174
+ test_id=test_file,
175
+ test_version=1,
176
+ model=model,
177
+ provider=provider,
178
+ started_at=datetime.now(timezone.utc).isoformat(),
179
+ mcp_profile_id=mcp_profile,
180
+ llm_profile_id=llm_profile,
181
+ metadata=metadata,
182
+ )
183
+ self._began = True
184
+ except SQLAlchemyError as exc:
185
+ self._report_db_error("begin", exc)
186
+
187
+ def append(self, result: dict[str, Any]) -> None:
188
+ """Persist one completed test immediately (crash-safe progress)."""
189
+ if not self._began or self._finished or self._broken:
190
+ return
191
+ try:
192
+ get_storage().save_question_result(run_id=self.run_id, **question_result_kwargs(result))
193
+ except SQLAlchemyError as exc:
194
+ self._report_db_error("append", exc)
195
+
196
+ def finish(self, status: str) -> None:
197
+ """Stamp the terminal status + denormalized totals. Idempotent —
198
+ the first terminal status wins (e.g. ``stopped`` from the cancel
199
+ path must not be overwritten by a later generic finalizer)."""
200
+ if not self._began or self._finished or self._broken:
201
+ return
202
+ try:
203
+ get_storage().finish_run(
204
+ self.run_id, status=status, completed_at=datetime.now(timezone.utc).isoformat()
205
+ )
206
+ self._finished = True
207
+ except SQLAlchemyError as exc:
208
+ self._report_db_error("finish", exc)