testmcpy 0.8.0__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {testmcpy-0.8.0/testmcpy.egg-info → testmcpy-0.9.2}/PKG-INFO +8 -1
- {testmcpy-0.8.0 → testmcpy-0.9.2}/README.md +7 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/pyproject.toml +1 -1
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/models.py +6 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/api.py +151 -7
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/results.py +2 -15
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/runs.py +36 -2
- testmcpy-0.9.2/testmcpy/server/run_persistence.py +208 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/run_registry.py +65 -6
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/websocket.py +133 -50
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/llm_integration.py +5 -5
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/storage.py +106 -23
- testmcpy-0.9.2/testmcpy/ui/dist/assets/index-BXP9_Odn.js +324 -0
- testmcpy-0.9.2/testmcpy/ui/dist/assets/index-D35cfDhp.css +1 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/dist/index.html +2 -2
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/BackgroundRunsIndicator.jsx +13 -2
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/contexts/TestRunContext.jsx +195 -96
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/TestManager.jsx +22 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2/testmcpy.egg-info}/PKG-INFO +8 -1
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/SOURCES.txt +3 -2
- testmcpy-0.8.0/testmcpy/ui/dist/assets/index-9d2zHuWX.js +0 -324
- testmcpy-0.8.0/testmcpy/ui/dist/assets/index-CgmKHZdS.css +0 -1
- {testmcpy-0.8.0 → testmcpy-0.9.2}/LICENSE +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/MANIFEST.in +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/NOTICE +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/setup.cfg +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/__main__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/hooks.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/models.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/orchestrator.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/prompts.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/agent/tools.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/analytics.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/auth_debugger.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/auth_flow_recorder.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/app.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/agent.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/analytics.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/badge.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/baseline.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/bench.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/conformance.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/export_db.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/mcp.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/metamorphic.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/multi_env.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/mutate.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/push.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/run.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/scan.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/score.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/server.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/tools.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/tui.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/cli/commands/wizard.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/config.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/chat_session.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/docs_optimizer.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/mcp_manager.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/tool_comparison.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/core/tool_discovery.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/db.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/error_handlers.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/auth_evaluators.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/base_evaluators.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/evaluator_packs.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/evals/security_evaluators.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/base.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/curl.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/graphql.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/javascript_client.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/json_yaml.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/protobuf.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/python.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/python_client.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/thrift.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/typescript.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/formatters/typescript_client.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/llm_profiles.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/mcp_profiles.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/migrate_json.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_detailed_exploration.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_poc.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/claude_sdk_working_poc.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/research/test_ollama_tools.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/rules.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/security/scanner.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/auth_middleware.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/helpers/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/helpers/mcp_config.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/models.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/agent.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/analytics.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/auth.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/compare.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/compatibility.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/generation_logs.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/health.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/llm.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/mcp_profiles.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/metrics.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/search.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/security.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/smoke_reports.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/test_profiles.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/tests.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/routers/tools.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/server/state.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/smoke_test.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/__init__.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/baseline.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/ci_gate.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/comparison_runner.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/coverage_analyzer.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/emitters.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/html_report.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/mcp_client.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/metamorphic.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/model_registry.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/models.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/multi_env.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/oauth_flows.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/prompt_mutation.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/report_generator.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/runner_tools.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/schema_diff.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/test_runner.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/token_manager.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/src/usability_score.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/test_profiles.py +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/README.md +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/index.html +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/package-lock.json +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/package.json +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/postcss.config.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/App.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/Badge.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/CommandPalette.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/CompareToolsTab.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ConfirmDialog.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/EditorStatusBar.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/EditorTabStrip.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ErrorAlert.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ErrorBoundary.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/LLMProfileSelector.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/LoadingSpinner.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/MCPProfileSelector.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/NotificationProvider.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/OptimizeDocsModal.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/OutputDiff.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ParameterCard.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/SchemaCodeViewer.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/SkeletonLoader.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/StreamingLogViewer.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestGenerationModal.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestProfileSelector.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestResultPanel.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TestStatusIndicator.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolCallTimeline.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolComparison.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/ToolDebugModal.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TraceView.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/TypeBadge.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/Wizard.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/components/__tests__/OutputDiff.test.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/contexts/ThemeContext.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useEditorTheme.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useKeyboardShortcuts.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/hooks/useSafeFetch.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/index.css +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/main.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/AuthDebugger.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/ChatInterface.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Configuration.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/GenerationHistory.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/LLMProfiles.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/MCPExplorer.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/MCPProfiles.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Performance.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/ProfilesManager.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Reports.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/SecurityDashboard.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/Servers.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/__tests__/ChatInterface.test.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/pages/__tests__/Performance.test.jsx +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/test-setup.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/__tests__/formatConverters.test.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/formatConverters.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/src/utils/formatters.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/tailwind.config.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/vite.config.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy/ui/vitest.config.js +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/dependency_links.txt +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/entry_points.txt +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/requires.txt +0 -0
- {testmcpy-0.8.0 → testmcpy-0.9.2}/testmcpy.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: testmcpy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.2
|
|
4
4
|
Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
|
|
5
5
|
Author: Amin Ghadersohi
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -351,6 +351,13 @@ profiles:
|
|
|
351
351
|
|
|
352
352
|
The setup command is **idempotent** — safe to run multiple times. Use `--force` to overwrite existing files.
|
|
353
353
|
|
|
354
|
+
**`TESTMCPY_CHAT_OAUTH_LOGIN`** (default `true`): when a chat message hits an
|
|
355
|
+
OAuth (`oauth_auto_discover`) MCP profile with no cached token, the server opens
|
|
356
|
+
the interactive browser OAuth flow and retries. This assumes a browser is
|
|
357
|
+
available on the machine running the server — in headless deployments set
|
|
358
|
+
`TESTMCPY_CHAT_OAUTH_LOGIN=false` so the request fails fast with a clear error
|
|
359
|
+
instead of blocking on a login that can never complete.
|
|
360
|
+
|
|
354
361
|
### 2. Explore Your MCP Service
|
|
355
362
|
|
|
356
363
|
```bash
|
|
@@ -271,6 +271,13 @@ profiles:
|
|
|
271
271
|
|
|
272
272
|
The setup command is **idempotent** — safe to run multiple times. Use `--force` to overwrite existing files.
|
|
273
273
|
|
|
274
|
+
**`TESTMCPY_CHAT_OAUTH_LOGIN`** (default `true`): when a chat message hits an
|
|
275
|
+
OAuth (`oauth_auto_discover`) MCP profile with no cached token, the server opens
|
|
276
|
+
the interactive browser OAuth flow and retries. This assumes a browser is
|
|
277
|
+
available on the machine running the server — in headless deployments set
|
|
278
|
+
`TESTMCPY_CHAT_OAUTH_LOGIN=false` so the request fails fast with a clear error
|
|
279
|
+
instead of blocking on a login that can never complete.
|
|
280
|
+
|
|
274
281
|
### 2. Explore Your MCP Service
|
|
275
282
|
|
|
276
283
|
```bash
|
|
@@ -93,7 +93,7 @@ testmcpy = [
|
|
|
93
93
|
|
|
94
94
|
[project]
|
|
95
95
|
name = "testmcpy"
|
|
96
|
-
version = "0.
|
|
96
|
+
version = "0.9.2"
|
|
97
97
|
description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
|
|
98
98
|
authors = [{name = "Amin Ghadersohi"}]
|
|
99
99
|
license = "Apache-2.0"
|
|
@@ -139,6 +139,12 @@ class TestRunModel(Base):
|
|
|
139
139
|
total_tokens: Mapped[int] = mapped_column(Integer, default=0)
|
|
140
140
|
started_at: Mapped[str] = mapped_column(String, nullable=False)
|
|
141
141
|
completed_at: Mapped[str | None] = mapped_column(String, nullable=True)
|
|
142
|
+
# Touched every ~30s while the run executes (UTC ISO, same format as
|
|
143
|
+
# the reconciliation cutoff it's compared against). Lets crash
|
|
144
|
+
# reconciliation distinguish a live run — possibly owned by another
|
|
145
|
+
# server sharing this DB — from a dead one, instead of guessing from
|
|
146
|
+
# started_at age.
|
|
147
|
+
heartbeat_at: Mapped[str | None] = mapped_column(String, nullable=True)
|
|
142
148
|
metadata_: Mapped[dict | None] = mapped_column("metadata", JSON, nullable=True)
|
|
143
149
|
created_at: Mapped[datetime] = mapped_column(
|
|
144
150
|
DateTime, nullable=False, default=lambda: datetime.now(timezone.utc)
|
|
@@ -12,6 +12,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning, module="websocket
|
|
|
12
12
|
warnings.filterwarnings("ignore", category=DeprecationWarning, module="websockets.legacy")
|
|
13
13
|
warnings.filterwarnings("ignore", category=DeprecationWarning, module="uvicorn")
|
|
14
14
|
|
|
15
|
+
import contextlib # noqa: E402
|
|
15
16
|
from contextlib import asynccontextmanager # noqa: E402
|
|
16
17
|
from datetime import datetime # noqa: E402
|
|
17
18
|
from enum import Enum # noqa: E402
|
|
@@ -150,6 +151,22 @@ def _get_init_lock(cache_key: str) -> asyncio.Lock:
|
|
|
150
151
|
return _client_init_locks[cache_key]
|
|
151
152
|
|
|
152
153
|
|
|
154
|
+
def _primary_mcp_provider_kwargs(
|
|
155
|
+
clients_to_use: list[tuple[str, str, MCPClient]],
|
|
156
|
+
) -> dict[str, Any]:
|
|
157
|
+
"""mcp_url/auth kwargs from the FIRST selected MCP client.
|
|
158
|
+
|
|
159
|
+
SDK providers support a single MCP server; the Chat UI sends exactly one
|
|
160
|
+
"profileId:mcpName". Without these kwargs the providers fall back to the
|
|
161
|
+
DEFAULT profile's URL/auth, breaking chat for any other selected profile.
|
|
162
|
+
create_llm_provider filters these out for providers that don't accept them.
|
|
163
|
+
"""
|
|
164
|
+
if not clients_to_use:
|
|
165
|
+
return {}
|
|
166
|
+
_profile_id, _mcp_name, client = clients_to_use[0]
|
|
167
|
+
return {"mcp_url": client.base_url, "auth": client.auth_config}
|
|
168
|
+
|
|
169
|
+
|
|
153
170
|
async def get_mcp_clients_for_profile(profile_id: str) -> list[tuple[str, MCPClient]]:
|
|
154
171
|
"""
|
|
155
172
|
Get or create MCP clients for all MCP servers in a profile.
|
|
@@ -298,12 +315,15 @@ async def get_mcp_client_for_server(profile_id: str, mcp_name: str) -> MCPClient
|
|
|
298
315
|
return client
|
|
299
316
|
|
|
300
317
|
|
|
301
|
-
async def clear_cached_client(cache_key: str) -> bool:
|
|
318
|
+
async def clear_cached_client(cache_key: str, record_failure: bool = True) -> bool:
|
|
302
319
|
"""
|
|
303
320
|
Clear a cached MCP client by its cache key.
|
|
304
321
|
|
|
305
322
|
Args:
|
|
306
323
|
cache_key: Cache key in format "{profile_id}:{mcp_name}"
|
|
324
|
+
record_failure: When True (default), throttle the next reconnect via
|
|
325
|
+
back-off. Pass False for deliberate re-initialization (e.g. an
|
|
326
|
+
interactive OAuth re-login) where an immediate reconnect is wanted.
|
|
307
327
|
|
|
308
328
|
Returns:
|
|
309
329
|
True if a client was cleared, False if no client was cached
|
|
@@ -312,8 +332,9 @@ async def clear_cached_client(cache_key: str) -> bool:
|
|
|
312
332
|
|
|
313
333
|
client = mcp_clients.pop(cache_key, None)
|
|
314
334
|
if client:
|
|
315
|
-
|
|
316
|
-
|
|
335
|
+
if record_failure:
|
|
336
|
+
# Record a failure so the next reconnect is throttled via back-off.
|
|
337
|
+
_record_failure(cache_key)
|
|
317
338
|
try:
|
|
318
339
|
await client.close()
|
|
319
340
|
print(f"Cleared cached client '{cache_key}'")
|
|
@@ -323,6 +344,69 @@ async def clear_cached_client(cache_key: str) -> bool:
|
|
|
323
344
|
return False
|
|
324
345
|
|
|
325
346
|
|
|
347
|
+
# Marker substring of the ValueError raised by BaseSDKProvider when an
|
|
348
|
+
# oauth_auto_discover profile has no cached token (see
|
|
349
|
+
# llm_integration.BaseSDKProvider._resolve_mcp_bearer_token).
|
|
350
|
+
_OAUTH_TOKEN_ERROR = "No usable cached OAuth token"
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _chat_oauth_login_enabled() -> bool:
|
|
354
|
+
"""Feature flag for interactive OAuth login during chat (default ON).
|
|
355
|
+
|
|
356
|
+
Disable with TESTMCPY_CHAT_OAUTH_LOGIN=false (or 0/no). Read at call time
|
|
357
|
+
so tests can monkeypatch the environment.
|
|
358
|
+
"""
|
|
359
|
+
return os.environ.get("TESTMCPY_CHAT_OAUTH_LOGIN", "true").strip().lower() not in (
|
|
360
|
+
"0",
|
|
361
|
+
"false",
|
|
362
|
+
"no",
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
async def _relogin_oauth_servers(server_keys: list[str]) -> dict[str, MCPClient]:
|
|
367
|
+
"""Deliberate interactive re-auth for the given "profileId:mcpName" keys.
|
|
368
|
+
|
|
369
|
+
Drops cached clients WITHOUT recording back-off, clears any pre-existing
|
|
370
|
+
back-off state, and re-initializes. MCPClient.initialize() with
|
|
371
|
+
oauth_auto_discover opens the browser OAuth flow and caches the token via
|
|
372
|
+
fastmcp FileTokenStorage; duplicate popups are prevented by the per-key
|
|
373
|
+
init locks.
|
|
374
|
+
|
|
375
|
+
Returns the fresh clients keyed by cache key so callers can replace any
|
|
376
|
+
references to the old, now-closed client objects.
|
|
377
|
+
"""
|
|
378
|
+
new_clients: dict[str, MCPClient] = {}
|
|
379
|
+
for cache_key in server_keys:
|
|
380
|
+
await clear_cached_client(cache_key, record_failure=False)
|
|
381
|
+
_clear_failure(cache_key) # earlier failures must not block deliberate re-auth
|
|
382
|
+
profile_id, mcp_name = cache_key.split(":", 1)
|
|
383
|
+
client = await get_mcp_client_for_server(profile_id, mcp_name)
|
|
384
|
+
if client:
|
|
385
|
+
new_clients[cache_key] = client
|
|
386
|
+
return new_clients
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _refresh_client_refs(
|
|
390
|
+
new_clients: dict[str, MCPClient],
|
|
391
|
+
clients_to_use: list[tuple[str, str, MCPClient]],
|
|
392
|
+
tool_to_client: dict[str, tuple[MCPClient, str, str]],
|
|
393
|
+
) -> tuple[list[tuple[str, str, MCPClient]], dict[str, tuple[MCPClient, str, str]]]:
|
|
394
|
+
"""Swap re-logged-in clients into the chat endpoints' lookup structures.
|
|
395
|
+
|
|
396
|
+
After _relogin_oauth_servers the old client objects are closed; tool
|
|
397
|
+
execution through tool_to_client must use the replacements.
|
|
398
|
+
"""
|
|
399
|
+
refreshed_clients = [
|
|
400
|
+
(pid, name, new_clients.get(f"{pid}:{name}", client))
|
|
401
|
+
for pid, name, client in clients_to_use
|
|
402
|
+
]
|
|
403
|
+
refreshed_tools = {
|
|
404
|
+
tool: (new_clients.get(f"{pid}:{name}", client), pid, name)
|
|
405
|
+
for tool, (client, pid, name) in tool_to_client.items()
|
|
406
|
+
}
|
|
407
|
+
return refreshed_clients, refreshed_tools
|
|
408
|
+
|
|
409
|
+
|
|
326
410
|
def is_auth_error(error_msg: str) -> bool:
|
|
327
411
|
"""Check if an error message indicates an authentication failure."""
|
|
328
412
|
error_lower = error_msg.lower()
|
|
@@ -395,6 +479,30 @@ async def lifespan(app: FastAPI):
|
|
|
395
479
|
except SQLAlchemyError as e:
|
|
396
480
|
print(f"Warning: could not reconcile stale runs: {e}")
|
|
397
481
|
|
|
482
|
+
# …and keep reconciling while we run, so a crashed sibling server (or
|
|
483
|
+
# a row orphaned by an event-loop death that didn't restart the
|
|
484
|
+
# process) flips to 'interrupted' within minutes rather than at the
|
|
485
|
+
# next restart. Heartbeat-only (no started_at fallback): legacy rows
|
|
486
|
+
# without heartbeats carry local-naive timestamps that can't be
|
|
487
|
+
# compared reliably against a UTC cutoff.
|
|
488
|
+
async def _stale_run_sweeper() -> None:
|
|
489
|
+
from testmcpy.storage import get_storage
|
|
490
|
+
|
|
491
|
+
while True:
|
|
492
|
+
await _asyncio.sleep(60)
|
|
493
|
+
try:
|
|
494
|
+
get_storage().mark_stale_runs_interrupted(no_heartbeat_older_than_hours=None)
|
|
495
|
+
except _asyncio.CancelledError:
|
|
496
|
+
raise
|
|
497
|
+
except Exception as sweep_err: # noqa: BLE001 — long-lived loop:
|
|
498
|
+
# any escaping error (not just SQLAlchemyError — e.g. an
|
|
499
|
+
# OSError on first-time DB-path init) would otherwise kill
|
|
500
|
+
# the sweeper permanently and silently, reverting crash
|
|
501
|
+
# reconciliation to startup-only. (PR #90 review)
|
|
502
|
+
print(f"Warning: stale-run sweep failed: {sweep_err}")
|
|
503
|
+
|
|
504
|
+
sweeper_task = _asyncio.create_task(_stale_run_sweeper())
|
|
505
|
+
|
|
398
506
|
# Startup
|
|
399
507
|
try:
|
|
400
508
|
mcp_url = config.get_mcp_url()
|
|
@@ -410,6 +518,10 @@ async def lifespan(app: FastAPI):
|
|
|
410
518
|
yield
|
|
411
519
|
|
|
412
520
|
# Shutdown
|
|
521
|
+
sweeper_task.cancel()
|
|
522
|
+
with contextlib.suppress(_asyncio.CancelledError):
|
|
523
|
+
await sweeper_task
|
|
524
|
+
|
|
413
525
|
if mcp_client:
|
|
414
526
|
await mcp_client.close()
|
|
415
527
|
|
|
@@ -907,9 +1019,25 @@ async def chat(request: ChatRequest) -> ChatResponse:
|
|
|
907
1019
|
provider_kwargs = {}
|
|
908
1020
|
if api_key:
|
|
909
1021
|
provider_kwargs["api_key"] = api_key
|
|
910
|
-
|
|
1022
|
+
provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
|
|
911
1023
|
print("[Chat] Initializing LLM provider...")
|
|
912
|
-
|
|
1024
|
+
try:
|
|
1025
|
+
llm_provider = create_llm_provider(provider, model, **provider_kwargs)
|
|
1026
|
+
await llm_provider.initialize()
|
|
1027
|
+
except ValueError as e:
|
|
1028
|
+
if not (_chat_oauth_login_enabled() and _OAUTH_TOKEN_ERROR in str(e)):
|
|
1029
|
+
raise
|
|
1030
|
+
print("[Chat] No cached OAuth token; triggering interactive OAuth login...")
|
|
1031
|
+
new_clients = await _relogin_oauth_servers(accessed_servers)
|
|
1032
|
+
# The old client objects are closed now — swap in the replacements
|
|
1033
|
+
# so tool execution doesn't hit a closed client.
|
|
1034
|
+
clients_to_use, tool_to_client = _refresh_client_refs(
|
|
1035
|
+
new_clients, clients_to_use, tool_to_client
|
|
1036
|
+
)
|
|
1037
|
+
provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
|
|
1038
|
+
llm_provider = create_llm_provider(provider, model, **provider_kwargs)
|
|
1039
|
+
# Single retry; a second failure falls to the existing handlers.
|
|
1040
|
+
await llm_provider.initialize()
|
|
913
1041
|
print(
|
|
914
1042
|
f"[Chat] LLM provider initialized. Generating response with {len(all_tools)} tools..."
|
|
915
1043
|
)
|
|
@@ -1180,8 +1308,24 @@ async def chat_stream(request: ChatRequest):
|
|
|
1180
1308
|
provider_kwargs: dict = {}
|
|
1181
1309
|
if api_key:
|
|
1182
1310
|
provider_kwargs["api_key"] = api_key
|
|
1183
|
-
|
|
1184
|
-
|
|
1311
|
+
provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
|
|
1312
|
+
try:
|
|
1313
|
+
llm_provider = create_llm_provider(provider, model, **provider_kwargs)
|
|
1314
|
+
await llm_provider.initialize()
|
|
1315
|
+
except ValueError as e:
|
|
1316
|
+
if not (_chat_oauth_login_enabled() and _OAUTH_TOKEN_ERROR in str(e)):
|
|
1317
|
+
raise
|
|
1318
|
+
yield send_event("status", "Waiting for OAuth login in browser...")
|
|
1319
|
+
new_clients = await _relogin_oauth_servers(accessed_servers)
|
|
1320
|
+
# The old client objects are closed now — swap in the replacements
|
|
1321
|
+
# so tool execution doesn't hit a closed client.
|
|
1322
|
+
clients_to_use, tool_to_client = _refresh_client_refs(
|
|
1323
|
+
new_clients, clients_to_use, tool_to_client
|
|
1324
|
+
)
|
|
1325
|
+
provider_kwargs.update(_primary_mcp_provider_kwargs(clients_to_use))
|
|
1326
|
+
llm_provider = create_llm_provider(provider, model, **provider_kwargs)
|
|
1327
|
+
# Single retry; a second failure falls to the existing handlers.
|
|
1328
|
+
await llm_provider.initialize()
|
|
1185
1329
|
|
|
1186
1330
|
# --- Detect if provider is SDK-based (handles its own agentic loop) ---
|
|
1187
1331
|
from testmcpy.src.llm_integration import ClaudeSDKProvider
|
|
@@ -11,6 +11,7 @@ from typing import Any
|
|
|
11
11
|
from fastapi import APIRouter, HTTPException
|
|
12
12
|
from pydantic import BaseModel
|
|
13
13
|
|
|
14
|
+
from testmcpy.server.run_persistence import question_result_kwargs
|
|
14
15
|
from testmcpy.storage import get_storage
|
|
15
16
|
|
|
16
17
|
router = APIRouter(prefix="/api/results", tags=["results"])
|
|
@@ -98,21 +99,7 @@ def save_test_run_to_file(data: dict[str, Any]) -> dict[str, Any]:
|
|
|
98
99
|
|
|
99
100
|
# Save individual question results
|
|
100
101
|
for r in results:
|
|
101
|
-
storage.save_question_result(
|
|
102
|
-
run_id=run_id,
|
|
103
|
-
question_id=r.get("test_name", r.get("question_id", "unknown")),
|
|
104
|
-
passed=r.get("passed", False),
|
|
105
|
-
score=r.get("score", 0.0),
|
|
106
|
-
answer=r.get("response", r.get("answer")),
|
|
107
|
-
tool_uses=r.get("tool_calls", r.get("tool_uses")),
|
|
108
|
-
tool_results=r.get("tool_results"),
|
|
109
|
-
tokens_input=(r.get("token_usage") or {}).get("input", 0),
|
|
110
|
-
tokens_output=(r.get("token_usage") or {}).get("output", 0),
|
|
111
|
-
duration_ms=int(r.get("duration", 0) * 1000),
|
|
112
|
-
evaluations=r.get("evaluations"),
|
|
113
|
-
error=r.get("error"),
|
|
114
|
-
cost_usd=r.get("cost", r.get("cost_usd", 0.0)),
|
|
115
|
-
)
|
|
102
|
+
storage.save_question_result(run_id=run_id, **question_result_kwargs(r))
|
|
116
103
|
|
|
117
104
|
# Complete the run
|
|
118
105
|
storage.complete_run(run_id, datetime.now().isoformat())
|
|
@@ -11,8 +11,11 @@ from __future__ import annotations
|
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
13
|
from fastapi import APIRouter, HTTPException
|
|
14
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
14
15
|
|
|
15
16
|
from testmcpy.server import run_registry
|
|
17
|
+
from testmcpy.server.run_persistence import wire_status_for_db_status
|
|
18
|
+
from testmcpy.storage import get_storage
|
|
16
19
|
|
|
17
20
|
router = APIRouter(prefix="/api", tags=["runs"])
|
|
18
21
|
|
|
@@ -65,9 +68,40 @@ async def list_runs(active_only: bool = True) -> dict[str, Any]:
|
|
|
65
68
|
@router.get("/runs/{run_id}")
|
|
66
69
|
async def get_run(run_id: str) -> dict[str, Any]:
|
|
67
70
|
handle = await run_registry.get_run(run_id)
|
|
68
|
-
if handle is None:
|
|
71
|
+
if handle is not None:
|
|
72
|
+
return _serialise(handle)
|
|
73
|
+
# Registry miss (GC'd after CLEANUP_TTL, or a server restart) — fall
|
|
74
|
+
# back to the results DB so a stale tab asking about its run gets the
|
|
75
|
+
# final state instead of a 404. ``source: history`` tells the client
|
|
76
|
+
# this is a finished record, not a live handle. A DB hiccup is treated
|
|
77
|
+
# as a miss (matching the WS twin, _attach_history_run) rather than
|
|
78
|
+
# surfacing a 500 to the indicator's poll loop.
|
|
79
|
+
try:
|
|
80
|
+
record = get_storage().get_run(run_id)
|
|
81
|
+
except SQLAlchemyError:
|
|
82
|
+
record = None
|
|
83
|
+
if record is None:
|
|
69
84
|
raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
|
|
70
|
-
|
|
85
|
+
status = wire_status_for_db_status(record.get("status"))
|
|
86
|
+
return {
|
|
87
|
+
"run_id": run_id,
|
|
88
|
+
# Only single-run ids ever land a DB row today — directory-batch
|
|
89
|
+
# ids persist per-file under fresh ids (see _attach_history_run),
|
|
90
|
+
# so anything resolvable here is a single run by construction.
|
|
91
|
+
"kind": "single",
|
|
92
|
+
"status": status,
|
|
93
|
+
"started_at": record.get("started_at"),
|
|
94
|
+
"finished_at": record.get("completed_at"),
|
|
95
|
+
"meta": {
|
|
96
|
+
"test_path": record.get("test_id"),
|
|
97
|
+
"model": record.get("model"),
|
|
98
|
+
"provider": record.get("provider"),
|
|
99
|
+
},
|
|
100
|
+
"summary": record.get("summary"),
|
|
101
|
+
"result_count": len(record.get("question_results") or []),
|
|
102
|
+
"is_attached": False,
|
|
103
|
+
"source": "history",
|
|
104
|
+
}
|
|
71
105
|
|
|
72
106
|
|
|
73
107
|
@router.post("/runs/{run_id}/stop")
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Incremental DB persistence for in-flight test runs.
|
|
2
|
+
|
|
3
|
+
Historically the WebSocket runner saved a run to the database only once,
|
|
4
|
+
at the very end (``save_test_run_to_file``) — a server crash at test 29/30
|
|
5
|
+
lost everything. ``RunRecord`` makes the DB the source of truth for
|
|
6
|
+
partial progress instead:
|
|
7
|
+
|
|
8
|
+
- ``begin()`` — creates the suite + a ``test_runs`` row (status=running)
|
|
9
|
+
as soon as the run starts executing.
|
|
10
|
+
- ``append()`` — writes one ``question_results`` row per completed test.
|
|
11
|
+
- ``finish()`` — stamps the terminal status (completed/error/stopped) and
|
|
12
|
+
the denormalized totals. Idempotent.
|
|
13
|
+
|
|
14
|
+
DB errors are swallowed (logged through the run's own log stream): a
|
|
15
|
+
persistence hiccup must degrade history, never kill a live run.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import uuid
|
|
21
|
+
from collections.abc import Callable
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
26
|
+
|
|
27
|
+
from testmcpy.storage import get_storage
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def mint_run_id() -> str:
|
|
31
|
+
"""Legacy ``<8-hex>_<timestamp>`` run-id shape shared with the run
|
|
32
|
+
registry and ``save_test_run_to_file`` so every code path mints
|
|
33
|
+
correlatable identifiers."""
|
|
34
|
+
return f"{uuid.uuid4().hex[:8]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def question_result_kwargs(r: dict[str, Any]) -> dict[str, Any]:
|
|
38
|
+
"""Map a TestResult.to_dict() shape onto ``save_question_result``
|
|
39
|
+
kwargs. Single source of truth for the mapping — used by both the
|
|
40
|
+
end-of-run ``save_test_run_to_file`` and the incremental ``RunRecord``.
|
|
41
|
+
"""
|
|
42
|
+
# LLM providers report token_usage as {prompt, completion, total}
|
|
43
|
+
# (see llm_integration.py); the old mapping read input/output and
|
|
44
|
+
# silently stored 0 for every UI-triggered run. Keep input/output as
|
|
45
|
+
# a fallback for callers of POST /api/results/save that adopted the
|
|
46
|
+
# old keys.
|
|
47
|
+
usage = r.get("token_usage") or {}
|
|
48
|
+
return {
|
|
49
|
+
"question_id": r.get("test_name", r.get("question_id", "unknown")),
|
|
50
|
+
"passed": r.get("passed", False),
|
|
51
|
+
"score": r.get("score", 0.0),
|
|
52
|
+
"answer": r.get("response", r.get("answer")),
|
|
53
|
+
"tool_uses": r.get("tool_calls", r.get("tool_uses")),
|
|
54
|
+
"tool_results": r.get("tool_results"),
|
|
55
|
+
"tokens_input": usage.get("prompt", usage.get("input", 0)),
|
|
56
|
+
"tokens_output": usage.get("completion", usage.get("output", 0)),
|
|
57
|
+
"duration_ms": int(r.get("duration", 0) * 1000),
|
|
58
|
+
"evaluations": r.get("evaluations"),
|
|
59
|
+
"error": r.get("error"),
|
|
60
|
+
"cost_usd": r.get("cost", r.get("cost_usd", 0.0)),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def ui_result_from_question_result(q: dict[str, Any]) -> dict[str, Any]:
|
|
65
|
+
"""Inverse of ``question_result_kwargs``: map a stored question_results
|
|
66
|
+
row (as returned by ``storage.get_run``) back onto the TestResult
|
|
67
|
+
wire shape the UI's test_complete / all_complete handlers expect —
|
|
68
|
+
including the live {prompt, completion, total} token_usage keys the
|
|
69
|
+
client sums (TestRunContext reads token_usage.total)."""
|
|
70
|
+
tokens_in = q.get("tokens_input", 0) or 0
|
|
71
|
+
tokens_out = q.get("tokens_output", 0) or 0
|
|
72
|
+
return {
|
|
73
|
+
"test_name": q.get("question_id"),
|
|
74
|
+
"passed": bool(q.get("passed")),
|
|
75
|
+
"score": q.get("score", 0.0),
|
|
76
|
+
"response": q.get("answer"),
|
|
77
|
+
"tool_calls": q.get("tool_uses") or [],
|
|
78
|
+
"tool_results": q.get("tool_results") or [],
|
|
79
|
+
"token_usage": {
|
|
80
|
+
"prompt": tokens_in,
|
|
81
|
+
"completion": tokens_out,
|
|
82
|
+
"total": tokens_in + tokens_out,
|
|
83
|
+
},
|
|
84
|
+
"duration": (q.get("duration_ms") or 0) / 1000,
|
|
85
|
+
"evaluations": q.get("evaluations") or [],
|
|
86
|
+
"error": q.get("error"),
|
|
87
|
+
"cost": q.get("cost_usd", 0.0) or 0.0,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# DB statuses that map straight onto the wire's terminal statuses. A DB
|
|
92
|
+
# row still 'running' (or already 'interrupted') with no registry handle
|
|
93
|
+
# means the server died mid-run — report it as interrupted.
|
|
94
|
+
_TERMINAL_WIRE_STATUS = {"completed": "completed", "stopped": "stopped", "error": "error"}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def wire_status_for_db_status(db_status: str | None) -> str:
|
|
98
|
+
"""Map a test_runs.status onto the WebSocket/REST wire status for a
|
|
99
|
+
run that is NOT in the in-memory registry: terminal statuses pass
|
|
100
|
+
through, anything else (running / interrupted / NULL / unknown) means
|
|
101
|
+
the owning process died mid-run — interrupted."""
|
|
102
|
+
return _TERMINAL_WIRE_STATUS.get(db_status or "", "interrupted")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def history_replay_messages(record: dict[str, Any]) -> list[dict[str, Any]]:
|
|
106
|
+
"""Synthesize the WebSocket message sequence for attaching to a run
|
|
107
|
+
that's no longer in the in-memory registry (GC'd after CLEANUP_TTL,
|
|
108
|
+
or lost to a server restart) but lives in the results DB: a
|
|
109
|
+
``run_started`` marker, one ``test_complete`` per stored result (so
|
|
110
|
+
the UI rebuilds its per-test panels), and a terminal ``all_complete``
|
|
111
|
+
carrying the run's real status — including ``interrupted`` with
|
|
112
|
+
partial results for runs that died mid-flight."""
|
|
113
|
+
status = wire_status_for_db_status(record.get("status"))
|
|
114
|
+
results = [ui_result_from_question_result(q) for q in record.get("question_results", [])]
|
|
115
|
+
passed = sum(1 for r in results if r["passed"])
|
|
116
|
+
summary = {
|
|
117
|
+
"total": len(results),
|
|
118
|
+
"passed": passed,
|
|
119
|
+
"failed": len(results) - passed,
|
|
120
|
+
"total_cost": sum(r["cost"] for r in results),
|
|
121
|
+
"status": status,
|
|
122
|
+
}
|
|
123
|
+
return [
|
|
124
|
+
{
|
|
125
|
+
"type": "run_started",
|
|
126
|
+
"run_id": record.get("run_id"),
|
|
127
|
+
"kind": "single",
|
|
128
|
+
"reattached": True,
|
|
129
|
+
"status": status,
|
|
130
|
+
"source": "history",
|
|
131
|
+
},
|
|
132
|
+
*({"type": "test_complete", "test_name": r["test_name"], "result": r} for r in results),
|
|
133
|
+
{"type": "all_complete", "status": status, "summary": summary, "results": results},
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class RunRecord:
|
|
138
|
+
"""Write-through record of one run (one YAML file) in the results DB.
|
|
139
|
+
|
|
140
|
+
All writes are best-effort: a failure marks the record broken and is
|
|
141
|
+
reported once through ``log``, after which subsequent calls no-op so
|
|
142
|
+
a flaky DB doesn't spam the run log or slow the run down.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, run_id: str | None = None, log: Callable[[str], None] | None = None):
|
|
146
|
+
self.run_id = run_id or mint_run_id()
|
|
147
|
+
self._log = log or (lambda msg: None)
|
|
148
|
+
self._began = False
|
|
149
|
+
self._finished = False
|
|
150
|
+
self._broken = False
|
|
151
|
+
|
|
152
|
+
def _report_db_error(self, op: str, exc: SQLAlchemyError) -> None:
|
|
153
|
+
self._broken = True
|
|
154
|
+
self._log(f"⚠️ Results DB unavailable ({op}): {exc} — run continues without history")
|
|
155
|
+
|
|
156
|
+
def begin(
|
|
157
|
+
self,
|
|
158
|
+
*,
|
|
159
|
+
test_file: str,
|
|
160
|
+
model: str,
|
|
161
|
+
provider: str,
|
|
162
|
+
mcp_profile: str | None = None,
|
|
163
|
+
llm_profile: str | None = None,
|
|
164
|
+
metadata: dict[str, Any] | None = None,
|
|
165
|
+
) -> None:
|
|
166
|
+
"""Create the suite + the run row (status=running) up front."""
|
|
167
|
+
if self._began or self._broken:
|
|
168
|
+
return
|
|
169
|
+
try:
|
|
170
|
+
storage = get_storage()
|
|
171
|
+
storage.save_suite(suite_id=test_file, name=test_file, questions=[])
|
|
172
|
+
storage.save_run(
|
|
173
|
+
run_id=self.run_id,
|
|
174
|
+
test_id=test_file,
|
|
175
|
+
test_version=1,
|
|
176
|
+
model=model,
|
|
177
|
+
provider=provider,
|
|
178
|
+
started_at=datetime.now(timezone.utc).isoformat(),
|
|
179
|
+
mcp_profile_id=mcp_profile,
|
|
180
|
+
llm_profile_id=llm_profile,
|
|
181
|
+
metadata=metadata,
|
|
182
|
+
)
|
|
183
|
+
self._began = True
|
|
184
|
+
except SQLAlchemyError as exc:
|
|
185
|
+
self._report_db_error("begin", exc)
|
|
186
|
+
|
|
187
|
+
def append(self, result: dict[str, Any]) -> None:
|
|
188
|
+
"""Persist one completed test immediately (crash-safe progress)."""
|
|
189
|
+
if not self._began or self._finished or self._broken:
|
|
190
|
+
return
|
|
191
|
+
try:
|
|
192
|
+
get_storage().save_question_result(run_id=self.run_id, **question_result_kwargs(result))
|
|
193
|
+
except SQLAlchemyError as exc:
|
|
194
|
+
self._report_db_error("append", exc)
|
|
195
|
+
|
|
196
|
+
def finish(self, status: str) -> None:
|
|
197
|
+
"""Stamp the terminal status + denormalized totals. Idempotent —
|
|
198
|
+
the first terminal status wins (e.g. ``stopped`` from the cancel
|
|
199
|
+
path must not be overwritten by a later generic finalizer)."""
|
|
200
|
+
if not self._began or self._finished or self._broken:
|
|
201
|
+
return
|
|
202
|
+
try:
|
|
203
|
+
get_storage().finish_run(
|
|
204
|
+
self.run_id, status=status, completed_at=datetime.now(timezone.utc).isoformat()
|
|
205
|
+
)
|
|
206
|
+
self._finished = True
|
|
207
|
+
except SQLAlchemyError as exc:
|
|
208
|
+
self._report_db_error("finish", exc)
|