PyPI - testmcpy - Versions diffs - 0.2.16__tar.gz → 0.3.0__tar.gz - Mend

testmcpy 0.2.16tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (189) hide show

{testmcpy-0.2.16/testmcpy.egg-info → testmcpy-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: testmcpy
-Version: 0.2.16
+Version: 0.3.0
 Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
 Author: Amin Ghadersohi
 License-Expression: Apache-2.0
@@ -30,6 +30,8 @@ Requires-Dist: python-dotenv<2.0.0,>=1.0.0
 Requires-Dist: click<9.0.0,>=8.0.0
 Requires-Dist: shellingham<2.0.0,>=1.3.0
 Requires-Dist: textual<1.0.0,>=0.47.0
+Requires-Dist: sqlalchemy<3.0.0,>=2.0.0
+Requires-Dist: alembic<2.0.0,>=1.13.0
 Provides-Extra: dev
 Requires-Dist: ruff>=0.8.0; extra == "dev"
 Requires-Dist: mypy>=1.13.0; extra == "dev"
@@ -50,6 +52,11 @@ Provides-Extra: sdk
 Requires-Dist: claude-agent-sdk>=0.1.0; extra == "sdk"
 Provides-Extra: tui
 Requires-Dist: textual>=0.85.0; extra == "tui"
+Provides-Extra: e2e
+Requires-Dist: playwright>=1.40.0; extra == "e2e"
+Requires-Dist: pytest-playwright>=0.4.0; extra == "e2e"
+Provides-Extra: export
+Requires-Dist: pandas<3.0.0,>=2.0.0; extra == "export"
 Provides-Extra: all
 Requires-Dist: fastapi<1.0.0,>=0.104.0; extra == "all"
 Requires-Dist: uvicorn[standard]<1.0.0,>=0.24.0; extra == "all"
@@ -626,6 +633,4 @@ By contributing, you agree that your contributions will be licensed under Apache
 ## Acknowledgments
-Built to enable better LLM testing and integration with Model Context Protocol services.
-Special thanks to the MCP community and all our contributors!
+**Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).

{testmcpy-0.2.16 → testmcpy-0.3.0}/README.md RENAMED Viewed

@@ -566,6 +566,4 @@ By contributing, you agree that your contributions will be licensed under Apache
 ## Acknowledgments
-Built to enable better LLM testing and integration with Model Context Protocol services.
-Special thanks to the MCP community and all our contributors!
+**Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).

{testmcpy-0.2.16 → testmcpy-0.3.0}/pyproject.toml RENAMED Viewed

@@ -43,13 +43,16 @@ check_untyped_defs = true
 ignore_missing_imports = true
 [tool.pytest.ini_options]
-testpaths = ["tests"]
+testpaths = ["unit_tests", "integration_tests"]
 python_files = ["test_*.py"]
 addopts = [
     "-v",
     "--strict-markers",
     "--tb=short",
 ]
+markers = [
+    "e2e: End-to-end UI tests (requires playwright)",
+]
 [tool.coverage.run]
 source = ["testmcpy"]
@@ -90,7 +93,7 @@ testmcpy = [
 [project]
 name = "testmcpy"
-version = "0.2.16"
+version = "0.3.0"
 description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
 authors = [{name = "Amin Ghadersohi"}]
 license = "Apache-2.0"
@@ -118,6 +121,8 @@ dependencies = [
     "click>=8.0.0,<9.0.0",
     "shellingham>=1.3.0,<2.0.0",
     "textual>=0.47.0,<1.0.0",
+    "sqlalchemy>=2.0.0,<3.0.0",
+    "alembic>=1.13.0,<2.0.0",
 ]
 [project.optional-dependencies]
@@ -145,6 +150,13 @@ sdk = [
 tui = [
     "textual>=0.85.0",
 ]
+e2e = [
+    "playwright>=1.40.0",
+    "pytest-playwright>=0.4.0",
+]
+export = [
+    "pandas>=2.0.0,<3.0.0",
+]
 all = [
     "fastapi>=0.104.0,<1.0.0",
     "uvicorn[standard]>=0.24.0,<1.0.0",

{testmcpy-0.2.16 → testmcpy-0.3.0}/testmcpy/__init__.py RENAMED Viewed

@@ -11,6 +11,6 @@ try:
     __version__ = version("testmcpy")
 except Exception:
     # Fallback for development or when package not installed
-    __version__ = "0.2.12"
+    __version__ = "0.3.0"
 __author__ = "testmcpy Contributors"

testmcpy-0.3.0/testmcpy/agent/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Test Execution Agent using Claude Agent SDK.
+Provides an intelligent orchestrator that wraps testmcpy infrastructure
+with reasoning, adaptability, and natural language interaction.
+Note: Requires `claude-agent-sdk` package. Imports are lazy to avoid
+crashing when the SDK is not installed.
+"""
+from testmcpy.agent.models import AgentRunReport, AgentSession, ToolInvocation
+def __getattr__(name):
+    if name == "TestExecutionAgent":
+        from testmcpy.agent.orchestrator import TestExecutionAgent
+        return TestExecutionAgent
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "TestExecutionAgent",
+    "AgentRunReport",
+    "AgentSession",
+    "ToolInvocation",
+]

testmcpy-0.3.0/testmcpy/agent/hooks.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""
+Agent hooks for monitoring and controlling the Test Execution Agent.
+Uses the Claude Agent SDK hook system (PreToolUse, PostToolUse, Stop)
+to track tool calls, detect loops, measure costs, and generate reports.
+"""
+import time
+from typing import Any
+from testmcpy.agent.models import AgentSession, ToolInvocation
+try:
+    from claude_agent_sdk import HookContext
+except ImportError:
+    HookContext = Any  # type: ignore[assignment,misc]
+# Maximum identical consecutive tool calls before blocking
+MAX_IDENTICAL_CALLS = 3
+def create_hooks(session: AgentSession) -> dict[str, list[dict[str, Any]]]:
+    """Create all hooks wired to a shared AgentSession.
+    Returns a hooks dict in the format expected by ClaudeAgentOptions:
+        {
+            "PreToolUse": [{"matcher": None, "hooks": [callback]}],
+            "PostToolUse": [{"matcher": None, "hooks": [callback]}],
+            "Stop": [{"matcher": None, "hooks": [callback]}],
+        }
+    """
+    # Mutable state shared between hooks (not in session to keep session clean)
+    _hook_state: dict[str, Any] = {
+        "last_tool_name": None,
+        "last_tool_args": None,
+        "consecutive_identical": 0,
+        "pending_start_times": {},  # tool_use_id -> start_time
+    }
+    async def pre_tool_use(
+        input_data: dict[str, Any],
+        tool_use_id: str | None,
+        context: HookContext,
+    ) -> dict[str, Any]:
+        """Hook called before each tool use.
+        - Detects loops (3+ identical consecutive calls)
+        - Records start time for duration tracking
+        """
+        tool_name = input_data.get("name", "")
+        tool_args = input_data.get("input", {})
+        # Loop detection: track consecutive identical calls
+        if (
+            tool_name == _hook_state["last_tool_name"]
+            and tool_args == _hook_state["last_tool_args"]
+        ):
+            _hook_state["consecutive_identical"] += 1
+        else:
+            _hook_state["consecutive_identical"] = 1
+        _hook_state["last_tool_name"] = tool_name
+        _hook_state["last_tool_args"] = tool_args
+        # Block if too many identical calls in a row
+        if _hook_state["consecutive_identical"] >= MAX_IDENTICAL_CALLS:
+            session.record_error(
+                f"Loop detected: {tool_name} called {MAX_IDENTICAL_CALLS}+ times "
+                f"with identical arguments"
+            )
+            return {
+                "decision": "block",
+                "systemMessage": (
+                    f"BLOCKED: You have called {tool_name} {MAX_IDENTICAL_CALLS} times "
+                    f"in a row with identical arguments. This looks like a loop. "
+                    f"Try a different approach or different arguments."
+                ),
+            }
+        # Record start time for this tool use
+        if tool_use_id:
+            _hook_state["pending_start_times"][tool_use_id] = time.time()
+        return {}
+    async def post_tool_use(
+        input_data: dict[str, Any],
+        tool_use_id: str | None,
+        context: HookContext,
+    ) -> dict[str, Any]:
+        """Hook called after each tool use.
+        - Records tool invocation with timing
+        - Tracks test results for execute_test_case
+        - Accumulates costs and tokens
+        """
+        tool_name = input_data.get("name", "")
+        tool_input = input_data.get("input", {})
+        tool_result = input_data.get("result", "")
+        # Calculate duration
+        duration_ms = 0.0
+        if tool_use_id and tool_use_id in _hook_state["pending_start_times"]:
+            start = _hook_state["pending_start_times"].pop(tool_use_id)
+            duration_ms = (time.time() - start) * 1000
+        # Determine if there was an error
+        is_error = False
+        result_summary = ""
+        if isinstance(tool_result, str):
+            result_summary = tool_result[:200]
+            is_error = tool_result.startswith("Error:")
+        elif isinstance(tool_result, dict):
+            is_error = tool_result.get("is_error", False)
+            content = tool_result.get("content", [])
+            if content and isinstance(content, list) and len(content) > 0:
+                first = content[0]
+                if isinstance(first, dict):
+                    result_summary = first.get("text", "")[:200]
+        # Record the invocation
+        invocation = ToolInvocation(
+            tool_name=tool_name,
+            arguments=tool_input,
+            result_summary=result_summary,
+            is_error=is_error,
+            duration_ms=duration_ms,
+        )
+        session.record_tool_call(invocation)
+        # Track test results if this was execute_test_case
+        if tool_name == "execute_test_case" and not is_error:
+            try:
+                import json
+                # Parse the result to extract pass/fail
+                if isinstance(tool_result, str) and not tool_result.startswith("Error:"):
+                    parsed = json.loads(tool_result)
+                    if "passed" in parsed:
+                        session.record_test_result(parsed["passed"])
+                        # Track test execution cost separately
+                        if "cost" in parsed:
+                            session.test_execution_cost_usd += parsed.get("cost", 0.0)
+                        if "token_usage" in parsed and parsed["token_usage"]:
+                            tokens = parsed["token_usage"]
+                            session.test_execution_tokens += tokens.get("total", 0)
+            except (json.JSONDecodeError, KeyError, TypeError):
+                pass
+        if is_error:
+            session.record_error(f"Tool {tool_name} returned error: {result_summary}")
+        return {}
+    async def stop_hook(
+        input_data: dict[str, Any],
+        tool_use_id: str | None,
+        context: HookContext,
+    ) -> dict[str, Any]:
+        """Hook called when the agent stops.
+        Finalizes the session and generates the run report.
+        """
+        session.complete()
+        # Extract orchestrator cost from the result message if available
+        result = input_data.get("result", {})
+        if isinstance(result, dict):
+            total_cost = result.get("total_cost_usd", 0.0)
+            if total_cost:
+                session.orchestrator_cost_usd = total_cost - session.test_execution_cost_usd
+            usage = result.get("usage", {})
+            if usage:
+                session.orchestrator_tokens_input = usage.get("input_tokens", 0)
+                session.orchestrator_tokens_output = usage.get("output_tokens", 0)
+        return {}
+    return {
+        "PreToolUse": [{"matcher": None, "hooks": [pre_tool_use]}],
+        "PostToolUse": [{"matcher": None, "hooks": [post_tool_use]}],
+        "Stop": [{"matcher": None, "hooks": [stop_hook]}],
+    }

testmcpy-0.3.0/testmcpy/agent/models.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""
+Data models for the Test Execution Agent.
+Defines session state, run reports, and tool invocation records
+used by the agent hooks and orchestrator.
+"""
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+@dataclass
+class ToolInvocation:
+    """Record of a single tool call made by the agent."""
+    tool_name: str
+    arguments: dict[str, Any]
+    result_summary: str
+    is_error: bool = False
+    duration_ms: float = 0.0
+    timestamp: str = ""
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.now(timezone.utc).isoformat()
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+@dataclass
+class AgentSession:
+    """Mutable state accumulated during an agent run.
+    Used by hooks to track progress and build the final report.
+    """
+    # Test execution tracking
+    tests_run: int = 0
+    tests_passed: int = 0
+    tests_failed: int = 0
+    # Cost tracking (orchestrator vs test execution)
+    orchestrator_cost_usd: float = 0.0
+    test_execution_cost_usd: float = 0.0
+    # Token tracking
+    orchestrator_tokens_input: int = 0
+    orchestrator_tokens_output: int = 0
+    test_execution_tokens: int = 0
+    # Tool call history
+    tool_call_history: list[ToolInvocation] = field(default_factory=list)
+    tool_call_counts: dict[str, int] = field(default_factory=dict)
+    # Errors
+    errors: list[str] = field(default_factory=list)
+    # Timing
+    started_at: str = ""
+    completed_at: str = ""
+    def __post_init__(self):
+        if not self.started_at:
+            self.started_at = datetime.now(timezone.utc).isoformat()
+    def record_tool_call(self, invocation: ToolInvocation) -> None:
+        """Record a tool invocation."""
+        self.tool_call_history.append(invocation)
+        self.tool_call_counts[invocation.tool_name] = (
+            self.tool_call_counts.get(invocation.tool_name, 0) + 1
+        )
+    def record_test_result(self, passed: bool) -> None:
+        """Record a test result."""
+        self.tests_run += 1
+        if passed:
+            self.tests_passed += 1
+        else:
+            self.tests_failed += 1
+    def record_error(self, error: str) -> None:
+        """Record an error."""
+        self.errors.append(error)
+    def complete(self) -> None:
+        """Mark the session as completed."""
+        self.completed_at = datetime.now(timezone.utc).isoformat()
+    def to_dict(self) -> dict[str, Any]:
+        d = asdict(self)
+        d["tool_call_history"] = [t.to_dict() for t in self.tool_call_history]
+        return d
+@dataclass
+class AgentRunReport:
+    """Final report from an agent run.
+    Separates orchestrator costs from test execution costs.
+    """
+    # Run metadata
+    run_id: str = ""
+    started_at: str = ""
+    completed_at: str = ""
+    duration_ms: float = 0.0
+    # Test results summary
+    tests_run: int = 0
+    tests_passed: int = 0
+    tests_failed: int = 0
+    pass_rate: float = 0.0
+    # Cost breakdown
+    orchestrator_cost_usd: float = 0.0
+    test_execution_cost_usd: float = 0.0
+    total_cost_usd: float = 0.0
+    # Token breakdown
+    orchestrator_tokens_input: int = 0
+    orchestrator_tokens_output: int = 0
+    test_execution_tokens: int = 0
+    # Agent activity
+    total_tool_calls: int = 0
+    tool_call_counts: dict[str, int] = field(default_factory=dict)
+    tool_call_history: list[ToolInvocation] = field(default_factory=list)
+    # Errors
+    errors: list[str] = field(default_factory=list)
+    # Agent's final analysis (text from the agent)
+    analysis: str = ""
+    # Number of agent turns
+    num_turns: int = 0
+    @classmethod
+    def from_session(cls, session: AgentSession, run_id: str = "") -> "AgentRunReport":
+        """Build a report from a completed agent session."""
+        session.complete()
+        started = datetime.fromisoformat(session.started_at)
+        completed = datetime.fromisoformat(session.completed_at)
+        duration_ms = (completed - started).total_seconds() * 1000
+        total_cost = session.orchestrator_cost_usd + session.test_execution_cost_usd
+        pass_rate = session.tests_passed / session.tests_run if session.tests_run > 0 else 0.0
+        return cls(
+            run_id=run_id,
+            started_at=session.started_at,
+            completed_at=session.completed_at,
+            duration_ms=duration_ms,
+            tests_run=session.tests_run,
+            tests_passed=session.tests_passed,
+            tests_failed=session.tests_failed,
+            pass_rate=pass_rate,
+            orchestrator_cost_usd=session.orchestrator_cost_usd,
+            test_execution_cost_usd=session.test_execution_cost_usd,
+            total_cost_usd=total_cost,
+            orchestrator_tokens_input=session.orchestrator_tokens_input,
+            orchestrator_tokens_output=session.orchestrator_tokens_output,
+            test_execution_tokens=session.test_execution_tokens,
+            total_tool_calls=len(session.tool_call_history),
+            tool_call_counts=dict(session.tool_call_counts),
+            tool_call_history=list(session.tool_call_history),
+            errors=list(session.errors),
+        )
+    def to_dict(self) -> dict[str, Any]:
+        d = asdict(self)
+        d["tool_call_history"] = [t.to_dict() for t in self.tool_call_history]
+        return d

testmcpy-0.3.0/testmcpy/agent/orchestrator.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""
+Test Execution Agent orchestrator.
+Main entry point for creating and running the agent. Wires together
+tools, hooks, prompts, and the Claude Agent SDK.
+"""
+import uuid
+from collections.abc import AsyncIterator
+from datetime import datetime, timezone
+from typing import Any
+from testmcpy.agent.hooks import create_hooks
+from testmcpy.agent.models import AgentRunReport, AgentSession
+from testmcpy.agent.prompts import build_context_prompt
+from testmcpy.agent.tools import ALL_TOOLS, set_tool_context
+try:
+    from claude_agent_sdk import (
+        AssistantMessage,
+        ClaudeAgentOptions,
+        ClaudeSDKClient,
+        ResultMessage,
+        TextBlock,
+        create_sdk_mcp_server,
+        query,
+    )
+    _HAS_SDK = True
+except ImportError:
+    _HAS_SDK = False
+class TestExecutionAgent:
+    """Intelligent test execution agent powered by Claude Agent SDK.
+    Orchestrates testmcpy infrastructure through custom @tool functions,
+    providing reasoning, adaptability, and natural language interaction.
+    """
+    def __init__(
+        self,
+        mcp_profile: str | None = None,
+        mcp_url: str | None = None,
+        auth_config: dict[str, Any] | None = None,
+        models: list[str] | None = None,
+        storage_path: str | None = None,
+        max_turns: int = 50,
+        agent_model: str | None = None,
+    ):
+        """Initialize the agent.
+        Args:
+            mcp_profile: MCP service profile name
+            mcp_url: Direct MCP service URL (overrides profile)
+            auth_config: Authentication config dict
+            models: List of model names available for testing
+            storage_path: Path to SQLite storage database
+            max_turns: Maximum agent turns (default 50)
+            agent_model: Model for the agent itself (default: SDK default)
+        """
+        if not _HAS_SDK:
+            raise ImportError(
+                "claude_agent_sdk is required for the Test Execution Agent. "
+                "Install with: pip install testmcpy[sdk]"
+            )
+        self.mcp_profile = mcp_profile
+        self.mcp_url = mcp_url
+        self.auth_config = auth_config
+        self.models = models or []
+        self.storage_path = storage_path
+        self.max_turns = max_turns
+        self.agent_model = agent_model
+        # Configure shared tool context
+        set_tool_context(
+            mcp_profile=mcp_profile,
+            mcp_url=mcp_url,
+            auth_config=auth_config,
+            storage_path=storage_path,
+        )
+    def _build_options(self, session: AgentSession) -> ClaudeAgentOptions:
+        """Build ClaudeAgentOptions with tools, hooks, and configuration."""
+        # Create in-process MCP server with our custom tools
+        mcp_server = create_sdk_mcp_server(
+            name="testmcpy-agent-tools",
+            version="1.0.0",
+            tools=ALL_TOOLS,
+        )
+        # Build system prompt with context
+        system_prompt = build_context_prompt(
+            mcp_profile=self.mcp_profile,
+            models=self.models,
+        )
+        # Create hooks wired to the session
+        hooks = create_hooks(session)
+        options = ClaudeAgentOptions(
+            system_prompt=system_prompt,
+            permission_mode="bypassPermissions",
+            max_turns=self.max_turns,
+            mcp_servers={"testmcpy-agent-tools": mcp_server},
+            hooks=hooks,
+        )
+        if self.agent_model:
+            options.model = self.agent_model
+        return options
+    async def run(self, prompt: str) -> AgentRunReport:
+        """Execute a one-shot agent run.
+        The agent processes the prompt, uses tools as needed, and returns
+        a structured report of what it did.
+        Args:
+            prompt: Natural language instruction (e.g., "Run all tests in tests/example.yaml")
+        Returns:
+            AgentRunReport with test results, costs, and analysis
+        """
+        run_id = (
+            f"agent_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
+        )
+        session = AgentSession()
+        options = self._build_options(session)
+        # Collect the agent's text output for analysis
+        analysis_parts = []
+        num_turns = 0
+        async for message in query(prompt=prompt, options=options):
+            if isinstance(message, AssistantMessage):
+                for block in message.content:
+                    if isinstance(block, TextBlock):
+                        analysis_parts.append(block.text)
+            if isinstance(message, ResultMessage):
+                num_turns = message.num_turns
+                # Extract cost info from result
+                if message.total_cost_usd is not None:
+                    session.orchestrator_cost_usd = max(
+                        0.0,
+                        message.total_cost_usd - session.test_execution_cost_usd,
+                    )
+                if message.usage:
+                    session.orchestrator_tokens_input = message.usage.get("input_tokens", 0)
+                    session.orchestrator_tokens_output = message.usage.get("output_tokens", 0)
+        # Build report
+        report = AgentRunReport.from_session(session, run_id=run_id)
+        report.analysis = "\n".join(analysis_parts)
+        report.num_turns = num_turns
+        return report
+    async def chat(self, prompt: str) -> AsyncIterator[dict[str, Any]]:
+        """Start an interactive chat session.
+        Yields message dicts as they arrive from the agent.
+        Suitable for streaming to a web UI or CLI.
+        Args:
+            prompt: Initial prompt to start the conversation
+        Yields:
+            Dicts with keys: type (text|tool_use|tool_result|result), content
+        """
+        session = AgentSession()
+        options = self._build_options(session)
+        async with ClaudeSDKClient(options=options) as client:
+            await client.query(prompt)
+            async for message in client.receive_response():
+                if isinstance(message, AssistantMessage):
+                    for block in message.content:
+                        if isinstance(block, TextBlock):
+                            yield {"type": "text", "content": block.text}
+                elif isinstance(message, ResultMessage):
+                    report = AgentRunReport.from_session(session)
+                    report.num_turns = message.num_turns
+                    if message.total_cost_usd is not None:
+                        report.orchestrator_cost_usd = max(
+                            0.0,
+                            message.total_cost_usd - session.test_execution_cost_usd,
+                        )
+                        report.total_cost_usd = message.total_cost_usd
+                    yield {"type": "result", "content": report.to_dict()}

testmcpy 0.2.16__tar.gz → 0.3.0__tar.gz

testmcpy 0.2.16tar.gz → 0.3.0tar.gz