PyPI - testmcpy - Versions diffs - 0.3.2__tar.gz → 0.5.0__tar.gz - Mend

testmcpy 0.3.2tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

{testmcpy-0.3.2/testmcpy.egg-info → testmcpy-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: testmcpy
-Version: 0.3.2
+Version: 0.5.0
 Summary: A comprehensive testing framework for validating LLM tool calling capabilities with MCP services
 Author: Amin Ghadersohi
 License-Expression: Apache-2.0
@@ -404,6 +404,33 @@ testmcpy run tests/ --model claude-haiku-4-5
 **Common options:** `--profile`, `--llm-profile`, `--model`, `--provider`, `--timeout`, `--verbose`, `--output`
+### Inline MCP Auth (No Config File Needed)
+Pass MCP auth credentials directly on the command line, bypassing `.mcp_services.yaml`:
+```bash
+# JWT auth (e.g., Preset workspaces)
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type jwt \
+  --jwt-url https://auth.example.com/v1/auth/ \
+  --jwt-token $MCP_JWT_TOKEN \
+  --jwt-secret $MCP_JWT_SECRET
+# Bearer token auth
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type bearer \
+  --auth-token $MCP_BEARER_TOKEN
+# No auth (public MCP endpoint)
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type none
+```
+Environment variables are also supported: `MCP_AUTH_TOKEN`, `MCP_JWT_URL`, `MCP_JWT_TOKEN`, `MCP_JWT_SECRET`.
 ## Web Interface
 Optional React-based UI with 15+ pages for visual testing and analytics:
@@ -549,4 +576,4 @@ Apache License 2.0 — See [LICENSE](LICENSE) for details.
 ---
-**Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).
+**Built by [@aminghadersohi](https://github.com/aminghadersohi)** at [Preset](https://preset.io).

{testmcpy-0.3.2 → testmcpy-0.5.0}/README.md RENAMED Viewed

@@ -334,6 +334,33 @@ testmcpy run tests/ --model claude-haiku-4-5
 **Common options:** `--profile`, `--llm-profile`, `--model`, `--provider`, `--timeout`, `--verbose`, `--output`
+### Inline MCP Auth (No Config File Needed)
+Pass MCP auth credentials directly on the command line, bypassing `.mcp_services.yaml`:
+```bash
+# JWT auth (e.g., Preset workspaces)
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type jwt \
+  --jwt-url https://auth.example.com/v1/auth/ \
+  --jwt-token $MCP_JWT_TOKEN \
+  --jwt-secret $MCP_JWT_SECRET
+# Bearer token auth
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type bearer \
+  --auth-token $MCP_BEARER_TOKEN
+# No auth (public MCP endpoint)
+testmcpy run tests/ \
+  --mcp-url https://workspace.example.com/mcp \
+  --auth-type none
+```
+Environment variables are also supported: `MCP_AUTH_TOKEN`, `MCP_JWT_URL`, `MCP_JWT_TOKEN`, `MCP_JWT_SECRET`.
 ## Web Interface
 Optional React-based UI with 15+ pages for visual testing and analytics:
@@ -479,4 +506,4 @@ Apache License 2.0 — See [LICENSE](LICENSE) for details.
 ---
-**Built by [@aminghadersohi](https://github.com/aminghadersohi)** ([Preset](https://preset.io), [Apache Superset](https://github.com/apache/superset)).
+**Built by [@aminghadersohi](https://github.com/aminghadersohi)** at [Preset](https://preset.io).

{testmcpy-0.3.2 → testmcpy-0.5.0}/pyproject.toml RENAMED Viewed

@@ -93,7 +93,7 @@ testmcpy = [
 [project]
 name = "testmcpy"
-version = "0.3.2"
+version = "0.5.0"
 description = "A comprehensive testing framework for validating LLM tool calling capabilities with MCP services"
 authors = [{name = "Amin Ghadersohi"}]
 license = "Apache-2.0"

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/__init__.py RENAMED Viewed

@@ -11,6 +11,6 @@ try:
     __version__ = version("testmcpy")
 except Exception:
     # Fallback for development or when package not installed
-    __version__ = "0.3.2"
+    __version__ = "0.5.0"
 __author__ = "testmcpy Contributors"

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/cli/commands/push.py RENAMED Viewed

@@ -17,7 +17,7 @@ def push(
         None,
         "--server",
         "-s",
-        help="Remote testmcpy server URL (e.g. https://testmcpy.sandbox.preset.io)",
+        help="Remote testmcpy server URL (e.g. https://testmcpy.example.com)",
     ),
     api_key: Optional[str] = typer.Option(
         None,
@@ -66,7 +66,7 @@ def push(
             Panel(
                 "[red]No server URL specified.[/red]\n\n"
                 "Provide the remote testmcpy server URL via:\n"
-                "  [cyan]--server https://testmcpy.sandbox.preset.io[/cyan]\n"
+                "  [cyan]--server https://testmcpy.example.com[/cyan]\n"
                 "  or\n"
                 "  [cyan]export TESTMCPY_SERVER_URL=https://...[/cyan]",
                 title="Missing Server",

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/cli/commands/run.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import asyncio
 import json
+import uuid
 from pathlib import Path
 from typing import Optional
@@ -220,12 +221,63 @@ def run(
         "--system-prompt-file",
         help="File containing the system prompt text",
     ),
+    # Inline MCP auth options (bypass .mcp_services.yaml)
+    auth_type: Optional[str] = typer.Option(
+        None,
+        "--auth-type",
+        help="MCP auth type: jwt, bearer, oauth, api_key, none",
+    ),
+    auth_token: Optional[str] = typer.Option(
+        None,
+        "--auth-token",
+        envvar="MCP_AUTH_TOKEN",
+        help="Bearer token or API key for MCP auth",
+    ),
+    jwt_url: Optional[str] = typer.Option(
+        None,
+        "--jwt-url",
+        envvar="MCP_JWT_URL",
+        help="JWT auth endpoint URL (for --auth-type jwt)",
+    ),
+    jwt_token: Optional[str] = typer.Option(
+        None,
+        "--jwt-token",
+        envvar="MCP_JWT_TOKEN",
+        help="JWT API token / key name (for --auth-type jwt)",
+    ),
+    jwt_secret: Optional[str] = typer.Option(
+        None,
+        "--jwt-secret",
+        envvar="MCP_JWT_SECRET",
+        help="JWT API secret (for --auth-type jwt)",
+    ),
 ):
     """
     Run test cases against MCP service.
     This command executes test cases defined in YAML/JSON files.
     """
+    # Generate session ID to group multiple runs from the same CLI invocation
+    session_id = str(uuid.uuid4())
+    # Build inline auth dict if --auth-type is provided
+    inline_auth = None
+    if auth_type:
+        inline_auth = {"type": auth_type}
+        if auth_type == "jwt":
+            if jwt_url:
+                inline_auth["api_url"] = jwt_url
+            if jwt_token:
+                inline_auth["api_token"] = jwt_token
+            if jwt_secret:
+                inline_auth["api_secret"] = jwt_secret
+        elif auth_type == "bearer":
+            if auth_token:
+                inline_auth["token"] = auth_token
+        elif auth_type == "api_key":
+            if auth_token:
+                inline_auth["api_key"] = auth_token
     # Load config with profile if specified
     if profile:
         from testmcpy.config import Config
@@ -251,17 +303,25 @@ def run(
         # Get authenticated MCP client
         mcp_client = None
-        effective_profile = profile
-        if not effective_profile:
-            # Use default profile from config
-            mcp_config = load_mcp_yaml()
-            effective_profile = mcp_config.get("default")
-        if effective_profile:
-            try:
-                mcp_client = await get_or_create_mcp_client(effective_profile)
-            except Exception as e:
-                console.print(f"[yellow]Warning: Could not load MCP profile: {e}[/yellow]")
+        if inline_auth and effective_mcp_url:
+            # Use inline auth flags — bypass profile system entirely
+            from testmcpy.src.mcp_client import MCPClient
+            mcp_client = MCPClient(effective_mcp_url, auth=inline_auth)
+            await mcp_client.initialize()
+        else:
+            effective_profile = profile
+            if not effective_profile:
+                # Use default profile from config
+                mcp_config = load_mcp_yaml()
+                effective_profile = mcp_config.get("default")
+            if effective_profile:
+                try:
+                    mcp_client = await get_or_create_mcp_client(effective_profile)
+                except Exception as e:
+                    console.print(f"[yellow]Warning: Could not load MCP profile: {e}[/yellow]")
         # Load test cases and detect suite-level provider override
         test_cases = []
@@ -342,6 +402,11 @@ def run(
                     f"[yellow]Suite-level provider config:[/yellow] {suite_provider_config}"
                 )
+        def cli_log_callback(msg: str) -> None:
+            """Print runner/provider log messages to console in real-time."""
+            if verbose:
+                console.print(f"  [dim]{msg}[/dim]")
         runner = TestRunner(
             model=effective_model,
             provider=effective_provider,
@@ -350,6 +415,7 @@ def run(
             verbose=verbose,
             hide_tool_output=hide_tool_output,
             provider_config=suite_provider_config,
+            log_callback=cli_log_callback if verbose else None,
         )
         console.print(f"\n[bold]Found {len(test_cases)} test case(s)[/bold]")
@@ -384,11 +450,35 @@ def run(
                     f"  [dim]Prompt: {test_case.prompt[:80]}{'...' if len(test_case.prompt) > 80 else ''}[/dim]"
                 )
-            # Run the test
-            from rich.status import Status
-            with Status("[yellow]Executing test...[/yellow]", console=console):
+            # Run the test — show live progress instead of static spinner
+            if verbose:
+                # In verbose mode, let _log() print directly (no spinner overlay)
                 result = await runner._run_test_with_retry(test_case)
+            else:
+                # In non-verbose mode, update spinner with runner progress
+                from rich.status import Status
+                _status = Status("[yellow]Executing test...[/yellow]", console=console)
+                _status.start()
+                def update_status(msg: str, _s: Status = _status) -> None:
+                    msg_lower = msg.lower()
+                    if "tool call" in msg_lower or "tool_call" in msg_lower:
+                        _s.update(f"[yellow]Tool call: {msg.split('.')[-1].strip()[:60]}[/yellow]")
+                    elif "running test" in msg_lower:
+                        _s.update("[yellow]Running...[/yellow]")
+                    elif "executing" in msg_lower:
+                        _s.update("[yellow]Executing tool calls...[/yellow]")
+                    elif "evaluating" in msg_lower or "evaluator" in msg_lower:
+                        _s.update("[yellow]Evaluating results...[/yellow]")
+                old_callback = runner.log_callback
+                runner.log_callback = update_status
+                try:
+                    result = await runner._run_test_with_retry(test_case)
+                finally:
+                    runner.log_callback = old_callback
+                    _status.stop()
             results.append(result)
@@ -487,6 +577,9 @@ def run(
                     "passed": total_passed,
                     "failed": len(results) - total_passed,
                 },
+                "metadata": {
+                    "session_id": session_id,
+                },
             }
             save_result = save_test_run_to_file(save_data)
             console.print(f"[dim]Results saved: {save_result.get('run_id', '?')}[/dim]")

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/evals/base_evaluators.py RENAMED Viewed

@@ -187,6 +187,25 @@ class ExecutionSuccessful(BaseEvaluator):
     def description(self) -> str:
         return "Checks if tool execution completed without errors"
+    # Tools blocked by the SDK harness — errors from these are expected and should be ignored
+    _BLOCKED_TOOLS = {
+        "Read",
+        "Bash",
+        "Edit",
+        "Write",
+        "Grep",
+        "Glob",
+        "ToolSearch",
+        "Skill",
+        "TodoWrite",
+        "Agent",
+        "WebFetch",
+        "WebSearch",
+        "NotebookEdit",
+        "EnterWorktree",
+        "ExitWorktree",
+    }
     def evaluate(self, context: dict[str, Any]) -> EvalResult:
         tool_results = context.get("tool_results", [])
@@ -196,9 +215,15 @@ class ExecutionSuccessful(BaseEvaluator):
         errors = []
         for result in tool_results:
             if result.is_error:
-                errors.append(
-                    {"tool": result.tool_call_id, "error": result.error_message or "Unknown error"}
-                )
+                error_msg = result.error_message or "Unknown error"
+                # Skip errors from blocked/disallowed tools (expected failures)
+                if "No such tool available" in error_msg or "not enabled" in error_msg:
+                    continue
+                # Also check tool name against known blocked tools
+                tool_id = result.tool_call_id or ""
+                if any(blocked in tool_id for blocked in self._BLOCKED_TOOLS):
+                    continue
+                errors.append({"tool": tool_id, "error": error_msg})
         if errors:
             return EvalResult(

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/server/api.py RENAMED Viewed

@@ -261,7 +261,7 @@ def is_connection_error(error_msg: str) -> bool:
     Only returns True for errors where the MCP session is truly dead and
     the cached client must be discarded. Auth errors (401/403) are NOT
-    included because the PresetOAuth transport handles token refresh and
+    included because the MCPOAuth transport handles token refresh and
     re-auth internally — evicting the client on an expired token would
     just trigger a new browser OAuth popup.
     """

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/server/routers/compare.py RENAMED Viewed

@@ -57,6 +57,9 @@ async def compare_runs(request: CompareRequest) -> dict[str, Any]:
                 "passed": passed,
                 "failed": total - passed,
                 "pass_rate": round((passed / total * 100) if total > 0 else 0, 1),
+                "total_cost": run["summary"].get("total_cost_usd", 0),
+                "total_tokens": run["summary"].get("total_tokens", 0),
+                "total_duration_ms": run["summary"].get("total_duration_ms", 0),
             }
         )
@@ -94,6 +97,7 @@ async def compare_runs(request: CompareRequest) -> dict[str, Any]:
                     "answer_snippet": answer_snippet,
                     "tokens_input": match.get("tokens_input", 0),
                     "tokens_output": match.get("tokens_output", 0),
+                    "cost_usd": match.get("cost_usd", 0),
                 }
             else:
                 cells[run["run_id"]] = {

{testmcpy-0.3.2 → testmcpy-0.5.0}/testmcpy/server/routers/results.py RENAMED Viewed

@@ -87,6 +87,7 @@ def save_test_run_to_file(data: dict[str, Any]) -> dict[str, Any]:
         provider=provider,
         started_at=started_at,
         mcp_profile_id=data.get("mcp_profile"),
+        metadata=data.get("metadata"),
     )
     # Save individual question results
@@ -120,13 +121,33 @@ async def save_test_run(data: dict[str, Any]) -> dict[str, Any]:
 @router.get("/list")
-async def list_test_runs(test_file: str | None = None, limit: int = 50) -> dict[str, Any]:
+async def list_test_runs(
+    test_file: str | None = None,
+    model: str | None = None,
+    provider: str | None = None,
+    date_from: str | None = None,
+    date_to: str | None = None,
+    sort_by: str = "started_at",
+    sort_order: str = "desc",
+    limit: int = 50,
+    offset: int = 0,
+) -> dict[str, Any]:
     """
-    List all test runs, optionally filtered by test file.
+    List all test runs with filtering, sorting, and pagination.
     Returns metadata only (not full results).
     """
     storage = get_storage()
-    runs_data = storage.list_runs(test_id=test_file, limit=limit)
+    runs_data = storage.list_runs(
+        test_id=test_file,
+        model=model,
+        provider=provider,
+        date_from=date_from,
+        date_to=date_to,
+        sort_by=sort_by,
+        sort_order=sort_order,
+        limit=limit,
+        offset=offset,
+    )
     runs = []
     for run in runs_data:
@@ -143,13 +164,77 @@ async def list_test_runs(test_file: str | None = None, limit: int = 50) -> dict[
                 "total_tests": run["total_questions"],
                 "passed": run["passed_questions"],
                 "failed": run["total_questions"] - run["passed_questions"],
-                "total_cost": 0.0,
-                "total_tokens": 0,
-                "total_duration": 0.0,
+                "total_cost": run.get("total_cost", 0.0),
+                "total_tokens": run.get("total_tokens", 0),
+                "total_duration": round((run.get("total_duration_ms", 0) or 0) / 1000, 2),
+                "session_id": run.get("metadata", {}).get("session_id"),
+            }
+        )
+    # total reflects page size; a full count query would be needed for true pagination
+    # For now, signal "there may be more" if we hit the limit
+    return {"runs": runs, "total": len(runs), "has_more": len(runs) >= limit}
+@router.get("/filters")
+async def get_filter_options() -> dict[str, Any]:
+    """Get distinct values for filter dropdowns (models, providers, test files)."""
+    storage = get_storage()
+    return storage.get_filter_options()
+@router.get("/sessions")
+async def list_sessions(limit: int = 20, run_limit: int = 200) -> dict[str, Any]:
+    """List runs grouped by session_id, with aggregate stats per session.
+    Only examines the most recent `run_limit` runs to keep the query fast."""
+    storage = get_storage()
+    all_runs = storage.list_runs(limit=run_limit)
+    # Group by session_id
+    sessions: dict[str, list] = {}
+    ungrouped = []
+    for run in all_runs:
+        sid = run.get("metadata", {}).get("session_id")
+        if sid:
+            sessions.setdefault(sid, []).append(run)
+        else:
+            ungrouped.append(run)
+    # Build session summaries
+    result = []
+    for sid, runs in sorted(
+        sessions.items(), key=lambda x: x[1][0].get("started_at", ""), reverse=True
+    ):
+        total_q = sum(r["total_questions"] for r in runs)
+        passed_q = sum(r["passed_questions"] for r in runs)
+        result.append(
+            {
+                "session_id": sid,
+                "run_count": len(runs),
+                "models": list({r["model"] for r in runs}),
+                "providers": list({r["provider"] for r in runs}),
+                "test_files": [r["test_id"] for r in runs],
+                "started_at": min(r["started_at"] for r in runs if r.get("started_at")),
+                "total_tests": total_q,
+                "passed": passed_q,
+                "failed": total_q - passed_q,
+                "pass_rate": round(passed_q / total_q * 100, 1) if total_q > 0 else 0,
+                "total_cost": round(sum(r.get("total_cost", 0) for r in runs), 4),
+                "total_tokens": sum(r.get("total_tokens", 0) for r in runs),
+                "runs": [
+                    {
+                        "run_id": r["run_id"],
+                        "test_file": r["test_id"],
+                        "passed": r["passed_questions"],
+                        "failed": r["total_questions"] - r["passed_questions"],
+                        "pass_rate": r["pass_rate"],
+                    }
+                    for r in runs
+                ],
             }
         )
-    return {"runs": runs, "total": len(runs)}
+    return {"sessions": result[:limit], "total": len(result)}
 @router.get("/run/{run_id}")

testmcpy 0.3.2__tar.gz → 0.5.0__tar.gz

testmcpy 0.3.2tar.gz → 0.5.0tar.gz