PyPI - mcpbr - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

mcpbr 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

mcpbr/__init__.py +20 -1
mcpbr/analytics/__init__.py +79 -0
mcpbr/analytics/ab_testing.py +403 -0
mcpbr/analytics/anomaly.py +213 -0
mcpbr/analytics/comparison.py +548 -0
mcpbr/analytics/correlation.py +280 -0
mcpbr/analytics/database.py +386 -0
mcpbr/analytics/difficulty.py +238 -0
mcpbr/analytics/error_analysis.py +408 -0
mcpbr/analytics/leaderboard.py +285 -0
mcpbr/analytics/metrics.py +279 -0
mcpbr/analytics/regression_detector.py +472 -0
mcpbr/analytics/statistical.py +476 -0
mcpbr/analytics/trends.py +156 -0
mcpbr/cli.py +604 -0
mcpbr/config.py +37 -1
mcpbr/docker_env.py +2 -1
mcpbr/docker_prewarm.py +2 -1
mcpbr/dry_run.py +2 -1
mcpbr/gpu_support.py +2 -1
mcpbr/graceful_degradation.py +277 -0
mcpbr/languages.py +228 -0
mcpbr/logging_config.py +207 -0
mcpbr/models.py +66 -0
mcpbr/preflight.py +2 -1
mcpbr/pricing.py +72 -0
mcpbr/providers.py +316 -3
mcpbr/reports/__init__.py +17 -0
mcpbr/reports/enhanced_markdown.py +389 -0
mcpbr/reports/html_report.py +796 -0
mcpbr/reports/pdf_report.py +533 -0
mcpbr/sdk.py +264 -0
mcpbr/smoke_test.py +2 -1
{mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/METADATA +8 -1
{mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/RECORD +45 -24
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/WHEEL +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/entry_points.txt +0 -0
{mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/licenses/LICENSE +0 -0

mcpbr/__init__.py CHANGED Viewed

@@ -3,4 +3,23 @@
 A benchmark runner for evaluating MCP servers against SWE-bench tasks.
 """
-__version__ = "0.3.23"
+__version__ = "0.7.0"
+from .sdk import (
+    BenchmarkResult,
+    MCPBenchmark,
+    get_version,
+    list_benchmarks,
+    list_models,
+    list_providers,
+)
+__all__ = [
+    "__version__",
+    "BenchmarkResult",
+    "MCPBenchmark",
+    "get_version",
+    "list_benchmarks",
+    "list_models",
+    "list_providers",
+]

mcpbr/analytics/__init__.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Analytics module for mcpbr historical results tracking and analysis.
+Provides:
+- SQLite-based historical results database
+- Time-series trend analysis
+- Statistical significance testing
+- Multi-model comparison engine
+- Error pattern analysis and anomaly detection
+- Correlation analysis and difficulty estimation
+- A/B testing framework
+- Leaderboard generation
+- Performance regression detection
+- Custom metrics registry
+"""
+from __future__ import annotations
+from .ab_testing import ABTest, run_ab_test
+from .anomaly import detect_anomalies, detect_metric_anomalies
+from .comparison import ComparisonEngine, compare_results_files, format_comparison_table
+from .correlation import (
+    analyze_metric_correlations,
+    find_strong_correlations,
+    pearson_correlation,
+    spearman_correlation,
+)
+from .database import ResultsDatabase
+from .difficulty import (
+    aggregate_difficulty_stats,
+    estimate_difficulty,
+    estimate_task_difficulty_score,
+)
+from .error_analysis import ErrorPatternAnalyzer, identify_flaky_tasks
+from .leaderboard import Leaderboard, generate_leaderboard
+from .metrics import MetricDefinition, MetricsRegistry
+from .regression_detector import RegressionDetector
+from .statistical import (
+    bootstrap_confidence_interval,
+    chi_squared_test,
+    compare_resolution_rates,
+    effect_size_cohens_d,
+    mann_whitney_u,
+    permutation_test,
+)
+from .trends import calculate_moving_average, calculate_trends, detect_trend_direction
+__all__ = [
+    "ABTest",
+    "ComparisonEngine",
+    "ErrorPatternAnalyzer",
+    "Leaderboard",
+    "MetricDefinition",
+    "MetricsRegistry",
+    "RegressionDetector",
+    "ResultsDatabase",
+    "aggregate_difficulty_stats",
+    "analyze_metric_correlations",
+    "bootstrap_confidence_interval",
+    "calculate_moving_average",
+    "calculate_trends",
+    "chi_squared_test",
+    "compare_resolution_rates",
+    "compare_results_files",
+    "detect_anomalies",
+    "detect_metric_anomalies",
+    "detect_trend_direction",
+    "effect_size_cohens_d",
+    "estimate_difficulty",
+    "estimate_task_difficulty_score",
+    "find_strong_correlations",
+    "format_comparison_table",
+    "generate_leaderboard",
+    "identify_flaky_tasks",
+    "mann_whitney_u",
+    "pearson_correlation",
+    "permutation_test",
+    "run_ab_test",
+    "spearman_correlation",
+]

mcpbr/analytics/ab_testing.py ADDED Viewed

@@ -0,0 +1,403 @@
+"""A/B testing framework for comparing MCP server configurations.
+Provides tools for statistically comparing two evaluation runs (control vs.
+treatment) to determine which configuration performs better on resolution rate,
+cost, and other metrics.
+"""
+from __future__ import annotations
+import math
+from typing import Any
+def _normal_cdf(x: float) -> float:
+    """Approximate the standard normal cumulative distribution function.
+    Uses the Abramowitz & Stegun rational approximation (formula 26.2.17)
+    which is accurate to about 1e-5.
+    Args:
+        x: The z-score value.
+    Returns:
+        Probability that a standard normal variable is less than or equal to *x*.
+    """
+    if x < -8.0:
+        return 0.0
+    if x > 8.0:
+        return 1.0
+    sign = 1.0
+    if x < 0:
+        sign = -1.0
+        x = -x
+    t = 1.0 / (1.0 + 0.2316419 * x)
+    t2 = t * t
+    t3 = t2 * t
+    t4 = t3 * t
+    t5 = t4 * t
+    pdf = math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi)
+    cdf_upper = pdf * (
+        0.319381530 * t - 0.356563782 * t2 + 1.781477937 * t3 - 1.821255978 * t4 + 1.330274429 * t5
+    )
+    if sign > 0:
+        return 1.0 - cdf_upper
+    else:
+        return cdf_upper
+def _chi_squared_test(
+    resolved_a: int,
+    total_a: int,
+    resolved_b: int,
+    total_b: int,
+) -> dict[str, Any]:
+    """Perform a chi-squared test for independence on two resolution rates.
+    Constructs a 2x2 contingency table of resolved/unresolved counts for
+    groups A and B and computes the chi-squared statistic with Yates'
+    continuity correction.  The p-value is derived from the chi-squared
+    distribution with 1 degree of freedom using a normal CDF approximation.
+    Args:
+        resolved_a: Number of resolved tasks in group A.
+        total_a: Total tasks in group A.
+        resolved_b: Number of resolved tasks in group B.
+        total_b: Total tasks in group B.
+    Returns:
+        Dictionary with ``chi_squared``, ``p_value``, ``significant``
+        (at alpha = 0.05), and ``degrees_of_freedom``.
+    """
+    unresolved_a = total_a - resolved_a
+    unresolved_b = total_b - resolved_b
+    grand_total = total_a + total_b
+    if grand_total == 0:
+        return {
+            "chi_squared": 0.0,
+            "p_value": 1.0,
+            "significant": False,
+            "degrees_of_freedom": 1,
+        }
+    # Expected values for the 2x2 table
+    total_resolved = resolved_a + resolved_b
+    total_unresolved = unresolved_a + unresolved_b
+    expected = [
+        [total_resolved * total_a / grand_total, total_unresolved * total_a / grand_total],
+        [total_resolved * total_b / grand_total, total_unresolved * total_b / grand_total],
+    ]
+    observed = [
+        [resolved_a, unresolved_a],
+        [resolved_b, unresolved_b],
+    ]
+    chi2 = 0.0
+    for i in range(2):
+        for j in range(2):
+            e = expected[i][j]
+            if e > 0:
+                # Yates' continuity correction
+                diff = abs(observed[i][j] - e) - 0.5
+                if diff < 0:
+                    diff = 0.0
+                chi2 += (diff * diff) / e
+    # Convert chi-squared (1 df) to p-value via normal approximation:
+    # If X ~ chi2(1), then sqrt(X) ~ N(0,1) approximately.
+    if chi2 > 0:
+        z = math.sqrt(chi2)
+        p_value = 2.0 * (1.0 - _normal_cdf(z))
+    else:
+        p_value = 1.0
+    return {
+        "chi_squared": round(chi2, 6),
+        "p_value": round(p_value, 6),
+        "significant": p_value < 0.05,
+        "degrees_of_freedom": 1,
+    }
+def _extract_metrics(results_data: dict[str, Any]) -> dict[str, Any]:
+    """Extract key metrics from a results_data dictionary.
+    Args:
+        results_data: Evaluation results with ``summary.mcp`` and ``tasks``.
+    Returns:
+        Dictionary with ``resolved``, ``total``, ``rate``, ``cost``,
+        ``model``, ``provider``, ``avg_tokens``, and ``avg_runtime``.
+    """
+    summary = results_data.get("summary", {}).get("mcp", {})
+    tasks = results_data.get("tasks", [])
+    metadata = results_data.get("metadata", {})
+    config = metadata.get("config", {})
+    resolved = summary.get("resolved", 0)
+    total = summary.get("total", 0)
+    rate = summary.get("rate", 0.0)
+    cost = summary.get("total_cost", 0.0)
+    total_tokens = 0
+    total_runtime = 0.0
+    task_count = len(tasks)
+    for task in tasks:
+        mcp = task.get("mcp", {})
+        tokens = mcp.get("tokens", {})
+        total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
+        total_runtime += mcp.get("runtime_seconds", 0.0)
+    avg_tokens = total_tokens // task_count if task_count > 0 else 0
+    avg_runtime = total_runtime / task_count if task_count > 0 else 0.0
+    return {
+        "resolved": resolved,
+        "total": total,
+        "rate": rate,
+        "cost": cost,
+        "model": config.get("model", "unknown"),
+        "provider": config.get("provider", "unknown"),
+        "avg_tokens": avg_tokens,
+        "avg_runtime": round(avg_runtime, 2),
+    }
+class ABTest:
+    """A/B testing framework for comparing two MCP server configurations.
+    Creates a structured comparison between a control group (A) and treatment
+    group (B), running chi-squared significance testing on resolution rates
+    and comparing cost metrics.
+    Example::
+        test = ABTest("Model Comparison")
+        test.add_control(results_baseline)
+        test.add_treatment(results_candidate)
+        analysis = test.analyze()
+        print(test.format_report())
+    """
+    def __init__(
+        self,
+        name: str,
+        control_label: str = "A",
+        treatment_label: str = "B",
+    ) -> None:
+        """Initialize the A/B test.
+        Args:
+            name: Human-readable name for this test.
+            control_label: Label for the control group (default ``"A"``).
+            treatment_label: Label for the treatment group (default ``"B"``).
+        """
+        self.name = name
+        self.control_label = control_label
+        self.treatment_label = treatment_label
+        self._control: dict[str, Any] | None = None
+        self._treatment: dict[str, Any] | None = None
+        self._analysis: dict[str, Any] | None = None
+    def add_control(self, results_data: dict[str, Any]) -> None:
+        """Add the control group results.
+        Args:
+            results_data: Evaluation results dictionary for the control
+                configuration.
+        """
+        self._control = results_data
+        self._analysis = None
+    def add_treatment(self, results_data: dict[str, Any]) -> None:
+        """Add the treatment group results.
+        Args:
+            results_data: Evaluation results dictionary for the treatment
+                configuration.
+        """
+        self._treatment = results_data
+        self._analysis = None
+    def analyze(self) -> dict[str, Any]:
+        """Run the A/B test analysis.
+        Compares resolution rates using a chi-squared test, and reports
+        differences in cost and other metrics.
+        Returns:
+            Dictionary containing:
+                - ``test_name``: The test name.
+                - ``control``: Metrics for the control group.
+                - ``treatment``: Metrics for the treatment group.
+                - ``rate_difference``: Absolute difference in resolution rates.
+                - ``rate_relative_change``: Percentage change in resolution rate.
+                - ``cost_difference``: Difference in total cost.
+                - ``statistical_significance``: Chi-squared test results.
+                - ``winner``: ``"control"``, ``"treatment"``, or
+                  ``"no_significant_difference"``.
+                - ``recommendation``: Human-readable recommendation.
+        Raises:
+            ValueError: If control or treatment data has not been added.
+        """
+        if self._control is None:
+            raise ValueError("Control group results not set. Call add_control() first.")
+        if self._treatment is None:
+            raise ValueError("Treatment group results not set. Call add_treatment() first.")
+        ctrl = _extract_metrics(self._control)
+        treat = _extract_metrics(self._treatment)
+        rate_diff = treat["rate"] - ctrl["rate"]
+        rate_relative = (rate_diff / ctrl["rate"] * 100.0) if ctrl["rate"] > 0 else 0.0
+        cost_diff = treat["cost"] - ctrl["cost"]
+        significance = _chi_squared_test(
+            ctrl["resolved"],
+            ctrl["total"],
+            treat["resolved"],
+            treat["total"],
+        )
+        # Determine winner
+        if significance["significant"]:
+            if treat["rate"] > ctrl["rate"]:
+                winner = "treatment"
+            elif treat["rate"] < ctrl["rate"]:
+                winner = "control"
+            else:
+                winner = "no_significant_difference"
+        else:
+            winner = "no_significant_difference"
+        # Build recommendation
+        if winner == "treatment":
+            recommendation = (
+                f"Treatment ({self.treatment_label}) shows a statistically significant "
+                f"improvement of {rate_relative:+.1f}% in resolution rate. "
+                f"Recommend adopting the treatment configuration."
+            )
+        elif winner == "control":
+            recommendation = (
+                f"Control ({self.control_label}) performs significantly better. "
+                f"Treatment ({self.treatment_label}) shows a {rate_relative:+.1f}% change "
+                f"in resolution rate. Recommend keeping the control configuration."
+            )
+        else:
+            recommendation = (
+                f"No statistically significant difference detected between "
+                f"{self.control_label} and {self.treatment_label} "
+                f"(p={significance['p_value']:.4f}). Consider increasing sample size "
+                f"or testing with a larger effect."
+            )
+        self._analysis = {
+            "test_name": self.name,
+            "control": {
+                "label": self.control_label,
+                "resolved": ctrl["resolved"],
+                "total": ctrl["total"],
+                "rate": ctrl["rate"],
+                "cost": ctrl["cost"],
+            },
+            "treatment": {
+                "label": self.treatment_label,
+                "resolved": treat["resolved"],
+                "total": treat["total"],
+                "rate": treat["rate"],
+                "cost": treat["cost"],
+            },
+            "rate_difference": round(rate_diff, 6),
+            "rate_relative_change": round(rate_relative, 2),
+            "cost_difference": round(cost_diff, 4),
+            "statistical_significance": significance,
+            "winner": winner,
+            "recommendation": recommendation,
+        }
+        return self._analysis
+    def format_report(self) -> str:
+        """Format the analysis results as a human-readable report.
+        Calls :meth:`analyze` automatically if it has not been called yet.
+        Returns:
+            Multi-line string containing the formatted A/B test report.
+        Raises:
+            ValueError: If control or treatment data has not been added.
+        """
+        if self._analysis is None:
+            self.analyze()
+        assert self._analysis is not None  # for type checker
+        a = self._analysis
+        ctrl = a["control"]
+        treat = a["treatment"]
+        sig = a["statistical_significance"]
+        lines = [
+            f"{'=' * 60}",
+            f"A/B Test Report: {a['test_name']}",
+            f"{'=' * 60}",
+            "",
+            f"Control ({ctrl['label']}):",
+            f"  Resolution Rate: {ctrl['rate']:.1%} ({ctrl['resolved']}/{ctrl['total']})",
+            f"  Total Cost:      ${ctrl['cost']:.4f}",
+            "",
+            f"Treatment ({treat['label']}):",
+            f"  Resolution Rate: {treat['rate']:.1%} ({treat['resolved']}/{treat['total']})",
+            f"  Total Cost:      ${treat['cost']:.4f}",
+            "",
+            "Comparison:",
+            f"  Rate Difference:     {a['rate_difference']:+.4f} "
+            f"({a['rate_relative_change']:+.1f}%)",
+            f"  Cost Difference:     ${a['cost_difference']:+.4f}",
+            "",
+            "Statistical Significance:",
+            f"  Chi-squared: {sig['chi_squared']:.4f}",
+            f"  p-value:     {sig['p_value']:.6f}",
+            f"  Significant: {'Yes' if sig['significant'] else 'No'} (alpha=0.05)",
+            "",
+            f"Winner: {a['winner']}",
+            "",
+            f"Recommendation: {a['recommendation']}",
+            f"{'=' * 60}",
+        ]
+        return "\n".join(lines)
+def run_ab_test(
+    results_a: dict[str, Any],
+    results_b: dict[str, Any],
+    test_name: str = "A/B Test",
+) -> dict[str, Any]:
+    """Convenience function to run a quick A/B test comparison.
+    Creates an :class:`ABTest` instance, adds the control and treatment
+    data, and returns the analysis results.
+    Args:
+        results_a: Evaluation results for the control (A) group.
+        results_b: Evaluation results for the treatment (B) group.
+        test_name: Name for the test (default ``"A/B Test"``).
+    Returns:
+        Analysis dictionary from :meth:`ABTest.analyze`.
+    """
+    test = ABTest(test_name)
+    test.add_control(results_a)
+    test.add_treatment(results_b)
+    return test.analyze()

mcpbr 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

mcpbr 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl