mcpbr 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/analytics/__init__.py +79 -0
  3. mcpbr/analytics/ab_testing.py +403 -0
  4. mcpbr/analytics/anomaly.py +213 -0
  5. mcpbr/analytics/comparison.py +548 -0
  6. mcpbr/analytics/correlation.py +280 -0
  7. mcpbr/analytics/database.py +386 -0
  8. mcpbr/analytics/difficulty.py +238 -0
  9. mcpbr/analytics/error_analysis.py +408 -0
  10. mcpbr/analytics/leaderboard.py +285 -0
  11. mcpbr/analytics/metrics.py +279 -0
  12. mcpbr/analytics/regression_detector.py +472 -0
  13. mcpbr/analytics/statistical.py +476 -0
  14. mcpbr/analytics/trends.py +156 -0
  15. mcpbr/cli.py +604 -0
  16. mcpbr/config.py +37 -1
  17. mcpbr/docker_env.py +2 -1
  18. mcpbr/docker_prewarm.py +2 -1
  19. mcpbr/dry_run.py +2 -1
  20. mcpbr/gpu_support.py +2 -1
  21. mcpbr/graceful_degradation.py +277 -0
  22. mcpbr/languages.py +228 -0
  23. mcpbr/logging_config.py +207 -0
  24. mcpbr/models.py +66 -0
  25. mcpbr/preflight.py +2 -1
  26. mcpbr/pricing.py +72 -0
  27. mcpbr/providers.py +316 -3
  28. mcpbr/reports/__init__.py +17 -0
  29. mcpbr/reports/enhanced_markdown.py +389 -0
  30. mcpbr/reports/html_report.py +796 -0
  31. mcpbr/reports/pdf_report.py +533 -0
  32. mcpbr/sdk.py +264 -0
  33. mcpbr/smoke_test.py +2 -1
  34. {mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/METADATA +8 -1
  35. {mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/RECORD +45 -24
  36. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  37. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  38. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  39. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  40. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  41. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  42. {mcpbr-0.5.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  43. {mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/WHEEL +0 -0
  44. {mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/entry_points.txt +0 -0
  45. {mcpbr-0.5.0.dist-info → mcpbr-0.7.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/__init__.py CHANGED
@@ -3,4 +3,23 @@
3
3
  A benchmark runner for evaluating MCP servers against SWE-bench tasks.
4
4
  """
5
5
 
6
- __version__ = "0.3.23"
6
+ __version__ = "0.7.0"
7
+
8
+ from .sdk import (
9
+ BenchmarkResult,
10
+ MCPBenchmark,
11
+ get_version,
12
+ list_benchmarks,
13
+ list_models,
14
+ list_providers,
15
+ )
16
+
17
+ __all__ = [
18
+ "__version__",
19
+ "BenchmarkResult",
20
+ "MCPBenchmark",
21
+ "get_version",
22
+ "list_benchmarks",
23
+ "list_models",
24
+ "list_providers",
25
+ ]
@@ -0,0 +1,79 @@
1
+ """Analytics module for mcpbr historical results tracking and analysis.
2
+
3
+ Provides:
4
+ - SQLite-based historical results database
5
+ - Time-series trend analysis
6
+ - Statistical significance testing
7
+ - Multi-model comparison engine
8
+ - Error pattern analysis and anomaly detection
9
+ - Correlation analysis and difficulty estimation
10
+ - A/B testing framework
11
+ - Leaderboard generation
12
+ - Performance regression detection
13
+ - Custom metrics registry
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .ab_testing import ABTest, run_ab_test
19
+ from .anomaly import detect_anomalies, detect_metric_anomalies
20
+ from .comparison import ComparisonEngine, compare_results_files, format_comparison_table
21
+ from .correlation import (
22
+ analyze_metric_correlations,
23
+ find_strong_correlations,
24
+ pearson_correlation,
25
+ spearman_correlation,
26
+ )
27
+ from .database import ResultsDatabase
28
+ from .difficulty import (
29
+ aggregate_difficulty_stats,
30
+ estimate_difficulty,
31
+ estimate_task_difficulty_score,
32
+ )
33
+ from .error_analysis import ErrorPatternAnalyzer, identify_flaky_tasks
34
+ from .leaderboard import Leaderboard, generate_leaderboard
35
+ from .metrics import MetricDefinition, MetricsRegistry
36
+ from .regression_detector import RegressionDetector
37
+ from .statistical import (
38
+ bootstrap_confidence_interval,
39
+ chi_squared_test,
40
+ compare_resolution_rates,
41
+ effect_size_cohens_d,
42
+ mann_whitney_u,
43
+ permutation_test,
44
+ )
45
+ from .trends import calculate_moving_average, calculate_trends, detect_trend_direction
46
+
47
+ __all__ = [
48
+ "ABTest",
49
+ "ComparisonEngine",
50
+ "ErrorPatternAnalyzer",
51
+ "Leaderboard",
52
+ "MetricDefinition",
53
+ "MetricsRegistry",
54
+ "RegressionDetector",
55
+ "ResultsDatabase",
56
+ "aggregate_difficulty_stats",
57
+ "analyze_metric_correlations",
58
+ "bootstrap_confidence_interval",
59
+ "calculate_moving_average",
60
+ "calculate_trends",
61
+ "chi_squared_test",
62
+ "compare_resolution_rates",
63
+ "compare_results_files",
64
+ "detect_anomalies",
65
+ "detect_metric_anomalies",
66
+ "detect_trend_direction",
67
+ "effect_size_cohens_d",
68
+ "estimate_difficulty",
69
+ "estimate_task_difficulty_score",
70
+ "find_strong_correlations",
71
+ "format_comparison_table",
72
+ "generate_leaderboard",
73
+ "identify_flaky_tasks",
74
+ "mann_whitney_u",
75
+ "pearson_correlation",
76
+ "permutation_test",
77
+ "run_ab_test",
78
+ "spearman_correlation",
79
+ ]
@@ -0,0 +1,403 @@
1
+ """A/B testing framework for comparing MCP server configurations.
2
+
3
+ Provides tools for statistically comparing two evaluation runs (control vs.
4
+ treatment) to determine which configuration performs better on resolution rate,
5
+ cost, and other metrics.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from typing import Any
12
+
13
+
14
+ def _normal_cdf(x: float) -> float:
15
+ """Approximate the standard normal cumulative distribution function.
16
+
17
+ Uses the Abramowitz & Stegun rational approximation (formula 26.2.17)
18
+ which is accurate to about 1e-5.
19
+
20
+ Args:
21
+ x: The z-score value.
22
+
23
+ Returns:
24
+ Probability that a standard normal variable is less than or equal to *x*.
25
+ """
26
+ if x < -8.0:
27
+ return 0.0
28
+ if x > 8.0:
29
+ return 1.0
30
+
31
+ sign = 1.0
32
+ if x < 0:
33
+ sign = -1.0
34
+ x = -x
35
+
36
+ t = 1.0 / (1.0 + 0.2316419 * x)
37
+ t2 = t * t
38
+ t3 = t2 * t
39
+ t4 = t3 * t
40
+ t5 = t4 * t
41
+
42
+ pdf = math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi)
43
+ cdf_upper = pdf * (
44
+ 0.319381530 * t - 0.356563782 * t2 + 1.781477937 * t3 - 1.821255978 * t4 + 1.330274429 * t5
45
+ )
46
+
47
+ if sign > 0:
48
+ return 1.0 - cdf_upper
49
+ else:
50
+ return cdf_upper
51
+
52
+
53
+ def _chi_squared_test(
54
+ resolved_a: int,
55
+ total_a: int,
56
+ resolved_b: int,
57
+ total_b: int,
58
+ ) -> dict[str, Any]:
59
+ """Perform a chi-squared test for independence on two resolution rates.
60
+
61
+ Constructs a 2x2 contingency table of resolved/unresolved counts for
62
+ groups A and B and computes the chi-squared statistic with Yates'
63
+ continuity correction. The p-value is derived from the chi-squared
64
+ distribution with 1 degree of freedom using a normal CDF approximation.
65
+
66
+ Args:
67
+ resolved_a: Number of resolved tasks in group A.
68
+ total_a: Total tasks in group A.
69
+ resolved_b: Number of resolved tasks in group B.
70
+ total_b: Total tasks in group B.
71
+
72
+ Returns:
73
+ Dictionary with ``chi_squared``, ``p_value``, ``significant``
74
+ (at alpha = 0.05), and ``degrees_of_freedom``.
75
+ """
76
+ unresolved_a = total_a - resolved_a
77
+ unresolved_b = total_b - resolved_b
78
+ grand_total = total_a + total_b
79
+
80
+ if grand_total == 0:
81
+ return {
82
+ "chi_squared": 0.0,
83
+ "p_value": 1.0,
84
+ "significant": False,
85
+ "degrees_of_freedom": 1,
86
+ }
87
+
88
+ # Expected values for the 2x2 table
89
+ total_resolved = resolved_a + resolved_b
90
+ total_unresolved = unresolved_a + unresolved_b
91
+
92
+ expected = [
93
+ [total_resolved * total_a / grand_total, total_unresolved * total_a / grand_total],
94
+ [total_resolved * total_b / grand_total, total_unresolved * total_b / grand_total],
95
+ ]
96
+
97
+ observed = [
98
+ [resolved_a, unresolved_a],
99
+ [resolved_b, unresolved_b],
100
+ ]
101
+
102
+ chi2 = 0.0
103
+ for i in range(2):
104
+ for j in range(2):
105
+ e = expected[i][j]
106
+ if e > 0:
107
+ # Yates' continuity correction
108
+ diff = abs(observed[i][j] - e) - 0.5
109
+ if diff < 0:
110
+ diff = 0.0
111
+ chi2 += (diff * diff) / e
112
+
113
+ # Convert chi-squared (1 df) to p-value via normal approximation:
114
+ # If X ~ chi2(1), then sqrt(X) ~ N(0,1) approximately.
115
+ if chi2 > 0:
116
+ z = math.sqrt(chi2)
117
+ p_value = 2.0 * (1.0 - _normal_cdf(z))
118
+ else:
119
+ p_value = 1.0
120
+
121
+ return {
122
+ "chi_squared": round(chi2, 6),
123
+ "p_value": round(p_value, 6),
124
+ "significant": p_value < 0.05,
125
+ "degrees_of_freedom": 1,
126
+ }
127
+
128
+
129
+ def _extract_metrics(results_data: dict[str, Any]) -> dict[str, Any]:
130
+ """Extract key metrics from a results_data dictionary.
131
+
132
+ Args:
133
+ results_data: Evaluation results with ``summary.mcp`` and ``tasks``.
134
+
135
+ Returns:
136
+ Dictionary with ``resolved``, ``total``, ``rate``, ``cost``,
137
+ ``model``, ``provider``, ``avg_tokens``, and ``avg_runtime``.
138
+ """
139
+ summary = results_data.get("summary", {}).get("mcp", {})
140
+ tasks = results_data.get("tasks", [])
141
+ metadata = results_data.get("metadata", {})
142
+ config = metadata.get("config", {})
143
+
144
+ resolved = summary.get("resolved", 0)
145
+ total = summary.get("total", 0)
146
+ rate = summary.get("rate", 0.0)
147
+ cost = summary.get("total_cost", 0.0)
148
+
149
+ total_tokens = 0
150
+ total_runtime = 0.0
151
+ task_count = len(tasks)
152
+
153
+ for task in tasks:
154
+ mcp = task.get("mcp", {})
155
+ tokens = mcp.get("tokens", {})
156
+ total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
157
+ total_runtime += mcp.get("runtime_seconds", 0.0)
158
+
159
+ avg_tokens = total_tokens // task_count if task_count > 0 else 0
160
+ avg_runtime = total_runtime / task_count if task_count > 0 else 0.0
161
+
162
+ return {
163
+ "resolved": resolved,
164
+ "total": total,
165
+ "rate": rate,
166
+ "cost": cost,
167
+ "model": config.get("model", "unknown"),
168
+ "provider": config.get("provider", "unknown"),
169
+ "avg_tokens": avg_tokens,
170
+ "avg_runtime": round(avg_runtime, 2),
171
+ }
172
+
173
+
174
+ class ABTest:
175
+ """A/B testing framework for comparing two MCP server configurations.
176
+
177
+ Creates a structured comparison between a control group (A) and treatment
178
+ group (B), running chi-squared significance testing on resolution rates
179
+ and comparing cost metrics.
180
+
181
+ Example::
182
+
183
+ test = ABTest("Model Comparison")
184
+ test.add_control(results_baseline)
185
+ test.add_treatment(results_candidate)
186
+ analysis = test.analyze()
187
+ print(test.format_report())
188
+ """
189
+
190
+ def __init__(
191
+ self,
192
+ name: str,
193
+ control_label: str = "A",
194
+ treatment_label: str = "B",
195
+ ) -> None:
196
+ """Initialize the A/B test.
197
+
198
+ Args:
199
+ name: Human-readable name for this test.
200
+ control_label: Label for the control group (default ``"A"``).
201
+ treatment_label: Label for the treatment group (default ``"B"``).
202
+ """
203
+ self.name = name
204
+ self.control_label = control_label
205
+ self.treatment_label = treatment_label
206
+ self._control: dict[str, Any] | None = None
207
+ self._treatment: dict[str, Any] | None = None
208
+ self._analysis: dict[str, Any] | None = None
209
+
210
+ def add_control(self, results_data: dict[str, Any]) -> None:
211
+ """Add the control group results.
212
+
213
+ Args:
214
+ results_data: Evaluation results dictionary for the control
215
+ configuration.
216
+ """
217
+ self._control = results_data
218
+ self._analysis = None
219
+
220
+ def add_treatment(self, results_data: dict[str, Any]) -> None:
221
+ """Add the treatment group results.
222
+
223
+ Args:
224
+ results_data: Evaluation results dictionary for the treatment
225
+ configuration.
226
+ """
227
+ self._treatment = results_data
228
+ self._analysis = None
229
+
230
+ def analyze(self) -> dict[str, Any]:
231
+ """Run the A/B test analysis.
232
+
233
+ Compares resolution rates using a chi-squared test, and reports
234
+ differences in cost and other metrics.
235
+
236
+ Returns:
237
+ Dictionary containing:
238
+ - ``test_name``: The test name.
239
+ - ``control``: Metrics for the control group.
240
+ - ``treatment``: Metrics for the treatment group.
241
+ - ``rate_difference``: Absolute difference in resolution rates.
242
+ - ``rate_relative_change``: Percentage change in resolution rate.
243
+ - ``cost_difference``: Difference in total cost.
244
+ - ``statistical_significance``: Chi-squared test results.
245
+ - ``winner``: ``"control"``, ``"treatment"``, or
246
+ ``"no_significant_difference"``.
247
+ - ``recommendation``: Human-readable recommendation.
248
+
249
+ Raises:
250
+ ValueError: If control or treatment data has not been added.
251
+ """
252
+ if self._control is None:
253
+ raise ValueError("Control group results not set. Call add_control() first.")
254
+ if self._treatment is None:
255
+ raise ValueError("Treatment group results not set. Call add_treatment() first.")
256
+
257
+ ctrl = _extract_metrics(self._control)
258
+ treat = _extract_metrics(self._treatment)
259
+
260
+ rate_diff = treat["rate"] - ctrl["rate"]
261
+ rate_relative = (rate_diff / ctrl["rate"] * 100.0) if ctrl["rate"] > 0 else 0.0
262
+ cost_diff = treat["cost"] - ctrl["cost"]
263
+
264
+ significance = _chi_squared_test(
265
+ ctrl["resolved"],
266
+ ctrl["total"],
267
+ treat["resolved"],
268
+ treat["total"],
269
+ )
270
+
271
+ # Determine winner
272
+ if significance["significant"]:
273
+ if treat["rate"] > ctrl["rate"]:
274
+ winner = "treatment"
275
+ elif treat["rate"] < ctrl["rate"]:
276
+ winner = "control"
277
+ else:
278
+ winner = "no_significant_difference"
279
+ else:
280
+ winner = "no_significant_difference"
281
+
282
+ # Build recommendation
283
+ if winner == "treatment":
284
+ recommendation = (
285
+ f"Treatment ({self.treatment_label}) shows a statistically significant "
286
+ f"improvement of {rate_relative:+.1f}% in resolution rate. "
287
+ f"Recommend adopting the treatment configuration."
288
+ )
289
+ elif winner == "control":
290
+ recommendation = (
291
+ f"Control ({self.control_label}) performs significantly better. "
292
+ f"Treatment ({self.treatment_label}) shows a {rate_relative:+.1f}% change "
293
+ f"in resolution rate. Recommend keeping the control configuration."
294
+ )
295
+ else:
296
+ recommendation = (
297
+ f"No statistically significant difference detected between "
298
+ f"{self.control_label} and {self.treatment_label} "
299
+ f"(p={significance['p_value']:.4f}). Consider increasing sample size "
300
+ f"or testing with a larger effect."
301
+ )
302
+
303
+ self._analysis = {
304
+ "test_name": self.name,
305
+ "control": {
306
+ "label": self.control_label,
307
+ "resolved": ctrl["resolved"],
308
+ "total": ctrl["total"],
309
+ "rate": ctrl["rate"],
310
+ "cost": ctrl["cost"],
311
+ },
312
+ "treatment": {
313
+ "label": self.treatment_label,
314
+ "resolved": treat["resolved"],
315
+ "total": treat["total"],
316
+ "rate": treat["rate"],
317
+ "cost": treat["cost"],
318
+ },
319
+ "rate_difference": round(rate_diff, 6),
320
+ "rate_relative_change": round(rate_relative, 2),
321
+ "cost_difference": round(cost_diff, 4),
322
+ "statistical_significance": significance,
323
+ "winner": winner,
324
+ "recommendation": recommendation,
325
+ }
326
+
327
+ return self._analysis
328
+
329
+ def format_report(self) -> str:
330
+ """Format the analysis results as a human-readable report.
331
+
332
+ Calls :meth:`analyze` automatically if it has not been called yet.
333
+
334
+ Returns:
335
+ Multi-line string containing the formatted A/B test report.
336
+
337
+ Raises:
338
+ ValueError: If control or treatment data has not been added.
339
+ """
340
+ if self._analysis is None:
341
+ self.analyze()
342
+
343
+ assert self._analysis is not None # for type checker
344
+ a = self._analysis
345
+
346
+ ctrl = a["control"]
347
+ treat = a["treatment"]
348
+ sig = a["statistical_significance"]
349
+
350
+ lines = [
351
+ f"{'=' * 60}",
352
+ f"A/B Test Report: {a['test_name']}",
353
+ f"{'=' * 60}",
354
+ "",
355
+ f"Control ({ctrl['label']}):",
356
+ f" Resolution Rate: {ctrl['rate']:.1%} ({ctrl['resolved']}/{ctrl['total']})",
357
+ f" Total Cost: ${ctrl['cost']:.4f}",
358
+ "",
359
+ f"Treatment ({treat['label']}):",
360
+ f" Resolution Rate: {treat['rate']:.1%} ({treat['resolved']}/{treat['total']})",
361
+ f" Total Cost: ${treat['cost']:.4f}",
362
+ "",
363
+ "Comparison:",
364
+ f" Rate Difference: {a['rate_difference']:+.4f} "
365
+ f"({a['rate_relative_change']:+.1f}%)",
366
+ f" Cost Difference: ${a['cost_difference']:+.4f}",
367
+ "",
368
+ "Statistical Significance:",
369
+ f" Chi-squared: {sig['chi_squared']:.4f}",
370
+ f" p-value: {sig['p_value']:.6f}",
371
+ f" Significant: {'Yes' if sig['significant'] else 'No'} (alpha=0.05)",
372
+ "",
373
+ f"Winner: {a['winner']}",
374
+ "",
375
+ f"Recommendation: {a['recommendation']}",
376
+ f"{'=' * 60}",
377
+ ]
378
+
379
+ return "\n".join(lines)
380
+
381
+
382
+ def run_ab_test(
383
+ results_a: dict[str, Any],
384
+ results_b: dict[str, Any],
385
+ test_name: str = "A/B Test",
386
+ ) -> dict[str, Any]:
387
+ """Convenience function to run a quick A/B test comparison.
388
+
389
+ Creates an :class:`ABTest` instance, adds the control and treatment
390
+ data, and returns the analysis results.
391
+
392
+ Args:
393
+ results_a: Evaluation results for the control (A) group.
394
+ results_b: Evaluation results for the treatment (B) group.
395
+ test_name: Name for the test (default ``"A/B Test"``).
396
+
397
+ Returns:
398
+ Analysis dictionary from :meth:`ABTest.analyze`.
399
+ """
400
+ test = ABTest(test_name)
401
+ test.add_control(results_a)
402
+ test.add_treatment(results_b)
403
+ return test.analyze()