mcpbr 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. mcpbr/__init__.py +1 -1
  2. mcpbr/analytics/__init__.py +79 -0
  3. mcpbr/analytics/ab_testing.py +403 -0
  4. mcpbr/analytics/anomaly.py +213 -0
  5. mcpbr/analytics/comparison.py +548 -0
  6. mcpbr/analytics/correlation.py +280 -0
  7. mcpbr/analytics/database.py +386 -0
  8. mcpbr/analytics/difficulty.py +238 -0
  9. mcpbr/analytics/error_analysis.py +408 -0
  10. mcpbr/analytics/leaderboard.py +285 -0
  11. mcpbr/analytics/metrics.py +279 -0
  12. mcpbr/analytics/regression_detector.py +472 -0
  13. mcpbr/analytics/statistical.py +476 -0
  14. mcpbr/analytics/trends.py +156 -0
  15. mcpbr/cli.py +604 -0
  16. mcpbr/reports/__init__.py +17 -0
  17. mcpbr/reports/enhanced_markdown.py +389 -0
  18. mcpbr/reports/html_report.py +796 -0
  19. mcpbr/reports/pdf_report.py +533 -0
  20. {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/METADATA +1 -1
  21. {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/RECORD +31 -14
  22. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  23. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  24. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  25. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  26. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  27. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  28. {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  29. {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/WHEEL +0 -0
  30. {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/entry_points.txt +0 -0
  31. {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  A benchmark runner for evaluating MCP servers against SWE-bench tasks.
4
4
  """
5
5
 
6
- __version__ = "0.6.0"
6
+ __version__ = "0.7.0"
7
7
 
8
8
  from .sdk import (
9
9
  BenchmarkResult,
@@ -0,0 +1,79 @@
1
+ """Analytics module for mcpbr historical results tracking and analysis.
2
+
3
+ Provides:
4
+ - SQLite-based historical results database
5
+ - Time-series trend analysis
6
+ - Statistical significance testing
7
+ - Multi-model comparison engine
8
+ - Error pattern analysis and anomaly detection
9
+ - Correlation analysis and difficulty estimation
10
+ - A/B testing framework
11
+ - Leaderboard generation
12
+ - Performance regression detection
13
+ - Custom metrics registry
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .ab_testing import ABTest, run_ab_test
19
+ from .anomaly import detect_anomalies, detect_metric_anomalies
20
+ from .comparison import ComparisonEngine, compare_results_files, format_comparison_table
21
+ from .correlation import (
22
+ analyze_metric_correlations,
23
+ find_strong_correlations,
24
+ pearson_correlation,
25
+ spearman_correlation,
26
+ )
27
+ from .database import ResultsDatabase
28
+ from .difficulty import (
29
+ aggregate_difficulty_stats,
30
+ estimate_difficulty,
31
+ estimate_task_difficulty_score,
32
+ )
33
+ from .error_analysis import ErrorPatternAnalyzer, identify_flaky_tasks
34
+ from .leaderboard import Leaderboard, generate_leaderboard
35
+ from .metrics import MetricDefinition, MetricsRegistry
36
+ from .regression_detector import RegressionDetector
37
+ from .statistical import (
38
+ bootstrap_confidence_interval,
39
+ chi_squared_test,
40
+ compare_resolution_rates,
41
+ effect_size_cohens_d,
42
+ mann_whitney_u,
43
+ permutation_test,
44
+ )
45
+ from .trends import calculate_moving_average, calculate_trends, detect_trend_direction
46
+
47
+ __all__ = [
48
+ "ABTest",
49
+ "ComparisonEngine",
50
+ "ErrorPatternAnalyzer",
51
+ "Leaderboard",
52
+ "MetricDefinition",
53
+ "MetricsRegistry",
54
+ "RegressionDetector",
55
+ "ResultsDatabase",
56
+ "aggregate_difficulty_stats",
57
+ "analyze_metric_correlations",
58
+ "bootstrap_confidence_interval",
59
+ "calculate_moving_average",
60
+ "calculate_trends",
61
+ "chi_squared_test",
62
+ "compare_resolution_rates",
63
+ "compare_results_files",
64
+ "detect_anomalies",
65
+ "detect_metric_anomalies",
66
+ "detect_trend_direction",
67
+ "effect_size_cohens_d",
68
+ "estimate_difficulty",
69
+ "estimate_task_difficulty_score",
70
+ "find_strong_correlations",
71
+ "format_comparison_table",
72
+ "generate_leaderboard",
73
+ "identify_flaky_tasks",
74
+ "mann_whitney_u",
75
+ "pearson_correlation",
76
+ "permutation_test",
77
+ "run_ab_test",
78
+ "spearman_correlation",
79
+ ]
@@ -0,0 +1,403 @@
1
+ """A/B testing framework for comparing MCP server configurations.
2
+
3
+ Provides tools for statistically comparing two evaluation runs (control vs.
4
+ treatment) to determine which configuration performs better on resolution rate,
5
+ cost, and other metrics.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from typing import Any
12
+
13
+
14
+ def _normal_cdf(x: float) -> float:
15
+ """Approximate the standard normal cumulative distribution function.
16
+
17
+ Uses the Abramowitz & Stegun rational approximation (formula 26.2.17)
18
+ which is accurate to about 1e-5.
19
+
20
+ Args:
21
+ x: The z-score value.
22
+
23
+ Returns:
24
+ Probability that a standard normal variable is less than or equal to *x*.
25
+ """
26
+ if x < -8.0:
27
+ return 0.0
28
+ if x > 8.0:
29
+ return 1.0
30
+
31
+ sign = 1.0
32
+ if x < 0:
33
+ sign = -1.0
34
+ x = -x
35
+
36
+ t = 1.0 / (1.0 + 0.2316419 * x)
37
+ t2 = t * t
38
+ t3 = t2 * t
39
+ t4 = t3 * t
40
+ t5 = t4 * t
41
+
42
+ pdf = math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi)
43
+ cdf_upper = pdf * (
44
+ 0.319381530 * t - 0.356563782 * t2 + 1.781477937 * t3 - 1.821255978 * t4 + 1.330274429 * t5
45
+ )
46
+
47
+ if sign > 0:
48
+ return 1.0 - cdf_upper
49
+ else:
50
+ return cdf_upper
51
+
52
+
53
+ def _chi_squared_test(
54
+ resolved_a: int,
55
+ total_a: int,
56
+ resolved_b: int,
57
+ total_b: int,
58
+ ) -> dict[str, Any]:
59
+ """Perform a chi-squared test for independence on two resolution rates.
60
+
61
+ Constructs a 2x2 contingency table of resolved/unresolved counts for
62
+ groups A and B and computes the chi-squared statistic with Yates'
63
+ continuity correction. The p-value is derived from the chi-squared
64
+ distribution with 1 degree of freedom using a normal CDF approximation.
65
+
66
+ Args:
67
+ resolved_a: Number of resolved tasks in group A.
68
+ total_a: Total tasks in group A.
69
+ resolved_b: Number of resolved tasks in group B.
70
+ total_b: Total tasks in group B.
71
+
72
+ Returns:
73
+ Dictionary with ``chi_squared``, ``p_value``, ``significant``
74
+ (at alpha = 0.05), and ``degrees_of_freedom``.
75
+ """
76
+ unresolved_a = total_a - resolved_a
77
+ unresolved_b = total_b - resolved_b
78
+ grand_total = total_a + total_b
79
+
80
+ if grand_total == 0:
81
+ return {
82
+ "chi_squared": 0.0,
83
+ "p_value": 1.0,
84
+ "significant": False,
85
+ "degrees_of_freedom": 1,
86
+ }
87
+
88
+ # Expected values for the 2x2 table
89
+ total_resolved = resolved_a + resolved_b
90
+ total_unresolved = unresolved_a + unresolved_b
91
+
92
+ expected = [
93
+ [total_resolved * total_a / grand_total, total_unresolved * total_a / grand_total],
94
+ [total_resolved * total_b / grand_total, total_unresolved * total_b / grand_total],
95
+ ]
96
+
97
+ observed = [
98
+ [resolved_a, unresolved_a],
99
+ [resolved_b, unresolved_b],
100
+ ]
101
+
102
+ chi2 = 0.0
103
+ for i in range(2):
104
+ for j in range(2):
105
+ e = expected[i][j]
106
+ if e > 0:
107
+ # Yates' continuity correction
108
+ diff = abs(observed[i][j] - e) - 0.5
109
+ if diff < 0:
110
+ diff = 0.0
111
+ chi2 += (diff * diff) / e
112
+
113
+ # Convert chi-squared (1 df) to p-value via normal approximation:
114
+ # If X ~ chi2(1), then sqrt(X) ~ N(0,1) approximately.
115
+ if chi2 > 0:
116
+ z = math.sqrt(chi2)
117
+ p_value = 2.0 * (1.0 - _normal_cdf(z))
118
+ else:
119
+ p_value = 1.0
120
+
121
+ return {
122
+ "chi_squared": round(chi2, 6),
123
+ "p_value": round(p_value, 6),
124
+ "significant": p_value < 0.05,
125
+ "degrees_of_freedom": 1,
126
+ }
127
+
128
+
129
+ def _extract_metrics(results_data: dict[str, Any]) -> dict[str, Any]:
130
+ """Extract key metrics from a results_data dictionary.
131
+
132
+ Args:
133
+ results_data: Evaluation results with ``summary.mcp`` and ``tasks``.
134
+
135
+ Returns:
136
+ Dictionary with ``resolved``, ``total``, ``rate``, ``cost``,
137
+ ``model``, ``provider``, ``avg_tokens``, and ``avg_runtime``.
138
+ """
139
+ summary = results_data.get("summary", {}).get("mcp", {})
140
+ tasks = results_data.get("tasks", [])
141
+ metadata = results_data.get("metadata", {})
142
+ config = metadata.get("config", {})
143
+
144
+ resolved = summary.get("resolved", 0)
145
+ total = summary.get("total", 0)
146
+ rate = summary.get("rate", 0.0)
147
+ cost = summary.get("total_cost", 0.0)
148
+
149
+ total_tokens = 0
150
+ total_runtime = 0.0
151
+ task_count = len(tasks)
152
+
153
+ for task in tasks:
154
+ mcp = task.get("mcp", {})
155
+ tokens = mcp.get("tokens", {})
156
+ total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
157
+ total_runtime += mcp.get("runtime_seconds", 0.0)
158
+
159
+ avg_tokens = total_tokens // task_count if task_count > 0 else 0
160
+ avg_runtime = total_runtime / task_count if task_count > 0 else 0.0
161
+
162
+ return {
163
+ "resolved": resolved,
164
+ "total": total,
165
+ "rate": rate,
166
+ "cost": cost,
167
+ "model": config.get("model", "unknown"),
168
+ "provider": config.get("provider", "unknown"),
169
+ "avg_tokens": avg_tokens,
170
+ "avg_runtime": round(avg_runtime, 2),
171
+ }
172
+
173
+
174
+ class ABTest:
175
+ """A/B testing framework for comparing two MCP server configurations.
176
+
177
+ Creates a structured comparison between a control group (A) and treatment
178
+ group (B), running chi-squared significance testing on resolution rates
179
+ and comparing cost metrics.
180
+
181
+ Example::
182
+
183
+ test = ABTest("Model Comparison")
184
+ test.add_control(results_baseline)
185
+ test.add_treatment(results_candidate)
186
+ analysis = test.analyze()
187
+ print(test.format_report())
188
+ """
189
+
190
+ def __init__(
191
+ self,
192
+ name: str,
193
+ control_label: str = "A",
194
+ treatment_label: str = "B",
195
+ ) -> None:
196
+ """Initialize the A/B test.
197
+
198
+ Args:
199
+ name: Human-readable name for this test.
200
+ control_label: Label for the control group (default ``"A"``).
201
+ treatment_label: Label for the treatment group (default ``"B"``).
202
+ """
203
+ self.name = name
204
+ self.control_label = control_label
205
+ self.treatment_label = treatment_label
206
+ self._control: dict[str, Any] | None = None
207
+ self._treatment: dict[str, Any] | None = None
208
+ self._analysis: dict[str, Any] | None = None
209
+
210
+ def add_control(self, results_data: dict[str, Any]) -> None:
211
+ """Add the control group results.
212
+
213
+ Args:
214
+ results_data: Evaluation results dictionary for the control
215
+ configuration.
216
+ """
217
+ self._control = results_data
218
+ self._analysis = None
219
+
220
+ def add_treatment(self, results_data: dict[str, Any]) -> None:
221
+ """Add the treatment group results.
222
+
223
+ Args:
224
+ results_data: Evaluation results dictionary for the treatment
225
+ configuration.
226
+ """
227
+ self._treatment = results_data
228
+ self._analysis = None
229
+
230
+ def analyze(self) -> dict[str, Any]:
231
+ """Run the A/B test analysis.
232
+
233
+ Compares resolution rates using a chi-squared test, and reports
234
+ differences in cost and other metrics.
235
+
236
+ Returns:
237
+ Dictionary containing:
238
+ - ``test_name``: The test name.
239
+ - ``control``: Metrics for the control group.
240
+ - ``treatment``: Metrics for the treatment group.
241
+ - ``rate_difference``: Absolute difference in resolution rates.
242
+ - ``rate_relative_change``: Percentage change in resolution rate.
243
+ - ``cost_difference``: Difference in total cost.
244
+ - ``statistical_significance``: Chi-squared test results.
245
+ - ``winner``: ``"control"``, ``"treatment"``, or
246
+ ``"no_significant_difference"``.
247
+ - ``recommendation``: Human-readable recommendation.
248
+
249
+ Raises:
250
+ ValueError: If control or treatment data has not been added.
251
+ """
252
+ if self._control is None:
253
+ raise ValueError("Control group results not set. Call add_control() first.")
254
+ if self._treatment is None:
255
+ raise ValueError("Treatment group results not set. Call add_treatment() first.")
256
+
257
+ ctrl = _extract_metrics(self._control)
258
+ treat = _extract_metrics(self._treatment)
259
+
260
+ rate_diff = treat["rate"] - ctrl["rate"]
261
+ rate_relative = (rate_diff / ctrl["rate"] * 100.0) if ctrl["rate"] > 0 else 0.0
262
+ cost_diff = treat["cost"] - ctrl["cost"]
263
+
264
+ significance = _chi_squared_test(
265
+ ctrl["resolved"],
266
+ ctrl["total"],
267
+ treat["resolved"],
268
+ treat["total"],
269
+ )
270
+
271
+ # Determine winner
272
+ if significance["significant"]:
273
+ if treat["rate"] > ctrl["rate"]:
274
+ winner = "treatment"
275
+ elif treat["rate"] < ctrl["rate"]:
276
+ winner = "control"
277
+ else:
278
+ winner = "no_significant_difference"
279
+ else:
280
+ winner = "no_significant_difference"
281
+
282
+ # Build recommendation
283
+ if winner == "treatment":
284
+ recommendation = (
285
+ f"Treatment ({self.treatment_label}) shows a statistically significant "
286
+ f"improvement of {rate_relative:+.1f}% in resolution rate. "
287
+ f"Recommend adopting the treatment configuration."
288
+ )
289
+ elif winner == "control":
290
+ recommendation = (
291
+ f"Control ({self.control_label}) performs significantly better. "
292
+ f"Treatment ({self.treatment_label}) shows a {rate_relative:+.1f}% change "
293
+ f"in resolution rate. Recommend keeping the control configuration."
294
+ )
295
+ else:
296
+ recommendation = (
297
+ f"No statistically significant difference detected between "
298
+ f"{self.control_label} and {self.treatment_label} "
299
+ f"(p={significance['p_value']:.4f}). Consider increasing sample size "
300
+ f"or testing with a larger effect."
301
+ )
302
+
303
+ self._analysis = {
304
+ "test_name": self.name,
305
+ "control": {
306
+ "label": self.control_label,
307
+ "resolved": ctrl["resolved"],
308
+ "total": ctrl["total"],
309
+ "rate": ctrl["rate"],
310
+ "cost": ctrl["cost"],
311
+ },
312
+ "treatment": {
313
+ "label": self.treatment_label,
314
+ "resolved": treat["resolved"],
315
+ "total": treat["total"],
316
+ "rate": treat["rate"],
317
+ "cost": treat["cost"],
318
+ },
319
+ "rate_difference": round(rate_diff, 6),
320
+ "rate_relative_change": round(rate_relative, 2),
321
+ "cost_difference": round(cost_diff, 4),
322
+ "statistical_significance": significance,
323
+ "winner": winner,
324
+ "recommendation": recommendation,
325
+ }
326
+
327
+ return self._analysis
328
+
329
+ def format_report(self) -> str:
330
+ """Format the analysis results as a human-readable report.
331
+
332
+ Calls :meth:`analyze` automatically if it has not been called yet.
333
+
334
+ Returns:
335
+ Multi-line string containing the formatted A/B test report.
336
+
337
+ Raises:
338
+ ValueError: If control or treatment data has not been added.
339
+ """
340
+ if self._analysis is None:
341
+ self.analyze()
342
+
343
+ assert self._analysis is not None # for type checker
344
+ a = self._analysis
345
+
346
+ ctrl = a["control"]
347
+ treat = a["treatment"]
348
+ sig = a["statistical_significance"]
349
+
350
+ lines = [
351
+ f"{'=' * 60}",
352
+ f"A/B Test Report: {a['test_name']}",
353
+ f"{'=' * 60}",
354
+ "",
355
+ f"Control ({ctrl['label']}):",
356
+ f" Resolution Rate: {ctrl['rate']:.1%} ({ctrl['resolved']}/{ctrl['total']})",
357
+ f" Total Cost: ${ctrl['cost']:.4f}",
358
+ "",
359
+ f"Treatment ({treat['label']}):",
360
+ f" Resolution Rate: {treat['rate']:.1%} ({treat['resolved']}/{treat['total']})",
361
+ f" Total Cost: ${treat['cost']:.4f}",
362
+ "",
363
+ "Comparison:",
364
+ f" Rate Difference: {a['rate_difference']:+.4f} "
365
+ f"({a['rate_relative_change']:+.1f}%)",
366
+ f" Cost Difference: ${a['cost_difference']:+.4f}",
367
+ "",
368
+ "Statistical Significance:",
369
+ f" Chi-squared: {sig['chi_squared']:.4f}",
370
+ f" p-value: {sig['p_value']:.6f}",
371
+ f" Significant: {'Yes' if sig['significant'] else 'No'} (alpha=0.05)",
372
+ "",
373
+ f"Winner: {a['winner']}",
374
+ "",
375
+ f"Recommendation: {a['recommendation']}",
376
+ f"{'=' * 60}",
377
+ ]
378
+
379
+ return "\n".join(lines)
380
+
381
+
382
+ def run_ab_test(
383
+ results_a: dict[str, Any],
384
+ results_b: dict[str, Any],
385
+ test_name: str = "A/B Test",
386
+ ) -> dict[str, Any]:
387
+ """Convenience function to run a quick A/B test comparison.
388
+
389
+ Creates an :class:`ABTest` instance, adds the control and treatment
390
+ data, and returns the analysis results.
391
+
392
+ Args:
393
+ results_a: Evaluation results for the control (A) group.
394
+ results_b: Evaluation results for the treatment (B) group.
395
+ test_name: Name for the test (default ``"A/B Test"``).
396
+
397
+ Returns:
398
+ Analysis dictionary from :meth:`ABTest.analyze`.
399
+ """
400
+ test = ABTest(test_name)
401
+ test.add_control(results_a)
402
+ test.add_treatment(results_b)
403
+ return test.analyze()
@@ -0,0 +1,213 @@
1
+ """Anomaly detection for benchmark metrics.
2
+
3
+ Provides statistical methods (z-score, IQR, MAD) to identify outlier
4
+ values in benchmark cost, token, runtime, and iteration metrics.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import statistics
10
+ from typing import Any
11
+
12
+
13
+ def detect_anomalies(
14
+ values: list[float],
15
+ method: str = "zscore",
16
+ threshold: float = 2.0,
17
+ ) -> list[dict[str, Any]]:
18
+ """Detect anomalous values in a list of numeric measurements.
19
+
20
+ Args:
21
+ values: List of numeric values to analyse.
22
+ method: Detection method. One of ``"zscore"`` (z-score exceeds
23
+ threshold), ``"iqr"`` (IQR fence method), or ``"mad"``
24
+ (median absolute deviation). Defaults to ``"zscore"``.
25
+ threshold: Sensitivity threshold whose meaning depends on the
26
+ method. For ``"zscore"`` and ``"mad"`` it is the number of
27
+ standard deviations / MADs; for ``"iqr"`` it is the fence
28
+ multiplier (commonly 1.5). Defaults to 2.0.
29
+
30
+ Returns:
31
+ List of anomaly dicts, each containing:
32
+ - index: Position in the input list.
33
+ - value: The anomalous value.
34
+ - score: Computed score (z-score, IQR distance, or MAD score).
35
+ - method: The detection method used.
36
+
37
+ Raises:
38
+ ValueError: If *method* is not one of the supported methods.
39
+ """
40
+ dispatch = {
41
+ "zscore": _zscore_detect,
42
+ "iqr": _iqr_detect,
43
+ "mad": _mad_detect,
44
+ }
45
+ if method not in dispatch:
46
+ raise ValueError(f"Unknown method '{method}'. Supported: {', '.join(sorted(dispatch))}.")
47
+ return dispatch[method](values, threshold)
48
+
49
+
50
+ def detect_metric_anomalies(
51
+ results_data: dict[str, Any],
52
+ ) -> dict[str, list[dict[str, Any]]]:
53
+ """Detect anomalies across standard benchmark metrics.
54
+
55
+ Extracts cost, total tokens, runtime, and iteration counts from the
56
+ results data and runs z-score anomaly detection on each metric.
57
+
58
+ Args:
59
+ results_data: Benchmark results dict with a ``tasks`` key
60
+ containing a list of task dicts. Each task should have an
61
+ ``mcp`` dict with ``cost``, ``tokens`` (with ``input`` and
62
+ ``output``), ``runtime_seconds``, and ``iterations``.
63
+
64
+ Returns:
65
+ Dict mapping metric name (``"cost"``, ``"tokens"``,
66
+ ``"runtime"``, ``"iterations"``) to a list of anomaly dicts
67
+ returned by :func:`detect_anomalies`.
68
+ """
69
+ tasks = results_data.get("tasks", [])
70
+ if not tasks:
71
+ return {
72
+ "cost": [],
73
+ "tokens": [],
74
+ "runtime": [],
75
+ "iterations": [],
76
+ }
77
+
78
+ costs: list[float] = []
79
+ tokens: list[float] = []
80
+ runtimes: list[float] = []
81
+ iterations: list[float] = []
82
+
83
+ for task in tasks:
84
+ mcp = task.get("mcp", {})
85
+ costs.append(float(mcp.get("cost", 0.0)))
86
+ token_info = mcp.get("tokens", {})
87
+ total_tokens = float(token_info.get("input", 0)) + float(token_info.get("output", 0))
88
+ tokens.append(total_tokens)
89
+ runtimes.append(float(mcp.get("runtime_seconds", 0.0)))
90
+ iterations.append(float(mcp.get("iterations", 0)))
91
+
92
+ return {
93
+ "cost": detect_anomalies(costs),
94
+ "tokens": detect_anomalies(tokens),
95
+ "runtime": detect_anomalies(runtimes),
96
+ "iterations": detect_anomalies(iterations),
97
+ }
98
+
99
+
100
+ # ------------------------------------------------------------------
101
+ # Detection method implementations
102
+ # ------------------------------------------------------------------
103
+
104
+
105
+ def _zscore_detect(
106
+ values: list[float],
107
+ threshold: float,
108
+ ) -> list[dict[str, Any]]:
109
+ """Detect anomalies using z-score.
110
+
111
+ A value is anomalous if its absolute z-score exceeds *threshold*.
112
+
113
+ Args:
114
+ values: Numeric values to analyse.
115
+ threshold: Z-score cutoff.
116
+
117
+ Returns:
118
+ List of anomaly dicts with index, value, score, and method.
119
+ """
120
+ if len(values) < 2:
121
+ return []
122
+
123
+ mean = statistics.mean(values)
124
+ stdev = statistics.pstdev(values)
125
+ if stdev == 0:
126
+ return []
127
+
128
+ anomalies: list[dict[str, Any]] = []
129
+ for i, v in enumerate(values):
130
+ z = abs(v - mean) / stdev
131
+ if z > threshold:
132
+ anomalies.append({"index": i, "value": v, "score": z, "method": "zscore"})
133
+ return anomalies
134
+
135
+
136
+ def _iqr_detect(
137
+ values: list[float],
138
+ threshold: float,
139
+ ) -> list[dict[str, Any]]:
140
+ """Detect anomalies using the interquartile range (IQR) method.
141
+
142
+ Values outside ``[Q1 - threshold*IQR, Q3 + threshold*IQR]`` are
143
+ flagged. The score represents how many IQRs the value lies beyond
144
+ the nearest fence.
145
+
146
+ Args:
147
+ values: Numeric values to analyse.
148
+ threshold: IQR multiplier for the fence (commonly 1.5).
149
+
150
+ Returns:
151
+ List of anomaly dicts with index, value, score, and method.
152
+ """
153
+ if len(values) < 4:
154
+ return []
155
+
156
+ sorted_vals = sorted(values)
157
+ n = len(sorted_vals)
158
+ q1 = statistics.median(sorted_vals[: n // 2])
159
+ q3 = statistics.median(sorted_vals[(n + 1) // 2 :])
160
+ iqr = q3 - q1
161
+ if iqr == 0:
162
+ return []
163
+
164
+ lower_fence = q1 - threshold * iqr
165
+ upper_fence = q3 + threshold * iqr
166
+
167
+ anomalies: list[dict[str, Any]] = []
168
+ for i, v in enumerate(values):
169
+ if v < lower_fence:
170
+ score = (lower_fence - v) / iqr
171
+ anomalies.append({"index": i, "value": v, "score": score, "method": "iqr"})
172
+ elif v > upper_fence:
173
+ score = (v - upper_fence) / iqr
174
+ anomalies.append({"index": i, "value": v, "score": score, "method": "iqr"})
175
+ return anomalies
176
+
177
+
178
+ def _mad_detect(
179
+ values: list[float],
180
+ threshold: float,
181
+ ) -> list[dict[str, Any]]:
182
+ """Detect anomalies using median absolute deviation (MAD).
183
+
184
+ The modified z-score is computed as ``0.6745 * (x - median) / MAD``.
185
+ Values whose absolute modified z-score exceeds *threshold* are
186
+ flagged.
187
+
188
+ Args:
189
+ values: Numeric values to analyse.
190
+ threshold: Modified z-score cutoff.
191
+
192
+ Returns:
193
+ List of anomaly dicts with index, value, score, and method.
194
+ """
195
+ if len(values) < 2:
196
+ return []
197
+
198
+ med = statistics.median(values)
199
+ abs_devs = [abs(v - med) for v in values]
200
+ mad = statistics.median(abs_devs)
201
+ if mad == 0:
202
+ return []
203
+
204
+ # 0.6745 is the 0.75th quantile of the standard normal distribution,
205
+ # used to make MAD consistent with standard deviation for normal data.
206
+ consistency_constant = 0.6745
207
+
208
+ anomalies: list[dict[str, Any]] = []
209
+ for i, v in enumerate(values):
210
+ modified_z = consistency_constant * abs(v - med) / mad
211
+ if modified_z > threshold:
212
+ anomalies.append({"index": i, "value": v, "score": modified_z, "method": "mad"})
213
+ return anomalies