mcpbr 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/__init__.py +1 -1
- mcpbr/analytics/__init__.py +79 -0
- mcpbr/analytics/ab_testing.py +403 -0
- mcpbr/analytics/anomaly.py +213 -0
- mcpbr/analytics/comparison.py +548 -0
- mcpbr/analytics/correlation.py +280 -0
- mcpbr/analytics/database.py +386 -0
- mcpbr/analytics/difficulty.py +238 -0
- mcpbr/analytics/error_analysis.py +408 -0
- mcpbr/analytics/leaderboard.py +285 -0
- mcpbr/analytics/metrics.py +279 -0
- mcpbr/analytics/regression_detector.py +472 -0
- mcpbr/analytics/statistical.py +476 -0
- mcpbr/analytics/trends.py +156 -0
- mcpbr/cli.py +604 -0
- mcpbr/reports/__init__.py +17 -0
- mcpbr/reports/enhanced_markdown.py +389 -0
- mcpbr/reports/html_report.py +796 -0
- mcpbr/reports/pdf_report.py +533 -0
- {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/METADATA +1 -1
- {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/RECORD +31 -14
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.6.0.data → mcpbr-0.7.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.6.0.dist-info → mcpbr-0.7.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/__init__.py
CHANGED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Analytics module for mcpbr historical results tracking and analysis.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- SQLite-based historical results database
|
|
5
|
+
- Time-series trend analysis
|
|
6
|
+
- Statistical significance testing
|
|
7
|
+
- Multi-model comparison engine
|
|
8
|
+
- Error pattern analysis and anomaly detection
|
|
9
|
+
- Correlation analysis and difficulty estimation
|
|
10
|
+
- A/B testing framework
|
|
11
|
+
- Leaderboard generation
|
|
12
|
+
- Performance regression detection
|
|
13
|
+
- Custom metrics registry
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from .ab_testing import ABTest, run_ab_test
|
|
19
|
+
from .anomaly import detect_anomalies, detect_metric_anomalies
|
|
20
|
+
from .comparison import ComparisonEngine, compare_results_files, format_comparison_table
|
|
21
|
+
from .correlation import (
|
|
22
|
+
analyze_metric_correlations,
|
|
23
|
+
find_strong_correlations,
|
|
24
|
+
pearson_correlation,
|
|
25
|
+
spearman_correlation,
|
|
26
|
+
)
|
|
27
|
+
from .database import ResultsDatabase
|
|
28
|
+
from .difficulty import (
|
|
29
|
+
aggregate_difficulty_stats,
|
|
30
|
+
estimate_difficulty,
|
|
31
|
+
estimate_task_difficulty_score,
|
|
32
|
+
)
|
|
33
|
+
from .error_analysis import ErrorPatternAnalyzer, identify_flaky_tasks
|
|
34
|
+
from .leaderboard import Leaderboard, generate_leaderboard
|
|
35
|
+
from .metrics import MetricDefinition, MetricsRegistry
|
|
36
|
+
from .regression_detector import RegressionDetector
|
|
37
|
+
from .statistical import (
|
|
38
|
+
bootstrap_confidence_interval,
|
|
39
|
+
chi_squared_test,
|
|
40
|
+
compare_resolution_rates,
|
|
41
|
+
effect_size_cohens_d,
|
|
42
|
+
mann_whitney_u,
|
|
43
|
+
permutation_test,
|
|
44
|
+
)
|
|
45
|
+
from .trends import calculate_moving_average, calculate_trends, detect_trend_direction
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"ABTest",
|
|
49
|
+
"ComparisonEngine",
|
|
50
|
+
"ErrorPatternAnalyzer",
|
|
51
|
+
"Leaderboard",
|
|
52
|
+
"MetricDefinition",
|
|
53
|
+
"MetricsRegistry",
|
|
54
|
+
"RegressionDetector",
|
|
55
|
+
"ResultsDatabase",
|
|
56
|
+
"aggregate_difficulty_stats",
|
|
57
|
+
"analyze_metric_correlations",
|
|
58
|
+
"bootstrap_confidence_interval",
|
|
59
|
+
"calculate_moving_average",
|
|
60
|
+
"calculate_trends",
|
|
61
|
+
"chi_squared_test",
|
|
62
|
+
"compare_resolution_rates",
|
|
63
|
+
"compare_results_files",
|
|
64
|
+
"detect_anomalies",
|
|
65
|
+
"detect_metric_anomalies",
|
|
66
|
+
"detect_trend_direction",
|
|
67
|
+
"effect_size_cohens_d",
|
|
68
|
+
"estimate_difficulty",
|
|
69
|
+
"estimate_task_difficulty_score",
|
|
70
|
+
"find_strong_correlations",
|
|
71
|
+
"format_comparison_table",
|
|
72
|
+
"generate_leaderboard",
|
|
73
|
+
"identify_flaky_tasks",
|
|
74
|
+
"mann_whitney_u",
|
|
75
|
+
"pearson_correlation",
|
|
76
|
+
"permutation_test",
|
|
77
|
+
"run_ab_test",
|
|
78
|
+
"spearman_correlation",
|
|
79
|
+
]
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""A/B testing framework for comparing MCP server configurations.
|
|
2
|
+
|
|
3
|
+
Provides tools for statistically comparing two evaluation runs (control vs.
|
|
4
|
+
treatment) to determine which configuration performs better on resolution rate,
|
|
5
|
+
cost, and other metrics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normal_cdf(x: float) -> float:
|
|
15
|
+
"""Approximate the standard normal cumulative distribution function.
|
|
16
|
+
|
|
17
|
+
Uses the Abramowitz & Stegun rational approximation (formula 26.2.17)
|
|
18
|
+
which is accurate to about 1e-5.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
x: The z-score value.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Probability that a standard normal variable is less than or equal to *x*.
|
|
25
|
+
"""
|
|
26
|
+
if x < -8.0:
|
|
27
|
+
return 0.0
|
|
28
|
+
if x > 8.0:
|
|
29
|
+
return 1.0
|
|
30
|
+
|
|
31
|
+
sign = 1.0
|
|
32
|
+
if x < 0:
|
|
33
|
+
sign = -1.0
|
|
34
|
+
x = -x
|
|
35
|
+
|
|
36
|
+
t = 1.0 / (1.0 + 0.2316419 * x)
|
|
37
|
+
t2 = t * t
|
|
38
|
+
t3 = t2 * t
|
|
39
|
+
t4 = t3 * t
|
|
40
|
+
t5 = t4 * t
|
|
41
|
+
|
|
42
|
+
pdf = math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi)
|
|
43
|
+
cdf_upper = pdf * (
|
|
44
|
+
0.319381530 * t - 0.356563782 * t2 + 1.781477937 * t3 - 1.821255978 * t4 + 1.330274429 * t5
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if sign > 0:
|
|
48
|
+
return 1.0 - cdf_upper
|
|
49
|
+
else:
|
|
50
|
+
return cdf_upper
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _chi_squared_test(
|
|
54
|
+
resolved_a: int,
|
|
55
|
+
total_a: int,
|
|
56
|
+
resolved_b: int,
|
|
57
|
+
total_b: int,
|
|
58
|
+
) -> dict[str, Any]:
|
|
59
|
+
"""Perform a chi-squared test for independence on two resolution rates.
|
|
60
|
+
|
|
61
|
+
Constructs a 2x2 contingency table of resolved/unresolved counts for
|
|
62
|
+
groups A and B and computes the chi-squared statistic with Yates'
|
|
63
|
+
continuity correction. The p-value is derived from the chi-squared
|
|
64
|
+
distribution with 1 degree of freedom using a normal CDF approximation.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
resolved_a: Number of resolved tasks in group A.
|
|
68
|
+
total_a: Total tasks in group A.
|
|
69
|
+
resolved_b: Number of resolved tasks in group B.
|
|
70
|
+
total_b: Total tasks in group B.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Dictionary with ``chi_squared``, ``p_value``, ``significant``
|
|
74
|
+
(at alpha = 0.05), and ``degrees_of_freedom``.
|
|
75
|
+
"""
|
|
76
|
+
unresolved_a = total_a - resolved_a
|
|
77
|
+
unresolved_b = total_b - resolved_b
|
|
78
|
+
grand_total = total_a + total_b
|
|
79
|
+
|
|
80
|
+
if grand_total == 0:
|
|
81
|
+
return {
|
|
82
|
+
"chi_squared": 0.0,
|
|
83
|
+
"p_value": 1.0,
|
|
84
|
+
"significant": False,
|
|
85
|
+
"degrees_of_freedom": 1,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Expected values for the 2x2 table
|
|
89
|
+
total_resolved = resolved_a + resolved_b
|
|
90
|
+
total_unresolved = unresolved_a + unresolved_b
|
|
91
|
+
|
|
92
|
+
expected = [
|
|
93
|
+
[total_resolved * total_a / grand_total, total_unresolved * total_a / grand_total],
|
|
94
|
+
[total_resolved * total_b / grand_total, total_unresolved * total_b / grand_total],
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
observed = [
|
|
98
|
+
[resolved_a, unresolved_a],
|
|
99
|
+
[resolved_b, unresolved_b],
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
chi2 = 0.0
|
|
103
|
+
for i in range(2):
|
|
104
|
+
for j in range(2):
|
|
105
|
+
e = expected[i][j]
|
|
106
|
+
if e > 0:
|
|
107
|
+
# Yates' continuity correction
|
|
108
|
+
diff = abs(observed[i][j] - e) - 0.5
|
|
109
|
+
if diff < 0:
|
|
110
|
+
diff = 0.0
|
|
111
|
+
chi2 += (diff * diff) / e
|
|
112
|
+
|
|
113
|
+
# Convert chi-squared (1 df) to p-value via normal approximation:
|
|
114
|
+
# If X ~ chi2(1), then sqrt(X) ~ N(0,1) approximately.
|
|
115
|
+
if chi2 > 0:
|
|
116
|
+
z = math.sqrt(chi2)
|
|
117
|
+
p_value = 2.0 * (1.0 - _normal_cdf(z))
|
|
118
|
+
else:
|
|
119
|
+
p_value = 1.0
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"chi_squared": round(chi2, 6),
|
|
123
|
+
"p_value": round(p_value, 6),
|
|
124
|
+
"significant": p_value < 0.05,
|
|
125
|
+
"degrees_of_freedom": 1,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _extract_metrics(results_data: dict[str, Any]) -> dict[str, Any]:
|
|
130
|
+
"""Extract key metrics from a results_data dictionary.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
results_data: Evaluation results with ``summary.mcp`` and ``tasks``.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Dictionary with ``resolved``, ``total``, ``rate``, ``cost``,
|
|
137
|
+
``model``, ``provider``, ``avg_tokens``, and ``avg_runtime``.
|
|
138
|
+
"""
|
|
139
|
+
summary = results_data.get("summary", {}).get("mcp", {})
|
|
140
|
+
tasks = results_data.get("tasks", [])
|
|
141
|
+
metadata = results_data.get("metadata", {})
|
|
142
|
+
config = metadata.get("config", {})
|
|
143
|
+
|
|
144
|
+
resolved = summary.get("resolved", 0)
|
|
145
|
+
total = summary.get("total", 0)
|
|
146
|
+
rate = summary.get("rate", 0.0)
|
|
147
|
+
cost = summary.get("total_cost", 0.0)
|
|
148
|
+
|
|
149
|
+
total_tokens = 0
|
|
150
|
+
total_runtime = 0.0
|
|
151
|
+
task_count = len(tasks)
|
|
152
|
+
|
|
153
|
+
for task in tasks:
|
|
154
|
+
mcp = task.get("mcp", {})
|
|
155
|
+
tokens = mcp.get("tokens", {})
|
|
156
|
+
total_tokens += tokens.get("input", 0) + tokens.get("output", 0)
|
|
157
|
+
total_runtime += mcp.get("runtime_seconds", 0.0)
|
|
158
|
+
|
|
159
|
+
avg_tokens = total_tokens // task_count if task_count > 0 else 0
|
|
160
|
+
avg_runtime = total_runtime / task_count if task_count > 0 else 0.0
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"resolved": resolved,
|
|
164
|
+
"total": total,
|
|
165
|
+
"rate": rate,
|
|
166
|
+
"cost": cost,
|
|
167
|
+
"model": config.get("model", "unknown"),
|
|
168
|
+
"provider": config.get("provider", "unknown"),
|
|
169
|
+
"avg_tokens": avg_tokens,
|
|
170
|
+
"avg_runtime": round(avg_runtime, 2),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class ABTest:
|
|
175
|
+
"""A/B testing framework for comparing two MCP server configurations.
|
|
176
|
+
|
|
177
|
+
Creates a structured comparison between a control group (A) and treatment
|
|
178
|
+
group (B), running chi-squared significance testing on resolution rates
|
|
179
|
+
and comparing cost metrics.
|
|
180
|
+
|
|
181
|
+
Example::
|
|
182
|
+
|
|
183
|
+
test = ABTest("Model Comparison")
|
|
184
|
+
test.add_control(results_baseline)
|
|
185
|
+
test.add_treatment(results_candidate)
|
|
186
|
+
analysis = test.analyze()
|
|
187
|
+
print(test.format_report())
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def __init__(
|
|
191
|
+
self,
|
|
192
|
+
name: str,
|
|
193
|
+
control_label: str = "A",
|
|
194
|
+
treatment_label: str = "B",
|
|
195
|
+
) -> None:
|
|
196
|
+
"""Initialize the A/B test.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
name: Human-readable name for this test.
|
|
200
|
+
control_label: Label for the control group (default ``"A"``).
|
|
201
|
+
treatment_label: Label for the treatment group (default ``"B"``).
|
|
202
|
+
"""
|
|
203
|
+
self.name = name
|
|
204
|
+
self.control_label = control_label
|
|
205
|
+
self.treatment_label = treatment_label
|
|
206
|
+
self._control: dict[str, Any] | None = None
|
|
207
|
+
self._treatment: dict[str, Any] | None = None
|
|
208
|
+
self._analysis: dict[str, Any] | None = None
|
|
209
|
+
|
|
210
|
+
def add_control(self, results_data: dict[str, Any]) -> None:
|
|
211
|
+
"""Add the control group results.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
results_data: Evaluation results dictionary for the control
|
|
215
|
+
configuration.
|
|
216
|
+
"""
|
|
217
|
+
self._control = results_data
|
|
218
|
+
self._analysis = None
|
|
219
|
+
|
|
220
|
+
def add_treatment(self, results_data: dict[str, Any]) -> None:
|
|
221
|
+
"""Add the treatment group results.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
results_data: Evaluation results dictionary for the treatment
|
|
225
|
+
configuration.
|
|
226
|
+
"""
|
|
227
|
+
self._treatment = results_data
|
|
228
|
+
self._analysis = None
|
|
229
|
+
|
|
230
|
+
def analyze(self) -> dict[str, Any]:
|
|
231
|
+
"""Run the A/B test analysis.
|
|
232
|
+
|
|
233
|
+
Compares resolution rates using a chi-squared test, and reports
|
|
234
|
+
differences in cost and other metrics.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Dictionary containing:
|
|
238
|
+
- ``test_name``: The test name.
|
|
239
|
+
- ``control``: Metrics for the control group.
|
|
240
|
+
- ``treatment``: Metrics for the treatment group.
|
|
241
|
+
- ``rate_difference``: Absolute difference in resolution rates.
|
|
242
|
+
- ``rate_relative_change``: Percentage change in resolution rate.
|
|
243
|
+
- ``cost_difference``: Difference in total cost.
|
|
244
|
+
- ``statistical_significance``: Chi-squared test results.
|
|
245
|
+
- ``winner``: ``"control"``, ``"treatment"``, or
|
|
246
|
+
``"no_significant_difference"``.
|
|
247
|
+
- ``recommendation``: Human-readable recommendation.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError: If control or treatment data has not been added.
|
|
251
|
+
"""
|
|
252
|
+
if self._control is None:
|
|
253
|
+
raise ValueError("Control group results not set. Call add_control() first.")
|
|
254
|
+
if self._treatment is None:
|
|
255
|
+
raise ValueError("Treatment group results not set. Call add_treatment() first.")
|
|
256
|
+
|
|
257
|
+
ctrl = _extract_metrics(self._control)
|
|
258
|
+
treat = _extract_metrics(self._treatment)
|
|
259
|
+
|
|
260
|
+
rate_diff = treat["rate"] - ctrl["rate"]
|
|
261
|
+
rate_relative = (rate_diff / ctrl["rate"] * 100.0) if ctrl["rate"] > 0 else 0.0
|
|
262
|
+
cost_diff = treat["cost"] - ctrl["cost"]
|
|
263
|
+
|
|
264
|
+
significance = _chi_squared_test(
|
|
265
|
+
ctrl["resolved"],
|
|
266
|
+
ctrl["total"],
|
|
267
|
+
treat["resolved"],
|
|
268
|
+
treat["total"],
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Determine winner
|
|
272
|
+
if significance["significant"]:
|
|
273
|
+
if treat["rate"] > ctrl["rate"]:
|
|
274
|
+
winner = "treatment"
|
|
275
|
+
elif treat["rate"] < ctrl["rate"]:
|
|
276
|
+
winner = "control"
|
|
277
|
+
else:
|
|
278
|
+
winner = "no_significant_difference"
|
|
279
|
+
else:
|
|
280
|
+
winner = "no_significant_difference"
|
|
281
|
+
|
|
282
|
+
# Build recommendation
|
|
283
|
+
if winner == "treatment":
|
|
284
|
+
recommendation = (
|
|
285
|
+
f"Treatment ({self.treatment_label}) shows a statistically significant "
|
|
286
|
+
f"improvement of {rate_relative:+.1f}% in resolution rate. "
|
|
287
|
+
f"Recommend adopting the treatment configuration."
|
|
288
|
+
)
|
|
289
|
+
elif winner == "control":
|
|
290
|
+
recommendation = (
|
|
291
|
+
f"Control ({self.control_label}) performs significantly better. "
|
|
292
|
+
f"Treatment ({self.treatment_label}) shows a {rate_relative:+.1f}% change "
|
|
293
|
+
f"in resolution rate. Recommend keeping the control configuration."
|
|
294
|
+
)
|
|
295
|
+
else:
|
|
296
|
+
recommendation = (
|
|
297
|
+
f"No statistically significant difference detected between "
|
|
298
|
+
f"{self.control_label} and {self.treatment_label} "
|
|
299
|
+
f"(p={significance['p_value']:.4f}). Consider increasing sample size "
|
|
300
|
+
f"or testing with a larger effect."
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
self._analysis = {
|
|
304
|
+
"test_name": self.name,
|
|
305
|
+
"control": {
|
|
306
|
+
"label": self.control_label,
|
|
307
|
+
"resolved": ctrl["resolved"],
|
|
308
|
+
"total": ctrl["total"],
|
|
309
|
+
"rate": ctrl["rate"],
|
|
310
|
+
"cost": ctrl["cost"],
|
|
311
|
+
},
|
|
312
|
+
"treatment": {
|
|
313
|
+
"label": self.treatment_label,
|
|
314
|
+
"resolved": treat["resolved"],
|
|
315
|
+
"total": treat["total"],
|
|
316
|
+
"rate": treat["rate"],
|
|
317
|
+
"cost": treat["cost"],
|
|
318
|
+
},
|
|
319
|
+
"rate_difference": round(rate_diff, 6),
|
|
320
|
+
"rate_relative_change": round(rate_relative, 2),
|
|
321
|
+
"cost_difference": round(cost_diff, 4),
|
|
322
|
+
"statistical_significance": significance,
|
|
323
|
+
"winner": winner,
|
|
324
|
+
"recommendation": recommendation,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return self._analysis
|
|
328
|
+
|
|
329
|
+
def format_report(self) -> str:
|
|
330
|
+
"""Format the analysis results as a human-readable report.
|
|
331
|
+
|
|
332
|
+
Calls :meth:`analyze` automatically if it has not been called yet.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Multi-line string containing the formatted A/B test report.
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
ValueError: If control or treatment data has not been added.
|
|
339
|
+
"""
|
|
340
|
+
if self._analysis is None:
|
|
341
|
+
self.analyze()
|
|
342
|
+
|
|
343
|
+
assert self._analysis is not None # for type checker
|
|
344
|
+
a = self._analysis
|
|
345
|
+
|
|
346
|
+
ctrl = a["control"]
|
|
347
|
+
treat = a["treatment"]
|
|
348
|
+
sig = a["statistical_significance"]
|
|
349
|
+
|
|
350
|
+
lines = [
|
|
351
|
+
f"{'=' * 60}",
|
|
352
|
+
f"A/B Test Report: {a['test_name']}",
|
|
353
|
+
f"{'=' * 60}",
|
|
354
|
+
"",
|
|
355
|
+
f"Control ({ctrl['label']}):",
|
|
356
|
+
f" Resolution Rate: {ctrl['rate']:.1%} ({ctrl['resolved']}/{ctrl['total']})",
|
|
357
|
+
f" Total Cost: ${ctrl['cost']:.4f}",
|
|
358
|
+
"",
|
|
359
|
+
f"Treatment ({treat['label']}):",
|
|
360
|
+
f" Resolution Rate: {treat['rate']:.1%} ({treat['resolved']}/{treat['total']})",
|
|
361
|
+
f" Total Cost: ${treat['cost']:.4f}",
|
|
362
|
+
"",
|
|
363
|
+
"Comparison:",
|
|
364
|
+
f" Rate Difference: {a['rate_difference']:+.4f} "
|
|
365
|
+
f"({a['rate_relative_change']:+.1f}%)",
|
|
366
|
+
f" Cost Difference: ${a['cost_difference']:+.4f}",
|
|
367
|
+
"",
|
|
368
|
+
"Statistical Significance:",
|
|
369
|
+
f" Chi-squared: {sig['chi_squared']:.4f}",
|
|
370
|
+
f" p-value: {sig['p_value']:.6f}",
|
|
371
|
+
f" Significant: {'Yes' if sig['significant'] else 'No'} (alpha=0.05)",
|
|
372
|
+
"",
|
|
373
|
+
f"Winner: {a['winner']}",
|
|
374
|
+
"",
|
|
375
|
+
f"Recommendation: {a['recommendation']}",
|
|
376
|
+
f"{'=' * 60}",
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
return "\n".join(lines)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def run_ab_test(
|
|
383
|
+
results_a: dict[str, Any],
|
|
384
|
+
results_b: dict[str, Any],
|
|
385
|
+
test_name: str = "A/B Test",
|
|
386
|
+
) -> dict[str, Any]:
|
|
387
|
+
"""Convenience function to run a quick A/B test comparison.
|
|
388
|
+
|
|
389
|
+
Creates an :class:`ABTest` instance, adds the control and treatment
|
|
390
|
+
data, and returns the analysis results.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
results_a: Evaluation results for the control (A) group.
|
|
394
|
+
results_b: Evaluation results for the treatment (B) group.
|
|
395
|
+
test_name: Name for the test (default ``"A/B Test"``).
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Analysis dictionary from :meth:`ABTest.analyze`.
|
|
399
|
+
"""
|
|
400
|
+
test = ABTest(test_name)
|
|
401
|
+
test.add_control(results_a)
|
|
402
|
+
test.add_treatment(results_b)
|
|
403
|
+
return test.analyze()
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Anomaly detection for benchmark metrics.
|
|
2
|
+
|
|
3
|
+
Provides statistical methods (z-score, IQR, MAD) to identify outlier
|
|
4
|
+
values in benchmark cost, token, runtime, and iteration metrics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import statistics
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def detect_anomalies(
|
|
14
|
+
values: list[float],
|
|
15
|
+
method: str = "zscore",
|
|
16
|
+
threshold: float = 2.0,
|
|
17
|
+
) -> list[dict[str, Any]]:
|
|
18
|
+
"""Detect anomalous values in a list of numeric measurements.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
values: List of numeric values to analyse.
|
|
22
|
+
method: Detection method. One of ``"zscore"`` (z-score exceeds
|
|
23
|
+
threshold), ``"iqr"`` (IQR fence method), or ``"mad"``
|
|
24
|
+
(median absolute deviation). Defaults to ``"zscore"``.
|
|
25
|
+
threshold: Sensitivity threshold whose meaning depends on the
|
|
26
|
+
method. For ``"zscore"`` and ``"mad"`` it is the number of
|
|
27
|
+
standard deviations / MADs; for ``"iqr"`` it is the fence
|
|
28
|
+
multiplier (commonly 1.5). Defaults to 2.0.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of anomaly dicts, each containing:
|
|
32
|
+
- index: Position in the input list.
|
|
33
|
+
- value: The anomalous value.
|
|
34
|
+
- score: Computed score (z-score, IQR distance, or MAD score).
|
|
35
|
+
- method: The detection method used.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ValueError: If *method* is not one of the supported methods.
|
|
39
|
+
"""
|
|
40
|
+
dispatch = {
|
|
41
|
+
"zscore": _zscore_detect,
|
|
42
|
+
"iqr": _iqr_detect,
|
|
43
|
+
"mad": _mad_detect,
|
|
44
|
+
}
|
|
45
|
+
if method not in dispatch:
|
|
46
|
+
raise ValueError(f"Unknown method '{method}'. Supported: {', '.join(sorted(dispatch))}.")
|
|
47
|
+
return dispatch[method](values, threshold)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def detect_metric_anomalies(
|
|
51
|
+
results_data: dict[str, Any],
|
|
52
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
53
|
+
"""Detect anomalies across standard benchmark metrics.
|
|
54
|
+
|
|
55
|
+
Extracts cost, total tokens, runtime, and iteration counts from the
|
|
56
|
+
results data and runs z-score anomaly detection on each metric.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
results_data: Benchmark results dict with a ``tasks`` key
|
|
60
|
+
containing a list of task dicts. Each task should have an
|
|
61
|
+
``mcp`` dict with ``cost``, ``tokens`` (with ``input`` and
|
|
62
|
+
``output``), ``runtime_seconds``, and ``iterations``.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Dict mapping metric name (``"cost"``, ``"tokens"``,
|
|
66
|
+
``"runtime"``, ``"iterations"``) to a list of anomaly dicts
|
|
67
|
+
returned by :func:`detect_anomalies`.
|
|
68
|
+
"""
|
|
69
|
+
tasks = results_data.get("tasks", [])
|
|
70
|
+
if not tasks:
|
|
71
|
+
return {
|
|
72
|
+
"cost": [],
|
|
73
|
+
"tokens": [],
|
|
74
|
+
"runtime": [],
|
|
75
|
+
"iterations": [],
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
costs: list[float] = []
|
|
79
|
+
tokens: list[float] = []
|
|
80
|
+
runtimes: list[float] = []
|
|
81
|
+
iterations: list[float] = []
|
|
82
|
+
|
|
83
|
+
for task in tasks:
|
|
84
|
+
mcp = task.get("mcp", {})
|
|
85
|
+
costs.append(float(mcp.get("cost", 0.0)))
|
|
86
|
+
token_info = mcp.get("tokens", {})
|
|
87
|
+
total_tokens = float(token_info.get("input", 0)) + float(token_info.get("output", 0))
|
|
88
|
+
tokens.append(total_tokens)
|
|
89
|
+
runtimes.append(float(mcp.get("runtime_seconds", 0.0)))
|
|
90
|
+
iterations.append(float(mcp.get("iterations", 0)))
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"cost": detect_anomalies(costs),
|
|
94
|
+
"tokens": detect_anomalies(tokens),
|
|
95
|
+
"runtime": detect_anomalies(runtimes),
|
|
96
|
+
"iterations": detect_anomalies(iterations),
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
# Detection method implementations
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _zscore_detect(
|
|
106
|
+
values: list[float],
|
|
107
|
+
threshold: float,
|
|
108
|
+
) -> list[dict[str, Any]]:
|
|
109
|
+
"""Detect anomalies using z-score.
|
|
110
|
+
|
|
111
|
+
A value is anomalous if its absolute z-score exceeds *threshold*.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
values: Numeric values to analyse.
|
|
115
|
+
threshold: Z-score cutoff.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
List of anomaly dicts with index, value, score, and method.
|
|
119
|
+
"""
|
|
120
|
+
if len(values) < 2:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
mean = statistics.mean(values)
|
|
124
|
+
stdev = statistics.pstdev(values)
|
|
125
|
+
if stdev == 0:
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
anomalies: list[dict[str, Any]] = []
|
|
129
|
+
for i, v in enumerate(values):
|
|
130
|
+
z = abs(v - mean) / stdev
|
|
131
|
+
if z > threshold:
|
|
132
|
+
anomalies.append({"index": i, "value": v, "score": z, "method": "zscore"})
|
|
133
|
+
return anomalies
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _iqr_detect(
|
|
137
|
+
values: list[float],
|
|
138
|
+
threshold: float,
|
|
139
|
+
) -> list[dict[str, Any]]:
|
|
140
|
+
"""Detect anomalies using the interquartile range (IQR) method.
|
|
141
|
+
|
|
142
|
+
Values outside ``[Q1 - threshold*IQR, Q3 + threshold*IQR]`` are
|
|
143
|
+
flagged. The score represents how many IQRs the value lies beyond
|
|
144
|
+
the nearest fence.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
values: Numeric values to analyse.
|
|
148
|
+
threshold: IQR multiplier for the fence (commonly 1.5).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of anomaly dicts with index, value, score, and method.
|
|
152
|
+
"""
|
|
153
|
+
if len(values) < 4:
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
sorted_vals = sorted(values)
|
|
157
|
+
n = len(sorted_vals)
|
|
158
|
+
q1 = statistics.median(sorted_vals[: n // 2])
|
|
159
|
+
q3 = statistics.median(sorted_vals[(n + 1) // 2 :])
|
|
160
|
+
iqr = q3 - q1
|
|
161
|
+
if iqr == 0:
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
lower_fence = q1 - threshold * iqr
|
|
165
|
+
upper_fence = q3 + threshold * iqr
|
|
166
|
+
|
|
167
|
+
anomalies: list[dict[str, Any]] = []
|
|
168
|
+
for i, v in enumerate(values):
|
|
169
|
+
if v < lower_fence:
|
|
170
|
+
score = (lower_fence - v) / iqr
|
|
171
|
+
anomalies.append({"index": i, "value": v, "score": score, "method": "iqr"})
|
|
172
|
+
elif v > upper_fence:
|
|
173
|
+
score = (v - upper_fence) / iqr
|
|
174
|
+
anomalies.append({"index": i, "value": v, "score": score, "method": "iqr"})
|
|
175
|
+
return anomalies
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _mad_detect(
|
|
179
|
+
values: list[float],
|
|
180
|
+
threshold: float,
|
|
181
|
+
) -> list[dict[str, Any]]:
|
|
182
|
+
"""Detect anomalies using median absolute deviation (MAD).
|
|
183
|
+
|
|
184
|
+
The modified z-score is computed as ``0.6745 * (x - median) / MAD``.
|
|
185
|
+
Values whose absolute modified z-score exceeds *threshold* are
|
|
186
|
+
flagged.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
values: Numeric values to analyse.
|
|
190
|
+
threshold: Modified z-score cutoff.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
List of anomaly dicts with index, value, score, and method.
|
|
194
|
+
"""
|
|
195
|
+
if len(values) < 2:
|
|
196
|
+
return []
|
|
197
|
+
|
|
198
|
+
med = statistics.median(values)
|
|
199
|
+
abs_devs = [abs(v - med) for v in values]
|
|
200
|
+
mad = statistics.median(abs_devs)
|
|
201
|
+
if mad == 0:
|
|
202
|
+
return []
|
|
203
|
+
|
|
204
|
+
# 0.6745 is the 0.75th quantile of the standard normal distribution,
|
|
205
|
+
# used to make MAD consistent with standard deviation for normal data.
|
|
206
|
+
consistency_constant = 0.6745
|
|
207
|
+
|
|
208
|
+
anomalies: list[dict[str, Any]] = []
|
|
209
|
+
for i, v in enumerate(values):
|
|
210
|
+
modified_z = consistency_constant * abs(v - med) / mad
|
|
211
|
+
if modified_z > threshold:
|
|
212
|
+
anomalies.append({"index": i, "value": v, "score": modified_z, "method": "mad"})
|
|
213
|
+
return anomalies
|